Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 122 additions & 14 deletions crates/genie-health/src/checker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,17 +111,7 @@ impl HealthMonitor {
for (name, url) in &services {
let status = check_http(name, url).await;

// Log to SQLite.
let _ = self.db.execute(
"INSERT INTO health_log (ts_ms, service, healthy, response_ms, error) VALUES (?1, ?2, ?3, ?4, ?5)",
rusqlite::params![
ts_ms,
status.name,
status.healthy as i32,
status.response_ms,
status.error,
],
);
insert_health_log(&self.db, ts_ms, &status);

if status.healthy {
if self.failure_counts.remove(name).is_some() {
Expand All @@ -147,9 +137,7 @@ impl HealthMonitor {

// Prune logs older than 24h every ~120 checks (~1 hour at 30s interval).
let cutoff = ts_ms.saturating_sub(24 * 3600 * 1000);
let _ = self
.db
.execute("DELETE FROM health_log WHERE ts_ms < ?1", [cutoff]);
prune_health_log(&self.db, cutoff);
}

async fn send_alert(&self, status: &ServiceStatus) {
Expand Down Expand Up @@ -181,6 +169,35 @@ impl HealthMonitor {
}
}

fn insert_health_log(db: &Connection, ts_ms: u64, status: &ServiceStatus) {
if let Err(e) = db.execute(
"INSERT INTO health_log (ts_ms, service, healthy, response_ms, error) VALUES (?1, ?2, ?3, ?4, ?5)",
rusqlite::params![
ts_ms,
status.name,
status.healthy as i32,
status.response_ms,
status.error,
],
) {
tracing::error!(
service = %status.name,
error = %e,
"failed to insert health_log row"
);
}
}

fn prune_health_log(db: &Connection, cutoff_ts_ms: u64) {
if let Err(e) = db.execute("DELETE FROM health_log WHERE ts_ms < ?1", [cutoff_ts_ms]) {
tracing::error!(
cutoff_ts_ms,
error = %e,
"failed to prune health_log rows"
);
}
}

async fn check_http(name: &str, url: &str) -> ServiceStatus {
let start = std::time::Instant::now();

Expand Down Expand Up @@ -367,4 +384,95 @@ mod tests {

assert_eq!(llm_url, "http://127.0.0.1:9999/v1/health");
}

fn open_test_db(dir: &std::path::Path) -> Connection {
let db_path = dir.join("health.db");
let db = Connection::open(&db_path).unwrap();
db.execute_batch(
"
CREATE TABLE IF NOT EXISTS health_log (
ts_ms INTEGER NOT NULL,
service TEXT NOT NULL,
healthy INTEGER NOT NULL,
response_ms INTEGER NOT NULL,
error TEXT
);
",
)
.unwrap();
db
}

#[test]
fn health_log_insert_and_prune_on_writable_db() {
let dir =
std::env::temp_dir().join(format!("genie-health-writable-{}", std::process::id()));
let _ = std::fs::remove_dir_all(&dir);
std::fs::create_dir_all(&dir).unwrap();

let db = open_test_db(&dir);
let status = ServiceStatus {
name: "core".into(),
url: "http://127.0.0.1:3000/api/health".into(),
healthy: true,
response_ms: 12,
error: None,
};

insert_health_log(&db, 1_000, &status);
insert_health_log(&db, 2_000, &status);

let count: i64 = db
.query_row("SELECT COUNT(*) FROM health_log", [], |row| row.get(0))
.unwrap();
assert_eq!(count, 2);

prune_health_log(&db, 1_500);
let count: i64 = db
.query_row("SELECT COUNT(*) FROM health_log", [], |row| row.get(0))
.unwrap();
assert_eq!(count, 1);

let _ = std::fs::remove_dir_all(&dir);
}

#[test]
#[cfg(unix)]
fn health_log_write_errors_do_not_panic_on_readonly_db() {
use std::os::unix::fs::PermissionsExt;

let dir =
std::env::temp_dir().join(format!("genie-health-readonly-{}", std::process::id()));
let _ = std::fs::remove_dir_all(&dir);
std::fs::create_dir_all(&dir).unwrap();

let db_path = dir.join("health.db");
{
let db = open_test_db(&dir);
drop(db);
}

let mut perms = std::fs::metadata(&db_path).unwrap().permissions();
perms.set_mode(0o444);
std::fs::set_permissions(&db_path, perms).unwrap();

let db = Connection::open_with_flags(&db_path, rusqlite::OpenFlags::SQLITE_OPEN_READ_ONLY)
.unwrap();

let status = ServiceStatus {
name: "core".into(),
url: "http://127.0.0.1:3000/api/health".into(),
healthy: false,
response_ms: 0,
error: Some("timeout".into()),
};

insert_health_log(&db, 9_000, &status);
prune_health_log(&db, 0);

let mut perms = std::fs::metadata(&db_path).unwrap().permissions();
perms.set_mode(0o644);
std::fs::set_permissions(&db_path, perms).unwrap();
let _ = std::fs::remove_dir_all(&dir);
}
}
Loading