diff --git a/Cargo.lock b/Cargo.lock index 9349625..4a395e1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2206,7 +2206,7 @@ dependencies = [ [[package]] name = "pm-agent-client" -version = "0.1.0" +version = "0.1.1" dependencies = [ "anyhow", "chrono", @@ -2223,7 +2223,7 @@ dependencies = [ [[package]] name = "pm-auth" -version = "0.1.0" +version = "0.1.1" dependencies = [ "anyhow", "argon2", @@ -2250,7 +2250,7 @@ dependencies = [ [[package]] name = "pm-ca" -version = "0.1.0" +version = "0.1.1" dependencies = [ "anyhow", "chrono", @@ -2273,7 +2273,7 @@ dependencies = [ [[package]] name = "pm-core" -version = "0.1.0" +version = "0.1.1" dependencies = [ "aes-gcm", "anyhow", @@ -2297,7 +2297,7 @@ dependencies = [ [[package]] name = "pm-reports" -version = "0.1.0" +version = "0.1.1" dependencies = [ "anyhow", "chrono", @@ -2318,7 +2318,7 @@ dependencies = [ [[package]] name = "pm-web" -version = "0.1.0" +version = "0.1.1" dependencies = [ "anyhow", "axum", @@ -2354,7 +2354,7 @@ dependencies = [ [[package]] name = "pm-worker" -version = "0.1.0" +version = "0.1.1" dependencies = [ "anyhow", "chrono", diff --git a/crates/pm-core/src/models.rs b/crates/pm-core/src/models.rs index e4e4984..876817f 100644 --- a/crates/pm-core/src/models.rs +++ b/crates/pm-core/src/models.rs @@ -135,6 +135,7 @@ pub struct HealthCheck { pub expected_body: Option, pub ignore_cert_errors: bool, pub basic_auth_user: Option, + pub target_host_id: Option, // basic_auth_pass_encrypted and nonce NOT exposed in API responses pub created_at: DateTime, pub updated_at: DateTime, @@ -168,6 +169,7 @@ pub struct CreateHealthCheckRequest { pub ignore_cert_errors: bool, pub basic_auth_user: Option, pub basic_auth_pass: Option, // plaintext in request, encrypted before storage + pub target_host_id: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -180,6 +182,7 @@ pub struct UpdateHealthCheckRequest { pub ignore_cert_errors: Option, pub basic_auth_user: Option, pub basic_auth_pass: Option, // if provided, re-encrypt + pub target_host_id: Option, } fn default_true() -> bool { diff --git a/crates/pm-web/src/routes/health_checks.rs b/crates/pm-web/src/routes/health_checks.rs index 2dad360..fb5d517 100644 --- a/crates/pm-web/src/routes/health_checks.rs +++ b/crates/pm-web/src/routes/health_checks.rs @@ -148,6 +148,7 @@ async fn list_health_checks( r#" SELECT id, host_id, name, check_type, enabled, service_name, url, expected_body, ignore_cert_errors, basic_auth_user, + target_host_id, created_at, updated_at FROM host_health_checks WHERE host_id = $1 @@ -256,6 +257,55 @@ async fn create_health_check( )); } + // Validate target_host_id if provided + if let Some(tid) = req.target_host_id { + let target_exists: bool = sqlx::query_scalar( + "SELECT EXISTS (SELECT 1 FROM hosts WHERE id = $1)", + ) + .bind(tid) + .fetch_one(&state.db) + .await + .map_err(|e| { + tracing::error!(error = %e, "Failed to check target host"); + ( + StatusCode::INTERNAL_SERVER_ERROR, + Json(json!({ "error": { "code": "internal_error", "message": "Database error" } })), + ) + })?; + + if !target_exists { + return Err(( + StatusCode::BAD_REQUEST, + Json( + json!({ "error": { "code": "invalid_target_host", "message": "Target host does not exist" } }), + ), + )); + } + + let target_healthy: bool = sqlx::query_scalar( + "SELECT health_status = 'healthy' FROM hosts WHERE id = $1", + ) + .bind(tid) + .fetch_one(&state.db) + .await + .map_err(|e| { + tracing::error!(error = %e, "Failed to check target host health"); + ( + StatusCode::INTERNAL_SERVER_ERROR, + Json(json!({ "error": { "code": "internal_error", "message": "Database error" } })), + ) + })?; + + if !target_healthy { + return Err(( + StatusCode::BAD_REQUEST, + Json( + json!({ "error": { "code": "invalid_target_host", "message": "Target host is not currently healthy" } }), + ), + )); + } + } + // Enforce max 5 per host let count: i64 = sqlx::query_scalar( "SELECT COUNT(*) FROM host_health_checks WHERE host_id = $1", @@ -310,8 +360,9 @@ async fn create_health_check( INSERT INTO host_health_checks ( host_id, name, check_type, enabled, service_name, url, expected_body, ignore_cert_errors, - basic_auth_user, basic_auth_pass_encrypted, basic_auth_pass_nonce - ) VALUES ($1, $2, $3, true, $4, $5, $6, $7, $8, $9, $10) + basic_auth_user, basic_auth_pass_encrypted, basic_auth_pass_nonce, + target_host_id + ) VALUES ($1, $2, $3, true, $4, $5, $6, $7, $8, $9, $10, $11) RETURNING id "#, ) @@ -325,6 +376,7 @@ async fn create_health_check( .bind(&req.basic_auth_user) .bind(pass_encrypted) .bind(pass_nonce) + .bind(req.target_host_id) .fetch_one(&state.db) .await .map_err(|e| { @@ -347,6 +399,7 @@ async fn create_health_check( "check_id": check_id, "name": req.name, "check_type": req.check_type, + "target_host_id": req.target_host_id, }), None, None, @@ -361,6 +414,7 @@ async fn create_health_check( "name": req.name, "check_type": req.check_type, "enabled": true, + "target_host_id": req.target_host_id, })), )) } @@ -397,6 +451,7 @@ async fn get_health_check( r#" SELECT id, host_id, name, check_type, enabled, service_name, url, expected_body, ignore_cert_errors, basic_auth_user, + target_host_id, created_at, updated_at FROM host_health_checks WHERE id = $1 AND host_id = $2 @@ -495,6 +550,55 @@ async fn update_health_check( )); } + // Validate target_host_id if provided + if let Some(tid) = req.target_host_id { + let target_exists: bool = sqlx::query_scalar( + "SELECT EXISTS (SELECT 1 FROM hosts WHERE id = $1)", + ) + .bind(tid) + .fetch_one(&state.db) + .await + .map_err(|e| { + tracing::error!(error = %e, "Failed to check target host"); + ( + StatusCode::INTERNAL_SERVER_ERROR, + Json(json!({ "error": { "code": "internal_error", "message": "Database error" } })), + ) + })?; + + if !target_exists { + return Err(( + StatusCode::BAD_REQUEST, + Json( + json!({ "error": { "code": "invalid_target_host", "message": "Target host does not exist" } }), + ), + )); + } + + let target_healthy: bool = sqlx::query_scalar( + "SELECT health_status = 'healthy' FROM hosts WHERE id = $1", + ) + .bind(tid) + .fetch_one(&state.db) + .await + .map_err(|e| { + tracing::error!(error = %e, "Failed to check target host health"); + ( + StatusCode::INTERNAL_SERVER_ERROR, + Json(json!({ "error": { "code": "internal_error", "message": "Database error" } })), + ) + })?; + + if !target_healthy { + return Err(( + StatusCode::BAD_REQUEST, + Json( + json!({ "error": { "code": "invalid_target_host", "message": "Target host is not currently healthy" } }), + ), + )); + } + } + // Handle basic_auth_pass encryption if provided let (pass_encrypted, pass_nonce) = if let Some(ref pass) = req.basic_auth_pass { let key_path = PathBuf::from(crypto::KEY_PATH); @@ -584,9 +688,10 @@ async fn update_health_check( basic_auth_user = COALESCE($9, basic_auth_user), basic_auth_pass_encrypted = COALESCE($10, basic_auth_pass_encrypted), basic_auth_pass_nonce = COALESCE($11, basic_auth_pass_nonce), + target_host_id = COALESCE($12, target_host_id), updated_at = NOW() WHERE id = $1 AND host_id = $2 - "#, + "#, ) .bind(check_id) .bind(host_id) @@ -599,6 +704,7 @@ async fn update_health_check( .bind(&req.basic_auth_user) .bind(pass_encrypted) .bind(pass_nonce) + .bind(req.target_host_id) .execute(&state.db) .await .map_err(|e| { @@ -624,7 +730,7 @@ async fn update_health_check( None, Some("host"), Some(&host_id.to_string()), - json!({ "check_id": check_id }), + json!({ "check_id": check_id, "target_host_id": req.target_host_id }), None, None, ) @@ -731,6 +837,7 @@ async fn test_health_check( r#" SELECT id, host_id, name, check_type, enabled, service_name, url, expected_body, ignore_cert_errors, basic_auth_user, + target_host_id, created_at, updated_at FROM host_health_checks WHERE id = $1 AND host_id = $2 @@ -815,17 +922,18 @@ async fn run_service_check(check: &HealthCheck, state: &AppState) -> CheckResult }, }; - // Get host info for agent connection - let host_info: Option<(String, String)> = sqlx::query_as::<_, (String, String)>( - "SELECT host(ip_address)::text, fqdn FROM hosts WHERE id = $1", + // Get host info for agent connection — use target_host_id if set, otherwise own host + let effective_host_id = check.target_host_id.unwrap_or(check.host_id); + let host_info: Option<(String, i32)> = sqlx::query_as::<_, (String, i32)>( + "SELECT host(ip_address)::text, agent_port FROM hosts WHERE id = $1", ) - .bind(check.host_id) + .bind(effective_host_id) .fetch_optional(&state.db) .await .ok() .flatten(); - let (ip, fqdn) = match host_info { + let (ip, agent_port) = match host_info { Some(info) => info, None => { return CheckResult { @@ -838,8 +946,8 @@ async fn run_service_check(check: &HealthCheck, state: &AppState) -> CheckResult // Build agent URL let agent_url = format!( - "https://{}:12443/api/v1/system/services/{}", - ip, service_name + "https://{}:{}/api/v1/system/services/{}", + ip, agent_port, service_name ); let start = std::time::Instant::now(); diff --git a/crates/pm-worker/src/health_check_poller.rs b/crates/pm-worker/src/health_check_poller.rs index 361b6c3..c54775b 100644 --- a/crates/pm-worker/src/health_check_poller.rs +++ b/crates/pm-worker/src/health_check_poller.rs @@ -21,7 +21,7 @@ use pm_agent_client::{AgentClient, AgentClientError}; // ───────────────────────────────────────────────────────────────────────────── /// Row fetched for each enabled health check, joined with host connection info. -#[derive(Debug, FromRow)] +#[derive(FromRow)] struct HealthCheckRow { id: Uuid, host_id: Uuid, @@ -34,6 +34,7 @@ struct HealthCheckRow { basic_auth_user: Option, basic_auth_pass_encrypted: Option>, basic_auth_pass_nonce: Option>, + target_host_id: Option, ip_address: String, agent_port: i32, } @@ -99,10 +100,12 @@ pub async fn run_health_check_poller(pool: PgPool, config: Arc) { hc.basic_auth_user, hc.basic_auth_pass_encrypted, hc.basic_auth_pass_nonce, - host(h.ip_address)::text AS ip_address, - h.agent_port + hc.target_host_id, + host(COALESCE(th.ip_address, h.ip_address))::text AS ip_address, + COALESCE(th.agent_port, h.agent_port) AS agent_port FROM host_health_checks hc JOIN hosts h ON h.id = hc.host_id + LEFT JOIN hosts th ON th.id = hc.target_host_id WHERE hc.enabled = TRUE ORDER BY hc.id "#, diff --git a/frontend/src/pages/HostDetailPage.tsx b/frontend/src/pages/HostDetailPage.tsx index 03a3d10..770acf5 100644 --- a/frontend/src/pages/HostDetailPage.tsx +++ b/frontend/src/pages/HostDetailPage.tsx @@ -18,6 +18,7 @@ import { Grid, IconButton, InputLabel, + FormHelperText, MenuItem, Paper, Select, @@ -213,6 +214,7 @@ interface HealthCheckFormValues { basic_auth_user: string basic_auth_pass: string enabled: boolean + target_host_id: string } function defaultHealthCheckForm(): HealthCheckFormValues { @@ -226,6 +228,7 @@ function defaultHealthCheckForm(): HealthCheckFormValues { basic_auth_user: '', basic_auth_pass: '', enabled: true, + target_host_id: '', } } @@ -235,11 +238,13 @@ interface HealthCheckFormDialogProps { open: boolean title: string initial: HealthCheckFormValues + hosts: { id: string; display_name: string; fqdn: string }[] + currentHostId: string onClose: () => void onSubmit: (values: HealthCheckFormValues) => Promise } -function HealthCheckFormDialog({ open, title, initial, onClose, onSubmit }: HealthCheckFormDialogProps) { +function HealthCheckFormDialog({ open, title, initial, hosts, currentHostId, onClose, onSubmit }: HealthCheckFormDialogProps) { const [form, setForm] = useState(initial) const [saving, setSaving] = useState(false) const [err, setErr] = useState(null) @@ -276,8 +281,20 @@ function HealthCheckFormDialog({ open, title, initial, onClose, onSubmit }: Heal {form.check_type === 'service' && ( - set('service_name', e.target.value)} required fullWidth - helperText="Systemd service unit name to check" /> + <> + set('service_name', e.target.value)} required fullWidth + helperText="Systemd service unit name to check" /> + + Target Host (optional) + + Query a service on a different host's agent (for redundant services) + + )} {form.check_type === 'http' && ( <> @@ -591,6 +608,9 @@ export default function HostDetailPage() { const [reissueLoading, setReissueLoading] = useState(false) const [reissueError, setReissueError] = useState(null) + // Hosts list for target_host_id dropdown + const [hosts, setHosts] = useState<{ id: string; display_name: string; fqdn: string }[]>([]) + // ── Fetch host ──────────────────────────────────────────────────────────── useEffect(() => { if (id === 'new') { setLoading(false); return } @@ -599,6 +619,13 @@ export default function HostDetailPage() { .catch(() => setError('Host not found or access denied.')) .finally(() => setLoading(false)) }, [id]) + + // ── Fetch hosts list (for target_host_id dropdown) ────────────────────── + useEffect(() => { + hostsApi.list() + .then(res => setHosts(res.data?.hosts ?? [])) + .catch(() => { /* ignore */ }) + }, []) // ── Check cert existence ─────────────────────────────────────────────────── useEffect(() => { @@ -754,6 +781,7 @@ export default function HostDetailPage() { } if (values.check_type === 'service') { body.service_name = values.service_name || undefined + body.target_host_id = values.target_host_id || undefined } else { body.url = values.url || undefined body.expected_body = values.expected_body || undefined @@ -780,6 +808,7 @@ export default function HostDetailPage() { basic_auth_user: check.basic_auth_user ?? '', basic_auth_pass: '', enabled: check.enabled, + target_host_id: check.target_host_id ?? '', }) setHcEditOpen(true) } @@ -792,6 +821,7 @@ export default function HostDetailPage() { } if (values.check_type === 'service') { body.service_name = values.service_name || undefined + body.target_host_id = values.target_host_id || undefined } else { body.url = values.url || undefined body.expected_body = values.expected_body || undefined @@ -1145,6 +1175,8 @@ export default function HostDetailPage() { open={hcCreateOpen} title="Add Health Check" initial={hcCreateForm} + hosts={hosts} + currentHostId={id ?? ''} onClose={() => setHcCreateOpen(false)} onSubmit={handleHcCreateSubmit} /> @@ -1152,6 +1184,8 @@ export default function HostDetailPage() { open={hcEditOpen} title="Edit Health Check" initial={hcEditForm} + hosts={hosts} + currentHostId={id ?? ''} onClose={() => setHcEditOpen(false)} onSubmit={handleHcEditSubmit} /> diff --git a/frontend/src/types/index.ts b/frontend/src/types/index.ts index 6d62327..bd3c31a 100644 --- a/frontend/src/types/index.ts +++ b/frontend/src/types/index.ts @@ -282,6 +282,7 @@ export interface HealthCheck { expected_body?: string ignore_cert_errors: boolean basic_auth_user?: string + target_host_id?: string | null created_at: string updated_at: string } @@ -313,6 +314,7 @@ export interface CreateHealthCheckRequest { ignore_cert_errors?: boolean basic_auth_user?: string basic_auth_pass?: string + target_host_id?: string | null } export interface UpdateHealthCheckRequest { @@ -324,4 +326,5 @@ export interface UpdateHealthCheckRequest { ignore_cert_errors?: boolean basic_auth_user?: string basic_auth_pass?: string + target_host_id?: string | null } diff --git a/migrations/011_health_check_target_host.sql b/migrations/011_health_check_target_host.sql new file mode 100644 index 0000000..25322ce --- /dev/null +++ b/migrations/011_health_check_target_host.sql @@ -0,0 +1,10 @@ +-- Add target_host_id to health checks, allowing a check on Host A +-- to query a service on Host B's agent (for redundant services). +-- NULL = check own host (backward compatible). +-- FK with ON DELETE SET NULL: if target host deleted, revert to default. + +ALTER TABLE host_health_checks + ADD COLUMN target_host_id UUID REFERENCES hosts(id) ON DELETE SET NULL; + +CREATE INDEX idx_health_checks_target_host ON host_health_checks (target_host_id) + WHERE target_host_id IS NOT NULL; diff --git a/tasks/todo.md b/tasks/todo.md index 0dac267..35373b6 100644 --- a/tasks/todo.md +++ b/tasks/todo.md @@ -1,253 +1,61 @@ -# Health Check Configuration for Hosts — Feature Plan +# Target Host for Service Health Checks ## Overview -Each host can have 1-5 health checks. During maintenance windows, all checks must be healthy before patch execution. Health checks are also continuously polled for dashboard status. +Add `target_host_id` field to service health checks, allowing a check configured on Host A to query a service on Host B's agent. Useful for redundant services running on multiple machines. -## Design Decisions (confirmed with Kelly) -- Health check config lives in manager DB, manager controls everything -- Agent is just an action endpoint (no health check logic on agent) -- Admins and operators can manage (operators need matching group) -- Per-host 1:1 (not shareable templates) -- Both continuous polling AND must be healthy before patch execution -- Service health: query agent `GET /api/v1/system/services/{name}`, check `healthy` field -- HTTP health: manager makes direct HTTP request, substring match in response body -- Retry failed checks at 5-minute intervals until maintenance window closes -- 10-second check timeout; no response = failed -- Order doesn't matter -- Basic auth password: encrypted in DB with per-install app key -- Health check poll interval: 5 minutes -- Result retention: 4 days (time-based) +**Design:** `target_host_id` is nullable. When NULL (default), behavior unchanged — check queries its own host's agent. When set, the service check queries the target host's agent instead. Only applies to service checks; HTTP checks already specify a full URL. -## linux_patch_api Agent Endpoints (Reference) +## Implementation Checklist -### GET /api/v1/system/services/{name} -Returns service status for health check Type 1 (service). Added in commit 8b6d9ed. +### 1. Database Migration +- [ ] Create `migrations/011_health_check_target_host.sql` +- [ ] Add `target_host_id UUID REFERENCES hosts(id) ON DELETE SET NULL` column +- [ ] Add partial index on `target_host_id` where NOT NULL -**Response:** `ApiResponse` -```json -{ - "success": true, - "data": { - "name": "nginx", - "display_name": "A high performance web server", - "active_state": "active", - "sub_state": "running", - "load_state": "loaded", - "enabled_state": "enabled", - "main_pid": 1234, - "healthy": true - } -} -``` +### 2. Backend Models (`crates/pm-core/src/models.rs`) +- [ ] Add `target_host_id: Option` to `HealthCheck` struct +- [ ] Add `target_host_id: Option` to `CreateHealthCheckRequest` +- [ ] Add `target_host_id: Option` to `UpdateHealthCheckRequest` +- [ ] Add `target_host_id` to all HealthCheck SELECT queries -**Health determination:** The agent `healthy` field is authoritative. Manager uses this boolean directly. +### 3. API Routes (`crates/pm-web/src/routes/health_checks.rs`) +- [ ] Create: add `target_host_id` to INSERT, validate target host exists + is healthy +- [ ] Update: add `target_host_id` to COALESCE UPDATE +- [ ] List/Get: add `target_host_id` to SELECT columns +- [ ] Test endpoint (`run_service_check`): when `target_host_id` is Some, query that host's IP/port +- [ ] Audit log: include `target_host_id` in audit JSON -**Error responses:** 400 (invalid name), 404 (not found → unhealthy), 500 (error → unhealthy) +### 4. Health Check Poller (`crates/pm-worker/src/health_check_poller.rs`) +- [ ] Add `target_host_id: Option` to `HealthCheckRow` +- [ ] Modify SQL: LEFT JOIN hosts th ON th.id = hc.target_host_id, use COALESCE(th.ip_address, h.ip_address) and COALESCE(th.agent_port, h.agent_port) +- [ ] Add `target_ip_address` and `target_agent_port` fields to HealthCheckRow +- [ ] `run_service_check`: use target host IP/port when available +- [ ] `check_host_health_checks`: no change needed (results count toward owning host) -### GET /api/v1/health -Basic agent health. Returns `{"status": "healthy"}`. Used for connectivity checks, not host health checks. +### 5. Frontend Types (`frontend/src/types/index.ts`) +- [ ] Add `target_host_id?: string` to `HealthCheck` +- [ ] Add `target_host_id?: string` to `CreateHealthCheckRequest` +- [ ] Add `target_host_id?: string` to `UpdateHealthCheckRequest` ---- +### 6. Frontend Form (`frontend/src/pages/HostDetailPage.tsx`) +- [ ] Add `target_host_id: string` to `HealthCheckFormValues` +- [ ] Add `target_host_id: ''` to `defaultHealthCheckForm` +- [ ] Add host selector dropdown in `HealthCheckFormDialog` (visible when check_type === 'service') +- [ ] Fetch hosts list for dropdown (use hostsApi.list or a dedicated endpoint) +- [ ] `handleHcCreateSubmit`: include `target_host_id: values.target_host_id || undefined` +- [ ] `handleHcEditClick`: map `check.target_host_id ?? ''` to form +- [ ] `handleHcEditSubmit`: include `target_host_id` in UpdateHealthCheckRequest +- [ ] Display target host in health checks table Target column -## Phase 1: Database Schema +### 7. Build, Test, Deploy +- [ ] Run `cargo fmt --all` + `cargo clippy` + `cargo test` +- [ ] Run frontend build + ESLint + tsc +- [ ] Commit and push through CI pipeline +- [ ] Tag release, build .deb, deploy to dev -### New table: `host_health_checks` -```sql -CREATE TABLE host_health_checks ( - id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - host_id UUID NOT NULL REFERENCES hosts(id) ON DELETE CASCADE, - name VARCHAR(100) NOT NULL, - check_type VARCHAR(20) NOT NULL CHECK (check_type IN ('service', 'http')), - enabled BOOLEAN NOT NULL DEFAULT true, - -- Service check fields - service_name VARCHAR(200), - -- HTTP check fields - url TEXT, - expected_body VARCHAR(500), - ignore_cert_errors BOOLEAN DEFAULT true, - basic_auth_user VARCHAR(100), - basic_auth_pass_encrypted BYTEA, -- AES-256-GCM encrypted with per-install key - basic_auth_pass_nonce BYTEA, -- nonce for AES-GCM - -- Metadata - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), - CONSTRAINT valid_service_check CHECK ( - (check_type = 'service' AND service_name IS NOT NULL AND url IS NULL) - OR - (check_type = 'http' AND url IS NOT NULL AND expected_body IS NOT NULL AND service_name IS NULL) - ) -); - -CREATE INDEX idx_health_checks_host ON host_health_checks (host_id); -``` - -### New table: `host_health_check_results` -```sql -CREATE TABLE host_health_check_results ( - id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - check_id UUID NOT NULL REFERENCES host_health_checks(id) ON DELETE CASCADE, - healthy BOOLEAN NOT NULL, - detail TEXT, - latency_ms INTEGER, - checked_at TIMESTAMPTZ NOT NULL DEFAULT NOW() -); - -CREATE INDEX idx_health_results_check ON host_health_check_results (check_id, checked_at DESC); -``` - -### Encryption key storage -- Per-install app key stored at `/etc/patch-manager/keys/health-check.key` -- 256-bit random key generated on first startup if not present -- File permissions: 0600, owned by patch-manager user -- Used for AES-256-GCM encryption of basic_auth_pass - -- [ ] Create migration 007_health_checks.sql -- [ ] Add models to pm-core/src/models.rs -- [ ] Add encryption utility to pm-core -- [ ] Verify migration runs on dev LXC - ---- - -## Phase 2: Backend API Routes - -### Endpoints -- `GET /api/v1/hosts/{id}/health-checks` — list health checks for host (RBAC scoped) -- `POST /api/v1/hosts/{id}/health-checks` — create health check (max 5 per host) -- `PUT /api/v1/hosts/{id}/health-checks/{check_id}` — update health check -- `DELETE /api/v1/hosts/{id}/health-checks/{check_id}` — delete health check -- `POST /api/v1/hosts/{id}/health-checks/{check_id}/test` — run check immediately, return result - -### Request/Response types -```rust -struct CreateHealthCheckRequest { - name: String, - check_type: String, // "service" or "http" - service_name: Option, - url: Option, - expected_body: Option, - ignore_cert_errors: Option, - basic_auth_user: Option, - basic_auth_pass: Option, // plaintext in request, encrypted before storage -} - -struct HealthCheck { - id: Uuid, - host_id: Uuid, - name: String, - check_type: String, - enabled: bool, - service_name: Option, - url: Option, - expected_body: Option, - ignore_cert_errors: bool, - basic_auth_user: Option, - // basic_auth_pass NOT returned in responses - last_result: Option, - created_at: DateTime, - updated_at: DateTime, -} - -struct HealthCheckResult { - healthy: bool, - detail: Option, - latency_ms: Option, - checked_at: DateTime, -} -``` - -- [ ] Add routes to pm-web/src/routes/ (new health_checks.rs) -- [ ] Add CRUD operations -- [ ] Add RBAC enforcement (admin all, operator matching group) -- [ ] Add max-5-per-host validation -- [ ] Add /test endpoint that runs check immediately -- [ ] Add audit logging for create/update/delete -- [ ] Add encryption/decryption for basic_auth_pass - ---- - -## Phase 3: Worker Health Check Engine - -### Continuous Polling -- New task in pm-worker: `health_check_poller` -- Polls all enabled health checks every 5 minutes -- For service checks: call agent `GET /api/v1/system/services/{name}` via mTLS, check `healthy` field -- For HTTP checks: make direct HTTP(S) request from manager, check status code + substring match -- Store results in `host_health_check_results` -- Prune results older than 4 days - -### Pre-Patch Execution Gate -- When a patch job is about to execute on a host: - 1. Check if host has any enabled health checks - 2. If yes, verify all are currently healthy (from latest poll result) - 3. If any are unhealthy: - - Wait and retry at 5-minute intervals - - Continue until maintenance window closes - - If window closes with failed checks, mark job host as failed with detail - 4. If all healthy, proceed with patch execution - -### HTTP Check Implementation -- Use reqwest with: - - 10-second timeout - - Accept invalid certs (ignore_cert_errors) - - Optional basic auth header (decrypt from DB) - - Check response body contains expected_body substring -- Return healthy=true if match, false otherwise - -- [ ] Add health_check_poller module to pm-worker -- [ ] Implement service check via AgentClient -- [ ] Implement HTTP check via reqwest -- [ ] Add pre-patch execution gate to job_executor -- [ ] Add retry loop with 5-minute intervals -- [ ] Add maintenance window expiry check -- [ ] Add health check config to WorkerConfig (poll interval) -- [ ] Add result pruning (4-day retention) - ---- - -## Phase 4: Frontend UI - -### Host Detail Page -- Add "Health Checks" section below host info -- List current health checks with status indicators -- Add/Edit/Delete health check dialogs -- "Test" button to run check immediately and show result -- Visual indicator: green check = healthy, red X = unhealthy, gray = unknown - -### Hosts Page -- Add health check summary column or indicator -- Show aggregate status: all healthy / some unhealthy / no checks configured - -### Deploy Page -- Show health check status in host selection table -- Warn if any selected hosts have unhealthy checks - -### Job Detail -- Show health check gate status when job is waiting for healthy checks -- Display which checks are passing/failing - -- [ ] Add HealthCheck types to frontend/src/types/index.ts -- [ ] Add health check API calls to frontend/src/api/client.ts -- [ ] Add Health Checks section to HostDetailPage.tsx -- [ ] Add health check status to HostsPage.tsx -- [ ] Add health check indicators to PatchDeploymentPage.tsx -- [ ] Add health check gate status to JobsPage.tsx detail view - ---- - -## Phase 5: Integration & Testing -- [ ] Build and deploy to dev LXC -- [ ] Test service health check against dev LXC agent -- [ ] Test HTTP health check against internal services -- [ ] Test pre-patch gate: deploy with failing check, verify retry behavior -- [ ] Test maintenance window expiry with failing checks -- [ ] Test RBAC: operator can only manage checks in their group -- [ ] Test max 5 checks per host enforcement -- [ ] Test basic auth encryption/decryption -- [ ] Push to Gitea - ---- - -## Resolved Items -- ~~Basic auth password storage~~ → Encrypted in DB with per-install app key (AES-256-GCM) -- ~~Health check poll interval~~ → 5 minutes -- ~~Result retention~~ → 4 days (time-based) +## Design Decisions +- `target_host_id` is nullable — NULL = check own host (backward compatible) +- FK with ON DELETE SET NULL — if target host deleted, revert to default +- Only applies to service checks (HTTP checks already have full URL) +- Health gate: results count toward the owning host, not the target host +- No RBAC required for target host — only requirement: target host exists in manager and is currently healthy