Private
Public Access
1
0

feat: add target_host_id to service health checks
All checks were successful
CI Pipeline / Rust Format Check (push) Successful in 6s
CI Pipeline / Clippy Lints (push) Successful in 45s
CI Pipeline / Rust Unit Tests (push) Successful in 1m2s
CI Pipeline / Security Audit (push) Successful in 3s
CI Pipeline / Frontend Lint & Type Check (push) Successful in 13s
CI Pipeline / Build .deb & Release (push) Has been skipped

- Add target_host_id column to host_health_checks table (nullable UUID FK)
- Allow service checks to query a different host agent
- Backend models, API routes, and poller updated
- Frontend: host selector dropdown for service checks
- Validation: target host must exist and be healthy
- FK ON DELETE SET NULL: revert to own host if target deleted
This commit is contained in:
2026-05-06 21:38:42 +00:00
parent 4889ab5d0a
commit 0279caf5d2
8 changed files with 234 additions and 265 deletions

14
Cargo.lock generated
View File

@ -2206,7 +2206,7 @@ dependencies = [
[[package]] [[package]]
name = "pm-agent-client" name = "pm-agent-client"
version = "0.1.0" version = "0.1.1"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"chrono", "chrono",
@ -2223,7 +2223,7 @@ dependencies = [
[[package]] [[package]]
name = "pm-auth" name = "pm-auth"
version = "0.1.0" version = "0.1.1"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"argon2", "argon2",
@ -2250,7 +2250,7 @@ dependencies = [
[[package]] [[package]]
name = "pm-ca" name = "pm-ca"
version = "0.1.0" version = "0.1.1"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"chrono", "chrono",
@ -2273,7 +2273,7 @@ dependencies = [
[[package]] [[package]]
name = "pm-core" name = "pm-core"
version = "0.1.0" version = "0.1.1"
dependencies = [ dependencies = [
"aes-gcm", "aes-gcm",
"anyhow", "anyhow",
@ -2297,7 +2297,7 @@ dependencies = [
[[package]] [[package]]
name = "pm-reports" name = "pm-reports"
version = "0.1.0" version = "0.1.1"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"chrono", "chrono",
@ -2318,7 +2318,7 @@ dependencies = [
[[package]] [[package]]
name = "pm-web" name = "pm-web"
version = "0.1.0" version = "0.1.1"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"axum", "axum",
@ -2354,7 +2354,7 @@ dependencies = [
[[package]] [[package]]
name = "pm-worker" name = "pm-worker"
version = "0.1.0" version = "0.1.1"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"chrono", "chrono",

View File

@ -135,6 +135,7 @@ pub struct HealthCheck {
pub expected_body: Option<String>, pub expected_body: Option<String>,
pub ignore_cert_errors: bool, pub ignore_cert_errors: bool,
pub basic_auth_user: Option<String>, pub basic_auth_user: Option<String>,
pub target_host_id: Option<Uuid>,
// basic_auth_pass_encrypted and nonce NOT exposed in API responses // basic_auth_pass_encrypted and nonce NOT exposed in API responses
pub created_at: DateTime<Utc>, pub created_at: DateTime<Utc>,
pub updated_at: DateTime<Utc>, pub updated_at: DateTime<Utc>,
@ -168,6 +169,7 @@ pub struct CreateHealthCheckRequest {
pub ignore_cert_errors: bool, pub ignore_cert_errors: bool,
pub basic_auth_user: Option<String>, pub basic_auth_user: Option<String>,
pub basic_auth_pass: Option<String>, // plaintext in request, encrypted before storage pub basic_auth_pass: Option<String>, // plaintext in request, encrypted before storage
pub target_host_id: Option<Uuid>,
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
@ -180,6 +182,7 @@ pub struct UpdateHealthCheckRequest {
pub ignore_cert_errors: Option<bool>, pub ignore_cert_errors: Option<bool>,
pub basic_auth_user: Option<String>, pub basic_auth_user: Option<String>,
pub basic_auth_pass: Option<String>, // if provided, re-encrypt pub basic_auth_pass: Option<String>, // if provided, re-encrypt
pub target_host_id: Option<Uuid>,
} }
fn default_true() -> bool { fn default_true() -> bool {

View File

@ -148,6 +148,7 @@ async fn list_health_checks(
r#" r#"
SELECT id, host_id, name, check_type, enabled, SELECT id, host_id, name, check_type, enabled,
service_name, url, expected_body, ignore_cert_errors, basic_auth_user, service_name, url, expected_body, ignore_cert_errors, basic_auth_user,
target_host_id,
created_at, updated_at created_at, updated_at
FROM host_health_checks FROM host_health_checks
WHERE host_id = $1 WHERE host_id = $1
@ -256,6 +257,55 @@ async fn create_health_check(
)); ));
} }
// Validate target_host_id if provided
if let Some(tid) = req.target_host_id {
let target_exists: bool = sqlx::query_scalar(
"SELECT EXISTS (SELECT 1 FROM hosts WHERE id = $1)",
)
.bind(tid)
.fetch_one(&state.db)
.await
.map_err(|e| {
tracing::error!(error = %e, "Failed to check target host");
(
StatusCode::INTERNAL_SERVER_ERROR,
Json(json!({ "error": { "code": "internal_error", "message": "Database error" } })),
)
})?;
if !target_exists {
return Err((
StatusCode::BAD_REQUEST,
Json(
json!({ "error": { "code": "invalid_target_host", "message": "Target host does not exist" } }),
),
));
}
let target_healthy: bool = sqlx::query_scalar(
"SELECT health_status = 'healthy' FROM hosts WHERE id = $1",
)
.bind(tid)
.fetch_one(&state.db)
.await
.map_err(|e| {
tracing::error!(error = %e, "Failed to check target host health");
(
StatusCode::INTERNAL_SERVER_ERROR,
Json(json!({ "error": { "code": "internal_error", "message": "Database error" } })),
)
})?;
if !target_healthy {
return Err((
StatusCode::BAD_REQUEST,
Json(
json!({ "error": { "code": "invalid_target_host", "message": "Target host is not currently healthy" } }),
),
));
}
}
// Enforce max 5 per host // Enforce max 5 per host
let count: i64 = sqlx::query_scalar( let count: i64 = sqlx::query_scalar(
"SELECT COUNT(*) FROM host_health_checks WHERE host_id = $1", "SELECT COUNT(*) FROM host_health_checks WHERE host_id = $1",
@ -310,8 +360,9 @@ async fn create_health_check(
INSERT INTO host_health_checks ( INSERT INTO host_health_checks (
host_id, name, check_type, enabled, host_id, name, check_type, enabled,
service_name, url, expected_body, ignore_cert_errors, service_name, url, expected_body, ignore_cert_errors,
basic_auth_user, basic_auth_pass_encrypted, basic_auth_pass_nonce basic_auth_user, basic_auth_pass_encrypted, basic_auth_pass_nonce,
) VALUES ($1, $2, $3, true, $4, $5, $6, $7, $8, $9, $10) target_host_id
) VALUES ($1, $2, $3, true, $4, $5, $6, $7, $8, $9, $10, $11)
RETURNING id RETURNING id
"#, "#,
) )
@ -325,6 +376,7 @@ async fn create_health_check(
.bind(&req.basic_auth_user) .bind(&req.basic_auth_user)
.bind(pass_encrypted) .bind(pass_encrypted)
.bind(pass_nonce) .bind(pass_nonce)
.bind(req.target_host_id)
.fetch_one(&state.db) .fetch_one(&state.db)
.await .await
.map_err(|e| { .map_err(|e| {
@ -347,6 +399,7 @@ async fn create_health_check(
"check_id": check_id, "check_id": check_id,
"name": req.name, "name": req.name,
"check_type": req.check_type, "check_type": req.check_type,
"target_host_id": req.target_host_id,
}), }),
None, None,
None, None,
@ -361,6 +414,7 @@ async fn create_health_check(
"name": req.name, "name": req.name,
"check_type": req.check_type, "check_type": req.check_type,
"enabled": true, "enabled": true,
"target_host_id": req.target_host_id,
})), })),
)) ))
} }
@ -397,6 +451,7 @@ async fn get_health_check(
r#" r#"
SELECT id, host_id, name, check_type, enabled, SELECT id, host_id, name, check_type, enabled,
service_name, url, expected_body, ignore_cert_errors, basic_auth_user, service_name, url, expected_body, ignore_cert_errors, basic_auth_user,
target_host_id,
created_at, updated_at created_at, updated_at
FROM host_health_checks FROM host_health_checks
WHERE id = $1 AND host_id = $2 WHERE id = $1 AND host_id = $2
@ -495,6 +550,55 @@ async fn update_health_check(
)); ));
} }
// Validate target_host_id if provided
if let Some(tid) = req.target_host_id {
let target_exists: bool = sqlx::query_scalar(
"SELECT EXISTS (SELECT 1 FROM hosts WHERE id = $1)",
)
.bind(tid)
.fetch_one(&state.db)
.await
.map_err(|e| {
tracing::error!(error = %e, "Failed to check target host");
(
StatusCode::INTERNAL_SERVER_ERROR,
Json(json!({ "error": { "code": "internal_error", "message": "Database error" } })),
)
})?;
if !target_exists {
return Err((
StatusCode::BAD_REQUEST,
Json(
json!({ "error": { "code": "invalid_target_host", "message": "Target host does not exist" } }),
),
));
}
let target_healthy: bool = sqlx::query_scalar(
"SELECT health_status = 'healthy' FROM hosts WHERE id = $1",
)
.bind(tid)
.fetch_one(&state.db)
.await
.map_err(|e| {
tracing::error!(error = %e, "Failed to check target host health");
(
StatusCode::INTERNAL_SERVER_ERROR,
Json(json!({ "error": { "code": "internal_error", "message": "Database error" } })),
)
})?;
if !target_healthy {
return Err((
StatusCode::BAD_REQUEST,
Json(
json!({ "error": { "code": "invalid_target_host", "message": "Target host is not currently healthy" } }),
),
));
}
}
// Handle basic_auth_pass encryption if provided // Handle basic_auth_pass encryption if provided
let (pass_encrypted, pass_nonce) = if let Some(ref pass) = req.basic_auth_pass { let (pass_encrypted, pass_nonce) = if let Some(ref pass) = req.basic_auth_pass {
let key_path = PathBuf::from(crypto::KEY_PATH); let key_path = PathBuf::from(crypto::KEY_PATH);
@ -584,9 +688,10 @@ async fn update_health_check(
basic_auth_user = COALESCE($9, basic_auth_user), basic_auth_user = COALESCE($9, basic_auth_user),
basic_auth_pass_encrypted = COALESCE($10, basic_auth_pass_encrypted), basic_auth_pass_encrypted = COALESCE($10, basic_auth_pass_encrypted),
basic_auth_pass_nonce = COALESCE($11, basic_auth_pass_nonce), basic_auth_pass_nonce = COALESCE($11, basic_auth_pass_nonce),
target_host_id = COALESCE($12, target_host_id),
updated_at = NOW() updated_at = NOW()
WHERE id = $1 AND host_id = $2 WHERE id = $1 AND host_id = $2
"#, "#,
) )
.bind(check_id) .bind(check_id)
.bind(host_id) .bind(host_id)
@ -599,6 +704,7 @@ async fn update_health_check(
.bind(&req.basic_auth_user) .bind(&req.basic_auth_user)
.bind(pass_encrypted) .bind(pass_encrypted)
.bind(pass_nonce) .bind(pass_nonce)
.bind(req.target_host_id)
.execute(&state.db) .execute(&state.db)
.await .await
.map_err(|e| { .map_err(|e| {
@ -624,7 +730,7 @@ async fn update_health_check(
None, None,
Some("host"), Some("host"),
Some(&host_id.to_string()), Some(&host_id.to_string()),
json!({ "check_id": check_id }), json!({ "check_id": check_id, "target_host_id": req.target_host_id }),
None, None,
None, None,
) )
@ -731,6 +837,7 @@ async fn test_health_check(
r#" r#"
SELECT id, host_id, name, check_type, enabled, SELECT id, host_id, name, check_type, enabled,
service_name, url, expected_body, ignore_cert_errors, basic_auth_user, service_name, url, expected_body, ignore_cert_errors, basic_auth_user,
target_host_id,
created_at, updated_at created_at, updated_at
FROM host_health_checks FROM host_health_checks
WHERE id = $1 AND host_id = $2 WHERE id = $1 AND host_id = $2
@ -815,17 +922,18 @@ async fn run_service_check(check: &HealthCheck, state: &AppState) -> CheckResult
}, },
}; };
// Get host info for agent connection // Get host info for agent connection — use target_host_id if set, otherwise own host
let host_info: Option<(String, String)> = sqlx::query_as::<_, (String, String)>( let effective_host_id = check.target_host_id.unwrap_or(check.host_id);
"SELECT host(ip_address)::text, fqdn FROM hosts WHERE id = $1", let host_info: Option<(String, i32)> = sqlx::query_as::<_, (String, i32)>(
"SELECT host(ip_address)::text, agent_port FROM hosts WHERE id = $1",
) )
.bind(check.host_id) .bind(effective_host_id)
.fetch_optional(&state.db) .fetch_optional(&state.db)
.await .await
.ok() .ok()
.flatten(); .flatten();
let (ip, fqdn) = match host_info { let (ip, agent_port) = match host_info {
Some(info) => info, Some(info) => info,
None => { None => {
return CheckResult { return CheckResult {
@ -838,8 +946,8 @@ async fn run_service_check(check: &HealthCheck, state: &AppState) -> CheckResult
// Build agent URL // Build agent URL
let agent_url = format!( let agent_url = format!(
"https://{}:12443/api/v1/system/services/{}", "https://{}:{}/api/v1/system/services/{}",
ip, service_name ip, agent_port, service_name
); );
let start = std::time::Instant::now(); let start = std::time::Instant::now();

View File

@ -21,7 +21,7 @@ use pm_agent_client::{AgentClient, AgentClientError};
// ───────────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────────
/// Row fetched for each enabled health check, joined with host connection info. /// Row fetched for each enabled health check, joined with host connection info.
#[derive(Debug, FromRow)] #[derive(FromRow)]
struct HealthCheckRow { struct HealthCheckRow {
id: Uuid, id: Uuid,
host_id: Uuid, host_id: Uuid,
@ -34,6 +34,7 @@ struct HealthCheckRow {
basic_auth_user: Option<String>, basic_auth_user: Option<String>,
basic_auth_pass_encrypted: Option<Vec<u8>>, basic_auth_pass_encrypted: Option<Vec<u8>>,
basic_auth_pass_nonce: Option<Vec<u8>>, basic_auth_pass_nonce: Option<Vec<u8>>,
target_host_id: Option<Uuid>,
ip_address: String, ip_address: String,
agent_port: i32, agent_port: i32,
} }
@ -99,10 +100,12 @@ pub async fn run_health_check_poller(pool: PgPool, config: Arc<AppConfig>) {
hc.basic_auth_user, hc.basic_auth_user,
hc.basic_auth_pass_encrypted, hc.basic_auth_pass_encrypted,
hc.basic_auth_pass_nonce, hc.basic_auth_pass_nonce,
host(h.ip_address)::text AS ip_address, hc.target_host_id,
h.agent_port host(COALESCE(th.ip_address, h.ip_address))::text AS ip_address,
COALESCE(th.agent_port, h.agent_port) AS agent_port
FROM host_health_checks hc FROM host_health_checks hc
JOIN hosts h ON h.id = hc.host_id JOIN hosts h ON h.id = hc.host_id
LEFT JOIN hosts th ON th.id = hc.target_host_id
WHERE hc.enabled = TRUE WHERE hc.enabled = TRUE
ORDER BY hc.id ORDER BY hc.id
"#, "#,

View File

@ -18,6 +18,7 @@ import {
Grid, Grid,
IconButton, IconButton,
InputLabel, InputLabel,
FormHelperText,
MenuItem, MenuItem,
Paper, Paper,
Select, Select,
@ -213,6 +214,7 @@ interface HealthCheckFormValues {
basic_auth_user: string basic_auth_user: string
basic_auth_pass: string basic_auth_pass: string
enabled: boolean enabled: boolean
target_host_id: string
} }
function defaultHealthCheckForm(): HealthCheckFormValues { function defaultHealthCheckForm(): HealthCheckFormValues {
@ -226,6 +228,7 @@ function defaultHealthCheckForm(): HealthCheckFormValues {
basic_auth_user: '', basic_auth_user: '',
basic_auth_pass: '', basic_auth_pass: '',
enabled: true, enabled: true,
target_host_id: '',
} }
} }
@ -235,11 +238,13 @@ interface HealthCheckFormDialogProps {
open: boolean open: boolean
title: string title: string
initial: HealthCheckFormValues initial: HealthCheckFormValues
hosts: { id: string; display_name: string; fqdn: string }[]
currentHostId: string
onClose: () => void onClose: () => void
onSubmit: (values: HealthCheckFormValues) => Promise<void> onSubmit: (values: HealthCheckFormValues) => Promise<void>
} }
function HealthCheckFormDialog({ open, title, initial, onClose, onSubmit }: HealthCheckFormDialogProps) { function HealthCheckFormDialog({ open, title, initial, hosts, currentHostId, onClose, onSubmit }: HealthCheckFormDialogProps) {
const [form, setForm] = useState<HealthCheckFormValues>(initial) const [form, setForm] = useState<HealthCheckFormValues>(initial)
const [saving, setSaving] = useState(false) const [saving, setSaving] = useState(false)
const [err, setErr] = useState<string | null>(null) const [err, setErr] = useState<string | null>(null)
@ -276,8 +281,20 @@ function HealthCheckFormDialog({ open, title, initial, onClose, onSubmit }: Heal
</Select> </Select>
</FormControl> </FormControl>
{form.check_type === 'service' && ( {form.check_type === 'service' && (
<TextField label="Service Name" value={form.service_name} onChange={e => set('service_name', e.target.value)} required fullWidth <>
helperText="Systemd service unit name to check" /> <TextField label="Service Name" value={form.service_name} onChange={e => set('service_name', e.target.value)} required fullWidth
helperText="Systemd service unit name to check" />
<FormControl fullWidth>
<InputLabel>Target Host (optional)</InputLabel>
<Select label="Target Host (optional)" value={form.target_host_id} onChange={e => set('target_host_id', e.target.value)}>
<MenuItem value="">Own Host (default)</MenuItem>
{hosts.filter(h => h.id !== currentHostId).map(h => (
<MenuItem key={h.id} value={h.id}>{h.display_name || h.fqdn}</MenuItem>
))}
</Select>
<FormHelperText>Query a service on a different host's agent (for redundant services)</FormHelperText>
</FormControl>
</>
)} )}
{form.check_type === 'http' && ( {form.check_type === 'http' && (
<> <>
@ -591,6 +608,9 @@ export default function HostDetailPage() {
const [reissueLoading, setReissueLoading] = useState(false) const [reissueLoading, setReissueLoading] = useState(false)
const [reissueError, setReissueError] = useState<string | null>(null) const [reissueError, setReissueError] = useState<string | null>(null)
// Hosts list for target_host_id dropdown
const [hosts, setHosts] = useState<{ id: string; display_name: string; fqdn: string }[]>([])
// ── Fetch host ──────────────────────────────────────────────────────────── // ── Fetch host ────────────────────────────────────────────────────────────
useEffect(() => { useEffect(() => {
if (id === 'new') { setLoading(false); return } if (id === 'new') { setLoading(false); return }
@ -599,6 +619,13 @@ export default function HostDetailPage() {
.catch(() => setError('Host not found or access denied.')) .catch(() => setError('Host not found or access denied.'))
.finally(() => setLoading(false)) .finally(() => setLoading(false))
}, [id]) }, [id])
// ── Fetch hosts list (for target_host_id dropdown) ──────────────────────
useEffect(() => {
hostsApi.list()
.then(res => setHosts(res.data?.hosts ?? []))
.catch(() => { /* ignore */ })
}, [])
// ── Check cert existence ─────────────────────────────────────────────────── // ── Check cert existence ───────────────────────────────────────────────────
useEffect(() => { useEffect(() => {
@ -754,6 +781,7 @@ export default function HostDetailPage() {
} }
if (values.check_type === 'service') { if (values.check_type === 'service') {
body.service_name = values.service_name || undefined body.service_name = values.service_name || undefined
body.target_host_id = values.target_host_id || undefined
} else { } else {
body.url = values.url || undefined body.url = values.url || undefined
body.expected_body = values.expected_body || undefined body.expected_body = values.expected_body || undefined
@ -780,6 +808,7 @@ export default function HostDetailPage() {
basic_auth_user: check.basic_auth_user ?? '', basic_auth_user: check.basic_auth_user ?? '',
basic_auth_pass: '', basic_auth_pass: '',
enabled: check.enabled, enabled: check.enabled,
target_host_id: check.target_host_id ?? '',
}) })
setHcEditOpen(true) setHcEditOpen(true)
} }
@ -792,6 +821,7 @@ export default function HostDetailPage() {
} }
if (values.check_type === 'service') { if (values.check_type === 'service') {
body.service_name = values.service_name || undefined body.service_name = values.service_name || undefined
body.target_host_id = values.target_host_id || undefined
} else { } else {
body.url = values.url || undefined body.url = values.url || undefined
body.expected_body = values.expected_body || undefined body.expected_body = values.expected_body || undefined
@ -1145,6 +1175,8 @@ export default function HostDetailPage() {
open={hcCreateOpen} open={hcCreateOpen}
title="Add Health Check" title="Add Health Check"
initial={hcCreateForm} initial={hcCreateForm}
hosts={hosts}
currentHostId={id ?? ''}
onClose={() => setHcCreateOpen(false)} onClose={() => setHcCreateOpen(false)}
onSubmit={handleHcCreateSubmit} onSubmit={handleHcCreateSubmit}
/> />
@ -1152,6 +1184,8 @@ export default function HostDetailPage() {
open={hcEditOpen} open={hcEditOpen}
title="Edit Health Check" title="Edit Health Check"
initial={hcEditForm} initial={hcEditForm}
hosts={hosts}
currentHostId={id ?? ''}
onClose={() => setHcEditOpen(false)} onClose={() => setHcEditOpen(false)}
onSubmit={handleHcEditSubmit} onSubmit={handleHcEditSubmit}
/> />

View File

@ -282,6 +282,7 @@ export interface HealthCheck {
expected_body?: string expected_body?: string
ignore_cert_errors: boolean ignore_cert_errors: boolean
basic_auth_user?: string basic_auth_user?: string
target_host_id?: string | null
created_at: string created_at: string
updated_at: string updated_at: string
} }
@ -313,6 +314,7 @@ export interface CreateHealthCheckRequest {
ignore_cert_errors?: boolean ignore_cert_errors?: boolean
basic_auth_user?: string basic_auth_user?: string
basic_auth_pass?: string basic_auth_pass?: string
target_host_id?: string | null
} }
export interface UpdateHealthCheckRequest { export interface UpdateHealthCheckRequest {
@ -324,4 +326,5 @@ export interface UpdateHealthCheckRequest {
ignore_cert_errors?: boolean ignore_cert_errors?: boolean
basic_auth_user?: string basic_auth_user?: string
basic_auth_pass?: string basic_auth_pass?: string
target_host_id?: string | null
} }

View File

@ -0,0 +1,10 @@
-- Add target_host_id to health checks, allowing a check on Host A
-- to query a service on Host B's agent (for redundant services).
-- NULL = check own host (backward compatible).
-- FK with ON DELETE SET NULL: if target host deleted, revert to default.
ALTER TABLE host_health_checks
ADD COLUMN target_host_id UUID REFERENCES hosts(id) ON DELETE SET NULL;
CREATE INDEX idx_health_checks_target_host ON host_health_checks (target_host_id)
WHERE target_host_id IS NOT NULL;

View File

@ -1,253 +1,61 @@
# Health Check Configuration for Hosts — Feature Plan # Target Host for Service Health Checks
## Overview ## Overview
Each host can have 1-5 health checks. During maintenance windows, all checks must be healthy before patch execution. Health checks are also continuously polled for dashboard status. Add `target_host_id` field to service health checks, allowing a check configured on Host A to query a service on Host B's agent. Useful for redundant services running on multiple machines.
## Design Decisions (confirmed with Kelly) **Design:** `target_host_id` is nullable. When NULL (default), behavior unchanged — check queries its own host's agent. When set, the service check queries the target host's agent instead. Only applies to service checks; HTTP checks already specify a full URL.
- Health check config lives in manager DB, manager controls everything
- Agent is just an action endpoint (no health check logic on agent)
- Admins and operators can manage (operators need matching group)
- Per-host 1:1 (not shareable templates)
- Both continuous polling AND must be healthy before patch execution
- Service health: query agent `GET /api/v1/system/services/{name}`, check `healthy` field
- HTTP health: manager makes direct HTTP request, substring match in response body
- Retry failed checks at 5-minute intervals until maintenance window closes
- 10-second check timeout; no response = failed
- Order doesn't matter
- Basic auth password: encrypted in DB with per-install app key
- Health check poll interval: 5 minutes
- Result retention: 4 days (time-based)
## linux_patch_api Agent Endpoints (Reference) ## Implementation Checklist
### GET /api/v1/system/services/{name} ### 1. Database Migration
Returns service status for health check Type 1 (service). Added in commit 8b6d9ed. - [ ] Create `migrations/011_health_check_target_host.sql`
- [ ] Add `target_host_id UUID REFERENCES hosts(id) ON DELETE SET NULL` column
- [ ] Add partial index on `target_host_id` where NOT NULL
**Response:** `ApiResponse<ServiceStatusData>` ### 2. Backend Models (`crates/pm-core/src/models.rs`)
```json - [ ] Add `target_host_id: Option<Uuid>` to `HealthCheck` struct
{ - [ ] Add `target_host_id: Option<Uuid>` to `CreateHealthCheckRequest`
"success": true, - [ ] Add `target_host_id: Option<Uuid>` to `UpdateHealthCheckRequest`
"data": { - [ ] Add `target_host_id` to all HealthCheck SELECT queries
"name": "nginx",
"display_name": "A high performance web server",
"active_state": "active",
"sub_state": "running",
"load_state": "loaded",
"enabled_state": "enabled",
"main_pid": 1234,
"healthy": true
}
}
```
**Health determination:** The agent `healthy` field is authoritative. Manager uses this boolean directly. ### 3. API Routes (`crates/pm-web/src/routes/health_checks.rs`)
- [ ] Create: add `target_host_id` to INSERT, validate target host exists + is healthy
- [ ] Update: add `target_host_id` to COALESCE UPDATE
- [ ] List/Get: add `target_host_id` to SELECT columns
- [ ] Test endpoint (`run_service_check`): when `target_host_id` is Some, query that host's IP/port
- [ ] Audit log: include `target_host_id` in audit JSON
**Error responses:** 400 (invalid name), 404 (not found → unhealthy), 500 (error → unhealthy) ### 4. Health Check Poller (`crates/pm-worker/src/health_check_poller.rs`)
- [ ] Add `target_host_id: Option<Uuid>` to `HealthCheckRow`
- [ ] Modify SQL: LEFT JOIN hosts th ON th.id = hc.target_host_id, use COALESCE(th.ip_address, h.ip_address) and COALESCE(th.agent_port, h.agent_port)
- [ ] Add `target_ip_address` and `target_agent_port` fields to HealthCheckRow
- [ ] `run_service_check`: use target host IP/port when available
- [ ] `check_host_health_checks`: no change needed (results count toward owning host)
### GET /api/v1/health ### 5. Frontend Types (`frontend/src/types/index.ts`)
Basic agent health. Returns `{"status": "healthy"}`. Used for connectivity checks, not host health checks. - [ ] Add `target_host_id?: string` to `HealthCheck`
- [ ] Add `target_host_id?: string` to `CreateHealthCheckRequest`
- [ ] Add `target_host_id?: string` to `UpdateHealthCheckRequest`
--- ### 6. Frontend Form (`frontend/src/pages/HostDetailPage.tsx`)
- [ ] Add `target_host_id: string` to `HealthCheckFormValues`
- [ ] Add `target_host_id: ''` to `defaultHealthCheckForm`
- [ ] Add host selector dropdown in `HealthCheckFormDialog` (visible when check_type === 'service')
- [ ] Fetch hosts list for dropdown (use hostsApi.list or a dedicated endpoint)
- [ ] `handleHcCreateSubmit`: include `target_host_id: values.target_host_id || undefined`
- [ ] `handleHcEditClick`: map `check.target_host_id ?? ''` to form
- [ ] `handleHcEditSubmit`: include `target_host_id` in UpdateHealthCheckRequest
- [ ] Display target host in health checks table Target column
## Phase 1: Database Schema ### 7. Build, Test, Deploy
- [ ] Run `cargo fmt --all` + `cargo clippy` + `cargo test`
- [ ] Run frontend build + ESLint + tsc
- [ ] Commit and push through CI pipeline
- [ ] Tag release, build .deb, deploy to dev
### New table: `host_health_checks` ## Design Decisions
```sql - `target_host_id` is nullable — NULL = check own host (backward compatible)
CREATE TABLE host_health_checks ( - FK with ON DELETE SET NULL — if target host deleted, revert to default
id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - Only applies to service checks (HTTP checks already have full URL)
host_id UUID NOT NULL REFERENCES hosts(id) ON DELETE CASCADE, - Health gate: results count toward the owning host, not the target host
name VARCHAR(100) NOT NULL, - No RBAC required for target host — only requirement: target host exists in manager and is currently healthy
check_type VARCHAR(20) NOT NULL CHECK (check_type IN ('service', 'http')),
enabled BOOLEAN NOT NULL DEFAULT true,
-- Service check fields
service_name VARCHAR(200),
-- HTTP check fields
url TEXT,
expected_body VARCHAR(500),
ignore_cert_errors BOOLEAN DEFAULT true,
basic_auth_user VARCHAR(100),
basic_auth_pass_encrypted BYTEA, -- AES-256-GCM encrypted with per-install key
basic_auth_pass_nonce BYTEA, -- nonce for AES-GCM
-- Metadata
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
CONSTRAINT valid_service_check CHECK (
(check_type = 'service' AND service_name IS NOT NULL AND url IS NULL)
OR
(check_type = 'http' AND url IS NOT NULL AND expected_body IS NOT NULL AND service_name IS NULL)
)
);
CREATE INDEX idx_health_checks_host ON host_health_checks (host_id);
```
### New table: `host_health_check_results`
```sql
CREATE TABLE host_health_check_results (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
check_id UUID NOT NULL REFERENCES host_health_checks(id) ON DELETE CASCADE,
healthy BOOLEAN NOT NULL,
detail TEXT,
latency_ms INTEGER,
checked_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_health_results_check ON host_health_check_results (check_id, checked_at DESC);
```
### Encryption key storage
- Per-install app key stored at `/etc/patch-manager/keys/health-check.key`
- 256-bit random key generated on first startup if not present
- File permissions: 0600, owned by patch-manager user
- Used for AES-256-GCM encryption of basic_auth_pass
- [ ] Create migration 007_health_checks.sql
- [ ] Add models to pm-core/src/models.rs
- [ ] Add encryption utility to pm-core
- [ ] Verify migration runs on dev LXC
---
## Phase 2: Backend API Routes
### Endpoints
- `GET /api/v1/hosts/{id}/health-checks` — list health checks for host (RBAC scoped)
- `POST /api/v1/hosts/{id}/health-checks` — create health check (max 5 per host)
- `PUT /api/v1/hosts/{id}/health-checks/{check_id}` — update health check
- `DELETE /api/v1/hosts/{id}/health-checks/{check_id}` — delete health check
- `POST /api/v1/hosts/{id}/health-checks/{check_id}/test` — run check immediately, return result
### Request/Response types
```rust
struct CreateHealthCheckRequest {
name: String,
check_type: String, // "service" or "http"
service_name: Option<String>,
url: Option<String>,
expected_body: Option<String>,
ignore_cert_errors: Option<bool>,
basic_auth_user: Option<String>,
basic_auth_pass: Option<String>, // plaintext in request, encrypted before storage
}
struct HealthCheck {
id: Uuid,
host_id: Uuid,
name: String,
check_type: String,
enabled: bool,
service_name: Option<String>,
url: Option<String>,
expected_body: Option<String>,
ignore_cert_errors: bool,
basic_auth_user: Option<String>,
// basic_auth_pass NOT returned in responses
last_result: Option<HealthCheckResult>,
created_at: DateTime<Utc>,
updated_at: DateTime<Utc>,
}
struct HealthCheckResult {
healthy: bool,
detail: Option<String>,
latency_ms: Option<i32>,
checked_at: DateTime<Utc>,
}
```
- [ ] Add routes to pm-web/src/routes/ (new health_checks.rs)
- [ ] Add CRUD operations
- [ ] Add RBAC enforcement (admin all, operator matching group)
- [ ] Add max-5-per-host validation
- [ ] Add /test endpoint that runs check immediately
- [ ] Add audit logging for create/update/delete
- [ ] Add encryption/decryption for basic_auth_pass
---
## Phase 3: Worker Health Check Engine
### Continuous Polling
- New task in pm-worker: `health_check_poller`
- Polls all enabled health checks every 5 minutes
- For service checks: call agent `GET /api/v1/system/services/{name}` via mTLS, check `healthy` field
- For HTTP checks: make direct HTTP(S) request from manager, check status code + substring match
- Store results in `host_health_check_results`
- Prune results older than 4 days
### Pre-Patch Execution Gate
- When a patch job is about to execute on a host:
1. Check if host has any enabled health checks
2. If yes, verify all are currently healthy (from latest poll result)
3. If any are unhealthy:
- Wait and retry at 5-minute intervals
- Continue until maintenance window closes
- If window closes with failed checks, mark job host as failed with detail
4. If all healthy, proceed with patch execution
### HTTP Check Implementation
- Use reqwest with:
- 10-second timeout
- Accept invalid certs (ignore_cert_errors)
- Optional basic auth header (decrypt from DB)
- Check response body contains expected_body substring
- Return healthy=true if match, false otherwise
- [ ] Add health_check_poller module to pm-worker
- [ ] Implement service check via AgentClient
- [ ] Implement HTTP check via reqwest
- [ ] Add pre-patch execution gate to job_executor
- [ ] Add retry loop with 5-minute intervals
- [ ] Add maintenance window expiry check
- [ ] Add health check config to WorkerConfig (poll interval)
- [ ] Add result pruning (4-day retention)
---
## Phase 4: Frontend UI
### Host Detail Page
- Add "Health Checks" section below host info
- List current health checks with status indicators
- Add/Edit/Delete health check dialogs
- "Test" button to run check immediately and show result
- Visual indicator: green check = healthy, red X = unhealthy, gray = unknown
### Hosts Page
- Add health check summary column or indicator
- Show aggregate status: all healthy / some unhealthy / no checks configured
### Deploy Page
- Show health check status in host selection table
- Warn if any selected hosts have unhealthy checks
### Job Detail
- Show health check gate status when job is waiting for healthy checks
- Display which checks are passing/failing
- [ ] Add HealthCheck types to frontend/src/types/index.ts
- [ ] Add health check API calls to frontend/src/api/client.ts
- [ ] Add Health Checks section to HostDetailPage.tsx
- [ ] Add health check status to HostsPage.tsx
- [ ] Add health check indicators to PatchDeploymentPage.tsx
- [ ] Add health check gate status to JobsPage.tsx detail view
---
## Phase 5: Integration & Testing
- [ ] Build and deploy to dev LXC
- [ ] Test service health check against dev LXC agent
- [ ] Test HTTP health check against internal services
- [ ] Test pre-patch gate: deploy with failing check, verify retry behavior
- [ ] Test maintenance window expiry with failing checks
- [ ] Test RBAC: operator can only manage checks in their group
- [ ] Test max 5 checks per host enforcement
- [ ] Test basic auth encryption/decryption
- [ ] Push to Gitea
---
## Resolved Items
- ~~Basic auth password storage~~ → Encrypted in DB with per-install app key (AES-256-GCM)
- ~~Health check poll interval~~ → 5 minutes
- ~~Result retention~~ → 4 days (time-based)