feat: add target_host_id to service health checks
All checks were successful
CI Pipeline / Rust Format Check (push) Successful in 6s
CI Pipeline / Clippy Lints (push) Successful in 45s
CI Pipeline / Rust Unit Tests (push) Successful in 1m2s
CI Pipeline / Security Audit (push) Successful in 3s
CI Pipeline / Frontend Lint & Type Check (push) Successful in 13s
CI Pipeline / Build .deb & Release (push) Has been skipped
All checks were successful
CI Pipeline / Rust Format Check (push) Successful in 6s
CI Pipeline / Clippy Lints (push) Successful in 45s
CI Pipeline / Rust Unit Tests (push) Successful in 1m2s
CI Pipeline / Security Audit (push) Successful in 3s
CI Pipeline / Frontend Lint & Type Check (push) Successful in 13s
CI Pipeline / Build .deb & Release (push) Has been skipped
- Add target_host_id column to host_health_checks table (nullable UUID FK) - Allow service checks to query a different host agent - Backend models, API routes, and poller updated - Frontend: host selector dropdown for service checks - Validation: target host must exist and be healthy - FK ON DELETE SET NULL: revert to own host if target deleted
This commit is contained in:
14
Cargo.lock
generated
14
Cargo.lock
generated
@ -2206,7 +2206,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "pm-agent-client"
|
||||
version = "0.1.0"
|
||||
version = "0.1.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
@ -2223,7 +2223,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "pm-auth"
|
||||
version = "0.1.0"
|
||||
version = "0.1.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"argon2",
|
||||
@ -2250,7 +2250,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "pm-ca"
|
||||
version = "0.1.0"
|
||||
version = "0.1.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
@ -2273,7 +2273,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "pm-core"
|
||||
version = "0.1.0"
|
||||
version = "0.1.1"
|
||||
dependencies = [
|
||||
"aes-gcm",
|
||||
"anyhow",
|
||||
@ -2297,7 +2297,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "pm-reports"
|
||||
version = "0.1.0"
|
||||
version = "0.1.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
@ -2318,7 +2318,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "pm-web"
|
||||
version = "0.1.0"
|
||||
version = "0.1.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"axum",
|
||||
@ -2354,7 +2354,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "pm-worker"
|
||||
version = "0.1.0"
|
||||
version = "0.1.1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
|
||||
@ -135,6 +135,7 @@ pub struct HealthCheck {
|
||||
pub expected_body: Option<String>,
|
||||
pub ignore_cert_errors: bool,
|
||||
pub basic_auth_user: Option<String>,
|
||||
pub target_host_id: Option<Uuid>,
|
||||
// basic_auth_pass_encrypted and nonce NOT exposed in API responses
|
||||
pub created_at: DateTime<Utc>,
|
||||
pub updated_at: DateTime<Utc>,
|
||||
@ -168,6 +169,7 @@ pub struct CreateHealthCheckRequest {
|
||||
pub ignore_cert_errors: bool,
|
||||
pub basic_auth_user: Option<String>,
|
||||
pub basic_auth_pass: Option<String>, // plaintext in request, encrypted before storage
|
||||
pub target_host_id: Option<Uuid>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@ -180,6 +182,7 @@ pub struct UpdateHealthCheckRequest {
|
||||
pub ignore_cert_errors: Option<bool>,
|
||||
pub basic_auth_user: Option<String>,
|
||||
pub basic_auth_pass: Option<String>, // if provided, re-encrypt
|
||||
pub target_host_id: Option<Uuid>,
|
||||
}
|
||||
|
||||
fn default_true() -> bool {
|
||||
|
||||
@ -148,6 +148,7 @@ async fn list_health_checks(
|
||||
r#"
|
||||
SELECT id, host_id, name, check_type, enabled,
|
||||
service_name, url, expected_body, ignore_cert_errors, basic_auth_user,
|
||||
target_host_id,
|
||||
created_at, updated_at
|
||||
FROM host_health_checks
|
||||
WHERE host_id = $1
|
||||
@ -256,6 +257,55 @@ async fn create_health_check(
|
||||
));
|
||||
}
|
||||
|
||||
// Validate target_host_id if provided
|
||||
if let Some(tid) = req.target_host_id {
|
||||
let target_exists: bool = sqlx::query_scalar(
|
||||
"SELECT EXISTS (SELECT 1 FROM hosts WHERE id = $1)",
|
||||
)
|
||||
.bind(tid)
|
||||
.fetch_one(&state.db)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
tracing::error!(error = %e, "Failed to check target host");
|
||||
(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(json!({ "error": { "code": "internal_error", "message": "Database error" } })),
|
||||
)
|
||||
})?;
|
||||
|
||||
if !target_exists {
|
||||
return Err((
|
||||
StatusCode::BAD_REQUEST,
|
||||
Json(
|
||||
json!({ "error": { "code": "invalid_target_host", "message": "Target host does not exist" } }),
|
||||
),
|
||||
));
|
||||
}
|
||||
|
||||
let target_healthy: bool = sqlx::query_scalar(
|
||||
"SELECT health_status = 'healthy' FROM hosts WHERE id = $1",
|
||||
)
|
||||
.bind(tid)
|
||||
.fetch_one(&state.db)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
tracing::error!(error = %e, "Failed to check target host health");
|
||||
(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(json!({ "error": { "code": "internal_error", "message": "Database error" } })),
|
||||
)
|
||||
})?;
|
||||
|
||||
if !target_healthy {
|
||||
return Err((
|
||||
StatusCode::BAD_REQUEST,
|
||||
Json(
|
||||
json!({ "error": { "code": "invalid_target_host", "message": "Target host is not currently healthy" } }),
|
||||
),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// Enforce max 5 per host
|
||||
let count: i64 = sqlx::query_scalar(
|
||||
"SELECT COUNT(*) FROM host_health_checks WHERE host_id = $1",
|
||||
@ -310,8 +360,9 @@ async fn create_health_check(
|
||||
INSERT INTO host_health_checks (
|
||||
host_id, name, check_type, enabled,
|
||||
service_name, url, expected_body, ignore_cert_errors,
|
||||
basic_auth_user, basic_auth_pass_encrypted, basic_auth_pass_nonce
|
||||
) VALUES ($1, $2, $3, true, $4, $5, $6, $7, $8, $9, $10)
|
||||
basic_auth_user, basic_auth_pass_encrypted, basic_auth_pass_nonce,
|
||||
target_host_id
|
||||
) VALUES ($1, $2, $3, true, $4, $5, $6, $7, $8, $9, $10, $11)
|
||||
RETURNING id
|
||||
"#,
|
||||
)
|
||||
@ -325,6 +376,7 @@ async fn create_health_check(
|
||||
.bind(&req.basic_auth_user)
|
||||
.bind(pass_encrypted)
|
||||
.bind(pass_nonce)
|
||||
.bind(req.target_host_id)
|
||||
.fetch_one(&state.db)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
@ -347,6 +399,7 @@ async fn create_health_check(
|
||||
"check_id": check_id,
|
||||
"name": req.name,
|
||||
"check_type": req.check_type,
|
||||
"target_host_id": req.target_host_id,
|
||||
}),
|
||||
None,
|
||||
None,
|
||||
@ -361,6 +414,7 @@ async fn create_health_check(
|
||||
"name": req.name,
|
||||
"check_type": req.check_type,
|
||||
"enabled": true,
|
||||
"target_host_id": req.target_host_id,
|
||||
})),
|
||||
))
|
||||
}
|
||||
@ -397,6 +451,7 @@ async fn get_health_check(
|
||||
r#"
|
||||
SELECT id, host_id, name, check_type, enabled,
|
||||
service_name, url, expected_body, ignore_cert_errors, basic_auth_user,
|
||||
target_host_id,
|
||||
created_at, updated_at
|
||||
FROM host_health_checks
|
||||
WHERE id = $1 AND host_id = $2
|
||||
@ -495,6 +550,55 @@ async fn update_health_check(
|
||||
));
|
||||
}
|
||||
|
||||
// Validate target_host_id if provided
|
||||
if let Some(tid) = req.target_host_id {
|
||||
let target_exists: bool = sqlx::query_scalar(
|
||||
"SELECT EXISTS (SELECT 1 FROM hosts WHERE id = $1)",
|
||||
)
|
||||
.bind(tid)
|
||||
.fetch_one(&state.db)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
tracing::error!(error = %e, "Failed to check target host");
|
||||
(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(json!({ "error": { "code": "internal_error", "message": "Database error" } })),
|
||||
)
|
||||
})?;
|
||||
|
||||
if !target_exists {
|
||||
return Err((
|
||||
StatusCode::BAD_REQUEST,
|
||||
Json(
|
||||
json!({ "error": { "code": "invalid_target_host", "message": "Target host does not exist" } }),
|
||||
),
|
||||
));
|
||||
}
|
||||
|
||||
let target_healthy: bool = sqlx::query_scalar(
|
||||
"SELECT health_status = 'healthy' FROM hosts WHERE id = $1",
|
||||
)
|
||||
.bind(tid)
|
||||
.fetch_one(&state.db)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
tracing::error!(error = %e, "Failed to check target host health");
|
||||
(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
Json(json!({ "error": { "code": "internal_error", "message": "Database error" } })),
|
||||
)
|
||||
})?;
|
||||
|
||||
if !target_healthy {
|
||||
return Err((
|
||||
StatusCode::BAD_REQUEST,
|
||||
Json(
|
||||
json!({ "error": { "code": "invalid_target_host", "message": "Target host is not currently healthy" } }),
|
||||
),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// Handle basic_auth_pass encryption if provided
|
||||
let (pass_encrypted, pass_nonce) = if let Some(ref pass) = req.basic_auth_pass {
|
||||
let key_path = PathBuf::from(crypto::KEY_PATH);
|
||||
@ -584,9 +688,10 @@ async fn update_health_check(
|
||||
basic_auth_user = COALESCE($9, basic_auth_user),
|
||||
basic_auth_pass_encrypted = COALESCE($10, basic_auth_pass_encrypted),
|
||||
basic_auth_pass_nonce = COALESCE($11, basic_auth_pass_nonce),
|
||||
target_host_id = COALESCE($12, target_host_id),
|
||||
updated_at = NOW()
|
||||
WHERE id = $1 AND host_id = $2
|
||||
"#,
|
||||
"#,
|
||||
)
|
||||
.bind(check_id)
|
||||
.bind(host_id)
|
||||
@ -599,6 +704,7 @@ async fn update_health_check(
|
||||
.bind(&req.basic_auth_user)
|
||||
.bind(pass_encrypted)
|
||||
.bind(pass_nonce)
|
||||
.bind(req.target_host_id)
|
||||
.execute(&state.db)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
@ -624,7 +730,7 @@ async fn update_health_check(
|
||||
None,
|
||||
Some("host"),
|
||||
Some(&host_id.to_string()),
|
||||
json!({ "check_id": check_id }),
|
||||
json!({ "check_id": check_id, "target_host_id": req.target_host_id }),
|
||||
None,
|
||||
None,
|
||||
)
|
||||
@ -731,6 +837,7 @@ async fn test_health_check(
|
||||
r#"
|
||||
SELECT id, host_id, name, check_type, enabled,
|
||||
service_name, url, expected_body, ignore_cert_errors, basic_auth_user,
|
||||
target_host_id,
|
||||
created_at, updated_at
|
||||
FROM host_health_checks
|
||||
WHERE id = $1 AND host_id = $2
|
||||
@ -815,17 +922,18 @@ async fn run_service_check(check: &HealthCheck, state: &AppState) -> CheckResult
|
||||
},
|
||||
};
|
||||
|
||||
// Get host info for agent connection
|
||||
let host_info: Option<(String, String)> = sqlx::query_as::<_, (String, String)>(
|
||||
"SELECT host(ip_address)::text, fqdn FROM hosts WHERE id = $1",
|
||||
// Get host info for agent connection — use target_host_id if set, otherwise own host
|
||||
let effective_host_id = check.target_host_id.unwrap_or(check.host_id);
|
||||
let host_info: Option<(String, i32)> = sqlx::query_as::<_, (String, i32)>(
|
||||
"SELECT host(ip_address)::text, agent_port FROM hosts WHERE id = $1",
|
||||
)
|
||||
.bind(check.host_id)
|
||||
.bind(effective_host_id)
|
||||
.fetch_optional(&state.db)
|
||||
.await
|
||||
.ok()
|
||||
.flatten();
|
||||
|
||||
let (ip, fqdn) = match host_info {
|
||||
let (ip, agent_port) = match host_info {
|
||||
Some(info) => info,
|
||||
None => {
|
||||
return CheckResult {
|
||||
@ -838,8 +946,8 @@ async fn run_service_check(check: &HealthCheck, state: &AppState) -> CheckResult
|
||||
|
||||
// Build agent URL
|
||||
let agent_url = format!(
|
||||
"https://{}:12443/api/v1/system/services/{}",
|
||||
ip, service_name
|
||||
"https://{}:{}/api/v1/system/services/{}",
|
||||
ip, agent_port, service_name
|
||||
);
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
@ -21,7 +21,7 @@ use pm_agent_client::{AgentClient, AgentClientError};
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Row fetched for each enabled health check, joined with host connection info.
|
||||
#[derive(Debug, FromRow)]
|
||||
#[derive(FromRow)]
|
||||
struct HealthCheckRow {
|
||||
id: Uuid,
|
||||
host_id: Uuid,
|
||||
@ -34,6 +34,7 @@ struct HealthCheckRow {
|
||||
basic_auth_user: Option<String>,
|
||||
basic_auth_pass_encrypted: Option<Vec<u8>>,
|
||||
basic_auth_pass_nonce: Option<Vec<u8>>,
|
||||
target_host_id: Option<Uuid>,
|
||||
ip_address: String,
|
||||
agent_port: i32,
|
||||
}
|
||||
@ -99,10 +100,12 @@ pub async fn run_health_check_poller(pool: PgPool, config: Arc<AppConfig>) {
|
||||
hc.basic_auth_user,
|
||||
hc.basic_auth_pass_encrypted,
|
||||
hc.basic_auth_pass_nonce,
|
||||
host(h.ip_address)::text AS ip_address,
|
||||
h.agent_port
|
||||
hc.target_host_id,
|
||||
host(COALESCE(th.ip_address, h.ip_address))::text AS ip_address,
|
||||
COALESCE(th.agent_port, h.agent_port) AS agent_port
|
||||
FROM host_health_checks hc
|
||||
JOIN hosts h ON h.id = hc.host_id
|
||||
LEFT JOIN hosts th ON th.id = hc.target_host_id
|
||||
WHERE hc.enabled = TRUE
|
||||
ORDER BY hc.id
|
||||
"#,
|
||||
|
||||
@ -18,6 +18,7 @@ import {
|
||||
Grid,
|
||||
IconButton,
|
||||
InputLabel,
|
||||
FormHelperText,
|
||||
MenuItem,
|
||||
Paper,
|
||||
Select,
|
||||
@ -213,6 +214,7 @@ interface HealthCheckFormValues {
|
||||
basic_auth_user: string
|
||||
basic_auth_pass: string
|
||||
enabled: boolean
|
||||
target_host_id: string
|
||||
}
|
||||
|
||||
function defaultHealthCheckForm(): HealthCheckFormValues {
|
||||
@ -226,6 +228,7 @@ function defaultHealthCheckForm(): HealthCheckFormValues {
|
||||
basic_auth_user: '',
|
||||
basic_auth_pass: '',
|
||||
enabled: true,
|
||||
target_host_id: '',
|
||||
}
|
||||
}
|
||||
|
||||
@ -235,11 +238,13 @@ interface HealthCheckFormDialogProps {
|
||||
open: boolean
|
||||
title: string
|
||||
initial: HealthCheckFormValues
|
||||
hosts: { id: string; display_name: string; fqdn: string }[]
|
||||
currentHostId: string
|
||||
onClose: () => void
|
||||
onSubmit: (values: HealthCheckFormValues) => Promise<void>
|
||||
}
|
||||
|
||||
function HealthCheckFormDialog({ open, title, initial, onClose, onSubmit }: HealthCheckFormDialogProps) {
|
||||
function HealthCheckFormDialog({ open, title, initial, hosts, currentHostId, onClose, onSubmit }: HealthCheckFormDialogProps) {
|
||||
const [form, setForm] = useState<HealthCheckFormValues>(initial)
|
||||
const [saving, setSaving] = useState(false)
|
||||
const [err, setErr] = useState<string | null>(null)
|
||||
@ -276,8 +281,20 @@ function HealthCheckFormDialog({ open, title, initial, onClose, onSubmit }: Heal
|
||||
</Select>
|
||||
</FormControl>
|
||||
{form.check_type === 'service' && (
|
||||
<TextField label="Service Name" value={form.service_name} onChange={e => set('service_name', e.target.value)} required fullWidth
|
||||
helperText="Systemd service unit name to check" />
|
||||
<>
|
||||
<TextField label="Service Name" value={form.service_name} onChange={e => set('service_name', e.target.value)} required fullWidth
|
||||
helperText="Systemd service unit name to check" />
|
||||
<FormControl fullWidth>
|
||||
<InputLabel>Target Host (optional)</InputLabel>
|
||||
<Select label="Target Host (optional)" value={form.target_host_id} onChange={e => set('target_host_id', e.target.value)}>
|
||||
<MenuItem value="">Own Host (default)</MenuItem>
|
||||
{hosts.filter(h => h.id !== currentHostId).map(h => (
|
||||
<MenuItem key={h.id} value={h.id}>{h.display_name || h.fqdn}</MenuItem>
|
||||
))}
|
||||
</Select>
|
||||
<FormHelperText>Query a service on a different host's agent (for redundant services)</FormHelperText>
|
||||
</FormControl>
|
||||
</>
|
||||
)}
|
||||
{form.check_type === 'http' && (
|
||||
<>
|
||||
@ -591,6 +608,9 @@ export default function HostDetailPage() {
|
||||
const [reissueLoading, setReissueLoading] = useState(false)
|
||||
const [reissueError, setReissueError] = useState<string | null>(null)
|
||||
|
||||
// Hosts list for target_host_id dropdown
|
||||
const [hosts, setHosts] = useState<{ id: string; display_name: string; fqdn: string }[]>([])
|
||||
|
||||
// ── Fetch host ────────────────────────────────────────────────────────────
|
||||
useEffect(() => {
|
||||
if (id === 'new') { setLoading(false); return }
|
||||
@ -599,6 +619,13 @@ export default function HostDetailPage() {
|
||||
.catch(() => setError('Host not found or access denied.'))
|
||||
.finally(() => setLoading(false))
|
||||
}, [id])
|
||||
|
||||
// ── Fetch hosts list (for target_host_id dropdown) ──────────────────────
|
||||
useEffect(() => {
|
||||
hostsApi.list()
|
||||
.then(res => setHosts(res.data?.hosts ?? []))
|
||||
.catch(() => { /* ignore */ })
|
||||
}, [])
|
||||
|
||||
// ── Check cert existence ───────────────────────────────────────────────────
|
||||
useEffect(() => {
|
||||
@ -754,6 +781,7 @@ export default function HostDetailPage() {
|
||||
}
|
||||
if (values.check_type === 'service') {
|
||||
body.service_name = values.service_name || undefined
|
||||
body.target_host_id = values.target_host_id || undefined
|
||||
} else {
|
||||
body.url = values.url || undefined
|
||||
body.expected_body = values.expected_body || undefined
|
||||
@ -780,6 +808,7 @@ export default function HostDetailPage() {
|
||||
basic_auth_user: check.basic_auth_user ?? '',
|
||||
basic_auth_pass: '',
|
||||
enabled: check.enabled,
|
||||
target_host_id: check.target_host_id ?? '',
|
||||
})
|
||||
setHcEditOpen(true)
|
||||
}
|
||||
@ -792,6 +821,7 @@ export default function HostDetailPage() {
|
||||
}
|
||||
if (values.check_type === 'service') {
|
||||
body.service_name = values.service_name || undefined
|
||||
body.target_host_id = values.target_host_id || undefined
|
||||
} else {
|
||||
body.url = values.url || undefined
|
||||
body.expected_body = values.expected_body || undefined
|
||||
@ -1145,6 +1175,8 @@ export default function HostDetailPage() {
|
||||
open={hcCreateOpen}
|
||||
title="Add Health Check"
|
||||
initial={hcCreateForm}
|
||||
hosts={hosts}
|
||||
currentHostId={id ?? ''}
|
||||
onClose={() => setHcCreateOpen(false)}
|
||||
onSubmit={handleHcCreateSubmit}
|
||||
/>
|
||||
@ -1152,6 +1184,8 @@ export default function HostDetailPage() {
|
||||
open={hcEditOpen}
|
||||
title="Edit Health Check"
|
||||
initial={hcEditForm}
|
||||
hosts={hosts}
|
||||
currentHostId={id ?? ''}
|
||||
onClose={() => setHcEditOpen(false)}
|
||||
onSubmit={handleHcEditSubmit}
|
||||
/>
|
||||
|
||||
@ -282,6 +282,7 @@ export interface HealthCheck {
|
||||
expected_body?: string
|
||||
ignore_cert_errors: boolean
|
||||
basic_auth_user?: string
|
||||
target_host_id?: string | null
|
||||
created_at: string
|
||||
updated_at: string
|
||||
}
|
||||
@ -313,6 +314,7 @@ export interface CreateHealthCheckRequest {
|
||||
ignore_cert_errors?: boolean
|
||||
basic_auth_user?: string
|
||||
basic_auth_pass?: string
|
||||
target_host_id?: string | null
|
||||
}
|
||||
|
||||
export interface UpdateHealthCheckRequest {
|
||||
@ -324,4 +326,5 @@ export interface UpdateHealthCheckRequest {
|
||||
ignore_cert_errors?: boolean
|
||||
basic_auth_user?: string
|
||||
basic_auth_pass?: string
|
||||
target_host_id?: string | null
|
||||
}
|
||||
|
||||
10
migrations/011_health_check_target_host.sql
Normal file
10
migrations/011_health_check_target_host.sql
Normal file
@ -0,0 +1,10 @@
|
||||
-- Add target_host_id to health checks, allowing a check on Host A
|
||||
-- to query a service on Host B's agent (for redundant services).
|
||||
-- NULL = check own host (backward compatible).
|
||||
-- FK with ON DELETE SET NULL: if target host deleted, revert to default.
|
||||
|
||||
ALTER TABLE host_health_checks
|
||||
ADD COLUMN target_host_id UUID REFERENCES hosts(id) ON DELETE SET NULL;
|
||||
|
||||
CREATE INDEX idx_health_checks_target_host ON host_health_checks (target_host_id)
|
||||
WHERE target_host_id IS NOT NULL;
|
||||
290
tasks/todo.md
290
tasks/todo.md
@ -1,253 +1,61 @@
|
||||
# Health Check Configuration for Hosts — Feature Plan
|
||||
# Target Host for Service Health Checks
|
||||
|
||||
## Overview
|
||||
Each host can have 1-5 health checks. During maintenance windows, all checks must be healthy before patch execution. Health checks are also continuously polled for dashboard status.
|
||||
Add `target_host_id` field to service health checks, allowing a check configured on Host A to query a service on Host B's agent. Useful for redundant services running on multiple machines.
|
||||
|
||||
## Design Decisions (confirmed with Kelly)
|
||||
- Health check config lives in manager DB, manager controls everything
|
||||
- Agent is just an action endpoint (no health check logic on agent)
|
||||
- Admins and operators can manage (operators need matching group)
|
||||
- Per-host 1:1 (not shareable templates)
|
||||
- Both continuous polling AND must be healthy before patch execution
|
||||
- Service health: query agent `GET /api/v1/system/services/{name}`, check `healthy` field
|
||||
- HTTP health: manager makes direct HTTP request, substring match in response body
|
||||
- Retry failed checks at 5-minute intervals until maintenance window closes
|
||||
- 10-second check timeout; no response = failed
|
||||
- Order doesn't matter
|
||||
- Basic auth password: encrypted in DB with per-install app key
|
||||
- Health check poll interval: 5 minutes
|
||||
- Result retention: 4 days (time-based)
|
||||
**Design:** `target_host_id` is nullable. When NULL (default), behavior unchanged — check queries its own host's agent. When set, the service check queries the target host's agent instead. Only applies to service checks; HTTP checks already specify a full URL.
|
||||
|
||||
## linux_patch_api Agent Endpoints (Reference)
|
||||
## Implementation Checklist
|
||||
|
||||
### GET /api/v1/system/services/{name}
|
||||
Returns service status for health check Type 1 (service). Added in commit 8b6d9ed.
|
||||
### 1. Database Migration
|
||||
- [ ] Create `migrations/011_health_check_target_host.sql`
|
||||
- [ ] Add `target_host_id UUID REFERENCES hosts(id) ON DELETE SET NULL` column
|
||||
- [ ] Add partial index on `target_host_id` where NOT NULL
|
||||
|
||||
**Response:** `ApiResponse<ServiceStatusData>`
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"name": "nginx",
|
||||
"display_name": "A high performance web server",
|
||||
"active_state": "active",
|
||||
"sub_state": "running",
|
||||
"load_state": "loaded",
|
||||
"enabled_state": "enabled",
|
||||
"main_pid": 1234,
|
||||
"healthy": true
|
||||
}
|
||||
}
|
||||
```
|
||||
### 2. Backend Models (`crates/pm-core/src/models.rs`)
|
||||
- [ ] Add `target_host_id: Option<Uuid>` to `HealthCheck` struct
|
||||
- [ ] Add `target_host_id: Option<Uuid>` to `CreateHealthCheckRequest`
|
||||
- [ ] Add `target_host_id: Option<Uuid>` to `UpdateHealthCheckRequest`
|
||||
- [ ] Add `target_host_id` to all HealthCheck SELECT queries
|
||||
|
||||
**Health determination:** The agent `healthy` field is authoritative. Manager uses this boolean directly.
|
||||
### 3. API Routes (`crates/pm-web/src/routes/health_checks.rs`)
|
||||
- [ ] Create: add `target_host_id` to INSERT, validate target host exists + is healthy
|
||||
- [ ] Update: add `target_host_id` to COALESCE UPDATE
|
||||
- [ ] List/Get: add `target_host_id` to SELECT columns
|
||||
- [ ] Test endpoint (`run_service_check`): when `target_host_id` is Some, query that host's IP/port
|
||||
- [ ] Audit log: include `target_host_id` in audit JSON
|
||||
|
||||
**Error responses:** 400 (invalid name), 404 (not found → unhealthy), 500 (error → unhealthy)
|
||||
### 4. Health Check Poller (`crates/pm-worker/src/health_check_poller.rs`)
|
||||
- [ ] Add `target_host_id: Option<Uuid>` to `HealthCheckRow`
|
||||
- [ ] Modify SQL: LEFT JOIN hosts th ON th.id = hc.target_host_id, use COALESCE(th.ip_address, h.ip_address) and COALESCE(th.agent_port, h.agent_port)
|
||||
- [ ] Add `target_ip_address` and `target_agent_port` fields to HealthCheckRow
|
||||
- [ ] `run_service_check`: use target host IP/port when available
|
||||
- [ ] `check_host_health_checks`: no change needed (results count toward owning host)
|
||||
|
||||
### GET /api/v1/health
|
||||
Basic agent health. Returns `{"status": "healthy"}`. Used for connectivity checks, not host health checks.
|
||||
### 5. Frontend Types (`frontend/src/types/index.ts`)
|
||||
- [ ] Add `target_host_id?: string` to `HealthCheck`
|
||||
- [ ] Add `target_host_id?: string` to `CreateHealthCheckRequest`
|
||||
- [ ] Add `target_host_id?: string` to `UpdateHealthCheckRequest`
|
||||
|
||||
---
|
||||
### 6. Frontend Form (`frontend/src/pages/HostDetailPage.tsx`)
|
||||
- [ ] Add `target_host_id: string` to `HealthCheckFormValues`
|
||||
- [ ] Add `target_host_id: ''` to `defaultHealthCheckForm`
|
||||
- [ ] Add host selector dropdown in `HealthCheckFormDialog` (visible when check_type === 'service')
|
||||
- [ ] Fetch hosts list for dropdown (use hostsApi.list or a dedicated endpoint)
|
||||
- [ ] `handleHcCreateSubmit`: include `target_host_id: values.target_host_id || undefined`
|
||||
- [ ] `handleHcEditClick`: map `check.target_host_id ?? ''` to form
|
||||
- [ ] `handleHcEditSubmit`: include `target_host_id` in UpdateHealthCheckRequest
|
||||
- [ ] Display target host in health checks table Target column
|
||||
|
||||
## Phase 1: Database Schema
|
||||
### 7. Build, Test, Deploy
|
||||
- [ ] Run `cargo fmt --all` + `cargo clippy` + `cargo test`
|
||||
- [ ] Run frontend build + ESLint + tsc
|
||||
- [ ] Commit and push through CI pipeline
|
||||
- [ ] Tag release, build .deb, deploy to dev
|
||||
|
||||
### New table: `host_health_checks`
|
||||
```sql
|
||||
CREATE TABLE host_health_checks (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
host_id UUID NOT NULL REFERENCES hosts(id) ON DELETE CASCADE,
|
||||
name VARCHAR(100) NOT NULL,
|
||||
check_type VARCHAR(20) NOT NULL CHECK (check_type IN ('service', 'http')),
|
||||
enabled BOOLEAN NOT NULL DEFAULT true,
|
||||
-- Service check fields
|
||||
service_name VARCHAR(200),
|
||||
-- HTTP check fields
|
||||
url TEXT,
|
||||
expected_body VARCHAR(500),
|
||||
ignore_cert_errors BOOLEAN DEFAULT true,
|
||||
basic_auth_user VARCHAR(100),
|
||||
basic_auth_pass_encrypted BYTEA, -- AES-256-GCM encrypted with per-install key
|
||||
basic_auth_pass_nonce BYTEA, -- nonce for AES-GCM
|
||||
-- Metadata
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
CONSTRAINT valid_service_check CHECK (
|
||||
(check_type = 'service' AND service_name IS NOT NULL AND url IS NULL)
|
||||
OR
|
||||
(check_type = 'http' AND url IS NOT NULL AND expected_body IS NOT NULL AND service_name IS NULL)
|
||||
)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_health_checks_host ON host_health_checks (host_id);
|
||||
```
|
||||
|
||||
### New table: `host_health_check_results`
|
||||
```sql
|
||||
CREATE TABLE host_health_check_results (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
check_id UUID NOT NULL REFERENCES host_health_checks(id) ON DELETE CASCADE,
|
||||
healthy BOOLEAN NOT NULL,
|
||||
detail TEXT,
|
||||
latency_ms INTEGER,
|
||||
checked_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX idx_health_results_check ON host_health_check_results (check_id, checked_at DESC);
|
||||
```
|
||||
|
||||
### Encryption key storage
|
||||
- Per-install app key stored at `/etc/patch-manager/keys/health-check.key`
|
||||
- 256-bit random key generated on first startup if not present
|
||||
- File permissions: 0600, owned by patch-manager user
|
||||
- Used for AES-256-GCM encryption of basic_auth_pass
|
||||
|
||||
- [ ] Create migration 007_health_checks.sql
|
||||
- [ ] Add models to pm-core/src/models.rs
|
||||
- [ ] Add encryption utility to pm-core
|
||||
- [ ] Verify migration runs on dev LXC
|
||||
|
||||
---
|
||||
|
||||
## Phase 2: Backend API Routes
|
||||
|
||||
### Endpoints
|
||||
- `GET /api/v1/hosts/{id}/health-checks` — list health checks for host (RBAC scoped)
|
||||
- `POST /api/v1/hosts/{id}/health-checks` — create health check (max 5 per host)
|
||||
- `PUT /api/v1/hosts/{id}/health-checks/{check_id}` — update health check
|
||||
- `DELETE /api/v1/hosts/{id}/health-checks/{check_id}` — delete health check
|
||||
- `POST /api/v1/hosts/{id}/health-checks/{check_id}/test` — run check immediately, return result
|
||||
|
||||
### Request/Response types
|
||||
```rust
|
||||
struct CreateHealthCheckRequest {
|
||||
name: String,
|
||||
check_type: String, // "service" or "http"
|
||||
service_name: Option<String>,
|
||||
url: Option<String>,
|
||||
expected_body: Option<String>,
|
||||
ignore_cert_errors: Option<bool>,
|
||||
basic_auth_user: Option<String>,
|
||||
basic_auth_pass: Option<String>, // plaintext in request, encrypted before storage
|
||||
}
|
||||
|
||||
struct HealthCheck {
|
||||
id: Uuid,
|
||||
host_id: Uuid,
|
||||
name: String,
|
||||
check_type: String,
|
||||
enabled: bool,
|
||||
service_name: Option<String>,
|
||||
url: Option<String>,
|
||||
expected_body: Option<String>,
|
||||
ignore_cert_errors: bool,
|
||||
basic_auth_user: Option<String>,
|
||||
// basic_auth_pass NOT returned in responses
|
||||
last_result: Option<HealthCheckResult>,
|
||||
created_at: DateTime<Utc>,
|
||||
updated_at: DateTime<Utc>,
|
||||
}
|
||||
|
||||
struct HealthCheckResult {
|
||||
healthy: bool,
|
||||
detail: Option<String>,
|
||||
latency_ms: Option<i32>,
|
||||
checked_at: DateTime<Utc>,
|
||||
}
|
||||
```
|
||||
|
||||
- [ ] Add routes to pm-web/src/routes/ (new health_checks.rs)
|
||||
- [ ] Add CRUD operations
|
||||
- [ ] Add RBAC enforcement (admin all, operator matching group)
|
||||
- [ ] Add max-5-per-host validation
|
||||
- [ ] Add /test endpoint that runs check immediately
|
||||
- [ ] Add audit logging for create/update/delete
|
||||
- [ ] Add encryption/decryption for basic_auth_pass
|
||||
|
||||
---
|
||||
|
||||
## Phase 3: Worker Health Check Engine
|
||||
|
||||
### Continuous Polling
|
||||
- New task in pm-worker: `health_check_poller`
|
||||
- Polls all enabled health checks every 5 minutes
|
||||
- For service checks: call agent `GET /api/v1/system/services/{name}` via mTLS, check `healthy` field
|
||||
- For HTTP checks: make direct HTTP(S) request from manager, check status code + substring match
|
||||
- Store results in `host_health_check_results`
|
||||
- Prune results older than 4 days
|
||||
|
||||
### Pre-Patch Execution Gate
|
||||
- When a patch job is about to execute on a host:
|
||||
1. Check if host has any enabled health checks
|
||||
2. If yes, verify all are currently healthy (from latest poll result)
|
||||
3. If any are unhealthy:
|
||||
- Wait and retry at 5-minute intervals
|
||||
- Continue until maintenance window closes
|
||||
- If window closes with failed checks, mark job host as failed with detail
|
||||
4. If all healthy, proceed with patch execution
|
||||
|
||||
### HTTP Check Implementation
|
||||
- Use reqwest with:
|
||||
- 10-second timeout
|
||||
- Accept invalid certs (ignore_cert_errors)
|
||||
- Optional basic auth header (decrypt from DB)
|
||||
- Check response body contains expected_body substring
|
||||
- Return healthy=true if match, false otherwise
|
||||
|
||||
- [ ] Add health_check_poller module to pm-worker
|
||||
- [ ] Implement service check via AgentClient
|
||||
- [ ] Implement HTTP check via reqwest
|
||||
- [ ] Add pre-patch execution gate to job_executor
|
||||
- [ ] Add retry loop with 5-minute intervals
|
||||
- [ ] Add maintenance window expiry check
|
||||
- [ ] Add health check config to WorkerConfig (poll interval)
|
||||
- [ ] Add result pruning (4-day retention)
|
||||
|
||||
---
|
||||
|
||||
## Phase 4: Frontend UI
|
||||
|
||||
### Host Detail Page
|
||||
- Add "Health Checks" section below host info
|
||||
- List current health checks with status indicators
|
||||
- Add/Edit/Delete health check dialogs
|
||||
- "Test" button to run check immediately and show result
|
||||
- Visual indicator: green check = healthy, red X = unhealthy, gray = unknown
|
||||
|
||||
### Hosts Page
|
||||
- Add health check summary column or indicator
|
||||
- Show aggregate status: all healthy / some unhealthy / no checks configured
|
||||
|
||||
### Deploy Page
|
||||
- Show health check status in host selection table
|
||||
- Warn if any selected hosts have unhealthy checks
|
||||
|
||||
### Job Detail
|
||||
- Show health check gate status when job is waiting for healthy checks
|
||||
- Display which checks are passing/failing
|
||||
|
||||
- [ ] Add HealthCheck types to frontend/src/types/index.ts
|
||||
- [ ] Add health check API calls to frontend/src/api/client.ts
|
||||
- [ ] Add Health Checks section to HostDetailPage.tsx
|
||||
- [ ] Add health check status to HostsPage.tsx
|
||||
- [ ] Add health check indicators to PatchDeploymentPage.tsx
|
||||
- [ ] Add health check gate status to JobsPage.tsx detail view
|
||||
|
||||
---
|
||||
|
||||
## Phase 5: Integration & Testing
|
||||
- [ ] Build and deploy to dev LXC
|
||||
- [ ] Test service health check against dev LXC agent
|
||||
- [ ] Test HTTP health check against internal services
|
||||
- [ ] Test pre-patch gate: deploy with failing check, verify retry behavior
|
||||
- [ ] Test maintenance window expiry with failing checks
|
||||
- [ ] Test RBAC: operator can only manage checks in their group
|
||||
- [ ] Test max 5 checks per host enforcement
|
||||
- [ ] Test basic auth encryption/decryption
|
||||
- [ ] Push to Gitea
|
||||
|
||||
---
|
||||
|
||||
## Resolved Items
|
||||
- ~~Basic auth password storage~~ → Encrypted in DB with per-install app key (AES-256-GCM)
|
||||
- ~~Health check poll interval~~ → 5 minutes
|
||||
- ~~Result retention~~ → 4 days (time-based)
|
||||
## Design Decisions
|
||||
- `target_host_id` is nullable — NULL = check own host (backward compatible)
|
||||
- FK with ON DELETE SET NULL — if target host deleted, revert to default
|
||||
- Only applies to service checks (HTTP checks already have full URL)
|
||||
- Health gate: results count toward the owning host, not the target host
|
||||
- No RBAC required for target host — only requirement: target host exists in manager and is currently healthy
|
||||
|
||||
Reference in New Issue
Block a user