feat: health check configuration and worker engine (Phase 3+4)

- Added health_check_poller.rs: periodic service/HTTP health checks - Added pre-patch health gate in job_executor.rs - Added waiting_health_check job status (migration 008) - Added health_check_status to HostSummary and hosts API - Added health check types and API functions to frontend - Added health check UI section to HostDetailPage - Added health check status indicators to HostsPage and PatchDeploymentPage - Added serde default for health_check_poll_interval_secs - Fixed missing AgentClient import in health_check_poller.rs - Fixed missing ws_relay import in main.rs - Fixed missing closing paren in retry_pending_jobs SQL - Added ReadWritePaths for /etc/patch-manager/keys in systemd services
2026-05-05 14:10:37 +00:00
parent a306806b04
commit 93828e1976
28 changed files with 2726 additions and 50 deletions
--- a/crates/pm-agent-client/src/client.rs
+++ b/crates/pm-agent-client/src/client.rs
@ -30,7 +30,7 @@ use crate::{
    error::AgentClientError,
    types::{
        AgentEnvelope, AgentJobStatus, ApplyPatchesRequest, ApplyPatchesResponse, HealthData,
-        PackagesData, PatchesData, RollbackResponse, SystemInfoData,
+        PackagesData, PatchesData, RollbackResponse, ServiceStatusData, SystemInfoData,
    },
 };

@ -221,10 +221,17 @@ impl AgentClient {
            .await
    }

+    /// `GET /api/v1/system/services/{name}` — check status of a specific service on the agent.
+    #[instrument(skip(self), fields(base_url = %self.base_url, service_name = %service_name))]
+    pub async fn service_status(&self, service_name: &str) -> Result<ServiceStatusData, AgentClientError> {
+        self.get(&format!("system/services/{}", service_name), &[]).await
+    }
+
    // --------------------------------------------------------
    // Private POST helper
    // --------------------------------------------------------

+
    /// Execute a POST request against `{base_url}/{path}`, serialize `body` as
    /// JSON, deserialize the [`AgentEnvelope`], and extract the `data` field —
    /// or propagate an [`AgentClientError::ApiError`].
--- a/crates/pm-agent-client/src/lib.rs
+++ b/crates/pm-agent-client/src/lib.rs
@ -39,5 +39,5 @@ pub use error::AgentClientError;
 /// Response envelope and all data types.
 pub use types::{
    AgentEnvelope, AgentErrorBody, HealthData, Package, PackagesData, Patch, PatchesData,
-    SystemInfoData,
+    RollbackResponse, ServiceStatusData, SystemInfoData,
 };
--- a/crates/pm-agent-client/src/types.rs
+++ b/crates/pm-agent-client/src/types.rs
@ -193,6 +193,23 @@ pub struct AgentJobStatus {
    pub completed_at: Option<DateTime<Utc>>,
 }

+// ============================================================
+// GET /api/v1/system/services/{name}
+// ============================================================
+
+/// Payload returned by `GET /api/v1/system/services/{name}`.
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct ServiceStatusData {
+    /// Service name.
+    pub name: String,
+    /// Service status string (e.g. `"running"`, `"stopped"`, `"failed"`).
+    pub status: String,
+    /// Whether the service is considered healthy.
+    pub healthy: bool,
+    /// Seconds elapsed since the service started (`null` if not running).
+    pub uptime_secs: Option<u64>,
+}
+
 // ============================================================
 // POST /api/v1/jobs/{id}/rollback
 // ============================================================
--- a/crates/pm-core/Cargo.toml
+++ b/crates/pm-core/Cargo.toml
@ -22,3 +22,5 @@ config = { workspace = true }
 axum = { workspace = true }
 sha2 = { workspace = true }
 hex = { workspace = true }
+aes-gcm = { workspace = true }
+rand = { workspace = true }
--- a/crates/pm-core/src/audit.rs
+++ b/crates/pm-core/src/audit.rs
@ -47,6 +47,9 @@ pub enum AuditAction {
    PatchJobCompleted,
    PatchJobFailed,
    MaintenanceWindowReminder,
+    HealthCheckCreated,
+    HealthCheckUpdated,
+    HealthCheckDeleted,
 }

 impl AuditAction {
@ -80,6 +83,9 @@ impl AuditAction {
            Self::PatchJobCompleted => "patch_job_completed",
            Self::PatchJobFailed => "patch_job_failed",
            Self::MaintenanceWindowReminder => "maintenance_window_reminder",
+            Self::HealthCheckCreated => "health_check_created",
+            Self::HealthCheckUpdated => "health_check_updated",
+            Self::HealthCheckDeleted => "health_check_deleted",
        }
    }
 }
--- a/crates/pm-core/src/config.rs
+++ b/crates/pm-core/src/config.rs
@ -39,6 +39,9 @@ pub struct WorkerConfig {
    pub health_poll_interval_secs: u64,
    /// Patch data poll interval in seconds (default: 1800 = 30 min)
    pub patch_poll_interval_secs: u64,
+    /// Health check poll interval in seconds (default: 300 = 5 min)
+    #[serde(default = "default_health_check_poll_interval")]
+    pub health_check_poll_interval_secs: u64,
    /// Maximum concurrent agent calls
    pub max_concurrent_agent_calls: usize,
    /// Worker heartbeat interval in seconds
@ -98,6 +101,8 @@ impl AppConfig {
    }
 }

+fn default_health_check_poll_interval() -> u64 { 300 }
+
 impl Default for AppConfig {
    fn default() -> Self {
        Self {
@ -115,6 +120,7 @@ impl Default for AppConfig {
            worker: WorkerConfig {
                health_poll_interval_secs: 300,
                patch_poll_interval_secs: 1800,
+                health_check_poll_interval_secs: 300,
                max_concurrent_agent_calls: 64,
                heartbeat_interval_secs: 30,
                ws_relay_poll_interval_secs: 10,
--- a/crates/pm-core/src/crypto.rs
+++ b/crates/pm-core/src/crypto.rs
@ -0,0 +1,80 @@
+//! AES-256-GCM encryption for sensitive health check credentials.
+//!
+//! Uses a per-install key stored at `/etc/patch-manager/keys/health-check.key`.
+
+use aes_gcm::{
+    aead::{Aead, KeyInit, OsRng},
+    Aes256Gcm, Nonce,
+};
+use rand::RngCore;
+use std::fs;
+use std::path::Path;
+
+pub const KEY_PATH: &str = "/etc/patch-manager/keys/health-check.key";
+
+/// Load or create the per-install encryption key.
+/// If the key file doesn't exist, generates a new 256-bit key and saves it.
+pub fn load_or_create_key(path: &Path) -> Result<[u8; 32], CryptoError> {
+    if path.exists() {
+        let key_bytes = fs::read(path).map_err(CryptoError::Io)?;
+        if key_bytes.len() != 32 {
+            return Err(CryptoError::InvalidKeyLength(key_bytes.len()));
+        }
+        let mut key = [0u8; 32];
+        key.copy_from_slice(&key_bytes);
+        Ok(key)
+    } else {
+        let mut key = [0u8; 32];
+        OsRng.fill_bytes(&mut key);
+        if let Some(parent) = path.parent() {
+            fs::create_dir_all(parent).map_err(CryptoError::Io)?;
+        }
+        fs::write(path, &key).map_err(CryptoError::Io)?;
+        // Set permissions to 0600 (owner read/write only)
+        #[cfg(unix)]
+        {
+            use std::os::unix::fs::PermissionsExt;
+            fs::set_permissions(path, fs::Permissions::from_mode(0o600))
+                .map_err(CryptoError::Io)?;
+        }
+        Ok(key)
+    }
+}
+
+/// Encrypt plaintext with AES-256-GCM. Returns (ciphertext, nonce).
+pub fn encrypt(plaintext: &str, key: &[u8; 32]) -> Result<(Vec<u8>, Vec<u8>), CryptoError> {
+    let cipher = Aes256Gcm::new_from_slice(key).map_err(|e| CryptoError::KeyInit(e.to_string()))?;
+    let mut nonce_bytes = [0u8; 12];
+    OsRng.fill_bytes(&mut nonce_bytes);
+    let nonce = Nonce::from_slice(&nonce_bytes);
+    let ciphertext = cipher
+        .encrypt(nonce, plaintext.as_bytes())
+        .map_err(|_| CryptoError::EncryptionFailed)?;
+    Ok((ciphertext, nonce_bytes.to_vec()))
+}
+
+/// Decrypt AES-256-GCM ciphertext with the given nonce.
+pub fn decrypt(ciphertext: &[u8], nonce: &[u8], key: &[u8; 32]) -> Result<String, CryptoError> {
+    let cipher = Aes256Gcm::new_from_slice(key).map_err(|e| CryptoError::KeyInit(e.to_string()))?;
+    let nonce = Nonce::from_slice(nonce);
+    let plaintext = cipher
+        .decrypt(nonce, ciphertext)
+        .map_err(|_| CryptoError::DecryptionFailed)?;
+    String::from_utf8(plaintext).map_err(CryptoError::Utf8)
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum CryptoError {
+    #[error("IO error: {0}")]
+    Io(#[from] std::io::Error),
+    #[error("Invalid key length: expected 32 bytes, got {0}")]
+    InvalidKeyLength(usize),
+    #[error("Key init error: {0}")]
+    KeyInit(String),
+    #[error("Encryption failed")]
+    EncryptionFailed,
+    #[error("Decryption failed")]
+    DecryptionFailed,
+    #[error("UTF-8 error: {0}")]
+    Utf8(#[from] std::string::FromUtf8Error),
+}
--- a/crates/pm-core/src/lib.rs
+++ b/crates/pm-core/src/lib.rs
@ -1,5 +1,6 @@
 pub mod audit;
 pub mod config;
+pub mod crypto;
 pub mod db;
 pub mod error;
 pub mod logging;
@ -8,11 +9,14 @@ pub mod request_id;

 // Re-export commonly used types
 pub use config::AppConfig;
+pub use crypto::{CryptoError, KEY_PATH, decrypt, encrypt, load_or_create_key};
 pub use error::{AppError, ErrorResponse};
 pub use models::{
-    AuthProvider, CreateGroupRequest, CreateHostRequest, CreateUserRequest, DiscoveryCidrRequest,
-    DiscoveryResult, Group, Host, HostHealthStatus, HostSummary, RegisterDiscoveredRequest,
-    UpdateGroupRequest, UpdateUserRequest, User, UserRole as DbUserRole,
+    AuthProvider, CreateGroupRequest, CreateHealthCheckRequest, CreateHostRequest,
+    CreateUserRequest, DiscoveryCidrRequest, DiscoveryResult, Group, HealthCheck,
+    HealthCheckResult, HealthCheckWithResult, Host, HostHealthStatus, HostSummary,
+    RegisterDiscoveredRequest, UpdateGroupRequest, UpdateHealthCheckRequest, UpdateUserRequest,
+    User, UserRole as DbUserRole,
 };

 // Re-export audit integrity types
--- a/crates/pm-core/src/models.rs
+++ b/crates/pm-core/src/models.rs
@ -113,9 +113,79 @@ pub struct HostSummary {
    pub health_status: HostHealthStatus,
    pub agent_version: Option<String>,
    pub patches_missing: i32,
+    pub health_check_status: Option<String>,
    pub registered_at: DateTime<Utc>,
 }

+// ============================================================
+// Health Checks
+// ============================================================
+
+#[derive(Debug, Clone, Serialize, Deserialize, FromRow)]
+pub struct HealthCheck {
+    pub id: Uuid,
+    pub host_id: Uuid,
+    pub name: String,
+    pub check_type: String, // "service" or "http"
+    pub enabled: bool,
+    // Service check fields
+    pub service_name: Option<String>,
+    // HTTP check fields
+    pub url: Option<String>,
+    pub expected_body: Option<String>,
+    pub ignore_cert_errors: bool,
+    pub basic_auth_user: Option<String>,
+    // basic_auth_pass_encrypted and nonce NOT exposed in API responses
+    pub created_at: DateTime<Utc>,
+    pub updated_at: DateTime<Utc>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct HealthCheckWithResult {
+    #[serde(flatten)]
+    pub check: HealthCheck,
+    pub last_result: Option<HealthCheckResult>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, FromRow)]
+pub struct HealthCheckResult {
+    pub id: Uuid,
+    pub check_id: Uuid,
+    pub healthy: bool,
+    pub detail: Option<String>,
+    pub latency_ms: Option<i32>,
+    pub checked_at: DateTime<Utc>,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct CreateHealthCheckRequest {
+    pub name: String,
+    pub check_type: String, // "service" or "http"
+    pub service_name: Option<String>,
+    pub url: Option<String>,
+    pub expected_body: Option<String>,
+    #[serde(default = "default_true")]
+    pub ignore_cert_errors: bool,
+    pub basic_auth_user: Option<String>,
+    pub basic_auth_pass: Option<String>, // plaintext in request, encrypted before storage
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct UpdateHealthCheckRequest {
+    pub name: Option<String>,
+    pub enabled: Option<bool>,
+    pub service_name: Option<String>,
+    pub url: Option<String>,
+    pub expected_body: Option<String>,
+    pub ignore_cert_errors: Option<bool>,
+    pub basic_auth_user: Option<String>,
+    pub basic_auth_pass: Option<String>, // if provided, re-encrypt
+}
+
+fn default_true() -> bool {
+    true
+}
+
 // ============================================================
 // Group
 // ============================================================
--- a/crates/pm-web/src/main.rs
+++ b/crates/pm-web/src/main.rs
@ -189,6 +189,7 @@ pub fn build_router(state: AppState) -> Router {
        .merge(routes::ws::ticket_router())
        // Reports
        .nest("/reports", routes::reports::router())
+        .nest("/hosts/{host_id}/health-checks", routes::health_checks::router())
        // Settings (admin-only)
        .nest("/settings", routes::settings::router())
        // Apply auth middleware to all the above
--- a/crates/pm-web/src/routes/health_checks.rs
+++ b/crates/pm-web/src/routes/health_checks.rs
--- a/crates/pm-web/src/routes/hosts.rs
+++ b/crates/pm-web/src/routes/hosts.rs
@ -112,6 +112,7 @@ async fn list_hosts(
            SELECT h.id, h.fqdn, host(h.ip_address)::text AS ip_address, h.display_name,
                   h.os_family, h.os_name, h.health_status, h.agent_version,
                   COALESCE(hpd.patch_count, 0) AS patches_missing,
+                   " + hc_subquery + ",
                   h.registered_at
            FROM hosts h
            LEFT JOIN host_patch_data hpd ON hpd.host_id = h.id
@ -130,6 +131,7 @@ async fn list_hosts(
                   h.display_name, h.os_family, h.os_name,
                   h.health_status, h.agent_version,
                   COALESCE(hpd.patch_count, 0) AS patches_missing,
+                   " + hc_subquery + ",
                   h.registered_at
            FROM hosts h
            LEFT JOIN host_patch_data hpd ON hpd.host_id = h.id
--- a/crates/pm-web/src/routes/mod.rs
+++ b/crates/pm-web/src/routes/mod.rs
@ -11,5 +11,6 @@ pub mod settings;
 pub mod status;
 pub mod users;
 pub mod ws;
+pub mod health_checks;

 pub mod reports;
--- a/crates/pm-worker/Cargo.toml
+++ b/crates/pm-worker/Cargo.toml
@ -28,3 +28,4 @@ tokio-rustls = { version = "0.26" }
 rustls-pemfile = { version = "2" }
 tokio-tungstenite = { version = "0.26", features = ["rustls-tls-webpki-roots"] }
 lettre = { version = "0.11", default-features = false, features = ["tokio1-rustls-tls", "smtp-transport", "builder"] }
+reqwest = { workspace = true }
--- a/crates/pm-worker/src/health_check_poller.rs
+++ b/crates/pm-worker/src/health_check_poller.rs
@ -0,0 +1,471 @@
+//! Periodic health check poller for configured service and HTTP checks.
+//!
+//! Polls every `health_check_poll_interval_secs`, querying each enabled health
+//! check definition and storing results in `host_health_check_results`.
+//! Results older than 4 days are pruned on each cycle.
+
+use std::path::Path;
+use std::sync::Arc;
+use std::time::Instant;
+
+use pm_core::{config::AppConfig, crypto};
+use sqlx::{FromRow, PgPool};
+use tokio::{sync::Semaphore, time};
+use uuid::Uuid;
+
+use crate::agent_loader::load_agent_certs;
+use pm_agent_client::{AgentClient, AgentClientError};
+
+// ─────────────────────────────────────────────────────────────────────────────
+// DB row types
+// ─────────────────────────────────────────────────────────────────────────────
+
+/// Row fetched for each enabled health check, joined with host connection info.
+#[derive(Debug, FromRow)]
+struct HealthCheckRow {
+    id: Uuid,
+    host_id: Uuid,
+    name: String,
+    check_type: String,
+    service_name: Option<String>,
+    url: Option<String>,
+    expected_body: Option<String>,
+    ignore_cert_errors: Option<bool>,
+    basic_auth_user: Option<String>,
+    basic_auth_pass_encrypted: Option<Vec<u8>>,
+    basic_auth_pass_nonce: Option<Vec<u8>>,
+    ip_address: String,
+    agent_port: i32,
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Public entry point
+// ─────────────────────────────────────────────────────────────────────────────
+
+/// Run the health check poller loop indefinitely.
+///
+/// On each tick all enabled health checks are queried concurrently (up to
+/// `max_concurrent_agent_calls` in-flight at once). Results are persisted
+/// to `host_health_check_results` and stale rows are pruned.
+pub async fn run_health_check_poller(pool: PgPool, config: Arc<AppConfig>) {
+    let interval_secs = config.worker.health_check_poll_interval_secs;
+    let mut ticker = time::interval(std::time::Duration::from_secs(interval_secs));
+
+    tracing::info!(interval_secs, "Health check poller started");
+
+    loop {
+        ticker.tick().await;
+
+        // Load certs on each cycle so cert rotation is picked up automatically.
+        let certs = match load_agent_certs(&config.security) {
+            Ok(c) => c,
+            Err(e) => {
+                tracing::error!(
+                    error = %e,
+                    "Health check poller: failed to load agent certs — skipping cycle"
+                );
+                continue;
+            },
+        };
+
+        let client_cert = Arc::new(certs.client_cert);
+        let client_key = Arc::new(certs.client_key);
+        let ca_cert = Arc::new(certs.ca_cert);
+
+        // Load the crypto key for decrypting HTTP check passwords.
+        let crypto_key = match crypto::load_or_create_key(Path::new(crypto::KEY_PATH)) {
+            Ok(k) => Arc::new(k),
+            Err(e) => {
+                tracing::error!(
+                    error = %e,
+                    "Health check poller: failed to load crypto key — skipping cycle"
+                );
+                continue;
+            },
+        };
+
+        // Fetch all enabled health checks with host connection info.
+        let checks: Vec<HealthCheckRow> = match sqlx::query_as(
+            r#"
+            SELECT
+                hc.id,
+                hc.host_id,
+                hc.name,
+                hc.check_type,
+                hc.service_name,
+                hc.url,
+                hc.expected_body,
+                hc.ignore_cert_errors,
+                hc.basic_auth_user,
+                hc.basic_auth_pass_encrypted,
+                hc.basic_auth_pass_nonce,
+                host(h.ip_address)::text AS ip_address,
+                h.agent_port
+            FROM host_health_checks hc
+            JOIN hosts h ON h.id = hc.host_id
+            WHERE hc.enabled = TRUE
+            ORDER BY hc.id
+            "#,
+        )
+        .fetch_all(&pool)
+        .await
+        {
+            Ok(rows) => rows,
+            Err(e) => {
+                tracing::error!(error = %e, "Health check poller: failed to fetch health checks");
+                continue;
+            },
+        };
+
+        if checks.is_empty() {
+            tracing::debug!("Health check poller: no enabled health checks, skipping cycle");
+            prune_old_results(&pool).await;
+            continue;
+        }
+
+        let total = checks.len();
+        let semaphore = Arc::new(Semaphore::new(config.worker.max_concurrent_agent_calls));
+
+        let mut handles = Vec::with_capacity(total);
+
+        for check in checks {
+            let pool = pool.clone();
+            let sem = semaphore.clone();
+            let cert = client_cert.clone();
+            let key = client_key.clone();
+            let ca = ca_cert.clone();
+            let ckey = crypto_key.clone();
+
+            let handle = tokio::spawn(async move {
+                let _permit = sem.acquire().await.expect("semaphore closed");
+                run_check(pool, check, &cert, &key, &ca, &ckey).await
+            });
+
+            handles.push(handle);
+        }
+
+        // Collect results and tally counts.
+        let mut healthy_count = 0usize;
+        let mut unhealthy_count = 0usize;
+        let mut error_count = 0usize;
+
+        for handle in handles {
+            match handle.await {
+                Ok(true) => healthy_count += 1,
+                Ok(false) => unhealthy_count += 1,
+                Err(e) => {
+                    tracing::error!(error = %e, "Health check poller task panicked");
+                    error_count += 1;
+                },
+            }
+        }
+
+        tracing::info!(
+            total,
+            healthy_count,
+            unhealthy_count,
+            error_count,
+            "Health check poll cycle complete"
+        );
+
+        // Prune results older than 4 days.
+        prune_old_results(&pool).await;
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Check dispatch
+// ─────────────────────────────────────────────────────────────────────────────
+
+/// Run a single health check and persist the result. Returns `true` if healthy.
+async fn run_check(
+    pool: PgPool,
+    check: HealthCheckRow,
+    client_cert: &[u8],
+    client_key: &[u8],
+    ca_cert: &[u8],
+    crypto_key: &[u8; 32],
+) -> bool {
+    let start = Instant::now();
+
+    let (healthy, detail) = match check.check_type.as_str() {
+        "service" => run_service_check(&check, client_cert, client_key, ca_cert).await,
+        "http" => run_http_check(&check, crypto_key).await,
+        other => {
+            tracing::warn!(
+                check_id = %check.id,
+                check_type = other,
+                "Unknown health check type — treating as unhealthy"
+            );
+            (false, format!("Unknown check type: {other}"))
+        },
+    };
+
+    let latency_ms = start.elapsed().as_millis() as i32;
+
+    // Persist the result.
+    if let Err(e) = sqlx::query(
+        r#"
+        INSERT INTO host_health_check_results (check_id, healthy, detail, latency_ms)
+        VALUES ($1, $2, $3, $4)
+        "#,
+    )
+    .bind(check.id)
+    .bind(healthy)
+    .bind(&detail)
+    .bind(latency_ms)
+    .execute(&pool)
+    .await
+    {
+        tracing::error!(
+            check_id = %check.id,
+            error = %e,
+            "Health check poller: failed to insert result"
+        );
+    }
+
+    healthy
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Service check (via mTLS AgentClient)
+// ─────────────────────────────────────────────────────────────────────────────
+
+/// Execute a service check by calling the agent's `/api/v1/system/services/{name}` endpoint.
+async fn run_service_check(
+    check: &HealthCheckRow,
+    client_cert: &[u8],
+    client_key: &[u8],
+    ca_cert: &[u8],
+) -> (bool, String) {
+    let service_name = match &check.service_name {
+        Some(name) => name.clone(),
+        None => {
+            return (false, "Service check missing service_name".to_string());
+        },
+    };
+
+    let client = match AgentClient::new(
+        &check.ip_address,
+        check.agent_port as u16,
+        client_cert,
+        client_key,
+        ca_cert,
+    ) {
+        Ok(c) => c,
+        Err(e) => {
+            return (false, format!("Failed to build AgentClient: {e}"));
+        },
+    };
+
+    match client.service_status(&service_name).await {
+        Ok(data) => {
+            let detail = if data.healthy {
+                format!(
+                    "Service '{}' is {} (uptime: {}s)",
+                    data.name,
+                    data.status,
+                    data.uptime_secs.map_or("N/A".to_string(), |s| s.to_string())
+                )
+            } else {
+                format!(
+                    "Service '{}' status: {} (unhealthy)",
+                    data.name, data.status
+                )
+            };
+            (data.healthy, detail)
+        },
+        Err(AgentClientError::Timeout) => {
+            (false, format!("Agent timed out querying service '{service_name}'"))
+        },
+        Err(AgentClientError::Connect(_)) => {
+            (false, format!("Agent connection refused for service '{service_name}'"))
+        },
+        Err(AgentClientError::ApiError { code, message }) => {
+            // 404, 400, 500 etc. from the agent means the service is unhealthy.
+            (false, format!("Agent error [{code}]: {message}"))
+        },
+        Err(e) => {
+            (false, format!("Agent error querying service '{service_name}': {e}"))
+        },
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// HTTP check (via reqwest, no mTLS)
+// ─────────────────────────────────────────────────────────────────────────────
+
+/// Execute an HTTP check by making a GET request to the configured URL.
+/// Supports optional basic auth (decrypted from DB) and substring body matching.
+async fn run_http_check(
+    check: &HealthCheckRow,
+    crypto_key: &[u8; 32],
+) -> (bool, String) {
+    let url = match &check.url {
+        Some(u) => u.clone(),
+        None => {
+            return (false, "HTTP check missing URL".to_string());
+        },
+    };
+
+    // Build a reqwest client for this check.
+    // Use danger_accept_invalid_certs if ignore_cert_errors is set (default true).
+    let ignore_cert_errors = check.ignore_cert_errors.unwrap_or(true);
+
+    let client_builder = reqwest::Client::builder()
+        .timeout(std::time::Duration::from_secs(10))
+        .redirect(reqwest::redirect::Policy::limited(5));
+
+    let client = if ignore_cert_errors {
+        client_builder
+            .danger_accept_invalid_certs(true)
+            .build()
+            .unwrap_or_else(|_| reqwest::Client::new())
+    } else {
+        client_builder.build().unwrap_or_else(|_| reqwest::Client::new())
+    };
+
+    // Build the request.
+    let mut request = client.get(&url);
+
+    // Add basic auth if configured.
+    if let Some(user) = &check.basic_auth_user {
+        // Decrypt the password if present.
+        let password = match (&check.basic_auth_pass_encrypted, &check.basic_auth_pass_nonce) {
+            (Some(enc), Some(nonce)) => {
+                match crypto::decrypt(enc, nonce, crypto_key) {
+                    Ok(p) => p,
+                    Err(e) => {
+                        return (
+                            false,
+                            format!("Failed to decrypt basic auth password: {e}"),
+                        );
+                    },
+                }
+            },
+            _ => {
+                // No encrypted password stored — treat as missing credentials.
+                return (false, "HTTP check has basic_auth_user but no encrypted password".to_string());
+            },
+        };
+        request = request.basic_auth(user.as_str(), Some(password.as_str()));
+    }
+
+    // Execute the request.
+    let response = match request.send().await {
+        Ok(r) => r,
+        Err(e) => {
+            if e.is_timeout() {
+                return (false, format!("HTTP check timed out: {url}"));
+            } else if e.is_connect() {
+                return (false, format!("HTTP check connection failed: {url}"));
+            } else {
+                return (false, format!("HTTP check request error: {e}"));
+            }
+        },
+    };
+
+    let status = response.status();
+
+    // Check HTTP status code.
+    if !status.is_success() {
+        return (
+            false,
+            format!("HTTP check returned status {} for {url}", status.as_u16()),
+        );
+    }
+
+    // Read the response body for substring matching.
+    let body = match response.text().await {
+        Ok(b) => b,
+        Err(e) => {
+            return (false, format!("HTTP check failed to read response body: {e}"));
+        },
+    };
+
+    // Check expected_body substring match.
+    if let Some(expected) = &check.expected_body {
+        if !body.contains(expected) {
+            return (
+                false,
+                format!(
+                    "HTTP check body mismatch for {url}: expected substring not found"
+                ),
+            );
+        }
+    }
+
+    (true, format!("HTTP check OK for {url} (status {})", status.as_u16()))
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Prune old results
+// ─────────────────────────────────────────────────────────────────────────────
+
+/// Delete health check results older than 4 days.
+async fn prune_old_results(pool: &PgPool) {
+    match sqlx::query(
+        "DELETE FROM host_health_check_results WHERE checked_at < NOW() - INTERVAL '4 days'",
+    )
+    .execute(pool)
+    .await
+    {
+        Ok(result) => {
+            if result.rows_affected() > 0 {
+                tracing::info!(
+                    rows_deleted = result.rows_affected(),
+                    "Health check poller: pruned old results"
+                );
+            }
+        },
+        Err(e) => {
+            tracing::error!(error = %e, "Health check poller: failed to prune old results");
+        },
+    }
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Health check gate for job executor
+// ─────────────────────────────────────────────────────────────────────────────
+
+/// Check whether all enabled health checks for a host are healthy.
+///
+/// Returns `Ok(true)` if all checks pass (or no checks are configured),
+/// `Ok(false)` if any check is unhealthy or has no result yet.
+pub async fn check_host_health_checks(pool: &PgPool, host_id: Uuid) -> anyhow::Result<bool> {
+    // Check if there are any enabled health checks for this host.
+    let check_count: (i64,) = sqlx::query_as(
+        "SELECT COUNT(*) FROM host_health_checks WHERE host_id = $1 AND enabled = TRUE",
+    )
+    .bind(host_id)
+    .fetch_one(pool)
+    .await?;
+
+    if check_count.0 == 0 {
+        // No health checks configured for this host — treat as healthy.
+        return Ok(true);
+    }
+
+    // Find any enabled check that has no healthy result or an unhealthy latest result.
+    let unhealthy_count: (i64,) = sqlx::query_as(
+        r#"
+        SELECT COUNT(*)
+        FROM host_health_checks hc
+        LEFT JOIN LATERAL (
+            SELECT healthy
+            FROM host_health_check_results r
+            WHERE r.check_id = hc.id
+            ORDER BY r.checked_at DESC
+            LIMIT 1
+        ) latest ON true
+        WHERE hc.host_id = $1
+          AND hc.enabled = TRUE
+          AND (latest.healthy IS NULL OR latest.healthy = FALSE)
+        "#,
+    )
+    .bind(host_id)
+    .fetch_one(pool)
+    .await?;
+
+    Ok(unhealthy_count.0 == 0)
+}
--- a/crates/pm-worker/src/job_executor.rs
+++ b/crates/pm-worker/src/job_executor.rs
@ -24,6 +24,7 @@ use uuid::Uuid;

 use crate::agent_loader::load_agent_certs;
 use crate::email;
+use crate::health_check_poller::check_host_health_checks;

 // ─────────────────────────────────────────────────────────────────────────────
 // Internal DB row types
@ -78,6 +79,7 @@ struct StatusCounts {
    succeeded_count: i64,
    failed_count: i64,
    cancelled_count: i64,
+    waiting_health_check_count: i64,
    total_count: i64,
 }

@ -369,6 +371,89 @@ async fn execute_host_job(
        },
    };

+    // ── 1b. Health check gate ──────────────────────────────────────────────
+    // All enabled health checks for this host must be healthy before we proceed.
+    match check_host_health_checks(&pool, host_id).await {
+        Ok(true) => {
+            tracing::debug!(%host_id, "execute_host_job: health checks passed");
+        },
+        Ok(false) => {
+            tracing::info!(%host_id, %pjh_id, "execute_host_job: health checks not passed, setting waiting_health_check");
+            // Check if the maintenance window is still open for this host.
+            let window_open: bool = sqlx::query_scalar(
+                r#"
+                SELECT EXISTS(
+                    SELECT 1 FROM maintenance_windows mw
+                    WHERE mw.host_id = $1
+                      AND mw.enabled = TRUE
+                      AND (
+                        (mw.recurrence = 'once'
+                         AND mw.start_at <= NOW()
+                         AND NOW() < mw.start_at + (mw.duration_minutes * INTERVAL '1 minute'))
+                        OR
+                        (mw.recurrence = 'daily'
+                         AND (NOW() AT TIME ZONE 'UTC')::time >= (mw.start_at AT TIME ZONE 'UTC')::time
+                         AND (NOW() AT TIME ZONE 'UTC')::time < ((mw.start_at AT TIME ZONE 'UTC')::time
+                                                                 + (mw.duration_minutes * INTERVAL '1 minute')))
+                        OR
+                        (mw.recurrence = 'weekly'
+                         AND EXTRACT(DOW FROM NOW() AT TIME ZONE 'UTC') = mw.recurrence_day
+                         AND (NOW() AT TIME ZONE 'UTC')::time >= (mw.start_at AT TIME ZONE 'UTC')::time
+                         AND (NOW() AT TIME ZONE 'UTC')::time < ((mw.start_at AT TIME ZONE 'UTC')::time
+                                                                 + (mw.duration_minutes * INTERVAL '1 minute')))
+                        OR
+                        (mw.recurrence = 'monthly'
+                         AND EXTRACT(DAY FROM NOW() AT TIME ZONE 'UTC') = mw.recurrence_day
+                         AND (NOW() AT TIME ZONE 'UTC')::time >= (mw.start_at AT TIME ZONE 'UTC')::time
+                         AND (NOW() AT TIME ZONE 'UTC')::time < ((mw.start_at AT TIME ZONE 'UTC')::time
+                                                                 + (mw.duration_minutes * INTERVAL '1 minute')))
+                      )
+                )
+                "#,
+            )
+            .bind(host_id)
+            .fetch_optional(&pool)
+            .await
+            .unwrap_or(Some(true))
+            .unwrap_or(true); // Default to true if no window configured
+
+            if !window_open {
+                tracing::warn!(%host_id, %pjh_id, "execute_host_job: health checks not passed and maintenance window closed");
+                handle_host_failure(
+                    pool,
+                    pjh_id,
+                    "Health checks did not pass before maintenance window closed".to_string(),
+                )
+                .await;
+                return;
+            }
+
+            // Set status to waiting_health_check and retry in 5 minutes.
+            let retry_at = Utc::now() + ChronoDuration::minutes(5);
+            if let Err(e) = sqlx::query(
+                r#"
+                UPDATE patch_job_hosts
+                SET    status        = 'waiting_health_check',
+                       retry_next_at = $2,
+                       last_error    = 'Waiting for health checks to pass'
+                WHERE  id = $1
+                "#,
+            )
+            .bind(pjh_id)
+            .bind(retry_at)
+            .execute(&pool)
+            .await
+            {
+                tracing::error!(%pjh_id, error = %e, "execute_host_job: failed to set waiting_health_check status");
+            }
+            return;
+        },
+        Err(e) => {
+            tracing::warn!(%host_id, error = %e, "execute_host_job: health check query failed, proceeding anyway");
+            // If we can't query health checks, proceed with the job rather than blocking.
+        },
+    }
+
    // ── 2. Fetch the job's patch_selection ──────────────────────────────────
    let patch_sel: JobPatchSelection =
        match sqlx::query_as("SELECT patch_selection FROM patch_jobs WHERE id = $1")
@ -764,6 +849,7 @@ async fn sync_job_status(pool: &PgPool, job_id: Uuid) {
            COUNT(*) FILTER (WHERE status = 'succeeded') AS succeeded_count,
            COUNT(*) FILTER (WHERE status = 'failed')    AS failed_count,
            COUNT(*) FILTER (WHERE status = 'cancelled') AS cancelled_count,
+            COUNT(*) FILTER (WHERE status = 'waiting_health_check') AS waiting_health_check_count,
            COUNT(*)                                     AS total_count
        FROM patch_job_hosts
        WHERE job_id = $1
@ -784,7 +870,7 @@ async fn sync_job_status(pool: &PgPool, job_id: Uuid) {
    let new_status: &str;
    let set_completed: bool;

-    if counts.running_count > 0 || counts.pending_count > 0 || counts.queued_count > 0 {
+    if counts.running_count > 0 || counts.pending_count > 0 || counts.queued_count > 0 || counts.waiting_health_check_count > 0 {
        // Still work in flight — keep parent running.
        new_status = "running";
        set_completed = false;
@ -912,17 +998,18 @@ async fn sync_job_status(pool: &PgPool, job_id: Uuid) {

 /// Find pending host entries whose back-off window has elapsed, reset them to
 /// `queued`, and dispatch them immediately.
+///
+/// Also retries `waiting_health_check` entries whose retry window has elapsed.
 pub async fn retry_pending_jobs(pool: PgPool, config: Arc<AppConfig>) {
    let rows: Vec<PatchJobHostPending> = match sqlx::query_as(
        r#"
        SELECT pjh.id, pjh.host_id, pjh.job_id
        FROM   patch_job_hosts pjh
        JOIN   patch_jobs j ON j.id = pjh.job_id
-        WHERE  pjh.status        = 'pending'
+        WHERE  pjh.status IN ('pending', 'waiting_health_check')
          AND  pjh.retry_next_at <= NOW()
-          AND  j.status         != 'cancelled'
-        "#,
-    )
+          AND  j.status != 'cancelled'
+        "#,)
    .fetch_all(&pool)
    .await
    {
--- a/crates/pm-worker/src/main.rs
+++ b/crates/pm-worker/src/main.rs
@ -6,6 +6,7 @@
 mod agent_loader;
 mod audit_verifier;
 mod email;
+mod health_check_poller;
 mod health_poller;
 mod job_executor;
 mod maintenance_scheduler;
@ -19,6 +20,7 @@ use std::{sync::Arc, time::Duration};
 use tokio::time;

 use audit_verifier::run_audit_verifier;
+use health_check_poller::run_health_check_poller;
 use health_poller::run_health_poller;
 use job_executor::run_job_executor;
 use maintenance_scheduler::run_maintenance_scheduler;
@ -29,7 +31,7 @@ use ws_relay::run_ws_relay;
 /// Minimum number of applied migrations the worker requires before
 /// accepting work. Prevents the worker from running against a schema
 /// that hasn't been migrated yet.
-const REQUIRED_MIGRATION_COUNT: i64 = 5;
+const REQUIRED_MIGRATION_COUNT: i64 = 8;

 /// How long to wait between schema-version checks before giving up.
 const SCHEMA_CHECK_TIMEOUT: Duration = Duration::from_secs(120);
@ -89,6 +91,9 @@ async fn main() -> anyhow::Result<()> {
    // M11: audit integrity verification (runs every 24 hours)
    let audit_verifier_handle = tokio::spawn(run_audit_verifier(pool.clone(), config.clone()));

+    // Health check poller — runs configured service/HTTP health checks
+    let health_check_handle = tokio::spawn(run_health_check_poller(pool.clone(), config.clone()));
+
    tracing::info!("Worker tasks started");

    // Wait for all tasks (they run indefinitely)