Private
Public Access
1
0

feat: health check configuration and worker engine (Phase 3+4)
Some checks failed
CI Pipeline / Rust Format Check (push) Failing after 4s
CI Pipeline / Clippy Lints (push) Successful in 46s
CI Pipeline / Rust Unit Tests (push) Successful in 1m1s
CI Pipeline / Security Audit (push) Successful in 4s
CI Pipeline / Frontend Lint & Type Check (push) Failing after 10s
CI Pipeline / Build .deb & Release (push) Has been skipped

- Added health_check_poller.rs: periodic service/HTTP health checks
- Added pre-patch health gate in job_executor.rs
- Added waiting_health_check job status (migration 008)
- Added health_check_status to HostSummary and hosts API
- Added health check types and API functions to frontend
- Added health check UI section to HostDetailPage
- Added health check status indicators to HostsPage and PatchDeploymentPage
- Added serde default for health_check_poll_interval_secs
- Fixed missing AgentClient import in health_check_poller.rs
- Fixed missing ws_relay import in main.rs
- Fixed missing closing paren in retry_pending_jobs SQL
- Added ReadWritePaths for /etc/patch-manager/keys in systemd services
This commit is contained in:
2026-05-05 14:10:37 +00:00
parent a306806b04
commit 93828e1976
28 changed files with 2726 additions and 50 deletions

View File

@ -30,7 +30,7 @@ use crate::{
error::AgentClientError,
types::{
AgentEnvelope, AgentJobStatus, ApplyPatchesRequest, ApplyPatchesResponse, HealthData,
PackagesData, PatchesData, RollbackResponse, SystemInfoData,
PackagesData, PatchesData, RollbackResponse, ServiceStatusData, SystemInfoData,
},
};
@ -221,10 +221,17 @@ impl AgentClient {
.await
}
/// `GET /api/v1/system/services/{name}` — check status of a specific service on the agent.
#[instrument(skip(self), fields(base_url = %self.base_url, service_name = %service_name))]
pub async fn service_status(&self, service_name: &str) -> Result<ServiceStatusData, AgentClientError> {
self.get(&format!("system/services/{}", service_name), &[]).await
}
// --------------------------------------------------------
// Private POST helper
// --------------------------------------------------------
/// Execute a POST request against `{base_url}/{path}`, serialize `body` as
/// JSON, deserialize the [`AgentEnvelope`], and extract the `data` field —
/// or propagate an [`AgentClientError::ApiError`].

View File

@ -39,5 +39,5 @@ pub use error::AgentClientError;
/// Response envelope and all data types.
pub use types::{
AgentEnvelope, AgentErrorBody, HealthData, Package, PackagesData, Patch, PatchesData,
SystemInfoData,
RollbackResponse, ServiceStatusData, SystemInfoData,
};

View File

@ -193,6 +193,23 @@ pub struct AgentJobStatus {
pub completed_at: Option<DateTime<Utc>>,
}
// ============================================================
// GET /api/v1/system/services/{name}
// ============================================================
/// Payload returned by `GET /api/v1/system/services/{name}`.
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct ServiceStatusData {
/// Service name.
pub name: String,
/// Service status string (e.g. `"running"`, `"stopped"`, `"failed"`).
pub status: String,
/// Whether the service is considered healthy.
pub healthy: bool,
/// Seconds elapsed since the service started (`null` if not running).
pub uptime_secs: Option<u64>,
}
// ============================================================
// POST /api/v1/jobs/{id}/rollback
// ============================================================

View File

@ -22,3 +22,5 @@ config = { workspace = true }
axum = { workspace = true }
sha2 = { workspace = true }
hex = { workspace = true }
aes-gcm = { workspace = true }
rand = { workspace = true }

View File

@ -47,6 +47,9 @@ pub enum AuditAction {
PatchJobCompleted,
PatchJobFailed,
MaintenanceWindowReminder,
HealthCheckCreated,
HealthCheckUpdated,
HealthCheckDeleted,
}
impl AuditAction {
@ -80,6 +83,9 @@ impl AuditAction {
Self::PatchJobCompleted => "patch_job_completed",
Self::PatchJobFailed => "patch_job_failed",
Self::MaintenanceWindowReminder => "maintenance_window_reminder",
Self::HealthCheckCreated => "health_check_created",
Self::HealthCheckUpdated => "health_check_updated",
Self::HealthCheckDeleted => "health_check_deleted",
}
}
}

View File

@ -39,6 +39,9 @@ pub struct WorkerConfig {
pub health_poll_interval_secs: u64,
/// Patch data poll interval in seconds (default: 1800 = 30 min)
pub patch_poll_interval_secs: u64,
/// Health check poll interval in seconds (default: 300 = 5 min)
#[serde(default = "default_health_check_poll_interval")]
pub health_check_poll_interval_secs: u64,
/// Maximum concurrent agent calls
pub max_concurrent_agent_calls: usize,
/// Worker heartbeat interval in seconds
@ -98,6 +101,8 @@ impl AppConfig {
}
}
fn default_health_check_poll_interval() -> u64 { 300 }
impl Default for AppConfig {
fn default() -> Self {
Self {
@ -115,6 +120,7 @@ impl Default for AppConfig {
worker: WorkerConfig {
health_poll_interval_secs: 300,
patch_poll_interval_secs: 1800,
health_check_poll_interval_secs: 300,
max_concurrent_agent_calls: 64,
heartbeat_interval_secs: 30,
ws_relay_poll_interval_secs: 10,

View File

@ -0,0 +1,80 @@
//! AES-256-GCM encryption for sensitive health check credentials.
//!
//! Uses a per-install key stored at `/etc/patch-manager/keys/health-check.key`.
use aes_gcm::{
aead::{Aead, KeyInit, OsRng},
Aes256Gcm, Nonce,
};
use rand::RngCore;
use std::fs;
use std::path::Path;
pub const KEY_PATH: &str = "/etc/patch-manager/keys/health-check.key";
/// Load or create the per-install encryption key.
/// If the key file doesn't exist, generates a new 256-bit key and saves it.
pub fn load_or_create_key(path: &Path) -> Result<[u8; 32], CryptoError> {
if path.exists() {
let key_bytes = fs::read(path).map_err(CryptoError::Io)?;
if key_bytes.len() != 32 {
return Err(CryptoError::InvalidKeyLength(key_bytes.len()));
}
let mut key = [0u8; 32];
key.copy_from_slice(&key_bytes);
Ok(key)
} else {
let mut key = [0u8; 32];
OsRng.fill_bytes(&mut key);
if let Some(parent) = path.parent() {
fs::create_dir_all(parent).map_err(CryptoError::Io)?;
}
fs::write(path, &key).map_err(CryptoError::Io)?;
// Set permissions to 0600 (owner read/write only)
#[cfg(unix)]
{
use std::os::unix::fs::PermissionsExt;
fs::set_permissions(path, fs::Permissions::from_mode(0o600))
.map_err(CryptoError::Io)?;
}
Ok(key)
}
}
/// Encrypt plaintext with AES-256-GCM. Returns (ciphertext, nonce).
pub fn encrypt(plaintext: &str, key: &[u8; 32]) -> Result<(Vec<u8>, Vec<u8>), CryptoError> {
let cipher = Aes256Gcm::new_from_slice(key).map_err(|e| CryptoError::KeyInit(e.to_string()))?;
let mut nonce_bytes = [0u8; 12];
OsRng.fill_bytes(&mut nonce_bytes);
let nonce = Nonce::from_slice(&nonce_bytes);
let ciphertext = cipher
.encrypt(nonce, plaintext.as_bytes())
.map_err(|_| CryptoError::EncryptionFailed)?;
Ok((ciphertext, nonce_bytes.to_vec()))
}
/// Decrypt AES-256-GCM ciphertext with the given nonce.
pub fn decrypt(ciphertext: &[u8], nonce: &[u8], key: &[u8; 32]) -> Result<String, CryptoError> {
let cipher = Aes256Gcm::new_from_slice(key).map_err(|e| CryptoError::KeyInit(e.to_string()))?;
let nonce = Nonce::from_slice(nonce);
let plaintext = cipher
.decrypt(nonce, ciphertext)
.map_err(|_| CryptoError::DecryptionFailed)?;
String::from_utf8(plaintext).map_err(CryptoError::Utf8)
}
#[derive(Debug, thiserror::Error)]
pub enum CryptoError {
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
#[error("Invalid key length: expected 32 bytes, got {0}")]
InvalidKeyLength(usize),
#[error("Key init error: {0}")]
KeyInit(String),
#[error("Encryption failed")]
EncryptionFailed,
#[error("Decryption failed")]
DecryptionFailed,
#[error("UTF-8 error: {0}")]
Utf8(#[from] std::string::FromUtf8Error),
}

View File

@ -1,5 +1,6 @@
pub mod audit;
pub mod config;
pub mod crypto;
pub mod db;
pub mod error;
pub mod logging;
@ -8,11 +9,14 @@ pub mod request_id;
// Re-export commonly used types
pub use config::AppConfig;
pub use crypto::{CryptoError, KEY_PATH, decrypt, encrypt, load_or_create_key};
pub use error::{AppError, ErrorResponse};
pub use models::{
AuthProvider, CreateGroupRequest, CreateHostRequest, CreateUserRequest, DiscoveryCidrRequest,
DiscoveryResult, Group, Host, HostHealthStatus, HostSummary, RegisterDiscoveredRequest,
UpdateGroupRequest, UpdateUserRequest, User, UserRole as DbUserRole,
AuthProvider, CreateGroupRequest, CreateHealthCheckRequest, CreateHostRequest,
CreateUserRequest, DiscoveryCidrRequest, DiscoveryResult, Group, HealthCheck,
HealthCheckResult, HealthCheckWithResult, Host, HostHealthStatus, HostSummary,
RegisterDiscoveredRequest, UpdateGroupRequest, UpdateHealthCheckRequest, UpdateUserRequest,
User, UserRole as DbUserRole,
};
// Re-export audit integrity types

View File

@ -113,9 +113,79 @@ pub struct HostSummary {
pub health_status: HostHealthStatus,
pub agent_version: Option<String>,
pub patches_missing: i32,
pub health_check_status: Option<String>,
pub registered_at: DateTime<Utc>,
}
// ============================================================
// Health Checks
// ============================================================
#[derive(Debug, Clone, Serialize, Deserialize, FromRow)]
pub struct HealthCheck {
pub id: Uuid,
pub host_id: Uuid,
pub name: String,
pub check_type: String, // "service" or "http"
pub enabled: bool,
// Service check fields
pub service_name: Option<String>,
// HTTP check fields
pub url: Option<String>,
pub expected_body: Option<String>,
pub ignore_cert_errors: bool,
pub basic_auth_user: Option<String>,
// basic_auth_pass_encrypted and nonce NOT exposed in API responses
pub created_at: DateTime<Utc>,
pub updated_at: DateTime<Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HealthCheckWithResult {
#[serde(flatten)]
pub check: HealthCheck,
pub last_result: Option<HealthCheckResult>,
}
#[derive(Debug, Clone, Serialize, Deserialize, FromRow)]
pub struct HealthCheckResult {
pub id: Uuid,
pub check_id: Uuid,
pub healthy: bool,
pub detail: Option<String>,
pub latency_ms: Option<i32>,
pub checked_at: DateTime<Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CreateHealthCheckRequest {
pub name: String,
pub check_type: String, // "service" or "http"
pub service_name: Option<String>,
pub url: Option<String>,
pub expected_body: Option<String>,
#[serde(default = "default_true")]
pub ignore_cert_errors: bool,
pub basic_auth_user: Option<String>,
pub basic_auth_pass: Option<String>, // plaintext in request, encrypted before storage
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct UpdateHealthCheckRequest {
pub name: Option<String>,
pub enabled: Option<bool>,
pub service_name: Option<String>,
pub url: Option<String>,
pub expected_body: Option<String>,
pub ignore_cert_errors: Option<bool>,
pub basic_auth_user: Option<String>,
pub basic_auth_pass: Option<String>, // if provided, re-encrypt
}
fn default_true() -> bool {
true
}
// ============================================================
// Group
// ============================================================

View File

@ -189,6 +189,7 @@ pub fn build_router(state: AppState) -> Router {
.merge(routes::ws::ticket_router())
// Reports
.nest("/reports", routes::reports::router())
.nest("/hosts/{host_id}/health-checks", routes::health_checks::router())
// Settings (admin-only)
.nest("/settings", routes::settings::router())
// Apply auth middleware to all the above

File diff suppressed because it is too large Load Diff

View File

@ -112,6 +112,7 @@ async fn list_hosts(
SELECT h.id, h.fqdn, host(h.ip_address)::text AS ip_address, h.display_name,
h.os_family, h.os_name, h.health_status, h.agent_version,
COALESCE(hpd.patch_count, 0) AS patches_missing,
" + hc_subquery + ",
h.registered_at
FROM hosts h
LEFT JOIN host_patch_data hpd ON hpd.host_id = h.id
@ -130,6 +131,7 @@ async fn list_hosts(
h.display_name, h.os_family, h.os_name,
h.health_status, h.agent_version,
COALESCE(hpd.patch_count, 0) AS patches_missing,
" + hc_subquery + ",
h.registered_at
FROM hosts h
LEFT JOIN host_patch_data hpd ON hpd.host_id = h.id

View File

@ -11,5 +11,6 @@ pub mod settings;
pub mod status;
pub mod users;
pub mod ws;
pub mod health_checks;
pub mod reports;

View File

@ -28,3 +28,4 @@ tokio-rustls = { version = "0.26" }
rustls-pemfile = { version = "2" }
tokio-tungstenite = { version = "0.26", features = ["rustls-tls-webpki-roots"] }
lettre = { version = "0.11", default-features = false, features = ["tokio1-rustls-tls", "smtp-transport", "builder"] }
reqwest = { workspace = true }

View File

@ -0,0 +1,471 @@
//! Periodic health check poller for configured service and HTTP checks.
//!
//! Polls every `health_check_poll_interval_secs`, querying each enabled health
//! check definition and storing results in `host_health_check_results`.
//! Results older than 4 days are pruned on each cycle.
use std::path::Path;
use std::sync::Arc;
use std::time::Instant;
use pm_core::{config::AppConfig, crypto};
use sqlx::{FromRow, PgPool};
use tokio::{sync::Semaphore, time};
use uuid::Uuid;
use crate::agent_loader::load_agent_certs;
use pm_agent_client::{AgentClient, AgentClientError};
// ─────────────────────────────────────────────────────────────────────────────
// DB row types
// ─────────────────────────────────────────────────────────────────────────────
/// Row fetched for each enabled health check, joined with host connection info.
#[derive(Debug, FromRow)]
struct HealthCheckRow {
id: Uuid,
host_id: Uuid,
name: String,
check_type: String,
service_name: Option<String>,
url: Option<String>,
expected_body: Option<String>,
ignore_cert_errors: Option<bool>,
basic_auth_user: Option<String>,
basic_auth_pass_encrypted: Option<Vec<u8>>,
basic_auth_pass_nonce: Option<Vec<u8>>,
ip_address: String,
agent_port: i32,
}
// ─────────────────────────────────────────────────────────────────────────────
// Public entry point
// ─────────────────────────────────────────────────────────────────────────────
/// Run the health check poller loop indefinitely.
///
/// On each tick all enabled health checks are queried concurrently (up to
/// `max_concurrent_agent_calls` in-flight at once). Results are persisted
/// to `host_health_check_results` and stale rows are pruned.
pub async fn run_health_check_poller(pool: PgPool, config: Arc<AppConfig>) {
let interval_secs = config.worker.health_check_poll_interval_secs;
let mut ticker = time::interval(std::time::Duration::from_secs(interval_secs));
tracing::info!(interval_secs, "Health check poller started");
loop {
ticker.tick().await;
// Load certs on each cycle so cert rotation is picked up automatically.
let certs = match load_agent_certs(&config.security) {
Ok(c) => c,
Err(e) => {
tracing::error!(
error = %e,
"Health check poller: failed to load agent certs — skipping cycle"
);
continue;
},
};
let client_cert = Arc::new(certs.client_cert);
let client_key = Arc::new(certs.client_key);
let ca_cert = Arc::new(certs.ca_cert);
// Load the crypto key for decrypting HTTP check passwords.
let crypto_key = match crypto::load_or_create_key(Path::new(crypto::KEY_PATH)) {
Ok(k) => Arc::new(k),
Err(e) => {
tracing::error!(
error = %e,
"Health check poller: failed to load crypto key — skipping cycle"
);
continue;
},
};
// Fetch all enabled health checks with host connection info.
let checks: Vec<HealthCheckRow> = match sqlx::query_as(
r#"
SELECT
hc.id,
hc.host_id,
hc.name,
hc.check_type,
hc.service_name,
hc.url,
hc.expected_body,
hc.ignore_cert_errors,
hc.basic_auth_user,
hc.basic_auth_pass_encrypted,
hc.basic_auth_pass_nonce,
host(h.ip_address)::text AS ip_address,
h.agent_port
FROM host_health_checks hc
JOIN hosts h ON h.id = hc.host_id
WHERE hc.enabled = TRUE
ORDER BY hc.id
"#,
)
.fetch_all(&pool)
.await
{
Ok(rows) => rows,
Err(e) => {
tracing::error!(error = %e, "Health check poller: failed to fetch health checks");
continue;
},
};
if checks.is_empty() {
tracing::debug!("Health check poller: no enabled health checks, skipping cycle");
prune_old_results(&pool).await;
continue;
}
let total = checks.len();
let semaphore = Arc::new(Semaphore::new(config.worker.max_concurrent_agent_calls));
let mut handles = Vec::with_capacity(total);
for check in checks {
let pool = pool.clone();
let sem = semaphore.clone();
let cert = client_cert.clone();
let key = client_key.clone();
let ca = ca_cert.clone();
let ckey = crypto_key.clone();
let handle = tokio::spawn(async move {
let _permit = sem.acquire().await.expect("semaphore closed");
run_check(pool, check, &cert, &key, &ca, &ckey).await
});
handles.push(handle);
}
// Collect results and tally counts.
let mut healthy_count = 0usize;
let mut unhealthy_count = 0usize;
let mut error_count = 0usize;
for handle in handles {
match handle.await {
Ok(true) => healthy_count += 1,
Ok(false) => unhealthy_count += 1,
Err(e) => {
tracing::error!(error = %e, "Health check poller task panicked");
error_count += 1;
},
}
}
tracing::info!(
total,
healthy_count,
unhealthy_count,
error_count,
"Health check poll cycle complete"
);
// Prune results older than 4 days.
prune_old_results(&pool).await;
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Check dispatch
// ─────────────────────────────────────────────────────────────────────────────
/// Run a single health check and persist the result. Returns `true` if healthy.
async fn run_check(
pool: PgPool,
check: HealthCheckRow,
client_cert: &[u8],
client_key: &[u8],
ca_cert: &[u8],
crypto_key: &[u8; 32],
) -> bool {
let start = Instant::now();
let (healthy, detail) = match check.check_type.as_str() {
"service" => run_service_check(&check, client_cert, client_key, ca_cert).await,
"http" => run_http_check(&check, crypto_key).await,
other => {
tracing::warn!(
check_id = %check.id,
check_type = other,
"Unknown health check type — treating as unhealthy"
);
(false, format!("Unknown check type: {other}"))
},
};
let latency_ms = start.elapsed().as_millis() as i32;
// Persist the result.
if let Err(e) = sqlx::query(
r#"
INSERT INTO host_health_check_results (check_id, healthy, detail, latency_ms)
VALUES ($1, $2, $3, $4)
"#,
)
.bind(check.id)
.bind(healthy)
.bind(&detail)
.bind(latency_ms)
.execute(&pool)
.await
{
tracing::error!(
check_id = %check.id,
error = %e,
"Health check poller: failed to insert result"
);
}
healthy
}
// ─────────────────────────────────────────────────────────────────────────────
// Service check (via mTLS AgentClient)
// ─────────────────────────────────────────────────────────────────────────────
/// Execute a service check by calling the agent's `/api/v1/system/services/{name}` endpoint.
async fn run_service_check(
check: &HealthCheckRow,
client_cert: &[u8],
client_key: &[u8],
ca_cert: &[u8],
) -> (bool, String) {
let service_name = match &check.service_name {
Some(name) => name.clone(),
None => {
return (false, "Service check missing service_name".to_string());
},
};
let client = match AgentClient::new(
&check.ip_address,
check.agent_port as u16,
client_cert,
client_key,
ca_cert,
) {
Ok(c) => c,
Err(e) => {
return (false, format!("Failed to build AgentClient: {e}"));
},
};
match client.service_status(&service_name).await {
Ok(data) => {
let detail = if data.healthy {
format!(
"Service '{}' is {} (uptime: {}s)",
data.name,
data.status,
data.uptime_secs.map_or("N/A".to_string(), |s| s.to_string())
)
} else {
format!(
"Service '{}' status: {} (unhealthy)",
data.name, data.status
)
};
(data.healthy, detail)
},
Err(AgentClientError::Timeout) => {
(false, format!("Agent timed out querying service '{service_name}'"))
},
Err(AgentClientError::Connect(_)) => {
(false, format!("Agent connection refused for service '{service_name}'"))
},
Err(AgentClientError::ApiError { code, message }) => {
// 404, 400, 500 etc. from the agent means the service is unhealthy.
(false, format!("Agent error [{code}]: {message}"))
},
Err(e) => {
(false, format!("Agent error querying service '{service_name}': {e}"))
},
}
}
// ─────────────────────────────────────────────────────────────────────────────
// HTTP check (via reqwest, no mTLS)
// ─────────────────────────────────────────────────────────────────────────────
/// Execute an HTTP check by making a GET request to the configured URL.
/// Supports optional basic auth (decrypted from DB) and substring body matching.
async fn run_http_check(
check: &HealthCheckRow,
crypto_key: &[u8; 32],
) -> (bool, String) {
let url = match &check.url {
Some(u) => u.clone(),
None => {
return (false, "HTTP check missing URL".to_string());
},
};
// Build a reqwest client for this check.
// Use danger_accept_invalid_certs if ignore_cert_errors is set (default true).
let ignore_cert_errors = check.ignore_cert_errors.unwrap_or(true);
let client_builder = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(10))
.redirect(reqwest::redirect::Policy::limited(5));
let client = if ignore_cert_errors {
client_builder
.danger_accept_invalid_certs(true)
.build()
.unwrap_or_else(|_| reqwest::Client::new())
} else {
client_builder.build().unwrap_or_else(|_| reqwest::Client::new())
};
// Build the request.
let mut request = client.get(&url);
// Add basic auth if configured.
if let Some(user) = &check.basic_auth_user {
// Decrypt the password if present.
let password = match (&check.basic_auth_pass_encrypted, &check.basic_auth_pass_nonce) {
(Some(enc), Some(nonce)) => {
match crypto::decrypt(enc, nonce, crypto_key) {
Ok(p) => p,
Err(e) => {
return (
false,
format!("Failed to decrypt basic auth password: {e}"),
);
},
}
},
_ => {
// No encrypted password stored — treat as missing credentials.
return (false, "HTTP check has basic_auth_user but no encrypted password".to_string());
},
};
request = request.basic_auth(user.as_str(), Some(password.as_str()));
}
// Execute the request.
let response = match request.send().await {
Ok(r) => r,
Err(e) => {
if e.is_timeout() {
return (false, format!("HTTP check timed out: {url}"));
} else if e.is_connect() {
return (false, format!("HTTP check connection failed: {url}"));
} else {
return (false, format!("HTTP check request error: {e}"));
}
},
};
let status = response.status();
// Check HTTP status code.
if !status.is_success() {
return (
false,
format!("HTTP check returned status {} for {url}", status.as_u16()),
);
}
// Read the response body for substring matching.
let body = match response.text().await {
Ok(b) => b,
Err(e) => {
return (false, format!("HTTP check failed to read response body: {e}"));
},
};
// Check expected_body substring match.
if let Some(expected) = &check.expected_body {
if !body.contains(expected) {
return (
false,
format!(
"HTTP check body mismatch for {url}: expected substring not found"
),
);
}
}
(true, format!("HTTP check OK for {url} (status {})", status.as_u16()))
}
// ─────────────────────────────────────────────────────────────────────────────
// Prune old results
// ─────────────────────────────────────────────────────────────────────────────
/// Delete health check results older than 4 days.
async fn prune_old_results(pool: &PgPool) {
match sqlx::query(
"DELETE FROM host_health_check_results WHERE checked_at < NOW() - INTERVAL '4 days'",
)
.execute(pool)
.await
{
Ok(result) => {
if result.rows_affected() > 0 {
tracing::info!(
rows_deleted = result.rows_affected(),
"Health check poller: pruned old results"
);
}
},
Err(e) => {
tracing::error!(error = %e, "Health check poller: failed to prune old results");
},
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Health check gate for job executor
// ─────────────────────────────────────────────────────────────────────────────
/// Check whether all enabled health checks for a host are healthy.
///
/// Returns `Ok(true)` if all checks pass (or no checks are configured),
/// `Ok(false)` if any check is unhealthy or has no result yet.
pub async fn check_host_health_checks(pool: &PgPool, host_id: Uuid) -> anyhow::Result<bool> {
// Check if there are any enabled health checks for this host.
let check_count: (i64,) = sqlx::query_as(
"SELECT COUNT(*) FROM host_health_checks WHERE host_id = $1 AND enabled = TRUE",
)
.bind(host_id)
.fetch_one(pool)
.await?;
if check_count.0 == 0 {
// No health checks configured for this host — treat as healthy.
return Ok(true);
}
// Find any enabled check that has no healthy result or an unhealthy latest result.
let unhealthy_count: (i64,) = sqlx::query_as(
r#"
SELECT COUNT(*)
FROM host_health_checks hc
LEFT JOIN LATERAL (
SELECT healthy
FROM host_health_check_results r
WHERE r.check_id = hc.id
ORDER BY r.checked_at DESC
LIMIT 1
) latest ON true
WHERE hc.host_id = $1
AND hc.enabled = TRUE
AND (latest.healthy IS NULL OR latest.healthy = FALSE)
"#,
)
.bind(host_id)
.fetch_one(pool)
.await?;
Ok(unhealthy_count.0 == 0)
}

View File

@ -24,6 +24,7 @@ use uuid::Uuid;
use crate::agent_loader::load_agent_certs;
use crate::email;
use crate::health_check_poller::check_host_health_checks;
// ─────────────────────────────────────────────────────────────────────────────
// Internal DB row types
@ -78,6 +79,7 @@ struct StatusCounts {
succeeded_count: i64,
failed_count: i64,
cancelled_count: i64,
waiting_health_check_count: i64,
total_count: i64,
}
@ -369,6 +371,89 @@ async fn execute_host_job(
},
};
// ── 1b. Health check gate ──────────────────────────────────────────────
// All enabled health checks for this host must be healthy before we proceed.
match check_host_health_checks(&pool, host_id).await {
Ok(true) => {
tracing::debug!(%host_id, "execute_host_job: health checks passed");
},
Ok(false) => {
tracing::info!(%host_id, %pjh_id, "execute_host_job: health checks not passed, setting waiting_health_check");
// Check if the maintenance window is still open for this host.
let window_open: bool = sqlx::query_scalar(
r#"
SELECT EXISTS(
SELECT 1 FROM maintenance_windows mw
WHERE mw.host_id = $1
AND mw.enabled = TRUE
AND (
(mw.recurrence = 'once'
AND mw.start_at <= NOW()
AND NOW() < mw.start_at + (mw.duration_minutes * INTERVAL '1 minute'))
OR
(mw.recurrence = 'daily'
AND (NOW() AT TIME ZONE 'UTC')::time >= (mw.start_at AT TIME ZONE 'UTC')::time
AND (NOW() AT TIME ZONE 'UTC')::time < ((mw.start_at AT TIME ZONE 'UTC')::time
+ (mw.duration_minutes * INTERVAL '1 minute')))
OR
(mw.recurrence = 'weekly'
AND EXTRACT(DOW FROM NOW() AT TIME ZONE 'UTC') = mw.recurrence_day
AND (NOW() AT TIME ZONE 'UTC')::time >= (mw.start_at AT TIME ZONE 'UTC')::time
AND (NOW() AT TIME ZONE 'UTC')::time < ((mw.start_at AT TIME ZONE 'UTC')::time
+ (mw.duration_minutes * INTERVAL '1 minute')))
OR
(mw.recurrence = 'monthly'
AND EXTRACT(DAY FROM NOW() AT TIME ZONE 'UTC') = mw.recurrence_day
AND (NOW() AT TIME ZONE 'UTC')::time >= (mw.start_at AT TIME ZONE 'UTC')::time
AND (NOW() AT TIME ZONE 'UTC')::time < ((mw.start_at AT TIME ZONE 'UTC')::time
+ (mw.duration_minutes * INTERVAL '1 minute')))
)
)
"#,
)
.bind(host_id)
.fetch_optional(&pool)
.await
.unwrap_or(Some(true))
.unwrap_or(true); // Default to true if no window configured
if !window_open {
tracing::warn!(%host_id, %pjh_id, "execute_host_job: health checks not passed and maintenance window closed");
handle_host_failure(
pool,
pjh_id,
"Health checks did not pass before maintenance window closed".to_string(),
)
.await;
return;
}
// Set status to waiting_health_check and retry in 5 minutes.
let retry_at = Utc::now() + ChronoDuration::minutes(5);
if let Err(e) = sqlx::query(
r#"
UPDATE patch_job_hosts
SET status = 'waiting_health_check',
retry_next_at = $2,
last_error = 'Waiting for health checks to pass'
WHERE id = $1
"#,
)
.bind(pjh_id)
.bind(retry_at)
.execute(&pool)
.await
{
tracing::error!(%pjh_id, error = %e, "execute_host_job: failed to set waiting_health_check status");
}
return;
},
Err(e) => {
tracing::warn!(%host_id, error = %e, "execute_host_job: health check query failed, proceeding anyway");
// If we can't query health checks, proceed with the job rather than blocking.
},
}
// ── 2. Fetch the job's patch_selection ──────────────────────────────────
let patch_sel: JobPatchSelection =
match sqlx::query_as("SELECT patch_selection FROM patch_jobs WHERE id = $1")
@ -764,6 +849,7 @@ async fn sync_job_status(pool: &PgPool, job_id: Uuid) {
COUNT(*) FILTER (WHERE status = 'succeeded') AS succeeded_count,
COUNT(*) FILTER (WHERE status = 'failed') AS failed_count,
COUNT(*) FILTER (WHERE status = 'cancelled') AS cancelled_count,
COUNT(*) FILTER (WHERE status = 'waiting_health_check') AS waiting_health_check_count,
COUNT(*) AS total_count
FROM patch_job_hosts
WHERE job_id = $1
@ -784,7 +870,7 @@ async fn sync_job_status(pool: &PgPool, job_id: Uuid) {
let new_status: &str;
let set_completed: bool;
if counts.running_count > 0 || counts.pending_count > 0 || counts.queued_count > 0 {
if counts.running_count > 0 || counts.pending_count > 0 || counts.queued_count > 0 || counts.waiting_health_check_count > 0 {
// Still work in flight — keep parent running.
new_status = "running";
set_completed = false;
@ -912,17 +998,18 @@ async fn sync_job_status(pool: &PgPool, job_id: Uuid) {
/// Find pending host entries whose back-off window has elapsed, reset them to
/// `queued`, and dispatch them immediately.
///
/// Also retries `waiting_health_check` entries whose retry window has elapsed.
pub async fn retry_pending_jobs(pool: PgPool, config: Arc<AppConfig>) {
let rows: Vec<PatchJobHostPending> = match sqlx::query_as(
r#"
SELECT pjh.id, pjh.host_id, pjh.job_id
FROM patch_job_hosts pjh
JOIN patch_jobs j ON j.id = pjh.job_id
WHERE pjh.status = 'pending'
WHERE pjh.status IN ('pending', 'waiting_health_check')
AND pjh.retry_next_at <= NOW()
AND j.status != 'cancelled'
"#,
)
AND j.status != 'cancelled'
"#,)
.fetch_all(&pool)
.await
{

View File

@ -6,6 +6,7 @@
mod agent_loader;
mod audit_verifier;
mod email;
mod health_check_poller;
mod health_poller;
mod job_executor;
mod maintenance_scheduler;
@ -19,6 +20,7 @@ use std::{sync::Arc, time::Duration};
use tokio::time;
use audit_verifier::run_audit_verifier;
use health_check_poller::run_health_check_poller;
use health_poller::run_health_poller;
use job_executor::run_job_executor;
use maintenance_scheduler::run_maintenance_scheduler;
@ -29,7 +31,7 @@ use ws_relay::run_ws_relay;
/// Minimum number of applied migrations the worker requires before
/// accepting work. Prevents the worker from running against a schema
/// that hasn't been migrated yet.
const REQUIRED_MIGRATION_COUNT: i64 = 5;
const REQUIRED_MIGRATION_COUNT: i64 = 8;
/// How long to wait between schema-version checks before giving up.
const SCHEMA_CHECK_TIMEOUT: Duration = Duration::from_secs(120);
@ -89,6 +91,9 @@ async fn main() -> anyhow::Result<()> {
// M11: audit integrity verification (runs every 24 hours)
let audit_verifier_handle = tokio::spawn(run_audit_verifier(pool.clone(), config.clone()));
// Health check poller — runs configured service/HTTP health checks
let health_check_handle = tokio::spawn(run_health_check_poller(pool.clone(), config.clone()));
tracing::info!("Worker tasks started");
// Wait for all tasks (they run indefinitely)