feat: health check configuration and worker engine (Phase 3+4)
Some checks failed
CI Pipeline / Rust Format Check (push) Failing after 4s
CI Pipeline / Clippy Lints (push) Successful in 46s
CI Pipeline / Rust Unit Tests (push) Successful in 1m1s
CI Pipeline / Security Audit (push) Successful in 4s
CI Pipeline / Frontend Lint & Type Check (push) Failing after 10s
CI Pipeline / Build .deb & Release (push) Has been skipped
Some checks failed
CI Pipeline / Rust Format Check (push) Failing after 4s
CI Pipeline / Clippy Lints (push) Successful in 46s
CI Pipeline / Rust Unit Tests (push) Successful in 1m1s
CI Pipeline / Security Audit (push) Successful in 4s
CI Pipeline / Frontend Lint & Type Check (push) Failing after 10s
CI Pipeline / Build .deb & Release (push) Has been skipped
- Added health_check_poller.rs: periodic service/HTTP health checks - Added pre-patch health gate in job_executor.rs - Added waiting_health_check job status (migration 008) - Added health_check_status to HostSummary and hosts API - Added health check types and API functions to frontend - Added health check UI section to HostDetailPage - Added health check status indicators to HostsPage and PatchDeploymentPage - Added serde default for health_check_poll_interval_secs - Fixed missing AgentClient import in health_check_poller.rs - Fixed missing ws_relay import in main.rs - Fixed missing closing paren in retry_pending_jobs SQL - Added ReadWritePaths for /etc/patch-manager/keys in systemd services
This commit is contained in:
@ -30,7 +30,7 @@ use crate::{
|
||||
error::AgentClientError,
|
||||
types::{
|
||||
AgentEnvelope, AgentJobStatus, ApplyPatchesRequest, ApplyPatchesResponse, HealthData,
|
||||
PackagesData, PatchesData, RollbackResponse, SystemInfoData,
|
||||
PackagesData, PatchesData, RollbackResponse, ServiceStatusData, SystemInfoData,
|
||||
},
|
||||
};
|
||||
|
||||
@ -221,10 +221,17 @@ impl AgentClient {
|
||||
.await
|
||||
}
|
||||
|
||||
/// `GET /api/v1/system/services/{name}` — check status of a specific service on the agent.
|
||||
#[instrument(skip(self), fields(base_url = %self.base_url, service_name = %service_name))]
|
||||
pub async fn service_status(&self, service_name: &str) -> Result<ServiceStatusData, AgentClientError> {
|
||||
self.get(&format!("system/services/{}", service_name), &[]).await
|
||||
}
|
||||
|
||||
// --------------------------------------------------------
|
||||
// Private POST helper
|
||||
// --------------------------------------------------------
|
||||
|
||||
|
||||
/// Execute a POST request against `{base_url}/{path}`, serialize `body` as
|
||||
/// JSON, deserialize the [`AgentEnvelope`], and extract the `data` field —
|
||||
/// or propagate an [`AgentClientError::ApiError`].
|
||||
|
||||
@ -39,5 +39,5 @@ pub use error::AgentClientError;
|
||||
/// Response envelope and all data types.
|
||||
pub use types::{
|
||||
AgentEnvelope, AgentErrorBody, HealthData, Package, PackagesData, Patch, PatchesData,
|
||||
SystemInfoData,
|
||||
RollbackResponse, ServiceStatusData, SystemInfoData,
|
||||
};
|
||||
|
||||
@ -193,6 +193,23 @@ pub struct AgentJobStatus {
|
||||
pub completed_at: Option<DateTime<Utc>>,
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// GET /api/v1/system/services/{name}
|
||||
// ============================================================
|
||||
|
||||
/// Payload returned by `GET /api/v1/system/services/{name}`.
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct ServiceStatusData {
|
||||
/// Service name.
|
||||
pub name: String,
|
||||
/// Service status string (e.g. `"running"`, `"stopped"`, `"failed"`).
|
||||
pub status: String,
|
||||
/// Whether the service is considered healthy.
|
||||
pub healthy: bool,
|
||||
/// Seconds elapsed since the service started (`null` if not running).
|
||||
pub uptime_secs: Option<u64>,
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// POST /api/v1/jobs/{id}/rollback
|
||||
// ============================================================
|
||||
|
||||
@ -22,3 +22,5 @@ config = { workspace = true }
|
||||
axum = { workspace = true }
|
||||
sha2 = { workspace = true }
|
||||
hex = { workspace = true }
|
||||
aes-gcm = { workspace = true }
|
||||
rand = { workspace = true }
|
||||
|
||||
@ -47,6 +47,9 @@ pub enum AuditAction {
|
||||
PatchJobCompleted,
|
||||
PatchJobFailed,
|
||||
MaintenanceWindowReminder,
|
||||
HealthCheckCreated,
|
||||
HealthCheckUpdated,
|
||||
HealthCheckDeleted,
|
||||
}
|
||||
|
||||
impl AuditAction {
|
||||
@ -80,6 +83,9 @@ impl AuditAction {
|
||||
Self::PatchJobCompleted => "patch_job_completed",
|
||||
Self::PatchJobFailed => "patch_job_failed",
|
||||
Self::MaintenanceWindowReminder => "maintenance_window_reminder",
|
||||
Self::HealthCheckCreated => "health_check_created",
|
||||
Self::HealthCheckUpdated => "health_check_updated",
|
||||
Self::HealthCheckDeleted => "health_check_deleted",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -39,6 +39,9 @@ pub struct WorkerConfig {
|
||||
pub health_poll_interval_secs: u64,
|
||||
/// Patch data poll interval in seconds (default: 1800 = 30 min)
|
||||
pub patch_poll_interval_secs: u64,
|
||||
/// Health check poll interval in seconds (default: 300 = 5 min)
|
||||
#[serde(default = "default_health_check_poll_interval")]
|
||||
pub health_check_poll_interval_secs: u64,
|
||||
/// Maximum concurrent agent calls
|
||||
pub max_concurrent_agent_calls: usize,
|
||||
/// Worker heartbeat interval in seconds
|
||||
@ -98,6 +101,8 @@ impl AppConfig {
|
||||
}
|
||||
}
|
||||
|
||||
fn default_health_check_poll_interval() -> u64 { 300 }
|
||||
|
||||
impl Default for AppConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
@ -115,6 +120,7 @@ impl Default for AppConfig {
|
||||
worker: WorkerConfig {
|
||||
health_poll_interval_secs: 300,
|
||||
patch_poll_interval_secs: 1800,
|
||||
health_check_poll_interval_secs: 300,
|
||||
max_concurrent_agent_calls: 64,
|
||||
heartbeat_interval_secs: 30,
|
||||
ws_relay_poll_interval_secs: 10,
|
||||
|
||||
80
crates/pm-core/src/crypto.rs
Normal file
80
crates/pm-core/src/crypto.rs
Normal file
@ -0,0 +1,80 @@
|
||||
//! AES-256-GCM encryption for sensitive health check credentials.
|
||||
//!
|
||||
//! Uses a per-install key stored at `/etc/patch-manager/keys/health-check.key`.
|
||||
|
||||
use aes_gcm::{
|
||||
aead::{Aead, KeyInit, OsRng},
|
||||
Aes256Gcm, Nonce,
|
||||
};
|
||||
use rand::RngCore;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
pub const KEY_PATH: &str = "/etc/patch-manager/keys/health-check.key";
|
||||
|
||||
/// Load or create the per-install encryption key.
|
||||
/// If the key file doesn't exist, generates a new 256-bit key and saves it.
|
||||
pub fn load_or_create_key(path: &Path) -> Result<[u8; 32], CryptoError> {
|
||||
if path.exists() {
|
||||
let key_bytes = fs::read(path).map_err(CryptoError::Io)?;
|
||||
if key_bytes.len() != 32 {
|
||||
return Err(CryptoError::InvalidKeyLength(key_bytes.len()));
|
||||
}
|
||||
let mut key = [0u8; 32];
|
||||
key.copy_from_slice(&key_bytes);
|
||||
Ok(key)
|
||||
} else {
|
||||
let mut key = [0u8; 32];
|
||||
OsRng.fill_bytes(&mut key);
|
||||
if let Some(parent) = path.parent() {
|
||||
fs::create_dir_all(parent).map_err(CryptoError::Io)?;
|
||||
}
|
||||
fs::write(path, &key).map_err(CryptoError::Io)?;
|
||||
// Set permissions to 0600 (owner read/write only)
|
||||
#[cfg(unix)]
|
||||
{
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
fs::set_permissions(path, fs::Permissions::from_mode(0o600))
|
||||
.map_err(CryptoError::Io)?;
|
||||
}
|
||||
Ok(key)
|
||||
}
|
||||
}
|
||||
|
||||
/// Encrypt plaintext with AES-256-GCM. Returns (ciphertext, nonce).
|
||||
pub fn encrypt(plaintext: &str, key: &[u8; 32]) -> Result<(Vec<u8>, Vec<u8>), CryptoError> {
|
||||
let cipher = Aes256Gcm::new_from_slice(key).map_err(|e| CryptoError::KeyInit(e.to_string()))?;
|
||||
let mut nonce_bytes = [0u8; 12];
|
||||
OsRng.fill_bytes(&mut nonce_bytes);
|
||||
let nonce = Nonce::from_slice(&nonce_bytes);
|
||||
let ciphertext = cipher
|
||||
.encrypt(nonce, plaintext.as_bytes())
|
||||
.map_err(|_| CryptoError::EncryptionFailed)?;
|
||||
Ok((ciphertext, nonce_bytes.to_vec()))
|
||||
}
|
||||
|
||||
/// Decrypt AES-256-GCM ciphertext with the given nonce.
|
||||
pub fn decrypt(ciphertext: &[u8], nonce: &[u8], key: &[u8; 32]) -> Result<String, CryptoError> {
|
||||
let cipher = Aes256Gcm::new_from_slice(key).map_err(|e| CryptoError::KeyInit(e.to_string()))?;
|
||||
let nonce = Nonce::from_slice(nonce);
|
||||
let plaintext = cipher
|
||||
.decrypt(nonce, ciphertext)
|
||||
.map_err(|_| CryptoError::DecryptionFailed)?;
|
||||
String::from_utf8(plaintext).map_err(CryptoError::Utf8)
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum CryptoError {
|
||||
#[error("IO error: {0}")]
|
||||
Io(#[from] std::io::Error),
|
||||
#[error("Invalid key length: expected 32 bytes, got {0}")]
|
||||
InvalidKeyLength(usize),
|
||||
#[error("Key init error: {0}")]
|
||||
KeyInit(String),
|
||||
#[error("Encryption failed")]
|
||||
EncryptionFailed,
|
||||
#[error("Decryption failed")]
|
||||
DecryptionFailed,
|
||||
#[error("UTF-8 error: {0}")]
|
||||
Utf8(#[from] std::string::FromUtf8Error),
|
||||
}
|
||||
@ -1,5 +1,6 @@
|
||||
pub mod audit;
|
||||
pub mod config;
|
||||
pub mod crypto;
|
||||
pub mod db;
|
||||
pub mod error;
|
||||
pub mod logging;
|
||||
@ -8,11 +9,14 @@ pub mod request_id;
|
||||
|
||||
// Re-export commonly used types
|
||||
pub use config::AppConfig;
|
||||
pub use crypto::{CryptoError, KEY_PATH, decrypt, encrypt, load_or_create_key};
|
||||
pub use error::{AppError, ErrorResponse};
|
||||
pub use models::{
|
||||
AuthProvider, CreateGroupRequest, CreateHostRequest, CreateUserRequest, DiscoveryCidrRequest,
|
||||
DiscoveryResult, Group, Host, HostHealthStatus, HostSummary, RegisterDiscoveredRequest,
|
||||
UpdateGroupRequest, UpdateUserRequest, User, UserRole as DbUserRole,
|
||||
AuthProvider, CreateGroupRequest, CreateHealthCheckRequest, CreateHostRequest,
|
||||
CreateUserRequest, DiscoveryCidrRequest, DiscoveryResult, Group, HealthCheck,
|
||||
HealthCheckResult, HealthCheckWithResult, Host, HostHealthStatus, HostSummary,
|
||||
RegisterDiscoveredRequest, UpdateGroupRequest, UpdateHealthCheckRequest, UpdateUserRequest,
|
||||
User, UserRole as DbUserRole,
|
||||
};
|
||||
|
||||
// Re-export audit integrity types
|
||||
|
||||
@ -113,9 +113,79 @@ pub struct HostSummary {
|
||||
pub health_status: HostHealthStatus,
|
||||
pub agent_version: Option<String>,
|
||||
pub patches_missing: i32,
|
||||
pub health_check_status: Option<String>,
|
||||
pub registered_at: DateTime<Utc>,
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Health Checks
|
||||
// ============================================================
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, FromRow)]
|
||||
pub struct HealthCheck {
|
||||
pub id: Uuid,
|
||||
pub host_id: Uuid,
|
||||
pub name: String,
|
||||
pub check_type: String, // "service" or "http"
|
||||
pub enabled: bool,
|
||||
// Service check fields
|
||||
pub service_name: Option<String>,
|
||||
// HTTP check fields
|
||||
pub url: Option<String>,
|
||||
pub expected_body: Option<String>,
|
||||
pub ignore_cert_errors: bool,
|
||||
pub basic_auth_user: Option<String>,
|
||||
// basic_auth_pass_encrypted and nonce NOT exposed in API responses
|
||||
pub created_at: DateTime<Utc>,
|
||||
pub updated_at: DateTime<Utc>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct HealthCheckWithResult {
|
||||
#[serde(flatten)]
|
||||
pub check: HealthCheck,
|
||||
pub last_result: Option<HealthCheckResult>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, FromRow)]
|
||||
pub struct HealthCheckResult {
|
||||
pub id: Uuid,
|
||||
pub check_id: Uuid,
|
||||
pub healthy: bool,
|
||||
pub detail: Option<String>,
|
||||
pub latency_ms: Option<i32>,
|
||||
pub checked_at: DateTime<Utc>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct CreateHealthCheckRequest {
|
||||
pub name: String,
|
||||
pub check_type: String, // "service" or "http"
|
||||
pub service_name: Option<String>,
|
||||
pub url: Option<String>,
|
||||
pub expected_body: Option<String>,
|
||||
#[serde(default = "default_true")]
|
||||
pub ignore_cert_errors: bool,
|
||||
pub basic_auth_user: Option<String>,
|
||||
pub basic_auth_pass: Option<String>, // plaintext in request, encrypted before storage
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct UpdateHealthCheckRequest {
|
||||
pub name: Option<String>,
|
||||
pub enabled: Option<bool>,
|
||||
pub service_name: Option<String>,
|
||||
pub url: Option<String>,
|
||||
pub expected_body: Option<String>,
|
||||
pub ignore_cert_errors: Option<bool>,
|
||||
pub basic_auth_user: Option<String>,
|
||||
pub basic_auth_pass: Option<String>, // if provided, re-encrypt
|
||||
}
|
||||
|
||||
fn default_true() -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Group
|
||||
// ============================================================
|
||||
|
||||
@ -189,6 +189,7 @@ pub fn build_router(state: AppState) -> Router {
|
||||
.merge(routes::ws::ticket_router())
|
||||
// Reports
|
||||
.nest("/reports", routes::reports::router())
|
||||
.nest("/hosts/{host_id}/health-checks", routes::health_checks::router())
|
||||
// Settings (admin-only)
|
||||
.nest("/settings", routes::settings::router())
|
||||
// Apply auth middleware to all the above
|
||||
|
||||
1042
crates/pm-web/src/routes/health_checks.rs
Normal file
1042
crates/pm-web/src/routes/health_checks.rs
Normal file
File diff suppressed because it is too large
Load Diff
@ -112,6 +112,7 @@ async fn list_hosts(
|
||||
SELECT h.id, h.fqdn, host(h.ip_address)::text AS ip_address, h.display_name,
|
||||
h.os_family, h.os_name, h.health_status, h.agent_version,
|
||||
COALESCE(hpd.patch_count, 0) AS patches_missing,
|
||||
" + hc_subquery + ",
|
||||
h.registered_at
|
||||
FROM hosts h
|
||||
LEFT JOIN host_patch_data hpd ON hpd.host_id = h.id
|
||||
@ -130,6 +131,7 @@ async fn list_hosts(
|
||||
h.display_name, h.os_family, h.os_name,
|
||||
h.health_status, h.agent_version,
|
||||
COALESCE(hpd.patch_count, 0) AS patches_missing,
|
||||
" + hc_subquery + ",
|
||||
h.registered_at
|
||||
FROM hosts h
|
||||
LEFT JOIN host_patch_data hpd ON hpd.host_id = h.id
|
||||
|
||||
@ -11,5 +11,6 @@ pub mod settings;
|
||||
pub mod status;
|
||||
pub mod users;
|
||||
pub mod ws;
|
||||
pub mod health_checks;
|
||||
|
||||
pub mod reports;
|
||||
|
||||
@ -28,3 +28,4 @@ tokio-rustls = { version = "0.26" }
|
||||
rustls-pemfile = { version = "2" }
|
||||
tokio-tungstenite = { version = "0.26", features = ["rustls-tls-webpki-roots"] }
|
||||
lettre = { version = "0.11", default-features = false, features = ["tokio1-rustls-tls", "smtp-transport", "builder"] }
|
||||
reqwest = { workspace = true }
|
||||
|
||||
471
crates/pm-worker/src/health_check_poller.rs
Normal file
471
crates/pm-worker/src/health_check_poller.rs
Normal file
@ -0,0 +1,471 @@
|
||||
//! Periodic health check poller for configured service and HTTP checks.
|
||||
//!
|
||||
//! Polls every `health_check_poll_interval_secs`, querying each enabled health
|
||||
//! check definition and storing results in `host_health_check_results`.
|
||||
//! Results older than 4 days are pruned on each cycle.
|
||||
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use pm_core::{config::AppConfig, crypto};
|
||||
use sqlx::{FromRow, PgPool};
|
||||
use tokio::{sync::Semaphore, time};
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::agent_loader::load_agent_certs;
|
||||
use pm_agent_client::{AgentClient, AgentClientError};
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// DB row types
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Row fetched for each enabled health check, joined with host connection info.
|
||||
#[derive(Debug, FromRow)]
|
||||
struct HealthCheckRow {
|
||||
id: Uuid,
|
||||
host_id: Uuid,
|
||||
name: String,
|
||||
check_type: String,
|
||||
service_name: Option<String>,
|
||||
url: Option<String>,
|
||||
expected_body: Option<String>,
|
||||
ignore_cert_errors: Option<bool>,
|
||||
basic_auth_user: Option<String>,
|
||||
basic_auth_pass_encrypted: Option<Vec<u8>>,
|
||||
basic_auth_pass_nonce: Option<Vec<u8>>,
|
||||
ip_address: String,
|
||||
agent_port: i32,
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Public entry point
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Run the health check poller loop indefinitely.
|
||||
///
|
||||
/// On each tick all enabled health checks are queried concurrently (up to
|
||||
/// `max_concurrent_agent_calls` in-flight at once). Results are persisted
|
||||
/// to `host_health_check_results` and stale rows are pruned.
|
||||
pub async fn run_health_check_poller(pool: PgPool, config: Arc<AppConfig>) {
|
||||
let interval_secs = config.worker.health_check_poll_interval_secs;
|
||||
let mut ticker = time::interval(std::time::Duration::from_secs(interval_secs));
|
||||
|
||||
tracing::info!(interval_secs, "Health check poller started");
|
||||
|
||||
loop {
|
||||
ticker.tick().await;
|
||||
|
||||
// Load certs on each cycle so cert rotation is picked up automatically.
|
||||
let certs = match load_agent_certs(&config.security) {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
tracing::error!(
|
||||
error = %e,
|
||||
"Health check poller: failed to load agent certs — skipping cycle"
|
||||
);
|
||||
continue;
|
||||
},
|
||||
};
|
||||
|
||||
let client_cert = Arc::new(certs.client_cert);
|
||||
let client_key = Arc::new(certs.client_key);
|
||||
let ca_cert = Arc::new(certs.ca_cert);
|
||||
|
||||
// Load the crypto key for decrypting HTTP check passwords.
|
||||
let crypto_key = match crypto::load_or_create_key(Path::new(crypto::KEY_PATH)) {
|
||||
Ok(k) => Arc::new(k),
|
||||
Err(e) => {
|
||||
tracing::error!(
|
||||
error = %e,
|
||||
"Health check poller: failed to load crypto key — skipping cycle"
|
||||
);
|
||||
continue;
|
||||
},
|
||||
};
|
||||
|
||||
// Fetch all enabled health checks with host connection info.
|
||||
let checks: Vec<HealthCheckRow> = match sqlx::query_as(
|
||||
r#"
|
||||
SELECT
|
||||
hc.id,
|
||||
hc.host_id,
|
||||
hc.name,
|
||||
hc.check_type,
|
||||
hc.service_name,
|
||||
hc.url,
|
||||
hc.expected_body,
|
||||
hc.ignore_cert_errors,
|
||||
hc.basic_auth_user,
|
||||
hc.basic_auth_pass_encrypted,
|
||||
hc.basic_auth_pass_nonce,
|
||||
host(h.ip_address)::text AS ip_address,
|
||||
h.agent_port
|
||||
FROM host_health_checks hc
|
||||
JOIN hosts h ON h.id = hc.host_id
|
||||
WHERE hc.enabled = TRUE
|
||||
ORDER BY hc.id
|
||||
"#,
|
||||
)
|
||||
.fetch_all(&pool)
|
||||
.await
|
||||
{
|
||||
Ok(rows) => rows,
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "Health check poller: failed to fetch health checks");
|
||||
continue;
|
||||
},
|
||||
};
|
||||
|
||||
if checks.is_empty() {
|
||||
tracing::debug!("Health check poller: no enabled health checks, skipping cycle");
|
||||
prune_old_results(&pool).await;
|
||||
continue;
|
||||
}
|
||||
|
||||
let total = checks.len();
|
||||
let semaphore = Arc::new(Semaphore::new(config.worker.max_concurrent_agent_calls));
|
||||
|
||||
let mut handles = Vec::with_capacity(total);
|
||||
|
||||
for check in checks {
|
||||
let pool = pool.clone();
|
||||
let sem = semaphore.clone();
|
||||
let cert = client_cert.clone();
|
||||
let key = client_key.clone();
|
||||
let ca = ca_cert.clone();
|
||||
let ckey = crypto_key.clone();
|
||||
|
||||
let handle = tokio::spawn(async move {
|
||||
let _permit = sem.acquire().await.expect("semaphore closed");
|
||||
run_check(pool, check, &cert, &key, &ca, &ckey).await
|
||||
});
|
||||
|
||||
handles.push(handle);
|
||||
}
|
||||
|
||||
// Collect results and tally counts.
|
||||
let mut healthy_count = 0usize;
|
||||
let mut unhealthy_count = 0usize;
|
||||
let mut error_count = 0usize;
|
||||
|
||||
for handle in handles {
|
||||
match handle.await {
|
||||
Ok(true) => healthy_count += 1,
|
||||
Ok(false) => unhealthy_count += 1,
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "Health check poller task panicked");
|
||||
error_count += 1;
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
total,
|
||||
healthy_count,
|
||||
unhealthy_count,
|
||||
error_count,
|
||||
"Health check poll cycle complete"
|
||||
);
|
||||
|
||||
// Prune results older than 4 days.
|
||||
prune_old_results(&pool).await;
|
||||
}
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Check dispatch
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Run a single health check and persist the result. Returns `true` if healthy.
|
||||
async fn run_check(
|
||||
pool: PgPool,
|
||||
check: HealthCheckRow,
|
||||
client_cert: &[u8],
|
||||
client_key: &[u8],
|
||||
ca_cert: &[u8],
|
||||
crypto_key: &[u8; 32],
|
||||
) -> bool {
|
||||
let start = Instant::now();
|
||||
|
||||
let (healthy, detail) = match check.check_type.as_str() {
|
||||
"service" => run_service_check(&check, client_cert, client_key, ca_cert).await,
|
||||
"http" => run_http_check(&check, crypto_key).await,
|
||||
other => {
|
||||
tracing::warn!(
|
||||
check_id = %check.id,
|
||||
check_type = other,
|
||||
"Unknown health check type — treating as unhealthy"
|
||||
);
|
||||
(false, format!("Unknown check type: {other}"))
|
||||
},
|
||||
};
|
||||
|
||||
let latency_ms = start.elapsed().as_millis() as i32;
|
||||
|
||||
// Persist the result.
|
||||
if let Err(e) = sqlx::query(
|
||||
r#"
|
||||
INSERT INTO host_health_check_results (check_id, healthy, detail, latency_ms)
|
||||
VALUES ($1, $2, $3, $4)
|
||||
"#,
|
||||
)
|
||||
.bind(check.id)
|
||||
.bind(healthy)
|
||||
.bind(&detail)
|
||||
.bind(latency_ms)
|
||||
.execute(&pool)
|
||||
.await
|
||||
{
|
||||
tracing::error!(
|
||||
check_id = %check.id,
|
||||
error = %e,
|
||||
"Health check poller: failed to insert result"
|
||||
);
|
||||
}
|
||||
|
||||
healthy
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Service check (via mTLS AgentClient)
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Execute a service check by calling the agent's `/api/v1/system/services/{name}` endpoint.
|
||||
async fn run_service_check(
|
||||
check: &HealthCheckRow,
|
||||
client_cert: &[u8],
|
||||
client_key: &[u8],
|
||||
ca_cert: &[u8],
|
||||
) -> (bool, String) {
|
||||
let service_name = match &check.service_name {
|
||||
Some(name) => name.clone(),
|
||||
None => {
|
||||
return (false, "Service check missing service_name".to_string());
|
||||
},
|
||||
};
|
||||
|
||||
let client = match AgentClient::new(
|
||||
&check.ip_address,
|
||||
check.agent_port as u16,
|
||||
client_cert,
|
||||
client_key,
|
||||
ca_cert,
|
||||
) {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
return (false, format!("Failed to build AgentClient: {e}"));
|
||||
},
|
||||
};
|
||||
|
||||
match client.service_status(&service_name).await {
|
||||
Ok(data) => {
|
||||
let detail = if data.healthy {
|
||||
format!(
|
||||
"Service '{}' is {} (uptime: {}s)",
|
||||
data.name,
|
||||
data.status,
|
||||
data.uptime_secs.map_or("N/A".to_string(), |s| s.to_string())
|
||||
)
|
||||
} else {
|
||||
format!(
|
||||
"Service '{}' status: {} (unhealthy)",
|
||||
data.name, data.status
|
||||
)
|
||||
};
|
||||
(data.healthy, detail)
|
||||
},
|
||||
Err(AgentClientError::Timeout) => {
|
||||
(false, format!("Agent timed out querying service '{service_name}'"))
|
||||
},
|
||||
Err(AgentClientError::Connect(_)) => {
|
||||
(false, format!("Agent connection refused for service '{service_name}'"))
|
||||
},
|
||||
Err(AgentClientError::ApiError { code, message }) => {
|
||||
// 404, 400, 500 etc. from the agent means the service is unhealthy.
|
||||
(false, format!("Agent error [{code}]: {message}"))
|
||||
},
|
||||
Err(e) => {
|
||||
(false, format!("Agent error querying service '{service_name}': {e}"))
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// HTTP check (via reqwest, no mTLS)
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Execute an HTTP check by making a GET request to the configured URL.
|
||||
/// Supports optional basic auth (decrypted from DB) and substring body matching.
|
||||
async fn run_http_check(
|
||||
check: &HealthCheckRow,
|
||||
crypto_key: &[u8; 32],
|
||||
) -> (bool, String) {
|
||||
let url = match &check.url {
|
||||
Some(u) => u.clone(),
|
||||
None => {
|
||||
return (false, "HTTP check missing URL".to_string());
|
||||
},
|
||||
};
|
||||
|
||||
// Build a reqwest client for this check.
|
||||
// Use danger_accept_invalid_certs if ignore_cert_errors is set (default true).
|
||||
let ignore_cert_errors = check.ignore_cert_errors.unwrap_or(true);
|
||||
|
||||
let client_builder = reqwest::Client::builder()
|
||||
.timeout(std::time::Duration::from_secs(10))
|
||||
.redirect(reqwest::redirect::Policy::limited(5));
|
||||
|
||||
let client = if ignore_cert_errors {
|
||||
client_builder
|
||||
.danger_accept_invalid_certs(true)
|
||||
.build()
|
||||
.unwrap_or_else(|_| reqwest::Client::new())
|
||||
} else {
|
||||
client_builder.build().unwrap_or_else(|_| reqwest::Client::new())
|
||||
};
|
||||
|
||||
// Build the request.
|
||||
let mut request = client.get(&url);
|
||||
|
||||
// Add basic auth if configured.
|
||||
if let Some(user) = &check.basic_auth_user {
|
||||
// Decrypt the password if present.
|
||||
let password = match (&check.basic_auth_pass_encrypted, &check.basic_auth_pass_nonce) {
|
||||
(Some(enc), Some(nonce)) => {
|
||||
match crypto::decrypt(enc, nonce, crypto_key) {
|
||||
Ok(p) => p,
|
||||
Err(e) => {
|
||||
return (
|
||||
false,
|
||||
format!("Failed to decrypt basic auth password: {e}"),
|
||||
);
|
||||
},
|
||||
}
|
||||
},
|
||||
_ => {
|
||||
// No encrypted password stored — treat as missing credentials.
|
||||
return (false, "HTTP check has basic_auth_user but no encrypted password".to_string());
|
||||
},
|
||||
};
|
||||
request = request.basic_auth(user.as_str(), Some(password.as_str()));
|
||||
}
|
||||
|
||||
// Execute the request.
|
||||
let response = match request.send().await {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
if e.is_timeout() {
|
||||
return (false, format!("HTTP check timed out: {url}"));
|
||||
} else if e.is_connect() {
|
||||
return (false, format!("HTTP check connection failed: {url}"));
|
||||
} else {
|
||||
return (false, format!("HTTP check request error: {e}"));
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
let status = response.status();
|
||||
|
||||
// Check HTTP status code.
|
||||
if !status.is_success() {
|
||||
return (
|
||||
false,
|
||||
format!("HTTP check returned status {} for {url}", status.as_u16()),
|
||||
);
|
||||
}
|
||||
|
||||
// Read the response body for substring matching.
|
||||
let body = match response.text().await {
|
||||
Ok(b) => b,
|
||||
Err(e) => {
|
||||
return (false, format!("HTTP check failed to read response body: {e}"));
|
||||
},
|
||||
};
|
||||
|
||||
// Check expected_body substring match.
|
||||
if let Some(expected) = &check.expected_body {
|
||||
if !body.contains(expected) {
|
||||
return (
|
||||
false,
|
||||
format!(
|
||||
"HTTP check body mismatch for {url}: expected substring not found"
|
||||
),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
(true, format!("HTTP check OK for {url} (status {})", status.as_u16()))
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Prune old results
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Delete health check results older than 4 days.
|
||||
async fn prune_old_results(pool: &PgPool) {
|
||||
match sqlx::query(
|
||||
"DELETE FROM host_health_check_results WHERE checked_at < NOW() - INTERVAL '4 days'",
|
||||
)
|
||||
.execute(pool)
|
||||
.await
|
||||
{
|
||||
Ok(result) => {
|
||||
if result.rows_affected() > 0 {
|
||||
tracing::info!(
|
||||
rows_deleted = result.rows_affected(),
|
||||
"Health check poller: pruned old results"
|
||||
);
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "Health check poller: failed to prune old results");
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Health check gate for job executor
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Check whether all enabled health checks for a host are healthy.
|
||||
///
|
||||
/// Returns `Ok(true)` if all checks pass (or no checks are configured),
|
||||
/// `Ok(false)` if any check is unhealthy or has no result yet.
|
||||
pub async fn check_host_health_checks(pool: &PgPool, host_id: Uuid) -> anyhow::Result<bool> {
|
||||
// Check if there are any enabled health checks for this host.
|
||||
let check_count: (i64,) = sqlx::query_as(
|
||||
"SELECT COUNT(*) FROM host_health_checks WHERE host_id = $1 AND enabled = TRUE",
|
||||
)
|
||||
.bind(host_id)
|
||||
.fetch_one(pool)
|
||||
.await?;
|
||||
|
||||
if check_count.0 == 0 {
|
||||
// No health checks configured for this host — treat as healthy.
|
||||
return Ok(true);
|
||||
}
|
||||
|
||||
// Find any enabled check that has no healthy result or an unhealthy latest result.
|
||||
let unhealthy_count: (i64,) = sqlx::query_as(
|
||||
r#"
|
||||
SELECT COUNT(*)
|
||||
FROM host_health_checks hc
|
||||
LEFT JOIN LATERAL (
|
||||
SELECT healthy
|
||||
FROM host_health_check_results r
|
||||
WHERE r.check_id = hc.id
|
||||
ORDER BY r.checked_at DESC
|
||||
LIMIT 1
|
||||
) latest ON true
|
||||
WHERE hc.host_id = $1
|
||||
AND hc.enabled = TRUE
|
||||
AND (latest.healthy IS NULL OR latest.healthy = FALSE)
|
||||
"#,
|
||||
)
|
||||
.bind(host_id)
|
||||
.fetch_one(pool)
|
||||
.await?;
|
||||
|
||||
Ok(unhealthy_count.0 == 0)
|
||||
}
|
||||
@ -24,6 +24,7 @@ use uuid::Uuid;
|
||||
|
||||
use crate::agent_loader::load_agent_certs;
|
||||
use crate::email;
|
||||
use crate::health_check_poller::check_host_health_checks;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Internal DB row types
|
||||
@ -78,6 +79,7 @@ struct StatusCounts {
|
||||
succeeded_count: i64,
|
||||
failed_count: i64,
|
||||
cancelled_count: i64,
|
||||
waiting_health_check_count: i64,
|
||||
total_count: i64,
|
||||
}
|
||||
|
||||
@ -369,6 +371,89 @@ async fn execute_host_job(
|
||||
},
|
||||
};
|
||||
|
||||
// ── 1b. Health check gate ──────────────────────────────────────────────
|
||||
// All enabled health checks for this host must be healthy before we proceed.
|
||||
match check_host_health_checks(&pool, host_id).await {
|
||||
Ok(true) => {
|
||||
tracing::debug!(%host_id, "execute_host_job: health checks passed");
|
||||
},
|
||||
Ok(false) => {
|
||||
tracing::info!(%host_id, %pjh_id, "execute_host_job: health checks not passed, setting waiting_health_check");
|
||||
// Check if the maintenance window is still open for this host.
|
||||
let window_open: bool = sqlx::query_scalar(
|
||||
r#"
|
||||
SELECT EXISTS(
|
||||
SELECT 1 FROM maintenance_windows mw
|
||||
WHERE mw.host_id = $1
|
||||
AND mw.enabled = TRUE
|
||||
AND (
|
||||
(mw.recurrence = 'once'
|
||||
AND mw.start_at <= NOW()
|
||||
AND NOW() < mw.start_at + (mw.duration_minutes * INTERVAL '1 minute'))
|
||||
OR
|
||||
(mw.recurrence = 'daily'
|
||||
AND (NOW() AT TIME ZONE 'UTC')::time >= (mw.start_at AT TIME ZONE 'UTC')::time
|
||||
AND (NOW() AT TIME ZONE 'UTC')::time < ((mw.start_at AT TIME ZONE 'UTC')::time
|
||||
+ (mw.duration_minutes * INTERVAL '1 minute')))
|
||||
OR
|
||||
(mw.recurrence = 'weekly'
|
||||
AND EXTRACT(DOW FROM NOW() AT TIME ZONE 'UTC') = mw.recurrence_day
|
||||
AND (NOW() AT TIME ZONE 'UTC')::time >= (mw.start_at AT TIME ZONE 'UTC')::time
|
||||
AND (NOW() AT TIME ZONE 'UTC')::time < ((mw.start_at AT TIME ZONE 'UTC')::time
|
||||
+ (mw.duration_minutes * INTERVAL '1 minute')))
|
||||
OR
|
||||
(mw.recurrence = 'monthly'
|
||||
AND EXTRACT(DAY FROM NOW() AT TIME ZONE 'UTC') = mw.recurrence_day
|
||||
AND (NOW() AT TIME ZONE 'UTC')::time >= (mw.start_at AT TIME ZONE 'UTC')::time
|
||||
AND (NOW() AT TIME ZONE 'UTC')::time < ((mw.start_at AT TIME ZONE 'UTC')::time
|
||||
+ (mw.duration_minutes * INTERVAL '1 minute')))
|
||||
)
|
||||
)
|
||||
"#,
|
||||
)
|
||||
.bind(host_id)
|
||||
.fetch_optional(&pool)
|
||||
.await
|
||||
.unwrap_or(Some(true))
|
||||
.unwrap_or(true); // Default to true if no window configured
|
||||
|
||||
if !window_open {
|
||||
tracing::warn!(%host_id, %pjh_id, "execute_host_job: health checks not passed and maintenance window closed");
|
||||
handle_host_failure(
|
||||
pool,
|
||||
pjh_id,
|
||||
"Health checks did not pass before maintenance window closed".to_string(),
|
||||
)
|
||||
.await;
|
||||
return;
|
||||
}
|
||||
|
||||
// Set status to waiting_health_check and retry in 5 minutes.
|
||||
let retry_at = Utc::now() + ChronoDuration::minutes(5);
|
||||
if let Err(e) = sqlx::query(
|
||||
r#"
|
||||
UPDATE patch_job_hosts
|
||||
SET status = 'waiting_health_check',
|
||||
retry_next_at = $2,
|
||||
last_error = 'Waiting for health checks to pass'
|
||||
WHERE id = $1
|
||||
"#,
|
||||
)
|
||||
.bind(pjh_id)
|
||||
.bind(retry_at)
|
||||
.execute(&pool)
|
||||
.await
|
||||
{
|
||||
tracing::error!(%pjh_id, error = %e, "execute_host_job: failed to set waiting_health_check status");
|
||||
}
|
||||
return;
|
||||
},
|
||||
Err(e) => {
|
||||
tracing::warn!(%host_id, error = %e, "execute_host_job: health check query failed, proceeding anyway");
|
||||
// If we can't query health checks, proceed with the job rather than blocking.
|
||||
},
|
||||
}
|
||||
|
||||
// ── 2. Fetch the job's patch_selection ──────────────────────────────────
|
||||
let patch_sel: JobPatchSelection =
|
||||
match sqlx::query_as("SELECT patch_selection FROM patch_jobs WHERE id = $1")
|
||||
@ -764,6 +849,7 @@ async fn sync_job_status(pool: &PgPool, job_id: Uuid) {
|
||||
COUNT(*) FILTER (WHERE status = 'succeeded') AS succeeded_count,
|
||||
COUNT(*) FILTER (WHERE status = 'failed') AS failed_count,
|
||||
COUNT(*) FILTER (WHERE status = 'cancelled') AS cancelled_count,
|
||||
COUNT(*) FILTER (WHERE status = 'waiting_health_check') AS waiting_health_check_count,
|
||||
COUNT(*) AS total_count
|
||||
FROM patch_job_hosts
|
||||
WHERE job_id = $1
|
||||
@ -784,7 +870,7 @@ async fn sync_job_status(pool: &PgPool, job_id: Uuid) {
|
||||
let new_status: &str;
|
||||
let set_completed: bool;
|
||||
|
||||
if counts.running_count > 0 || counts.pending_count > 0 || counts.queued_count > 0 {
|
||||
if counts.running_count > 0 || counts.pending_count > 0 || counts.queued_count > 0 || counts.waiting_health_check_count > 0 {
|
||||
// Still work in flight — keep parent running.
|
||||
new_status = "running";
|
||||
set_completed = false;
|
||||
@ -912,17 +998,18 @@ async fn sync_job_status(pool: &PgPool, job_id: Uuid) {
|
||||
|
||||
/// Find pending host entries whose back-off window has elapsed, reset them to
|
||||
/// `queued`, and dispatch them immediately.
|
||||
///
|
||||
/// Also retries `waiting_health_check` entries whose retry window has elapsed.
|
||||
pub async fn retry_pending_jobs(pool: PgPool, config: Arc<AppConfig>) {
|
||||
let rows: Vec<PatchJobHostPending> = match sqlx::query_as(
|
||||
r#"
|
||||
SELECT pjh.id, pjh.host_id, pjh.job_id
|
||||
FROM patch_job_hosts pjh
|
||||
JOIN patch_jobs j ON j.id = pjh.job_id
|
||||
WHERE pjh.status = 'pending'
|
||||
WHERE pjh.status IN ('pending', 'waiting_health_check')
|
||||
AND pjh.retry_next_at <= NOW()
|
||||
AND j.status != 'cancelled'
|
||||
"#,
|
||||
)
|
||||
AND j.status != 'cancelled'
|
||||
"#,)
|
||||
.fetch_all(&pool)
|
||||
.await
|
||||
{
|
||||
|
||||
@ -6,6 +6,7 @@
|
||||
mod agent_loader;
|
||||
mod audit_verifier;
|
||||
mod email;
|
||||
mod health_check_poller;
|
||||
mod health_poller;
|
||||
mod job_executor;
|
||||
mod maintenance_scheduler;
|
||||
@ -19,6 +20,7 @@ use std::{sync::Arc, time::Duration};
|
||||
use tokio::time;
|
||||
|
||||
use audit_verifier::run_audit_verifier;
|
||||
use health_check_poller::run_health_check_poller;
|
||||
use health_poller::run_health_poller;
|
||||
use job_executor::run_job_executor;
|
||||
use maintenance_scheduler::run_maintenance_scheduler;
|
||||
@ -29,7 +31,7 @@ use ws_relay::run_ws_relay;
|
||||
/// Minimum number of applied migrations the worker requires before
|
||||
/// accepting work. Prevents the worker from running against a schema
|
||||
/// that hasn't been migrated yet.
|
||||
const REQUIRED_MIGRATION_COUNT: i64 = 5;
|
||||
const REQUIRED_MIGRATION_COUNT: i64 = 8;
|
||||
|
||||
/// How long to wait between schema-version checks before giving up.
|
||||
const SCHEMA_CHECK_TIMEOUT: Duration = Duration::from_secs(120);
|
||||
@ -89,6 +91,9 @@ async fn main() -> anyhow::Result<()> {
|
||||
// M11: audit integrity verification (runs every 24 hours)
|
||||
let audit_verifier_handle = tokio::spawn(run_audit_verifier(pool.clone(), config.clone()));
|
||||
|
||||
// Health check poller — runs configured service/HTTP health checks
|
||||
let health_check_handle = tokio::spawn(run_health_check_poller(pool.clone(), config.clone()));
|
||||
|
||||
tracing::info!("Worker tasks started");
|
||||
|
||||
// Wait for all tasks (they run indefinitely)
|
||||
|
||||
Reference in New Issue
Block a user