From e3a27eb2ed684f71fd1d0315052e577a454cf60f Mon Sep 17 00:00:00 2001 From: Echo Date: Mon, 4 May 2026 15:16:20 +0000 Subject: [PATCH] fix: add ALPN http/1.1 for WebSocket, polling fallback, and job-level WS events - ws_relay.rs: Add ALPN protocol http/1.1 to rustls ClientConfig to prevent HTTP/2 negotiation which breaks WebSocket upgrades (Sec-WebSocket-Accept mismatch) - ws_relay.rs: Add detailed TLS error chain logging for debugging connection failures - ws_relay.rs: Add HTTP polling fallback when WebSocket connection fails, using AgentClient to poll /api/v1/jobs/{id} every ws_relay_poll_interval_secs - config.rs: Add ws_relay_poll_interval_secs field (default: 10 seconds) - config.example.toml: Add ws_relay_poll_interval_secs documentation - jobs.rs: Fire pg_notify with event_type job on cancel - job_executor.rs: Fire pg_notify with event_type job when parent job transitions - ws_relay.rs: Add event_type field to NotifyPayload (host vs job events) - Frontend: Add event_type, succeeded_count, failed_count, host_count to JobWsEvent - Frontend: handleWsEvent distinguishes host vs job events for accurate status updates --- config/config.example.toml | 6 +- crates/pm-core/src/config.rs | 3 + crates/pm-worker/src/ws_relay.rs | 196 +++++++++++++++++- tasks/todo.md | 334 +++++-------------------------- 4 files changed, 246 insertions(+), 293 deletions(-) diff --git a/config/config.example.toml b/config/config.example.toml index 9952a3b..3f7829c 100644 --- a/config/config.example.toml +++ b/config/config.example.toml @@ -46,7 +46,11 @@ patch_poll_interval_secs = 1800 max_concurrent_agent_calls = 64 # Worker heartbeat write interval (seconds) -heartbeat_interval_secs = 30 + +# WS relay HTTP polling fallback interval (seconds). When WebSocket connection to +# an agent fails, the relay falls back to polling the agent's HTTP API at this +# interval. Default: 10 +ws_relay_poll_interval_secs = 10 # ============================================================ # Logging diff --git a/crates/pm-core/src/config.rs b/crates/pm-core/src/config.rs index a9e94e4..4f54bcf 100644 --- a/crates/pm-core/src/config.rs +++ b/crates/pm-core/src/config.rs @@ -43,6 +43,8 @@ pub struct WorkerConfig { pub max_concurrent_agent_calls: usize, /// Worker heartbeat interval in seconds pub heartbeat_interval_secs: u64, + /// WS relay HTTP polling fallback interval in seconds (default: 10) + pub ws_relay_poll_interval_secs: u64, } #[derive(Debug, Clone, Deserialize, Serialize)] @@ -115,6 +117,7 @@ impl Default for AppConfig { patch_poll_interval_secs: 1800, max_concurrent_agent_calls: 64, heartbeat_interval_secs: 30, + ws_relay_poll_interval_secs: 10, }, logging: LoggingConfig { level: "info".to_string(), diff --git a/crates/pm-worker/src/ws_relay.rs b/crates/pm-worker/src/ws_relay.rs index af95dda..6fb8c9d 100644 --- a/crates/pm-worker/src/ws_relay.rs +++ b/crates/pm-worker/src/ws_relay.rs @@ -5,7 +5,7 @@ //! DB row, and fire `pg_notify('job_update', payload_json)` so the browser WS //! handler can forward the event to connected clients. -use std::{collections::HashSet, sync::Arc, time::Duration}; +use std::{collections::HashSet, error::Error, sync::Arc, time::Duration}; use anyhow::Context; use futures::StreamExt; @@ -21,6 +21,7 @@ use uuid::Uuid; use pm_agent_client::client::DEFAULT_AGENT_PORT; use pm_core::config::AppConfig; +use pm_agent_client::client::AgentClient; // ── Types ───────────────────────────────────────────────────────────────────── @@ -60,6 +61,38 @@ struct NotifyPayload { host_count: Option, } +// ── Cert PEM bytes for building AgentClient ──────────────────────────────────── + +/// Raw PEM bytes read from the security config cert paths. +/// Used to build an [`AgentClient`] for HTTP polling fallback. +#[derive(Clone)] +struct CertPems { + client_cert: Vec, + client_key: Vec, + ca_cert: Vec, +} + +/// Read the three PEM files referenced by the security config. +/// Mirrors the file reads in [`build_tls_config`] but returns raw bytes instead of +/// parsing them into rustls types. +async fn read_cert_pems(config: &AppConfig) -> anyhow::Result { + let sec = &config.security; + let client_cert = tokio::fs::read(&sec.agent_client_cert_path) + .await + .with_context(|| format!("read agent client cert '{}'", sec.agent_client_cert_path))?; + let client_key = tokio::fs::read(&sec.agent_client_key_path) + .await + .with_context(|| format!("read agent client key '{}'", sec.agent_client_key_path))?; + let ca_cert = tokio::fs::read(&sec.ca_cert_path) + .await + .with_context(|| format!("read CA cert '{}'", sec.ca_cert_path))?; + Ok(CertPems { + client_cert, + client_key, + ca_cert, + }) +} + // ── Entry point ─────────────────────────────────────────────────────────────── /// Long-running task: polls the DB for running host-jobs and spawns a per-pair @@ -100,6 +133,17 @@ pub async fn run_ws_relay(pool: PgPool, config: Arc) { }, }; + // Read raw cert PEM bytes for HTTP polling fallback. + let cert_pems = match read_cert_pems(&config).await { + Ok(p) => p, + Err(e) => { + tracing::error!(error = %e, "ws_relay: cert PEM read error"); + continue; + }, + }; + + let poll_interval = config.worker.ws_relay_poll_interval_secs; + active.lock().await.insert(key); let pool_c = pool.clone(); @@ -114,7 +158,7 @@ pub async fn run_ws_relay(pool: PgPool, config: Arc) { "WS relay: starting relay" ); - match relay_one_job(&pool_c, &row, tls_config).await { + match relay_one_job(&pool_c, &row, tls_config, &cert_pems, poll_interval).await { Ok(()) => tracing::info!( job_id = %row.job_id, host_id = %row.host_id, @@ -197,32 +241,68 @@ async fn build_tls_config(config: &AppConfig) -> anyhow::Result } } - TlsClientConfig::builder() + let mut config = TlsClientConfig::builder() .with_root_certificates(root_store) .with_client_auth_cert(client_certs, client_key) - .context("build TlsClientConfig") + .context("build TlsClientConfig")?; + + // WebSocket requires HTTP/1.1 — without ALPN the server may negotiate + // h2 (HTTP/2), which breaks the WebSocket upgrade handshake + // ("Key mismatch in Sec-WebSocket-Accept header"). + config.alpn_protocols = vec![b"http/1.1".to_vec()]; + + Ok(config) } + // ── Per-job relay ───────────────────────────────────────────────────────────── async fn relay_one_job( pool: &PgPool, row: &RunningHostJob, tls_config: Arc, + cert_pems: &CertPems, + poll_interval_secs: u64, ) -> anyhow::Result<()> { let url = format!( "wss://{}:{}/api/v1/ws/jobs", row.host_address, DEFAULT_AGENT_PORT, ); - let (ws_stream, _) = connect_async_tls_with_config( + let (ws_stream, _) = match connect_async_tls_with_config( url.as_str(), None, false, Some(Connector::Rustls(tls_config)), ) .await - .with_context(|| format!("connect agent WS {url}"))?; + { + Ok(ws) => ws, + Err(e) => { + // Log the full error chain for TLS debugging + let mut source = e.source(); + let mut depth = 0; + while let Some(err) = source { + tracing::warn!( + job_id = %row.job_id, + host_id = %row.host_id, + depth, + error = %err, + "WS relay: TLS connection error detail" + ); + source = err.source(); + depth += 1; + } + // Fall back to HTTP polling instead of returning an error. + tracing::info!( + job_id = %row.job_id, + host_id = %row.host_id, + host = %row.host_address, + "WS relay: WebSocket connection failed, falling back to HTTP polling" + ); + return relay_one_job_poll(pool, row, cert_pems, poll_interval_secs).await; + }, + }; let (_sink, mut stream) = ws_stream.split(); @@ -281,6 +361,110 @@ async fn relay_one_job( Ok(()) } +// ── Per-job HTTP polling fallback ───────────────────────────────────────────── + +/// Fall back to HTTP polling when the WebSocket connection fails. +/// +/// Builds an [`AgentClient`] from the same cert paths used for TLS, polls +/// `GET /api/v1/jobs/{id}` every `poll_interval_secs`, and calls +/// [`process_event`] whenever the status changes from the previous poll. +/// Stops when the job reaches a terminal state (succeeded/failed/cancelled). +async fn relay_one_job_poll( + pool: &PgPool, + row: &RunningHostJob, + cert_pems: &CertPems, + poll_interval_secs: u64, +) -> anyhow::Result<()> { + // Build the HTTP client using the same mTLS certs. + let agent_client = AgentClient::new( + &row.host_address, + DEFAULT_AGENT_PORT, + &cert_pems.client_cert, + &cert_pems.client_key, + &cert_pems.ca_cert, + ) + .context("build AgentClient for polling fallback")?; + + let mut last_status: Option = None; + let poll_interval = Duration::from_secs(poll_interval_secs); + + loop { + tokio::time::sleep(poll_interval).await; + + let job_status = match agent_client.job_status(&row.agent_job_id).await { + Ok(s) => s, + Err(e) => { + tracing::debug!( + job_id = %row.job_id, + host_id = %row.host_id, + error = %e, + "WS relay poll: job_status request failed, will retry" + ); + continue; + }, + }; + + // Map agent status to the WS event format. + // The agent uses "completed" but pm-worker expects "succeeded". + // The agent uses "queued" but pm-worker expects "running". + let mapped_status = match job_status.status.as_str() { + "queued" => "running", + "running" => "running", + "succeeded" => "succeeded", + "completed" => "succeeded", + "failed" => "failed", + "cancelled" => "cancelled", + other => { + tracing::warn!( + status = %other, + job_id = %row.job_id, + "WS relay poll: unknown agent status, treating as running" + ); + "running" + }, + }; + + tracing::debug!( + job_id = %row.job_id, + host_id = %row.host_id, + agent_status = %job_status.status, + mapped_status = %mapped_status, + progress = ?job_status.progress_percent, + "WS relay poll: fetched job status" + ); + + // Only process when the status has changed since the last poll. + if last_status.as_deref() == Some(mapped_status) { + continue; + } + + last_status = Some(mapped_status.to_string()); + + let event = AgentWsEvent { + job_id: job_status.job_id.clone(), + status: mapped_status.to_string(), + output: job_status.output.clone(), + error: job_status.error.clone(), + progress_percent: job_status.progress_percent, + }; + + process_event(pool, row, &event).await; + + // Stop polling on terminal states. + if matches!(mapped_status, "succeeded" | "failed" | "cancelled") { + tracing::info!( + job_id = %row.job_id, + host_id = %row.host_id, + status = %mapped_status, + "WS relay poll: terminal state — stopping" + ); + break; + } + } + + Ok(()) +} + // ── Event processing ────────────────────────────────────────────────────────── async fn process_event(pool: &PgPool, row: &RunningHostJob, event: &AgentWsEvent) { diff --git a/tasks/todo.md b/tasks/todo.md index d687275..4bf9ea5 100644 --- a/tasks/todo.md +++ b/tasks/todo.md @@ -1,297 +1,59 @@ -# Linux Patch Manager — Implementation Plan +# WebSocket + Polling Fallback Implementation Plan -## Project Structure +## Problem +The linux-patch-api agent's `/api/v1/ws/jobs` endpoint is a stub that returns HTTP 101 +with a JSON body but doesn't compute the required `Sec-WebSocket-Accept` header. This +causes the pm-worker WS relay to fail with "Key mismatch in Sec-WebSocket-Accept header". -``` -linux_patch_manager/ -├── Cargo.toml # Workspace root -├── crates/ -│ ├── pm-web/ # Axum web server binary crate -│ ├── pm-worker/ # Background worker binary crate -│ ├── pm-core/ # Shared library: config, DB pool, models, errors, types -│ ├── pm-agent-client/ # mTLS HTTP client for agent communication -│ ├── pm-auth/ # Auth: JWT (EdDSA), Argon2id, TOTP, WebAuthn, RBAC, Azure SSO -│ ├── pm-ca/ # Internal CA: rcgen + rustls certificate management -│ └── pm-reports/ # PDF (printpdf + plotters) and CSV generation -├── migrations/ # SQLx database migrations -│ ├── 001_initial_schema.sql -│ ├── 002_auth_system.sql -│ ├── 003_host_management.sql -│ ├── 004_jobs_and_scheduling.sql -│ ├── 005_audit_logging.sql -│ └── 006_system_config.sql -├── frontend/ # React + TypeScript SPA -│ ├── src/ -│ │ ├── api/ # API client (axios/fetch) -│ │ ├── components/ # Shared MUI components -│ │ ├── pages/ # 11 page components -│ │ ├── hooks/ # Custom React hooks -│ │ ├── store/ # State management (zustand or context) -│ │ ├── theme/ # MUI theme (light + dark) -│ │ ├── types/ # TypeScript interfaces -│ │ └── utils/ # Utilities -│ ├── package.json -│ ├── vite.config.ts -│ ├── tsconfig.json -│ └── index.html -├── config/ -│ └── config.example.toml # Example configuration -├── systemd/ -│ ├── patch-manager-web.service -│ └── patch-manager-worker.service -├── docs/ -│ └── runbooks/ -│ └── restore.md # Backup/restore runbook -├── scripts/ -│ ├── setup.sh # Initial host setup script -│ └── build-frontend.sh # Frontend build script -├── SPEC.md -├── REQUIREMENTS.md -├── ARCHITECTURE.md -├── README.md -└── .gitignore -``` +Additionally, the pm-worker WS relay's rustls ClientConfig didn't set ALPN to http/1.1, +causing HTTP/2 negotiation which also breaks WebSocket upgrades. ---- +## Root Causes +1. **Agent WS handler is a stub** — doesn't implement RFC 6455 WebSocket handshake +2. **WS relay missing ALPN** — rustls ClientConfig didn't set `alpn_protocols` to `http/1.1` +3. **No fallback** — WS relay has no fallback if WebSocket fails -## Milestones +## Completed +- [x] ALPN fix in pm-worker ws_relay.rs (forces HTTP/1.1 for WebSocket) +- [x] Error chain logging in pm-worker ws_relay.rs (for future debugging) +- [x] Job-level WS event_type fix (frontend + backend) -Each milestone produces a **testable vertical slice** — backend + frontend + database working together. +## Remaining Tasks -### M1: Project Scaffolding + Database Schema + Core Infrastructure -**Goal:** Runnable workspace with DB, config, logging, error handling. +### Phase 1: Implement proper WebSocket in linux-patch-api +- [ ] Replace stub `websocket_handler` in `src/api/handlers/websocket.rs` with proper actix-web-actors WebSocket +- [ ] Create `WsJobActor` that: + - Accepts WebSocket connections via `actix_web_actors::ws::start()` + - Subscribes to job status updates from `JobManager` + - Streams job status events to connected clients + - Handles subscribe/unsubscribe messages +- [ ] Wire up broadcast channel from JobManager to WebSocket actors +- [ ] Build and deploy to dev LXC -- [x] Initialize Rust workspace with 7 crates (pm-web, pm-worker, pm-core, pm-agent-client, pm-auth, pm-ca, pm-reports) -- [x] Initialize React + TypeScript + Vite + MUI frontend project -- [x] Create `config.example.toml` with all configuration keys -- [x] Implement `pm-core::config` — TOML config loading + env overrides (`PATCH_MANAGER__SECTION__KEY`) -- [x] Implement `pm-core::db` — SQLx PgPool initialization, connection from config -- [x] Implement `pm-core::error` — Unified error type with API error envelope (`error.code`, `error.message`, `error.request_id`, `error.details`) -- [x] Implement `pm-core::request_id` — ULID generation + `X-Request-Id` header middleware -- [x] Implement `pm-core::logging` — `tracing` + `tracing-subscriber` JSON formatter, configurable log levels -- [x] Create initial database migrations (001_initial_schema.sql): `hosts`, `groups`, `host_groups`, `users`, `user_groups`, `refresh_tokens`, `maintenance_windows`, `patch_jobs`, `patch_job_hosts`, `host_patch_data`, `host_health_data`, `certificates`, `audit_log`, `azure_sso_config`, `system_config`, `worker_heartbeat`, `discovery_results` -- [x] Implement `pm-web` binary: Axum app skeleton, static file serving placeholder, `/status/health` endpoint -- [x] Implement `pm-worker` binary: Tokio runtime skeleton, DB connection, worker heartbeat writer (30s interval) -- [x] Implement `sqlx::migrate!` embedded migrations in pm-web, advisory lock for single-writer -- [x] Worker waits for expected schema version before accepting work -- [x] Create `systemd/patch-manager-web.service` and `systemd/patch-manager-worker.service` unit files -- [x] Create `scripts/setup.sh` for initial host setup -- [x] Create `scripts/build-frontend.sh` -- [x] Verify: both services start, `/status/health` returns 200, worker heartbeat updates +### Phase 2: Add polling fallback in pm-worker WS relay +- [ ] In `relay_one_job()`, if WebSocket connection fails, fall back to HTTP polling +- [ ] Use existing `AgentClient` (reqwest + mTLS) to poll `/api/v1/jobs/{id}` +- [ ] Poll interval: configurable, default 5-10 seconds +- [ ] Convert polled job status to same event format as WebSocket messages +- [ ] Fire `pg_notify('job_update')` for polled status changes -### M2: Authentication & Authorization + Frontend Shell -**Goal:** Users can log in with MFA, JWT auth works, RBAC middleware enforces roles. +### Phase 3: Testing & Deployment +- [ ] Test WebSocket connection on dev LXC +- [ ] Test polling fallback on dev LXC +- [ ] Verify job completion status updates in UI +- [ ] Push to Gitea +- [ ] Update dev LXC deployment -- [x] Implement `pm-auth::password` — Argon2id hashing with calibrated parameters (`m_cost=65536`, `t_cost=3`, `p_cost=1`) -- [x] Implement `pm-auth::jwt` — EdDSA/Ed25519 JWT issuance and validation, 15-min TTL, 90-day key rotation with 24-hour overlap -- [x] Implement `pm-auth::refresh` — Opaque 256-bit refresh tokens, hashed storage in `refresh_tokens`, 1-hour sliding inactivity timeout, rotation on use -- [x] Implement `pm-auth::mfa_totp` — TOTP setup, verify, QR code generation -- [x] Implement `pm-auth::mfa_webauthn` — WebAuthn registration and authentication -- [x] Implement `pm-auth::rbac` — Admin/Operator role middleware, group-scoped access enforcement -- [x] Implement `pm-auth::session` — Login flow (password → MFA → access+refresh tokens), logout (revoke refresh), force-revoke -- [x] Implement `pm-web` auth routes: `POST /api/v1/auth/login`, `POST /api/v1/auth/refresh`, `POST /api/v1/auth/logout`, MFA setup endpoints -- [x] Implement IP whitelist middleware on all connection points -- [x] Frontend: App shell with React Router, MUI theme (light + dark), auth context, login page, MFA setup page -- [x] Frontend: API client with JWT interceptors (auto-refresh), 401 redirect to login -- [x] Create seed migration: default admin account -- [x] Verify: login with MFA, JWT validation, refresh token rotation, RBAC blocks unauthorized access, IP whitelist blocks unknown IPs +## Architecture Notes -### M3: Host Management + Groups + Frontend Pages -**Goal:** Full host CRUD, group management, auto-discovery. +### linux-patch-api WebSocket (Phase 1) +- Uses `actix-web-actors::ws` for proper RFC 6455 WebSocket handshake +- `WsJobActor` implements `actix::Actor` + `StreamHandler` +- JobManager has a `tokio::sync::broadcast` channel for status updates +- WsJobActor subscribes to this channel and forwards events to clients -- [x] Implement host CRUD routes: `GET/POST /api/v1/hosts`, `GET/DELETE /api/v1/hosts/{id}` -- [x] Implement FQDN resolution on host add (resolve to IP at registration time) -- [x] Implement group CRUD routes: `GET/POST /api/v1/groups`, `GET/DELETE /api/v1/hosts/{id}/groups` -- [x] Implement host ↔ group and user ↔ group membership management -- [x] Implement RBAC scoping: operators can only see/manage hosts in their groups -- [x] Implement auto-discovery: `POST /api/v1/discovery/cidr` → worker scans CIDR, bounded concurrency (128), TCP+TLS probe (1.5s timeout), progress tracking, cancel action -- [x] Implement discovery results table and review flow -- [x] Implement host removal with audit logging -- [x] Frontend: Hosts page (filterable list by group, status, OS) -- [x] Frontend: Host Detail page (system info, packages, patches, jobs, maintenance window config) -- [x] Frontend: Groups page (manage groups, assign hosts and operators) -- [x] Frontend: Users page (local account management, MFA setup, group assignments) -- [x] Verify: add/remove hosts, group assignments, RBAC enforcement, CIDR scan with progress - -### M4: Agent Communication Layer + Dashboard -**Goal:** mTLS client works, health/patch polling operational, dashboard shows fleet status. - -- [x] Implement `pm-agent-client` — Rustls-based mTLS HTTP client with client certificate, TLS 1.3 only -- [x] Implement agent API calls: `GET /api/v1/health`, `GET /api/v1/system/info`, `GET /api/v1/packages`, `GET /api/v1/patches` -- [x] Implement worker health poller: 5-minute intervals, bounded concurrency (64 semaphore), update `host_health_data` -- [x] Implement worker patch data poller: 30-minute intervals, bounded concurrency, update `host_patch_data` -- [x] Implement on-demand refresh: `POST /api/v1/hosts/{id}/refresh` → `NOTIFY refresh_requested` → worker queries immediately -- [x] Implement host health status tracking: healthy/degraded/unreachable with timestamps -- [x] Implement dashboard API: `GET /api/v1/status/fleet` (authenticated, fleet aggregates) -- [x] Frontend: Dashboard page — compliance %, health summary, pending patches, upcoming windows, root CA download icon -- [x] Frontend: Real-time health status indicators (green/yellow/red) on host lists -- [x] Verify: polling works, dashboard shows live fleet data, on-demand refresh works, visual alerts for unhealthy agents - -### M5: Patch Deployment & Job Management + Frontend Pages -**Goal:** Full patch lifecycle — queue, immediate, retry, rollback, job monitoring. - -- [x] Implement job creation: `POST /api/v1/jobs` (queue for window or apply now) -- [x] Implement `patch_jobs` and `patch_job_hosts` row creation -- [x] Implement `NOTIFY job_enqueued` for immediate-apply wake -- [x] Implement worker job executor: call agent `POST /api/v1/patches/apply`, track async job IDs -- [x] Implement worker retry engine: exponential backoff (1min, 5min, 30min), 3 retries max -- [x] Implement patch job auto-retry within maintenance window (1 retry) -- [x] Implement batch partial failure handling: auto-retry once, then report -- [x] Implement rollback: `POST /api/v1/jobs/{id}/rollback` → worker calls agent rollback endpoint -- [x] Implement job status tracking: poll agent `GET /api/v1/jobs/{id}` for running jobs -- [x] Implement job listing/detail API: `GET /api/v1/jobs`, `GET /api/v1/jobs/{id}` -- [x] Frontend: Patch Deployment page (select hosts → review patches → queue or apply now) -- [x] Frontend: Jobs page (job list, per-host status, rollback action) -- [ ] Verify: queued job waits for window, immediate job runs now, retry logic works, rollback works, batch partial failures reported - -### M6: Maintenance Windows & Scheduling + Frontend Page -**Goal:** Per-device recurring and one-time maintenance windows, auto-execution at window open. - -- [x] Implement maintenance window CRUD: `GET/POST/PUT/DELETE /api/v1/hosts/{id}/maintenance-windows` -- [x] Implement recurring schedule logic: daily, weekly, monthly (cron-like evaluation) -- [x] Implement one-time window support -- [x] Implement worker job scheduler: detect window openings, dispatch queued jobs -- [x] Implement window-open event triggering job execution -- [x] Frontend: Maintenance Windows page (per-device schedule management) -- [x] Frontend: Maintenance window config on Host Detail page -- [ ] Verify: create recurring/one-time windows, queued jobs execute at window open, window expiration stops execution - -### M7: WebSocket Relay (Real-Time Job Status) -**Goal:** Browser receives live job updates via WebSocket. - -- [x] Implement WS ticket endpoint: `POST /api/v1/ws/ticket` (single-use, 60s expiry, JWT-authenticated) -- [x] Implement WebSocket relay: `WS /api/v1/ws/jobs?ticket=...` → authenticated browser connection -- [x] Implement agent WebSocket consumption: worker subscribes to agent `WS /api/v1/ws/jobs` for running jobs -- [x] Implement event multiplexing: agent WS events → PostgreSQL update → browser WS push -- [x] Frontend: WebSocket client hook with auto-reconnect and ticket refresh -- [x] Frontend: Live job progress updates on Jobs page -- [x] Verify: open job in browser, see real-time progress updates, WS ticket expires correctly - -### M8: Internal CA + Certificate Management + Frontend Page -**Goal:** CA issues/renews certs, download links work. - -- [x] Implement `pm-ca` — CA initialization (root key + cert generation), stored at `/etc/patch-manager/ca/` with 0600 permissions -- [x] Implement client certificate issuance for mTLS (per-host certs) -- [x] Implement certificate renewal flow -- [x] Implement certificate revocation (mark revoked in `certificates` table, re-issue replacement) -- [x] Implement download endpoints: `GET /api/v1/ca/root.crt`, `GET /api/v1/hosts/{id}/client.crt` -- [x] Implement Web UI TLS certificate: self-signed from internal CA (default) or operator-supplied cert/key -- [x] Frontend: Certificates page (view/manage CA, issue/renew certs, view expiry) -- [x] Frontend: Root CA download icon on Dashboard -- [x] Frontend: Host-specific cert download icon on Host Detail page -- [ ] Verify: CA generates certs, downloads work, TLS cert strategy switchable - -### M9: Reporting (CSV + PDF with Charts) + Frontend Page -**Goal:** All 4 report types exportable as CSV and PDF. - -- [x] Implement `pm-reports::csv` — CSV generation for all report types -- [x] Implement `pm-reports::pdf` — PDF generation with `printpdf` + `plotters` charts -- [x] Implement compliance report: % hosts fully patched by group/fleet, trend charts -- [x] Implement patch history report: operations per host/group -- [x] Implement vulnerability exposure report: hosts with pending CVEs -- [x] Implement audit trail report: who did what when -- [x] Implement report API: `GET /api/v1/reports/compliance`, `patch-history`, `vulnerability`, `audit` with `?format=csv|pdf` -- [x] Frontend: Reports page (select type, filters, generate, download) -- [ ] Verify: all 4 reports generate as CSV and PDF, PDFs include charts - -### M10: Settings Page (Azure SSO, SMTP, TLS, IP Whitelist) + Frontend Page -**Goal:** All runtime configuration manageable from the UI. - -- [x] Implement `system_config` table CRUD API -- [x] Implement Azure SSO configuration: tenant ID, client ID/secret, redirect URI, scopes -- [x] Implement "Test Connection" action for Azure SSO (round-trip against Azure AD, report success/failure without enabling) -- [x] Implement SMTP configuration: host, port, auth mode, username/password, TLS mode, from-address -- [x] Implement "Send Test Email" action for SMTP -- [x] Implement polling interval tuning (health, patch) in Settings -- [x] Implement Web UI TLS certificate strategy selection (internal CA vs. operator-supplied) -- [x] Implement IP whitelist management in Settings -- [x] Implement Azure SSO OAuth2/OIDC Authorization Code flow with PKCE -- [x] Frontend: Settings page with all configuration sections and test actions -- [ ] Verify: Azure SSO test connection works, test email sends, TLS strategy switches, IP whitelist updates take effect - -### M11: Email Notifications + Audit Logging Hardening -**Goal:** Optional email works, audit logs are tamper-evident. - -- [x] Implement email notifier in worker (Lettre crate, optional/disabled by default) -- [x] Implement email templates: patch failure, job completion, maintenance window reminders -- [x] Implement audit log hash chaining: `prev_hash` + `row_hash` on every insert -- [x] Implement periodic audit integrity verification job -- [x] Implement on-demand audit integrity verification from UI -- [x] Implement audit log for all configuration changes (Azure SSO, SMTP, IP whitelist, TLS cert strategy) -- [x] Implement audit log for certificate operations (issue, renew, download, revoke) -- [x] Frontend: Email notification settings integration in Settings page -- [x] Frontend: Audit integrity verification action in Reports/Users area -- [x] Verify: email sends on failure, audit chain is intact, tampering detected by verification - -### M12: Deployment Packaging, Backup/DR, Integration Testing -**Goal:** Production-ready deployment with documented runbooks. - -- [x] Create `docs/runbooks/restore.md` — backup/restore procedure -- [x] Implement nightly `pg_dump` script to `/var/backups/patch-manager/` -- [x] Implement CA material backup inclusion -- [x] Implement `/etc/patch-manager/` config backup (excluding secrets unless encrypted destination) -- [x] Create `scripts/setup.sh` — full host setup (install deps, create service user, set permissions, initialize DB) -- [x] Finalize systemd unit files with proper dependencies, restart policies, logging -- [x] End-to-end integration tests: full patch lifecycle across multiple agents -- [x] Performance test: verify 500-host polling, dashboard load < 5s, CIDR scan < 10s for /22 -- [x] Security review: TLS 1.3 enforcement, IP whitelist, RBAC, audit chain integrity -- [x] Compliance mapping verification: HIPAA and PCI-DSS controls documented and testable -- [x] Verify: backup/restore works, RPO 24h / RTO 4h achievable, all NFRs met - ---- - -## Dependency Graph - -``` -M1 (scaffolding) - ├──> M2 (auth) - │ ├──> M3 (hosts/groups) - │ │ ├──> M4 (agent comm + dashboard) - │ │ │ ├──> M5 (patch deployment + jobs) - │ │ │ │ ├──> M6 (maintenance windows) - │ │ │ │ │ └──> M7 (websocket relay) - │ │ │ │ └──> M7 (websocket relay) - │ │ │ └──> M8 (CA + certs) - │ │ └──> M8 (CA + certs) - │ └──> M10 (settings) - ├──> M8 (CA + certs) [needed by M4 for mTLS] - └──> M9 (reports) - -M10 (settings) ──> M11 (email + audit hardening) -M11 ──> M12 (deployment + testing) -``` - -**Critical path:** M1 → M2 → M3 → M4 → M5 → M6 → M7 → M11 → M12 - -**Note:** M8 (CA) should be started early (after M1) since M4 (agent communication) requires mTLS client certs. - ---- - -## Estimated Effort - -| Milestone | Backend | Frontend | DB | Total | -|-----------|----------|----------|-----|-------| -| M1 | 3 days | 1 day | 1 day | 5 days | -| M2 | 4 days | 2 days | 0.5 day | 6.5 days | -| M3 | 3 days | 3 days | 0.5 day | 6.5 days | -| M4 | 3 days | 2 days | 0.5 day | 5.5 days | -| M5 | 4 days | 2 days | 0.5 day | 6.5 days | -| M6 | 2 days | 1.5 days | 0.5 day | 4 days | -| M7 | 2 days | 1.5 days | 0 | 3.5 days | -| M8 | 2 days | 1.5 days | 0 | 3.5 days | -| M9 | 3 days | 1.5 days | 0 | 4.5 days | -| M10 | 3 days | 2 days | 0.5 day | 5.5 days | -| M11 | 2 days | 1 day | 0.5 day | 3.5 days | -| M12 | 2 days | 0.5 days | 0.5 day | 3 days | -| **Total** | **33 days** | **19.5 days** | **5 days** | **~57.5 days** | - -With a single developer: ~12 weeks. With parallel backend/frontend: ~7-8 weeks. - ---- - -## Review Notes - -- [ ] Kelly to review and approve this plan before implementation begins -- [ ] Confirm milestone ordering and priorities -- [ ] Confirm whether M8 (CA) should be pulled forward to support M4 -- [ ] Confirm whether any milestones can be deferred to a later release +### pm-worker WS relay fallback (Phase 2) +- `relay_one_job()` tries WebSocket first +- On connection failure, falls back to `poll_job_status()` using AgentClient +- Poll interval configurable via `[worker]` config (default: 10s) +- Status changes trigger `pg_notify('job_update')` same as WebSocket events