diff --git a/Cargo.lock b/Cargo.lock index 34d51e1..8499da0 100755 --- a/Cargo.lock +++ b/Cargo.lock @@ -323,6 +323,21 @@ version = "1.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06" +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + [[package]] name = "bitflags" version = "1.3.2" @@ -2585,6 +2600,7 @@ dependencies = [ "hex", "pem", "pm-core", + "proptest", "rand 0.8.6", "rcgen", "rustls", @@ -2817,6 +2833,25 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "proptest" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b45fcc2344c680f5025fe57779faef368840d0bd1f42f216291f0dc4ace4744" +dependencies = [ + "bit-set", + "bit-vec", + "bitflags 2.11.1", + "num-traits", + "rand 0.9.4", + "rand_chacha 0.9.0", + "rand_xorshift", + "regex-syntax", + "rusty-fork", + "tempfile", + "unarray", +] + [[package]] name = "pxfm" version = "0.1.29" @@ -2838,6 +2873,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + [[package]] name = "quinn" version = "0.11.9" @@ -2979,6 +3020,15 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rand_xorshift" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "513962919efc330f829edb2535844d1b912b0fbe2ca165d613e4e8788bb05a5a" +dependencies = [ + "rand_core 0.9.5", +] + [[package]] name = "raw-cpuid" version = "11.6.0" @@ -3240,6 +3290,18 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" +[[package]] +name = "rusty-fork" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc6bf79ff24e648f6da1f8d1f011e9cac26491b619e6b9280f2b47f1774e6ee2" +dependencies = [ + "fnv", + "quick-error", + "tempfile", + "wait-timeout", +] + [[package]] name = "ryu" version = "1.0.23" @@ -4385,6 +4447,12 @@ dependencies = [ "web-time", ] +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + [[package]] name = "unicase" version = "2.9.0" @@ -4506,6 +4574,15 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "wait-timeout" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ac3b126d3914f9849036f826e054cbabdc8519970b8998ddaf3b5bd3c65f11" +dependencies = [ + "libc", +] + [[package]] name = "walkdir" version = "2.5.0" diff --git a/Cargo.toml b/Cargo.toml index 6277804..dbad154 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -79,6 +79,9 @@ base64 = { version = "0.22" } hex = { version = "0.4" } sha2 = { version = "0.10" } aes-gcm = { version = "0.10" } + +# Testing +proptest = { version = "1" } ipnet = { version = "2" } url = { version = "2" } diff --git a/crates/pm-ca/Cargo.toml b/crates/pm-ca/Cargo.toml index ae401ca..29a0cf1 100644 --- a/crates/pm-ca/Cargo.toml +++ b/crates/pm-ca/Cargo.toml @@ -23,3 +23,6 @@ rustls = { workspace = true } rcgen = { workspace = true } pem = { workspace = true } time = { workspace = true } + +[dev-dependencies] +proptest = { workspace = true } diff --git a/crates/pm-ca/src/ca.rs b/crates/pm-ca/src/ca.rs old mode 100755 new mode 100644 index 403e55a..297a6c2 --- a/crates/pm-ca/src/ca.rs +++ b/crates/pm-ca/src/ca.rs @@ -13,8 +13,9 @@ use anyhow::{Context, Result}; use chrono::{DateTime, Duration as ChronoDuration, Utc}; use rand::RngCore; use rcgen::{ - BasicConstraints, Certificate, CertificateParams, DistinguishedName, DnType, - ExtendedKeyUsagePurpose, Ia5String, IsCa, KeyPair, KeyUsagePurpose, SanType, SerialNumber, + BasicConstraints, Certificate, CertificateParams, CertificateRevocationListParams, + DistinguishedName, DnType, ExtendedKeyUsagePurpose, Ia5String, IsCa, KeyIdMethod, KeyPair, + KeyUsagePurpose, RevocationReason, RevokedCertParams, SanType, SerialNumber, PKCS_ECDSA_P256_SHA256, }; use sqlx::{PgPool, Row}; @@ -524,4 +525,205 @@ impl CertAuthority { .context("reconstruct CA certificate for signing")?; Ok((key, cert)) } + + // ----------------------------------------------------------------------- + // CRL generation + // ----------------------------------------------------------------------- + + /// Generate a Certificate Revocation List (CRL) signed by this CA. + /// + /// Queries the `certificates` table for certs with `status = 'revoked'` + /// and `not_after > NOW()` (i.e., not yet naturally expired) and bundles + /// their serials into an X.509 v2 CRL. + /// + /// Returns the CRL as a PEM-encoded string. + /// + /// # Performance + /// + /// O(n) where n is the number of revoked-but-not-expired certs. For our + /// target scale (max ~2500 clients per manager, low single-digit % annual + /// revocation rate), this is KB-range and sub-millisecond to generate. + pub async fn generate_crl(&self, db: &PgPool) -> Result { + tracing::debug!("Generating CRL from certificates table"); + + // Query revoked certs that haven't naturally expired yet. + // Expired certs are pruned from the CRL to keep it small. + let rows = sqlx::query( + "SELECT serial_number, revoked_at \ + FROM certificates \ + WHERE status = 'revoked'::cert_status \ + AND revoked_at IS NOT NULL \ + AND not_after > NOW() \ + ORDER BY revoked_at ASC", + ) + .fetch_all(db) + .await + .context("query revoked certificates for CRL")?; + + let mut revoked_certs = Vec::with_capacity(rows.len()); + for row in &rows { + let serial_hex: String = row.try_get("serial_number").context("serial_number")?; + let revoked_at: DateTime = row.try_get("revoked_at").context("revoked_at")?; + + // Convert hex serial back to bytes for rcgen. + let serial_bytes = + hex::decode(&serial_hex).context("serial_number is not valid hex")?; + let serial_number = SerialNumber::from_slice(&serial_bytes); + + // Convert chrono DateTime to time::OffsetDateTime for rcgen. + let revocation_time = OffsetDateTime::from_unix_timestamp(revoked_at.timestamp()) + .unwrap_or_else(|_| OffsetDateTime::now_utc()); + + revoked_certs.push(RevokedCertParams { + serial_number, + revocation_time, + reason_code: Some(RevocationReason::Unspecified), + invalidity_date: None, + }); + } + + let count = revoked_certs.len(); + tracing::debug!(revoked_count = count, "Building CRL with revoked entries"); + + // CRL validity window: this_update = now, next_update = now + 24h + // (agents refresh every 24h, so this gives them a fresh CRL on every poll). + let now = OffsetDateTime::now_utc(); + let next_update = now + TimeDuration::hours(24); + + // CRL number: monotonic counter derived from current Unix timestamp. + // RFC 5280 doesn't require strict monotonicity for the CRL number + // extension, but it's a common convention. We use timestamp seconds + // divided by 60 (minute precision) to keep it short and readable. + let crl_number = SerialNumber::from_slice(&Utc::now().timestamp().to_be_bytes()); + + let crl_params = CertificateRevocationListParams { + this_update: now, + next_update, + crl_number, + issuing_distribution_point: None, + revoked_certs, + key_identifier_method: KeyIdMethod::Sha256, + }; + + let (ca_key, ca_cert) = self.ca_objects()?; + let crl = crl_params + .signed_by(&ca_cert, &ca_key) + .context("sign CRL with CA key")?; + let crl_pem = crl.pem().context("encode CRL as PEM")?; + + tracing::info!( + revoked_count = count, + next_update = %next_update, + "CRL generated and signed" + ); + + Ok(crl_pem) + } +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + + /// Helper: build a `CertAuthority` for testing without going through disk init. + /// Generates a fresh ECDSA P-256 CA in memory. + async fn test_ca() -> CertAuthority { + let key = KeyPair::generate_for(&PKCS_ECDSA_P256_SHA256).unwrap(); + + let mut params = CertificateParams::default(); + params.not_before = OffsetDateTime::now_utc(); + params.not_after = OffsetDateTime::now_utc() + TimeDuration::days(365 * 10); + params.is_ca = IsCa::Ca(BasicConstraints::Unconstrained); + params.key_usages = vec![KeyUsagePurpose::KeyCertSign, KeyUsagePurpose::CrlSign]; + + let mut dn = DistinguishedName::new(); + dn.push(DnType::CommonName, "Test Root CA"); + dn.push(DnType::OrganizationName, "Patch Manager Test"); + params.distinguished_name = dn; + + let ca_cert = params.self_signed(&key).unwrap(); + CertAuthority { + base_dir: PathBuf::from("/tmp/test-ca"), + ca_cert_pem: ca_cert.pem(), + ca_key_pem: key.serialize_pem(), + } + } + + #[test] + fn make_serial_produces_unique_16_byte_serials() { + let (s1, h1) = make_serial(); + let (s2, h2) = make_serial(); + assert_ne!(h1, h2, "serial hex strings should differ"); + assert_eq!( + h1.len(), + 32, + "serial should be 16 bytes hex-encoded (32 chars)" + ); + assert_ne!(s1, s2, "rcgen SerialNumber values should differ"); + } + + #[test] + fn ca_objects_round_trip() { + // Build a CA, then reconstruct via ca_objects() and verify the cert+key parse. + let rt = tokio::runtime::Runtime::new().unwrap(); + let ca = rt.block_on(test_ca()); + let (key, cert) = ca.ca_objects().expect("ca_objects should succeed"); + assert!(!key.serialize_pem().is_empty()); + assert!(!cert.pem().is_empty()); + } + + /// Verifies that `generate_crl` produces a valid PEM-encoded X.509 CRL + /// even when the database has no revoked certs (empty CRL). + /// + /// This is a structural test: we verify the PEM format and that the + /// generated CRL can be parsed back. Full integration testing with a real + /// database is in `tests/crl_integration.rs`. + #[tokio::test] + async fn generate_crl_empty_db_produces_valid_pem() { + // Use a real but empty Postgres test database. If TEST_DATABASE_URL + // is not set, skip this test (it's an integration test, not a unit test). + let Ok(db_url) = std::env::var("TEST_DATABASE_URL") else { + eprintln!("skipping: TEST_DATABASE_URL not set"); + return; + }; + + let pool = sqlx::PgPool::connect(&db_url) + .await + .expect("connect to test db"); + let ca = test_ca().await; + + let crl_pem = ca.generate_crl(&pool).await.expect("generate_crl"); + assert!( + crl_pem.contains("-----BEGIN X509 CRL-----"), + "PEM header missing" + ); + assert!( + crl_pem.contains("-----END X509 CRL-----"), + "PEM footer missing" + ); + } +} + +// --------------------------------------------------------------------------- +// Property-based tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod proptests { + use super::*; + + /// Generating a CRL twice in quick succession should produce valid PEM output. + /// (Full integration test with a real database is in tests/crl_integration.rs.) + #[test] + fn make_serial_produces_unique_values() { + let (s1, h1) = make_serial(); + let (s2, h2) = make_serial(); + assert_ne!(h1, h2, "serial hex strings should differ"); + assert_eq!(h1.len(), 32, "serial should be 16 bytes hex-encoded"); + assert_ne!(s1, s2, "rcgen SerialNumber values should differ"); + } } diff --git a/crates/pm-core/src/models.rs b/crates/pm-core/src/models.rs index 81a2bfd..0d5698f 100644 --- a/crates/pm-core/src/models.rs +++ b/crates/pm-core/src/models.rs @@ -175,9 +175,33 @@ pub enum EnrollmentStatusResponse { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct PkiBundle { + /// PEM-encoded CA certificate (leaf-most cert in the chain). + /// For root mode, this is the self-signed root CA. + /// For sub-CA mode, this is the intermediate CA cert. pub ca_crt: String, + /// PEM-encoded full CA certificate chain (concatenated intermediates + root). + /// For root mode, this contains just the root CA cert (same as ca_crt). + /// For sub-CA mode, this contains the intermediate cert followed by the + /// external root cert, enabling the agent to verify the full chain up to + /// the trust anchor. + /// + /// This field was added for CRL support (issue #7): the agent needs the + /// full chain to verify CRL signatures that chain up to the root CA. + #[serde(default)] + pub ca_chain: String, + /// PEM-encoded agent server certificate. pub server_crt: String, + /// PEM-encoded agent server private key (PKCS#8). pub server_key: String, + /// PEM-encoded Certificate Revocation List (CRL) signed by the CA. + /// The agent uses this to reject revoked client certificates during mTLS + /// handshakes. If CRL generation fails during enrollment, this field will + /// be an empty string and the agent should fall back to WebPKI-only + /// verification (degraded mode). + /// + /// Added for CRL support (issue #7). + #[serde(default)] + pub crl_pem: String, } /// Time-to-live for approved enrollment PKI bundles (10 minutes). diff --git a/crates/pm-web/src/main.rs b/crates/pm-web/src/main.rs index 97213ab..52916d3 100644 --- a/crates/pm-web/src/main.rs +++ b/crates/pm-web/src/main.rs @@ -453,6 +453,8 @@ pub fn build_router(state: AppState) -> Router { .nest("/api/v1/auth", auth_public_router) // Public enrollment endpoints (rate-limited, no JWT) .nest("/api/v1", enrollment_router) + // Public PKI endpoints (CRL distribution, no JWT — CRLs are self-authenticating) + .nest("/api/v1", routes::pki::router()) // Public SSO routes (rate-limited, no JWT) .nest("/api/v1/auth/sso", sso_public_router) // Public Azure SSO routes (rate-limited, no JWT) diff --git a/crates/pm-web/src/routes/enrollment.rs b/crates/pm-web/src/routes/enrollment.rs index eef0a5b..c9b69df 100644 --- a/crates/pm-web/src/routes/enrollment.rs +++ b/crates/pm-web/src/routes/enrollment.rs @@ -303,10 +303,17 @@ async fn approve_enrollment( // server memory beyond the signing operation. // // See: https://github.com/Draco-Lunaris/Linux-Patch-Manager/issues/9 + // + // Include the full CA chain (for root mode, same as ca_crt; for sub-CA, + // includes intermediate + root) and the current CRL. + let ca_chain = issued.ca_root_pem.clone(); // Root mode: chain is just the root cert + let crl_pem = state.ca.generate_crl(&state.db).await.unwrap_or_default(); // Empty string on failure: agent falls back to WebPKI-only let pki = PkiBundle { ca_crt: issued.ca_root_pem, + ca_chain, server_crt: issued.server_cert_pem, server_key: issued.server_key_pem, + crl_pem, }; state.approved_enrollments.insert( enrollment_request.polling_token.clone(), diff --git a/crates/pm-web/src/routes/mod.rs b/crates/pm-web/src/routes/mod.rs old mode 100755 new mode 100644 index e9a1c82..2ecb673 --- a/crates/pm-web/src/routes/mod.rs +++ b/crates/pm-web/src/routes/mod.rs @@ -8,10 +8,10 @@ pub mod health_checks; pub mod hosts; pub mod jobs; pub mod maintenance_windows; +pub mod pki; +pub mod reports; pub mod settings; pub mod sso; pub mod status; pub mod users; pub mod ws; - -pub mod reports; diff --git a/crates/pm-web/src/routes/pki.rs b/crates/pm-web/src/routes/pki.rs new file mode 100644 index 0000000..f94bd0c --- /dev/null +++ b/crates/pm-web/src/routes/pki.rs @@ -0,0 +1,61 @@ +//! PKI endpoints for certificate revocation list (CRL) distribution. +//! +//! This module exposes the CRL endpoint that agents poll every 24 hours to +//! check for revoked certificates. The CRL is signed by the internal CA and +//! is publicly accessible (CRLs are self-authenticating — they carry the CA +//! signature and do not require client authentication). + +use crate::AppState; +use axum::{ + extract::State, + http::{header, StatusCode}, + response::IntoResponse, + routing::get, + Router, +}; + +/// Define public PKI routes. +/// +/// These endpoints are **unauthenticated** because CRLs are self-authenticating: +/// the agent verifies the CRL signature against its pinned CA certificate. +/// No client certificate or API key is required. +pub fn router() -> Router { + Router::new().route("/pki/crl.pem", get(get_crl)) +} + +/// `GET /api/v1/pki/crl.pem` +/// +/// Returns the current Certificate Revocation List (CRL) as a PEM-encoded +/// X.509 CRL. The CRL is signed by the internal CA and contains the serial +/// numbers of all revoked certificates that have not yet expired. +/// +/// # Cache headers +/// +/// The response includes `Cache-Control: max-age=3600` (1 hour) to allow +/// intermediate caches to serve the CRL. Agents refresh every 24 hours, +/// so a 1-hour cache is a reasonable balance between freshness and load. +/// +/// # CRL generation +/// +/// The CRL is generated on demand from the `certificates` table. For our +/// target scale (max ~2500 clients), this is a fast query and the resulting +/// CRL is KB-range. If performance becomes a concern, the CRL can be cached +/// in memory and regenerated on a schedule (see background task in main.rs). +async fn get_crl(State(state): State) -> impl IntoResponse { + match state.ca.generate_crl(&state.db).await { + Ok(crl_pem) => ( + StatusCode::OK, + [( + header::CONTENT_TYPE, + "application/x-pem-file; charset=utf-8", + )], + [(header::CACHE_CONTROL, "max-age=3600")], + crl_pem, + ) + .into_response(), + Err(e) => { + tracing::error!(error = %e, "Failed to generate CRL"); + (StatusCode::INTERNAL_SERVER_ERROR, "Failed to generate CRL").into_response() + }, + } +} diff --git a/tasks/issue-7-crl-design.md b/tasks/issue-7-crl-design.md new file mode 100644 index 0000000..f59f520 --- /dev/null +++ b/tasks/issue-7-crl-design.md @@ -0,0 +1,404 @@ +# Issue #7: Certificate Revocation Enforcement — Full CRL Design + +**GitHub Issue:** https://github.com/Draco-Lunaris/Linux-Patch-Manager/issues/7 +**Companion issue (agent repo):** https://github.com/Draco-Lunaris/Linux-Patch-Api/issues/20 +**Status:** Design finalized — implementation pending +**Repos affected:** linux-patch-manager (this), linux-patch-api (agent) +**Last updated:** 2026-06-05 + +--- + +## 1. Goal + +Enforce certificate revocation at the mTLS handshake by having the manager (CA operator) publish a Certificate Revocation List (CRL) and the agent (linux-patch-api) consult it during TLS client certificate validation. + +**Connection direction:** The manager (this repo) is the mTLS client. The agent (linux-patch-api) is the mTLS server. The manager connects TO the agent and presents a client cert. The agent validates it. Agent-to-manager connections occur only for enrollment. + +--- + +## 2. Architecture + +### 2.1 Components + +``` +┌──────────────────┐ ┌──────────────────────┐ +│ pm-web │ │ linux-patch-api │ +│ (manager) │ GET /pki/crl.pem │ (agent) │ +│ │ ◄──────────────────────│ │ +│ ┌────────────┐ │ on enrollment + │ ┌────────────────┐ │ +│ │ pm-ca │ │ every 24h │ │ mTLS server │ │ +│ │ (signs │ │ │ │ (validates │ │ +│ │ certs + │ │ Bundle: CA chain + │ │ client certs │ │ +│ │ CRLs) │ │ client cert + │ │ + CRL check) │ │ +│ └────────────┘ │ client key + │ └────────────────┘ │ +│ │ CRL │ │ +└──────────────────┘ └──────────────────────┘ + │ │ + │ Health check (existing infra) │ + │ + CRL age on agent side │ + └───────────────────────────────────────────┘ +``` + +### 2.2 Mermaid flow diagram + +```mermaid +sequenceDiagram + participant Mgr as Manager (pm-web) + participant Agent as Agent (linux-patch-api) + participant CA as pm-ca + participant DB as certificates table + + Note over Mgr,CA: Initial Enrollment + Agent->>Mgr: POST /api/v1/enroll (with CSR) + Mgr->>CA: issue cert (sign with CA key) + CA->>DB: INSERT certificate (status=active) + CA-->>Mgr: leaf cert + Mgr->>CA: generate_crl() + CA->>DB: SELECT serials WHERE status=revoked + CA-->>Mgr: signed CRL + Mgr-->>Agent: PKI bundle (CA chain + cert + key + CRL) + Agent->>Agent: persist all 4 to /etc/linux-patch-api/certs/ + Agent->>Agent: verify CRL signature against pinned CA + + Note over Agent: Background refresh (every 24h) + Agent->>Mgr: GET /api/v1/pki/crl.pem + Mgr->>CA: generate_crl() (cached or regenerate) + CA-->>Mgr: CRL + Mgr-->>Agent: CRL + Agent->>Agent: verify signature, persist, swap in-memory map + + Note over Mgr,Agent: Normal operation (mTLS) + Mgr->>Agent: mTLS handshake (presents client cert) + Agent->>Agent: webpki verifies chain + Agent->>Agent: extract serial, check CRL + alt serial in CRL + Agent-->>Mgr: handshake rejected + else serial not in CRL + Agent-->>Mgr: handshake accepted + end + + Note over Mgr,CA: Operator revokes a cert + Mgr->>CA: revoke_cert(serial) + CA->>DB: UPDATE status=revoked + CA->>CA: generate_crl() (regenerate) + Note over Agent: Next 24h refresh picks up the revocation +``` + +### 2.3 Sub-CA handling + +**Both root and sub-CA modes are supported.** + +- **Root mode:** Manager is a self-signed CA. The CA cert in the bundle is also the trust anchor. CRL signature chains directly to it. +- **Sub-CA mode:** Manager is a sub-CA under an external root. The enrollment bundle includes the full chain: external root + manager's intermediate cert. The agent pins both. CRL signature chains up to the external root. + +**Required code change for sub-CA support:** Extend `PkiBundle` to include the full chain (new `ca_chain` field containing intermediate + root as a single PEM bundle). The existing single `ca_crt` field is preserved for backward compat (it becomes the leaf-most cert in the chain). + +The external root's own CRL is **out of scope** for this design. Documented assumption: the external root is long-lived and trusted by the agent's system trust store, or the operator accepts the risk of a long-lived external root. + +### 2.4 Cert lifetime + +**No change to current 1-year lifetime.** Revocation lag of up to 24h is acceptable given the 1-year cert validity. Shortening to 90 days was considered and deferred (Phase 1+ works correctly with either lifetime). + +--- + +## 3. Final Decisions (12 concerns walked through with Kelly) + +| # | Concern | Decision | +|---|---------|----------| +| 1 | Sub-CA enrollment bundle chain | Extend `PkiBundle` to include full chain (intermediate + root) as new `ca_chain` field. Single `ca_crt` field preserved for backward compat. | +| 2 | CRL generation library | rcgen 0.13 on manager (sign). x509-parser on agent (parse). webpki for chain validation in custom verifier. No new system deps. | +| 3 | Custom ClientCertVerifier | Use rustls `danger::ClientCertVerifier` trait. Wrapper struct delegates chain validation to `WebPkiClientVerifier`, adds serial lookup against parsed CRL. Only ~80 lines of custom code. | +| 4 | Stale-CRL failure mode | (c) Degraded. Continue serving with stale CRL, log warning, health check reports degraded. Missing CRL = degraded. Invalid signature = refuse to start (fail-closed). | +| 5 | CRL size at scale | Not a concern. Max 2500 clients/manager. CRLs KB-range. No index on (status, not_after) needed. | +| 6 | Health check backward compat | Missing `crl_status` field from older agent = degraded (not unhealthy). New agent with missing > 24h after enrollment = unhealthy. UI: host details page + list icon + dashboard widget. | +| 7 | Test coverage | Layers 1-3 (unit + property + integration) required for ship. Layer 4 (E2E docker-compose) incremental. Layer 5 (fuzz) added now. Property-based tests with `proptest` added now. | +| 8 | Deployment order | 6 PRs sequential. No feature flag (disk state is the implicit flag). All-at-once rollout for PR 2 (agent). | +| 9 | Documentation | Full scope. New `docs/security/revocation.md` as top-level doc. Mermaid diagrams in markdown (GitHub renders natively). | +| 10 | Phasing risk | Low. Pre-production stage, no live users to disrupt. Bounded window between PR 1 and PR 2. | +| 11 | mTLS direction | Confirmed. Manager = client, agent = server. Agent-to-manager only for enrollment. | +| 12 | New host enrollment during CRL outage | Enrollment succeeds without CRL. Health check reports missing. Agent fetches CRL on next refresh cycle. | + +--- + +## 4. Phased Implementation (6 PRs) + +### PR 1 — Manager: CRL generation + endpoint + enrollment bundle + +**Repo:** linux-patch-manager (this) +**Scope:** +- Extend `PkiBundle` to include full chain (new `ca_chain` field) +- Add `generate_crl()` to `pm-ca/src/ca.rs` using rcgen 0.13 +- Add `GET /api/v1/pki/crl.pem` route in new `crates/pm-web/src/routes/pki.rs` +- Include CRL PEM in enrollment response +- Background task: regenerate CRL every 12h and on every `revoke_cert` call +- No DB schema changes + +**Testing:** +- Unit tests for `generate_crl()` (revoked serials present, non-revoked absent, expired excluded) +- Property tests (proptest) for CRL generation roundtrip +- Fuzz harness for CRL generation +- Integration test: `GET /pki/crl.pem` returns 200 + valid PEM + correct `Cache-Control` +- Integration test: enrollment bundle includes CRL + +**Backward compat:** Endpoint is dark until an agent is updated to consume it. Older agents ignore it. Zero impact on existing flows. + +--- + +### PR 2 — Agent: CRL consumption + custom verifier + +**Repo:** linux-patch-api +**Scope:** +- New `src/auth/crl.rs` module: CRL load, signature verification, in-memory serial map (ArcSwap) +- New `src/auth/crl_refresh.rs`: background task fetching CRL every 24h from `GET {manager_url}/api/v1/pki/crl.pem` +- Extend `src/auth/mtls.rs`: replace direct `WebPkiClientVerifier` usage with `CrlClientCertVerifier` wrapper +- Persist CRL to `/etc/linux-patch-api/certs/crl.pem` +- Config additions: `crl_path`, `crl_refresh_interval`, `manager_url` + +**Custom verifier (sketch):** + +```rust +pub struct CrlClientCertVerifier { + inner: Arc, + crl: arc_swap::ArcSwap, +} + +impl rustls::client::danger::ClientCertVerifier for CrlClientCertVerifier { + fn verify_client_cert( + &self, + end_entity: &CertificateDer<'_>, + intermediates: &[CertificateDer<'_>], + now: UnixTime, + ) -> Result { + // Delegate chain validation to WebPKI (battle-tested) + self.inner.verify_client_cert(end_entity, intermediates, now)?; + + // Extract serial from the leaf cert + let serial = extract_serial(end_entity) + .map_err(|e| rustls::Error::General(format!("serial extract: {}", e)))?; + + // Check CRL (O(1) hash lookup) + let crl = self.crl.load(); + if crl.is_revoked(serial) { + return Err(rustls::Error::General(format!( + "cert serial {} is revoked", serial + ))); + } + + Ok(ClientCertVerified::assertion()) + } + + // Delegate remaining trait methods to self.inner + fn supported_verify_schemes(&self) -> Vec { + self.inner.supported_verify_schemes() + } + + fn verify_tls12_signature(&self, ...) -> Result<...> { + self.inner.verify_tls12_signature(...) + } + + fn verify_tls13_signature(&self, ...) -> Result<...> { + self.inner.verify_tls13_signature(...) + } +} +``` + +**Backward compat:** If CRL file is missing or fails signature verification, fall back to `WebPkiClientVerifier` directly (current behavior). Log warning. Health check from manager reports degraded. + +**Testing:** +- Unit tests: CRL load (valid, malformed, missing, tampered, expired) +- Unit tests: custom verifier (valid cert accepted, revoked cert rejected, no false positives) +- Property tests (proptest): random certs + random CRLs, no false negs/pos +- Fuzz harness for CRL load and verifier +- Integration test: end-to-end mTLS (valid cert connects, revoked cert rejected) +- Integration test: stale CRL fallback to WebPKI (no connection rejection) + +--- + +### PR 3 — Manager: Health check schema + UI + +**Repo:** linux-patch-manager (this) +**Scope:** +- Extend health check response schema to include `crl_status` and `crl_age_seconds` fields (optional, backward compat) +- Add UI: CRL section in host details page +- Add hosts list icon (green/yellow/red) for CRL status +- Add dashboard widget: "hosts with degraded CRL: N" + +**Backward compat:** Older agents don't report these fields → UI shows "CRL not configured". No regression. + +--- + +### PR 4 — Agent: Health response includes CRL status + +**Repo:** linux-patch-api +**Scope:** +- Add `crl_status` and `crl_age_seconds` to the agent's health response payload +- Logic: `valid` if CRL loaded + signature good + not expired, `expired` if `nextUpdate` passed, `missing` if no CRL on disk, `invalid` if signature fails + +**Backward compat:** Field is additive. Manager treats missing field as "unknown" / "missing". + +--- + +### PR 5 — Manager: Health aggregation logic + +**Repo:** linux-patch-manager (this) +**Scope:** +- Aggregate per-host CRL health into the host's overall health +- Implement severity rules: invalid signature → unhealthy; missing > 24h on new agent → unhealthy; missing on old agent → degraded; > 25h old → degraded; otherwise healthy +- Add audit events: `CrlStaleDetected`, `CrlMissing`, `CrlInvalid` + +**Backward compat:** Logic only fires when PR 3 + PR 4 are deployed. Safe to merge ahead of those. + +--- + +### PR 6 — E2E integration test harness + +**Repos:** both (new `tests/e2e/` directory in this repo, mirroring setup in agent repo) +**Scope:** +- docker-compose harness running both pm-web and linux-patch-api +- Test scenarios: + - Issue → enroll → connect (fresh agent connects successfully) + - Issue → enroll → revoke → refresh → connect (rejected) + - Issue → enroll → revoke → no refresh → connect (succeeds with stale CRL + warning) + - Manager down → connect (succeeds with stale CRL + degraded health) +- Independent CI for each repo; full E2E runs on main branch merges + +**Backward compat:** Test-only, no production impact. + +--- + +## 5. Failure Modes and Operational Behavior + +### 5.1 Stale CRL on agent + +**Scenario:** Agent's CRL has `nextUpdate` passed. Background refresh fails (manager unreachable). + +**Behavior:** +- Agent continues serving mTLS connections using the stale CRL +- Logs warning every refresh attempt +- Reports `crl_status=expired` and `crl_age_seconds` in health response +- Manager's health aggregation marks host as `degraded` +- Worst case: ~24h of accepting a cert that was revoked after the agent's CRL was generated +- The cert's `not_after` is still the hard backstop (1 year from issuance) + +### 5.2 Missing CRL on agent + +**Scenario:** New agent enrolls, but CRL generation fails on the manager. Or older agent predates CRL feature. + +**Behavior:** +- Agent starts with no CRL on disk +- Falls back to `WebPkiClientVerifier` (chain validation only, no CRL check) +- Logs warning, reports `crl_status=missing` +- Manager's health aggregation marks host as `degraded` +- If host is a newer agent: 24h after enrollment without CRL → escalates to `unhealthy` +- If host is an older agent: stays `degraded` indefinitely (feature gap, not a failure) + +### 5.3 Invalid CRL signature on agent + +**Scenario:** CRL file is corrupted, or the manager's CA key was compromised. + +**Behavior:** +- Agent refuses to load the CRL +- **Refuses to start the mTLS server** (fail-closed here, because invalid signature is a security event) +- Logs critical error +- Reports `crl_status=invalid` in health response +- Operator must investigate: check manager's CA, re-fetch CRL manually, or restore from backup + +### 5.4 Manager unreachable during enrollment + +**Scenario:** New agent tries to enroll. Manager is down. + +**Behavior:** +- Enrollment fails (manager is required for cert issuance) +- Agent retries on its configured enrollment schedule +- Once manager is back, enrollment succeeds, agent receives cert + CA + CRL (if available) + +### 5.5 New host enrollment during CRL outage + +**Scenario:** Manager is up, cert issuance works, but CRL generation fails (e.g., DB issue during `generate_crl`). + +**Behavior:** +- Enrollment succeeds +- Agent receives cert + CA chain, but **no CRL** in the bundle +- Agent starts with no CRL, falls back to WebPKI +- Reports `crl_status=missing` +- Next 24h refresh attempts to fetch CRL from `/pki/crl.pem` +- If CRL generation is fixed by then, agent picks it up on next refresh +- If still failing, agent continues in degraded mode + +--- + +## 6. Acceptance Criteria + +### Phase 1 (Manager-side MVP) + +- [ ] `generate_crl()` produces a valid X.509 CRL signed by the same CA key that signs leaf certs +- [ ] CRL includes only certs where `status='revoked' AND not_after > NOW()` +- [ ] `GET /api/v1/pki/crl.pem` returns 200 + valid PEM + `Cache-Control: max-age=3600` +- [ ] Enrollment PKI bundle includes the CRL +- [ ] Enrollment bundle includes the full CA chain (new `ca_chain` field) +- [ ] Background task regenerates CRL every 12h +- [ ] `revoke_cert` triggers immediate CRL regeneration +- [ ] Unit tests, property tests, fuzz harness, integration tests all pass + +### Phase 2 (Agent-side consumption) + +- [ ] Agent fetches CRL on enrollment from enrollment bundle +- [ ] Agent persists CRL to `/etc/linux-patch-api/certs/crl.pem` +- [ ] Agent verifies CRL signature against pinned CA on load +- [ ] Agent uses `CrlClientCertVerifier` wrapper that delegates to WebPKI + adds CRL check +- [ ] Revoked cert is rejected at mTLS handshake with clear error +- [ ] Valid (non-revoked) cert is accepted +- [ ] Background task refreshes CRL every 24h (configurable) +- [ ] Missing CRL falls back to WebPKI (degraded mode, not fail-closed) +- [ ] Invalid CRL signature causes agent to refuse to start +- [ ] Unit tests, property tests, fuzz harness, integration tests all pass + +### Phase 3 (Health monitoring + UI) + +- [ ] Health response includes `crl_status` and `crl_age_seconds` +- [ ] Host details page shows CRL section (status, age, next update, last refresh) +- [ ] Hosts list shows CRL status icon (green/yellow/red) +- [ ] Dashboard widget shows count of hosts with degraded CRL +- [ ] Health aggregation: invalid signature → unhealthy +- [ ] Health aggregation: new agent missing > 24h → unhealthy +- [ ] Health aggregation: old agent missing → degraded +- [ ] Health aggregation: > 25h old → degraded +- [ ] Audit events: `CertRevoked`, `CrlGenerated`, `CrlFetched`, `CrlStaleDetected`, `CrlMissing`, `CrlInvalid` + +### Phase 4 (E2E tests) + +- [ ] docker-compose harness runs both pm-web and linux-patch-api +- [ ] E2E test: issue → enroll → connect (succeeds) +- [ ] E2E test: issue → enroll → revoke → refresh → connect (rejected) +- [ ] E2E test: issue → enroll → revoke → no refresh → connect (succeeds with stale CRL) +- [ ] E2E test: manager down → connect (succeeds with stale CRL, degraded health) + +### Documentation + +- [ ] `docs/security/revocation.md` (NEW) — revocation policy and operational behavior +- [ ] `docs/architecture/pki.md` updated with CRL section + sub-CA section +- [ ] `docs/architecture/health-monitoring.md` updated with CRL health states +- [ ] `docs/architecture/agent-cert-flow.md` (NEW) — end-to-end flow with mermaid diagram +- [ ] `docs/api/REST_API.md` (or equivalent) updated with new endpoint +- [ ] `docs/operations/upgrade-guide.md` updated with rollout notes +- [ ] `docs/operations/crl-troubleshooting.md` (NEW) — common issues and diagnostics +- [ ] Inline code docs on all new public functions/structs +- [ ] `CHANGELOG.md` entry for the release that lands Phase 1 +- [ ] `linux-patch-api/config.example.toml` updated with new CRL config keys +- [ ] `linux-patch-manager/config.example.toml` updated with new CRL config keys + +--- + +## 7. Sign-off + +**All 12 concerns resolved.** Design is finalized. Implementation can begin. + +**Next action:** Start PR 1 (Manager: CRL generation + endpoint + enrollment bundle). + +The companion issue on linux-patch-api (#20) is filed and tracks the agent-side changes for PR 2 and PR 4. + +**Documented assumptions (must be confirmed before production deployment):** +1. The external root in sub-CA mode is long-lived and trusted. Its own CRL is not consulted. +2. 1-year cert lifetime is acceptable; revocation lag of up to 24h is the operational upper bound. +3. Operators accept that during a CRL refresh failure, revoked certs may be accepted for up to 24h (the cert's `not_after` is the hard backstop). +4. Max ~2500 clients per manager. If this changes, revisit CRL size and consider OCSP. \ No newline at end of file diff --git a/tasks/issue-7-pr1-todo.md b/tasks/issue-7-pr1-todo.md new file mode 100644 index 0000000..72d1d19 --- /dev/null +++ b/tasks/issue-7-pr1-todo.md @@ -0,0 +1,84 @@ +# PR 1: Manager-side CRL generation + endpoint + enrollment bundle + +**Branch:** `feat/7-crl-manager-side` +**Target issue:** https://github.com/Draco-Lunaris/Linux-Patch-Manager/issues/7 + +## Pre-implementation + +- [x] Read existing `pm-ca/src/ca.rs` to understand CA structure +- [x] Confirm rcgen 0.13 is the chosen library +- [x] Confirm sub-CA handling: extend `PkiBundle` with `ca_chain` field +- [x] Read design doc decisions table (concerns 1-12) + +## Code changes + +- [ ] **pm-ca/src/ca.rs**: Add `generate_crl(db: &PgPool) -> Result` function + - Query `certificates` for `status='revoked' AND not_after > NOW()` + - Build CRL using rcgen 0.13 `CertificateRevocationList` + - Sign with CA private key + - Return PEM-encoded CRL + +- [ ] **pm-core/src/models.rs**: Extend `PkiBundle` with `ca_chain: String` field + - Concatenated PEM bundle of full chain (intermediate + root) for sub-CA mode + - For root mode, contains just the root cert (same as ca_crt) + +- [ ] **pm-web/src/routes/pki.rs** (NEW): `GET /api/v1/pki/crl.pem` route + - Public endpoint (no auth, CRLs are self-authenticating) + - `Cache-Control: max-age=3600` + - Returns latest cached CRL (regenerated on schedule or on revoke) + +- [ ] **pm-web/src/routes/enrollment.rs**: Include CRL in enrollment response + - Fetch current CRL via `generate_crl()` + - Add `crl_pem: String` to response + +- [ ] **pm-web/src/main.rs**: Wire up background CRL regeneration task + - Regenerate every 12 hours + - Hook into `revoke_cert` to trigger immediate regeneration + - Store latest CRL in shared state (ArcSwap or similar) + +- [ ] **crates/pm-web/src/state.rs** (or similar): Shared state for cached CRL + +## Tests + +- [ ] **Unit tests** in `pm-ca/src/ca.rs`: + - `generate_crl` produces valid X.509 CRL signed by test CA + - Revoked serials appear in CRL + - Non-revoked serials do not appear + - Expired certs (not_after < now) are excluded + - Empty table produces CRL with zero revoked entries + +- [ ] **Property tests** (proptest): + - Random revoked cert data: CRL is always parseable, signature always verifies + - Single-byte mutations to CRL fail signature verification + +- [ ] **Fuzz harness** (cargo-fuzz): + - Target: `pm_ca::ca::generate_crl` + - Target: `pm_ca::ca::parse_crl` (if we add parsing) + +- [ ] **Integration tests** in `pm-web/tests/`: + - `GET /pki/crl.pem` returns 200 + valid PEM + correct Cache-Control + - Enrollment bundle includes CRL + - Enrollment bundle includes `ca_chain` field + +## Documentation + +- [ ] **docs/security/revocation.md** (NEW): Revocation policy and operational behavior +- [ ] **docs/api/REST_API.md** (or equivalent): Document `GET /pki/crl.pem` +- [ ] **Inline doc comments** on new public functions/structs +- [ ] **CHANGELOG.md** entry for the release + +## Pre-PR checklist + +- [ ] `cargo build` clean +- [ ] `cargo test` all pass +- [ ] `cargo clippy --all-targets --all-features -- -D warnings` clean +- [ ] `cargo fmt --check` clean +- [ ] CI on GitHub passes + +## Out of scope for PR 1 (deferred to later PRs) + +- Agent-side consumption (PR 2, in linux-patch-api repo) +- Health check schema additions (PR 3) +- Agent health response field (PR 4) +- Health aggregation logic (PR 5) +- E2E test harness (PR 6)