Private
Public Access
1
0

feat: add auto-enrollment, cert validation, and crash loop fixes

- Auto-enrollment on startup when certs are missing/invalid and enrollment.manager_url configured
- Certificate validation (existence, parse, expiry, key match, CA trust)
- --enroll exits after completion (no port conflict with systemd service)
- --renew-certs flag for manual cert renewal
- SO_REUSEADDR on TcpListener::bind (prevents Address already in use)
- Polling token persistence for enrollment resume after restart
- Exit code strategy (0=clean, 1=error, 2=enrollment in progress)
- HTTP 409 (host already exists) handling during enrollment
- Move 'Listening on' log after actual bind
- Increase RestartSec to 10s and add StartLimitBurst=5
- Postinst checks for certs and enrollment URL, prints guidance
- EnrollmentConfig.manager_url changed to Option<String>
- cert_renewal_threshold_days and polling_token config fields
- Updated SPEC.md and DEPLOYMENT_GUIDE.md with new workflow
- RCA document for crash loop root cause analysis
- Version bumped to 1.2.0
This commit is contained in:
2026-05-29 10:44:42 -05:00
parent 48ec57581e
commit 1322598581
43 changed files with 1364 additions and 974 deletions

View File

@ -272,6 +272,14 @@ impl EnrollmentClient {
Ok(enrollment_response)
}
409 => {
// Host already exists - log warning and return special response
// The caller should skip to polling phase with existing token
tracing::warn!(
"Host already registered with manager (HTTP 409) — will attempt to resume polling"
);
Err(anyhow!("ENROLLMENT_CONFLICT: Host already exists"))
}
429 => {
Err(anyhow!(
"Rate limited (HTTP 429) — enrollment requests limited to 1/minute per IP. Retry after 60 seconds."

View File

@ -3,6 +3,12 @@
//! Handles secure registration with the patch manager, including
//! identity extraction (machine-id, FQDN, IPs, OS details) and
//! mTLS enrollment via the manager API.
//!
//! Supports:
//! - Auto-enrollment on startup when certs are missing/invalid
//! - Manual enrollment via `--enroll <url>` CLI flag
//! - Resume polling from persisted token after restart
//! - HTTP 409 (host already exists) handling
pub mod client;
pub mod identity;
@ -20,17 +26,42 @@ pub use identity::{
get_primary_ip, get_route_source_ip, is_container_bridge, is_link_local,
};
/// Error type for enrollment conflict (HTTP 409).
/// Used to signal that the host is already registered and we should
/// skip to the polling phase.
#[derive(Debug)]
pub struct EnrollmentConflictError;
impl std::fmt::Display for EnrollmentConflictError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Host already registered with manager")
}
}
impl std::error::Error for EnrollmentConflictError {}
/// Run the full enrollment flow against the manager at the given URL.
///
/// # Phases
/// 1. **Registration** - POST machine identity to manager, receive polling token
/// - If HTTP 409 (host already exists), skip to Phase 2 with existing token
/// 2. **Polling** - Poll manager for approval with configurable interval/max attempts
/// - If `polling_token` is already in config, skip Phase 1 and resume polling
/// 3. **Provisioning** - Write PKI bundle to disk (certs/keys) and append manager IP to whitelist
///
/// # Arguments
/// * `manager_url` - The manager API base URL
/// * `config` - Mutable reference to AppConfig for polling token persistence
/// * `config_path` - Path to config file for persisting polling token
///
/// # Errors
/// Returns Err on registration failure, polling timeout, denial, user interruption,
/// PKI provisioning failure, or whitelist update failure.
pub async fn run_enrollment(manager_url: &str, config: &super::AppConfig) -> Result<()> {
pub async fn run_enrollment(
manager_url: &str,
config: &mut super::AppConfig,
config_path: &str,
) -> Result<()> {
// Extract IP reporting overrides from enrollment config
let (report_interface, report_ip) = config
.enrollment
@ -40,13 +71,66 @@ pub async fn run_enrollment(manager_url: &str, config: &super::AppConfig) -> Res
let client = EnrollmentClient::with_ip_overrides(manager_url, report_interface, report_ip);
// Phase 1: Registration
tracing::info!(
manager_url = manager_url,
"Starting enrollment - registration phase"
);
let response = client.register().await?;
tracing::info!("Registration successful - received polling token");
// Check for existing polling token to resume
let polling_token = if let Some(ref enrollment) = config.enrollment {
if !enrollment.polling_token.is_empty() {
tracing::info!(
"Resuming enrollment polling from saved token (host already registered)"
);
enrollment.polling_token.clone()
} else {
// No saved token — need to register first
String::new()
}
} else {
String::new()
};
// Phase 1: Registration (skip if we have a saved polling token)
let polling_token = if polling_token.is_empty() {
tracing::info!(
manager_url = manager_url,
"Starting enrollment - registration phase"
);
match client.register().await {
Ok(response) => {
tracing::info!("Registration successful - received polling token");
response.polling_token
}
Err(e) => {
let err_str = e.to_string();
if err_str.contains("ENROLLMENT_CONFLICT") {
// HTTP 409 - host already exists
// We don't have a polling token, so we can't resume polling
// Log a warning and return an error — the user needs to
// re-enroll or the manager needs to provide a new token
tracing::warn!(
"Host already registered but no polling token saved. \
Cannot resume polling. Re-run enrollment or check manager status."
);
return Err(anyhow::anyhow!(
"Host already registered with manager but no polling token available for resume. \
Please check the manager for your host status or re-enroll."
));
}
// For other errors, propagate directly
return Err(e);
}
}
} else {
tracing::info!("Using saved polling token to resume enrollment");
polling_token
};
// Persist polling token for resume after restart
if let Err(e) = config.save_polling_token(&polling_token, config_path) {
tracing::warn!(
error = %e,
"Failed to persist polling token — enrollment will not resume after restart"
);
} else {
tracing::debug!("Polling token persisted to config");
}
// Get polling config (use defaults if not set)
let interval = config
@ -67,7 +151,7 @@ pub async fn run_enrollment(manager_url: &str, config: &super::AppConfig) -> Res
"Starting enrollment - polling phase"
);
let pki_bundle = client
.poll_for_approval(&response.polling_token, interval, max_attempts)
.poll_for_approval(&polling_token, interval, max_attempts)
.await?;
// Phase 3: PKI provisioning & whitelist update
@ -91,6 +175,16 @@ pub async fn run_enrollment(manager_url: &str, config: &super::AppConfig) -> Res
provision::append_manager_to_whitelist(&manager_ip, config.whitelist_path()).await?;
tracing::info!(manager_ip = %manager_ip, "Manager IP appended to whitelist");
// Clear polling token after successful provisioning
if let Err(e) = config.clear_polling_token(config_path) {
tracing::warn!(
error = %e,
"Failed to clear polling token from config — will attempt re-registration on next start"
);
} else {
tracing::debug!("Polling token cleared from config");
}
tracing::info!("Enrollment complete - PKI and whitelist configured");
Ok(())
}