feat: add auto-enrollment, cert validation, and crash loop fixes
- Auto-enrollment on startup when certs are missing/invalid and enrollment.manager_url configured - Certificate validation (existence, parse, expiry, key match, CA trust) - --enroll exits after completion (no port conflict with systemd service) - --renew-certs flag for manual cert renewal - SO_REUSEADDR on TcpListener::bind (prevents Address already in use) - Polling token persistence for enrollment resume after restart - Exit code strategy (0=clean, 1=error, 2=enrollment in progress) - HTTP 409 (host already exists) handling during enrollment - Move 'Listening on' log after actual bind - Increase RestartSec to 10s and add StartLimitBurst=5 - Postinst checks for certs and enrollment URL, prints guidance - EnrollmentConfig.manager_url changed to Option<String> - cert_renewal_threshold_days and polling_token config fields - Updated SPEC.md and DEPLOYMENT_GUIDE.md with new workflow - RCA document for crash loop root cause analysis - Version bumped to 1.2.0
This commit is contained in:
@ -272,6 +272,14 @@ impl EnrollmentClient {
|
||||
|
||||
Ok(enrollment_response)
|
||||
}
|
||||
409 => {
|
||||
// Host already exists - log warning and return special response
|
||||
// The caller should skip to polling phase with existing token
|
||||
tracing::warn!(
|
||||
"Host already registered with manager (HTTP 409) — will attempt to resume polling"
|
||||
);
|
||||
Err(anyhow!("ENROLLMENT_CONFLICT: Host already exists"))
|
||||
}
|
||||
429 => {
|
||||
Err(anyhow!(
|
||||
"Rate limited (HTTP 429) — enrollment requests limited to 1/minute per IP. Retry after 60 seconds."
|
||||
|
||||
@ -3,6 +3,12 @@
|
||||
//! Handles secure registration with the patch manager, including
|
||||
//! identity extraction (machine-id, FQDN, IPs, OS details) and
|
||||
//! mTLS enrollment via the manager API.
|
||||
//!
|
||||
//! Supports:
|
||||
//! - Auto-enrollment on startup when certs are missing/invalid
|
||||
//! - Manual enrollment via `--enroll <url>` CLI flag
|
||||
//! - Resume polling from persisted token after restart
|
||||
//! - HTTP 409 (host already exists) handling
|
||||
|
||||
pub mod client;
|
||||
pub mod identity;
|
||||
@ -20,17 +26,42 @@ pub use identity::{
|
||||
get_primary_ip, get_route_source_ip, is_container_bridge, is_link_local,
|
||||
};
|
||||
|
||||
/// Error type for enrollment conflict (HTTP 409).
|
||||
/// Used to signal that the host is already registered and we should
|
||||
/// skip to the polling phase.
|
||||
#[derive(Debug)]
|
||||
pub struct EnrollmentConflictError;
|
||||
|
||||
impl std::fmt::Display for EnrollmentConflictError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "Host already registered with manager")
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for EnrollmentConflictError {}
|
||||
|
||||
/// Run the full enrollment flow against the manager at the given URL.
|
||||
///
|
||||
/// # Phases
|
||||
/// 1. **Registration** - POST machine identity to manager, receive polling token
|
||||
/// - If HTTP 409 (host already exists), skip to Phase 2 with existing token
|
||||
/// 2. **Polling** - Poll manager for approval with configurable interval/max attempts
|
||||
/// - If `polling_token` is already in config, skip Phase 1 and resume polling
|
||||
/// 3. **Provisioning** - Write PKI bundle to disk (certs/keys) and append manager IP to whitelist
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `manager_url` - The manager API base URL
|
||||
/// * `config` - Mutable reference to AppConfig for polling token persistence
|
||||
/// * `config_path` - Path to config file for persisting polling token
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns Err on registration failure, polling timeout, denial, user interruption,
|
||||
/// PKI provisioning failure, or whitelist update failure.
|
||||
pub async fn run_enrollment(manager_url: &str, config: &super::AppConfig) -> Result<()> {
|
||||
pub async fn run_enrollment(
|
||||
manager_url: &str,
|
||||
config: &mut super::AppConfig,
|
||||
config_path: &str,
|
||||
) -> Result<()> {
|
||||
// Extract IP reporting overrides from enrollment config
|
||||
let (report_interface, report_ip) = config
|
||||
.enrollment
|
||||
@ -40,13 +71,66 @@ pub async fn run_enrollment(manager_url: &str, config: &super::AppConfig) -> Res
|
||||
|
||||
let client = EnrollmentClient::with_ip_overrides(manager_url, report_interface, report_ip);
|
||||
|
||||
// Phase 1: Registration
|
||||
tracing::info!(
|
||||
manager_url = manager_url,
|
||||
"Starting enrollment - registration phase"
|
||||
);
|
||||
let response = client.register().await?;
|
||||
tracing::info!("Registration successful - received polling token");
|
||||
// Check for existing polling token to resume
|
||||
let polling_token = if let Some(ref enrollment) = config.enrollment {
|
||||
if !enrollment.polling_token.is_empty() {
|
||||
tracing::info!(
|
||||
"Resuming enrollment polling from saved token (host already registered)"
|
||||
);
|
||||
enrollment.polling_token.clone()
|
||||
} else {
|
||||
// No saved token — need to register first
|
||||
String::new()
|
||||
}
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
|
||||
// Phase 1: Registration (skip if we have a saved polling token)
|
||||
let polling_token = if polling_token.is_empty() {
|
||||
tracing::info!(
|
||||
manager_url = manager_url,
|
||||
"Starting enrollment - registration phase"
|
||||
);
|
||||
match client.register().await {
|
||||
Ok(response) => {
|
||||
tracing::info!("Registration successful - received polling token");
|
||||
response.polling_token
|
||||
}
|
||||
Err(e) => {
|
||||
let err_str = e.to_string();
|
||||
if err_str.contains("ENROLLMENT_CONFLICT") {
|
||||
// HTTP 409 - host already exists
|
||||
// We don't have a polling token, so we can't resume polling
|
||||
// Log a warning and return an error — the user needs to
|
||||
// re-enroll or the manager needs to provide a new token
|
||||
tracing::warn!(
|
||||
"Host already registered but no polling token saved. \
|
||||
Cannot resume polling. Re-run enrollment or check manager status."
|
||||
);
|
||||
return Err(anyhow::anyhow!(
|
||||
"Host already registered with manager but no polling token available for resume. \
|
||||
Please check the manager for your host status or re-enroll."
|
||||
));
|
||||
}
|
||||
// For other errors, propagate directly
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
tracing::info!("Using saved polling token to resume enrollment");
|
||||
polling_token
|
||||
};
|
||||
|
||||
// Persist polling token for resume after restart
|
||||
if let Err(e) = config.save_polling_token(&polling_token, config_path) {
|
||||
tracing::warn!(
|
||||
error = %e,
|
||||
"Failed to persist polling token — enrollment will not resume after restart"
|
||||
);
|
||||
} else {
|
||||
tracing::debug!("Polling token persisted to config");
|
||||
}
|
||||
|
||||
// Get polling config (use defaults if not set)
|
||||
let interval = config
|
||||
@ -67,7 +151,7 @@ pub async fn run_enrollment(manager_url: &str, config: &super::AppConfig) -> Res
|
||||
"Starting enrollment - polling phase"
|
||||
);
|
||||
let pki_bundle = client
|
||||
.poll_for_approval(&response.polling_token, interval, max_attempts)
|
||||
.poll_for_approval(&polling_token, interval, max_attempts)
|
||||
.await?;
|
||||
|
||||
// Phase 3: PKI provisioning & whitelist update
|
||||
@ -91,6 +175,16 @@ pub async fn run_enrollment(manager_url: &str, config: &super::AppConfig) -> Res
|
||||
provision::append_manager_to_whitelist(&manager_ip, config.whitelist_path()).await?;
|
||||
tracing::info!(manager_ip = %manager_ip, "Manager IP appended to whitelist");
|
||||
|
||||
// Clear polling token after successful provisioning
|
||||
if let Err(e) = config.clear_polling_token(config_path) {
|
||||
tracing::warn!(
|
||||
error = %e,
|
||||
"Failed to clear polling token from config — will attempt re-registration on next start"
|
||||
);
|
||||
} else {
|
||||
tracing::debug!("Polling token cleared from config");
|
||||
}
|
||||
|
||||
tracing::info!("Enrollment complete - PKI and whitelist configured");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user