Skip to content

Commit 9e714ab

Browse files
committed
fix: add wall-clock certificate expiry check to webhook TLS rotation
The rotation interval uses tokio's monotonic clock, but certificate validity uses wall-clock time. When these diverge (hibernation, VM migration, cgroup freezing), the certificate can expire before rotation. Add a periodic wall-clock check (every 5 minutes) that compares SystemTime::now() against the certificate's not_after field and triggers early rotation if the cert is within 4 hours of expiry. Fixes: #1174
1 parent 7486017 commit 9e714ab

2 files changed

Lines changed: 65 additions & 20 deletions

File tree

crates/stackable-webhook/src/tls/cert_resolver.rs

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use std::sync::Arc;
1+
use std::{sync::Arc, time::SystemTime};
22

33
use arc_swap::ArcSwap;
44
use snafu::{OptionExt, ResultExt, Snafu};
@@ -57,6 +57,9 @@ pub struct CertificateResolver {
5757
/// Using a [`ArcSwap`] (over e.g. [`tokio::sync::RwLock`]), so that we can easily
5858
/// (and performant) bridge between async write and sync read.
5959
current_certified_key: ArcSwap<CertifiedKey>,
60+
/// The wall-clock expiry time (`not_after`) of the current certificate.
61+
/// Used to detect clock drift between monotonic and wall-clock time.
62+
current_not_after: ArcSwap<SystemTime>,
6063
subject_alterative_dns_names: Arc<Vec<String>>,
6164

6265
certificate_tx: mpsc::Sender<Certificate>,
@@ -68,7 +71,7 @@ impl CertificateResolver {
6871
certificate_tx: mpsc::Sender<Certificate>,
6972
) -> Result<Self> {
7073
let subject_alterative_dns_names = Arc::new(subject_alterative_dns_names);
71-
let certified_key = Self::generate_new_certificate_inner(
74+
let (certified_key, not_after) = Self::generate_new_certificate_inner(
7275
subject_alterative_dns_names.clone(),
7376
&certificate_tx,
7477
)
@@ -77,20 +80,37 @@ impl CertificateResolver {
7780
Ok(Self {
7881
subject_alterative_dns_names,
7982
current_certified_key: ArcSwap::new(certified_key),
83+
current_not_after: ArcSwap::new(Arc::new(not_after)),
8084
certificate_tx,
8185
})
8286
}
8387

8488
pub async fn rotate_certificate(&self) -> Result<()> {
85-
let certified_key = self.generate_new_certificate().await?;
89+
let (certified_key, not_after) = self.generate_new_certificate().await?;
8690

8791
// TODO: Sign the new cert somehow with the old cert. See https://github.com/stackabletech/decisions/issues/56
8892
self.current_certified_key.store(certified_key);
93+
self.current_not_after.store(Arc::new(not_after));
8994

9095
Ok(())
9196
}
9297

93-
async fn generate_new_certificate(&self) -> Result<Arc<CertifiedKey>> {
98+
/// Returns `true` if the current certificate is expired or will expire
99+
/// within the given `buffer` duration according to wall-clock time.
100+
///
101+
/// This catches cases where the monotonic timer (used by `tokio::time`)
102+
/// has drifted from wall-clock time, e.g. due to system hibernation.
103+
pub fn needs_rotation(&self, buffer: std::time::Duration) -> bool {
104+
let not_after = **self.current_not_after.load();
105+
// If subtraction underflows (buffer > time since epoch), fall back to
106+
// UNIX_EPOCH so that the comparison always triggers rotation.
107+
let deadline = not_after
108+
.checked_sub(buffer)
109+
.unwrap_or(SystemTime::UNIX_EPOCH);
110+
SystemTime::now() >= deadline
111+
}
112+
113+
async fn generate_new_certificate(&self) -> Result<(Arc<CertifiedKey>, SystemTime)> {
94114
let subject_alterative_dns_names = self.subject_alterative_dns_names.clone();
95115
Self::generate_new_certificate_inner(subject_alterative_dns_names, &self.certificate_tx)
96116
.await
@@ -106,7 +126,7 @@ impl CertificateResolver {
106126
async fn generate_new_certificate_inner(
107127
subject_alterative_dns_names: Arc<Vec<String>>,
108128
certificate_tx: &mpsc::Sender<Certificate>,
109-
) -> Result<Arc<CertifiedKey>> {
129+
) -> Result<(Arc<CertifiedKey>, SystemTime)> {
110130
// The certificate generations can take a while, so we use `spawn_blocking`
111131
let (cert, certified_key) = tokio::task::spawn_blocking(move || {
112132
let tls_provider =
@@ -144,12 +164,14 @@ impl CertificateResolver {
144164
.await
145165
.context(TokioSpawnBlockingSnafu)??;
146166

167+
let not_after = cert.tbs_certificate.validity.not_after.to_system_time();
168+
147169
certificate_tx
148170
.send(cert)
149171
.await
150172
.map_err(|_err| CertificateResolverError::SendCertificateToChannel)?;
151173

152-
Ok(certified_key)
174+
Ok((certified_key, not_after))
153175
}
154176
}
155177

crates/stackable-webhook/src/tls/mod.rs

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,25 @@ use crate::{
3939
mod cert_resolver;
4040

4141
pub const WEBHOOK_CA_LIFETIME: Duration = Duration::from_hours_unchecked(24);
42-
pub const WEBHOOK_CERTIFICATE_LIFETIME: Duration = Duration::from_hours_unchecked(24);
43-
pub const WEBHOOK_CERTIFICATE_ROTATION_INTERVAL: Duration = Duration::from_hours_unchecked(20);
42+
43+
/// The wall-clock lifetime of generated webhook certificates. If this is ever
44+
/// reduced, ensure it stays well above [`CERTIFICATE_ROTATION_CHECK_INTERVAL`]
45+
/// (currently 5 minutes), otherwise the certificate could expire between checks.
46+
const WEBHOOK_CERTIFICATE_LIFETIME_HOURS: u64 = 24;
47+
pub const WEBHOOK_CERTIFICATE_LIFETIME: Duration =
48+
Duration::from_hours_unchecked(WEBHOOK_CERTIFICATE_LIFETIME_HOURS);
49+
50+
/// How often to check whether the certificate needs rotation. This is
51+
/// intentionally independent of the certificate lifetime — it controls how
52+
/// quickly we detect wall-clock drift (from hibernation, VM migration, etc.),
53+
/// not how long the certificate lives.
54+
const CERTIFICATE_ROTATION_CHECK_INTERVAL: Duration = Duration::from_minutes_unchecked(5);
55+
56+
/// Rotate the certificate when less than 1/6 of its lifetime remains
57+
/// (4 hours for the current 24h lifetime). Derived from
58+
/// [`WEBHOOK_CERTIFICATE_LIFETIME`] so it scales if the lifetime changes.
59+
const CERTIFICATE_EXPIRY_BUFFER: Duration =
60+
Duration::from_minutes_unchecked(WEBHOOK_CERTIFICATE_LIFETIME_HOURS * 60 / 6);
4461

4562
pub type Result<T, E = TlsServerError> = std::result::Result<T, E>;
4663

@@ -153,8 +170,12 @@ impl TlsServer {
153170
router,
154171
} = self;
155172

156-
let start = tokio::time::Instant::now() + *WEBHOOK_CERTIFICATE_ROTATION_INTERVAL;
157-
let mut interval = tokio::time::interval_at(start, *WEBHOOK_CERTIFICATE_ROTATION_INTERVAL);
173+
// Periodically check whether the certificate needs rotation based on
174+
// wall-clock time. This avoids the monotonic vs wall-clock drift problem
175+
// that can occur during hibernation, VM migration, or cgroup freezing.
176+
let check_start = tokio::time::Instant::now() + *CERTIFICATE_ROTATION_CHECK_INTERVAL;
177+
let mut rotation_check =
178+
tokio::time::interval_at(check_start, *CERTIFICATE_ROTATION_CHECK_INTERVAL);
158179

159180
let tls_acceptor = TlsAcceptor::from(Arc::new(config));
160181
let tcp_listener = TcpListener::bind(socket_addr)
@@ -183,11 +204,10 @@ impl TlsServer {
183204
loop {
184205
let tls_acceptor = tls_acceptor.clone();
185206

186-
// Wait for either a new TCP connection or the certificate rotation interval tick
187207
tokio::select! {
188208
// We opt for a biased execution of arms to make sure we always check if a
189-
// shutdown signal was received or the certificate needs rotation based on the
190-
// interval. This ensures, we always use a valid certificate for the TLS connection.
209+
// shutdown signal was received or the certificate needs rotation before
210+
// accepting new connections.
191211
biased;
192212

193213
// Once a shutdown signal is received (this future becomes `Poll::Ready`), break out
@@ -198,13 +218,16 @@ impl TlsServer {
198218
break;
199219
}
200220

201-
// This is cancellation-safe. If this branch is cancelled, the tick is NOT consumed.
202-
// As such, we will not miss rotating the certificate.
203-
_ = interval.tick() => {
204-
cert_resolver
205-
.rotate_certificate()
206-
.await
207-
.context(RotateCertificateSnafu)?
221+
// Check wall-clock time to decide if the certificate needs rotation.
222+
// This is cancellation-safe: if cancelled, the tick is NOT consumed.
223+
_ = rotation_check.tick() => {
224+
if cert_resolver.needs_rotation(*CERTIFICATE_EXPIRY_BUFFER) {
225+
tracing::info!("certificate approaching expiry, rotating");
226+
cert_resolver
227+
.rotate_certificate()
228+
.await
229+
.context(RotateCertificateSnafu)?;
230+
}
208231
}
209232

210233
// This is cancellation-safe. If cancelled, no new connections are accepted.

0 commit comments

Comments
 (0)