Skip to content

Commit 168c58d

Browse files
committed
fix: add wall-clock certificate expiry check to webhook TLS rotation
The rotation interval uses tokio's monotonic clock, but certificate validity uses wall-clock time. When these diverge (hibernation, VM migration, cgroup freezing), the certificate can expire before rotation. Add a periodic wall-clock check (every 5 minutes) that compares SystemTime::now() against the certificate's not_after field and triggers early rotation if the cert is within 4 hours of expiry. Fixes: #1174
1 parent 7486017 commit 168c58d

2 files changed

Lines changed: 66 additions & 6 deletions

File tree

crates/stackable-webhook/src/tls/cert_resolver.rs

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
use std::sync::Arc;
1+
use std::{
2+
sync::Arc,
3+
time::SystemTime,
4+
};
25

36
use arc_swap::ArcSwap;
47
use snafu::{OptionExt, ResultExt, Snafu};
@@ -57,6 +60,9 @@ pub struct CertificateResolver {
5760
/// Using a [`ArcSwap`] (over e.g. [`tokio::sync::RwLock`]), so that we can easily
5861
/// (and performant) bridge between async write and sync read.
5962
current_certified_key: ArcSwap<CertifiedKey>,
63+
/// The wall-clock expiry time (`not_after`) of the current certificate.
64+
/// Used to detect clock drift between monotonic and wall-clock time.
65+
current_not_after: ArcSwap<SystemTime>,
6066
subject_alterative_dns_names: Arc<Vec<String>>,
6167

6268
certificate_tx: mpsc::Sender<Certificate>,
@@ -68,7 +74,7 @@ impl CertificateResolver {
6874
certificate_tx: mpsc::Sender<Certificate>,
6975
) -> Result<Self> {
7076
let subject_alterative_dns_names = Arc::new(subject_alterative_dns_names);
71-
let certified_key = Self::generate_new_certificate_inner(
77+
let (certified_key, not_after) = Self::generate_new_certificate_inner(
7278
subject_alterative_dns_names.clone(),
7379
&certificate_tx,
7480
)
@@ -77,20 +83,37 @@ impl CertificateResolver {
7783
Ok(Self {
7884
subject_alterative_dns_names,
7985
current_certified_key: ArcSwap::new(certified_key),
86+
current_not_after: ArcSwap::new(Arc::new(not_after)),
8087
certificate_tx,
8188
})
8289
}
8390

8491
pub async fn rotate_certificate(&self) -> Result<()> {
85-
let certified_key = self.generate_new_certificate().await?;
92+
let (certified_key, not_after) = self.generate_new_certificate().await?;
8693

8794
// TODO: Sign the new cert somehow with the old cert. See https://github.com/stackabletech/decisions/issues/56
8895
self.current_certified_key.store(certified_key);
96+
self.current_not_after.store(Arc::new(not_after));
8997

9098
Ok(())
9199
}
92100

93-
async fn generate_new_certificate(&self) -> Result<Arc<CertifiedKey>> {
101+
/// Returns `true` if the current certificate is expired or will expire
102+
/// within the given `buffer` duration according to wall-clock time.
103+
///
104+
/// This catches cases where the monotonic timer (used by `tokio::time`)
105+
/// has drifted from wall-clock time, e.g. due to system hibernation.
106+
pub fn needs_rotation(&self, buffer: std::time::Duration) -> bool {
107+
let not_after = **self.current_not_after.load();
108+
// If subtraction underflows (buffer > time since epoch), fall back to
109+
// UNIX_EPOCH so that the comparison always triggers rotation.
110+
let deadline = not_after
111+
.checked_sub(buffer)
112+
.unwrap_or(SystemTime::UNIX_EPOCH);
113+
SystemTime::now() >= deadline
114+
}
115+
116+
async fn generate_new_certificate(&self) -> Result<(Arc<CertifiedKey>, SystemTime)> {
94117
let subject_alterative_dns_names = self.subject_alterative_dns_names.clone();
95118
Self::generate_new_certificate_inner(subject_alterative_dns_names, &self.certificate_tx)
96119
.await
@@ -106,7 +129,7 @@ impl CertificateResolver {
106129
async fn generate_new_certificate_inner(
107130
subject_alterative_dns_names: Arc<Vec<String>>,
108131
certificate_tx: &mpsc::Sender<Certificate>,
109-
) -> Result<Arc<CertifiedKey>> {
132+
) -> Result<(Arc<CertifiedKey>, SystemTime)> {
110133
// The certificate generations can take a while, so we use `spawn_blocking`
111134
let (cert, certified_key) = tokio::task::spawn_blocking(move || {
112135
let tls_provider =
@@ -144,12 +167,18 @@ impl CertificateResolver {
144167
.await
145168
.context(TokioSpawnBlockingSnafu)??;
146169

170+
let not_after = cert
171+
.tbs_certificate
172+
.validity
173+
.not_after
174+
.to_system_time();
175+
147176
certificate_tx
148177
.send(cert)
149178
.await
150179
.map_err(|_err| CertificateResolverError::SendCertificateToChannel)?;
151180

152-
Ok(certified_key)
181+
Ok((certified_key, not_after))
153182
}
154183
}
155184

crates/stackable-webhook/src/tls/mod.rs

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,14 @@ pub const WEBHOOK_CA_LIFETIME: Duration = Duration::from_hours_unchecked(24);
4242
pub const WEBHOOK_CERTIFICATE_LIFETIME: Duration = Duration::from_hours_unchecked(24);
4343
pub const WEBHOOK_CERTIFICATE_ROTATION_INTERVAL: Duration = Duration::from_hours_unchecked(20);
4444

45+
/// How often to check wall-clock time for certificate expiry (5 minutes).
46+
/// This catches clock drift from system hibernation, VM migration, etc.
47+
const WALL_CLOCK_CHECK_INTERVAL: Duration = Duration::from_minutes_unchecked(5);
48+
49+
/// Rotate the certificate when it is within this buffer of expiry according
50+
/// to wall-clock time (4 hours before the 24h certificate expires).
51+
const WALL_CLOCK_EXPIRY_BUFFER: Duration = Duration::from_hours_unchecked(4);
52+
4553
pub type Result<T, E = TlsServerError> = std::result::Result<T, E>;
4654

4755
#[derive(Debug, Snafu)]
@@ -156,6 +164,14 @@ impl TlsServer {
156164
let start = tokio::time::Instant::now() + *WEBHOOK_CERTIFICATE_ROTATION_INTERVAL;
157165
let mut interval = tokio::time::interval_at(start, *WEBHOOK_CERTIFICATE_ROTATION_INTERVAL);
158166

167+
// A separate, shorter interval to check wall-clock time against the
168+
// certificate's not_after. This catches monotonic/wall-clock drift
169+
// caused by hibernation, VM migration, cgroup freezing, etc.
170+
let wall_clock_check_start =
171+
tokio::time::Instant::now() + *WALL_CLOCK_CHECK_INTERVAL;
172+
let mut wall_clock_check =
173+
tokio::time::interval_at(wall_clock_check_start, *WALL_CLOCK_CHECK_INTERVAL);
174+
159175
let tls_acceptor = TlsAcceptor::from(Arc::new(config));
160176
let tcp_listener = TcpListener::bind(socket_addr)
161177
.await
@@ -207,6 +223,21 @@ impl TlsServer {
207223
.context(RotateCertificateSnafu)?
208224
}
209225

226+
// Wall-clock check: detect if the certificate is near expiry
227+
// even though the monotonic rotation interval hasn't fired yet.
228+
// This handles clock drift from hibernation, VM migration, etc.
229+
_ = wall_clock_check.tick() => {
230+
if cert_resolver.needs_rotation(*WALL_CLOCK_EXPIRY_BUFFER) {
231+
tracing::info!("wall-clock check detected certificate near expiry, rotating early");
232+
cert_resolver
233+
.rotate_certificate()
234+
.await
235+
.context(RotateCertificateSnafu)?;
236+
// Reset the monotonic interval so we don't double-rotate
237+
interval.reset();
238+
}
239+
}
240+
210241
// This is cancellation-safe. If cancelled, no new connections are accepted.
211242
tcp_connection = tcp_listener.accept() => {
212243
let (tcp_stream, remote_addr) = match tcp_connection {

0 commit comments

Comments
 (0)