From b4cf1eed600796688b48fb0e48190e50ab3a3120 Mon Sep 17 00:00:00 2001 From: Mostafa Moradian Date: Fri, 22 May 2026 12:57:09 +0200 Subject: [PATCH 01/11] feat(cli): add daemon-tls feature flag and dependencies Introduces a new optional `daemon-tls` Cargo feature on `rsigma-cli` that pulls in `rustls` (with the `aws-lc-rs` provider), `tokio-rustls`, `rustls-pemfile`, `rustls-pki-types`, `x509-parser`, and `hyper`/ `hyper-util` for the upcoming server-side TLS termination support for the daemon HTTP REST, OTLP/HTTP, and OTLP/gRPC endpoints. Dev-only `rcgen` is added so the planned `cli_daemon_tls.rs` integration test can mint a self-signed CA on the fly. Implements part of #128. --- Cargo.lock | 165 ++++++++++++++++++++++++++++++++++- crates/rsigma-cli/Cargo.toml | 26 ++++++ 2 files changed, 187 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4e7ba387..0198efc6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -143,6 +143,45 @@ dependencies = [ "rustversion", ] +[[package]] +name = "asn1-rs" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7f43a50ac4fdca5df8e885c21b835997f0a1cdee65494a6847694a98652d9d8" +dependencies = [ + "asn1-rs-derive", + "asn1-rs-impl", + "displaydoc", + "nom 7.1.3", + "num-traits", + "rusticata-macros", + "thiserror 2.0.18", + "time", +] + +[[package]] +name = "asn1-rs-derive" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3109e49b1e4909e9db6515a30c633684d68cdeaa252f215214cb4fa1a5bfee2c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", + "synstructure", +] + +[[package]] +name = "asn1-rs-impl" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "assert-json-diff" version = "2.0.2" @@ -272,6 +311,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ec6fb3fe69024a75fa7e1bfb48aa6cf59706a101658ea01bfd33b2b248a038f" dependencies = [ "aws-lc-sys", + "untrusted 0.7.1", "zeroize", ] @@ -363,7 +403,7 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" dependencies = [ - "bit-vec", + "bit-vec 0.8.0", ] [[package]] @@ -372,6 +412,15 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" +[[package]] +name = "bit-vec" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b71798fca2c1fe1086445a7258a4bc81e6e49dcd24c8d0dd9a1e57395b603f51" +dependencies = [ + "serde", +] + [[package]] name = "bitflags" version = "1.3.2" @@ -1028,6 +1077,20 @@ dependencies = [ "zeroize", ] +[[package]] +name = "der-parser" +version = "10.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07da5016415d5a3c4dd39b11ed26f915f52fc4e0dc197d87908bc916e51bc1a6" +dependencies = [ + "asn1-rs", + "displaydoc", + "nom 7.1.3", + "num-bigint", + "num-traits", + "rusticata-macros", +] + [[package]] name = "deranged" version = "0.5.8" @@ -2798,6 +2861,15 @@ dependencies = [ "objc2-core-foundation", ] +[[package]] +name = "oid-registry" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12f40cff3dde1b6087cc5d5f5d4d65712f34016a03ed60e9c08dcc392736b5b7" +dependencies = [ + "asn1-rs", +] + [[package]] name = "once_cell" version = "1.21.4" @@ -2937,6 +3009,16 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c5a797f0e07bdf071d15742978fc3128ec6c22891c31a3a931513263904c982a" +[[package]] +name = "pem" +version = "3.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d30c53c26bc5b31a98cd02d20f25a7c8567146caf63ed593a9d87b2775291be" +dependencies = [ + "base64", + "serde_core", +] + [[package]] name = "pem-rfc7468" version = "0.7.0" @@ -3228,7 +3310,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b45fcc2344c680f5025fe57779faef368840d0bd1f42f216291f0dc4ace4744" dependencies = [ "bit-set", - "bit-vec", + "bit-vec 0.8.0", "bitflags 2.11.1", "num-traits", "rand 0.9.4", @@ -3499,6 +3581,20 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "rcgen" +version = "0.14.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57f6d249aad744e274e682777a50283a225a32705394ee6d5fcc01efa25e4055" +dependencies = [ + "aws-lc-rs", + "pem", + "rustls-pki-types", + "time", + "x509-parser", + "yasna", +] + [[package]] name = "redox_syscall" version = "0.5.18" @@ -3693,7 +3789,7 @@ dependencies = [ "cfg-if", "getrandom 0.2.17", "libc", - "untrusted", + "untrusted 0.9.0", "windows-sys 0.52.0", ] @@ -3742,6 +3838,8 @@ dependencies = [ "flate2", "futures", "humantime", + "hyper", + "hyper-util", "insta", "jaq-core", "jaq-json", @@ -3752,11 +3850,15 @@ dependencies = [ "predicates", "prometheus", "prost", + "rcgen", "rsigma-convert", "rsigma-eval", "rsigma-parser", "rsigma-runtime", "rusqlite", + "rustls", + "rustls-pemfile", + "rustls-pki-types", "serde", "serde_json", "serde_json_path", @@ -3765,13 +3867,16 @@ dependencies = [ "testcontainers-modules", "time", "tokio", + "tokio-rustls", "tokio-stream", "tonic", "tower-http", + "tower-service", "tracing", "tracing-subscriber", "ureq", "wiremock", + "x509-parser", "yaml_serde", "yamlpatch", "yamlpath", @@ -3935,6 +4040,15 @@ dependencies = [ "semver", ] +[[package]] +name = "rusticata-macros" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "faf0c4a6ece9950b9abdb62b1cfcf2a68b3b67a10ba445b3bb85be2a293d0632" +dependencies = [ + "nom 7.1.3", +] + [[package]] name = "rustix" version = "1.1.4" @@ -3976,6 +4090,15 @@ dependencies = [ "security-framework", ] +[[package]] +name = "rustls-pemfile" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "rustls-pki-types" version = "1.14.1" @@ -4022,7 +4145,7 @@ dependencies = [ "aws-lc-rs", "ring", "rustls-pki-types", - "untrusted", + "untrusted 0.9.0", ] [[package]] @@ -5225,6 +5348,12 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "untrusted" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" + [[package]] name = "untrusted" version = "0.9.0" @@ -5955,6 +6084,24 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" +[[package]] +name = "x509-parser" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d43b0f71ce057da06bc0851b23ee24f3f86190b07203dd8f567d0b706a185202" +dependencies = [ + "asn1-rs", + "aws-lc-rs", + "data-encoding", + "der-parser", + "lazy_static", + "nom 7.1.3", + "oid-registry", + "rusticata-macros", + "thiserror 2.0.18", + "time", +] + [[package]] name = "xattr" version = "1.6.1" @@ -6009,6 +6156,16 @@ dependencies = [ "tree-sitter-yaml", ] +[[package]] +name = "yasna" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5f6765e852b9b4dc8e2a76843e4d64d1cea8e79bcde0b6901aea8e7c7f08282" +dependencies = [ + "bit-vec 0.9.1", + "time", +] + [[package]] name = "yoke" version = "0.8.2" diff --git a/crates/rsigma-cli/Cargo.toml b/crates/rsigma-cli/Cargo.toml index 896dd4af..a48b5635 100644 --- a/crates/rsigma-cli/Cargo.toml +++ b/crates/rsigma-cli/Cargo.toml @@ -14,6 +14,17 @@ default = ["daemon"] daemon = ["rsigma-runtime", "tokio", "axum", "async-trait", "prometheus", "notify", "rusqlite", "tower-http"] daemon-nats = ["daemon", "rsigma-runtime/nats", "async-nats", "tokio-stream", "time"] daemon-otlp = ["daemon", "rsigma-runtime/otlp", "prost", "tonic", "flate2", "tokio-stream"] +daemon-tls = [ + "daemon", + "dep:rustls", + "dep:tokio-rustls", + "dep:rustls-pemfile", + "dep:rustls-pki-types", + "dep:x509-parser", + "dep:hyper", + "dep:hyper-util", + "dep:tower-service", +] logfmt = ["rsigma-runtime/logfmt"] cef = ["rsigma-runtime/cef"] evtx = ["rsigma-runtime/evtx"] @@ -58,6 +69,16 @@ prost = { version = "0.14", optional = true } tonic = { version = "0.14", features = ["gzip"], optional = true } flate2 = { version = "1", optional = true } +# daemon-tls dependencies +rustls = { version = "0.23", default-features = false, features = ["aws_lc_rs", "std", "tls12", "logging"], optional = true } +tokio-rustls = { version = "0.26", default-features = false, features = ["aws_lc_rs", "tls12", "logging"], optional = true } +rustls-pemfile = { version = "2", optional = true } +rustls-pki-types = { version = "1", optional = true } +x509-parser = { version = "0.18", optional = true } +hyper = { version = "1", features = ["server", "http1", "http2"], optional = true } +hyper-util = { version = "0.1", features = ["server-auto", "tokio", "service"], optional = true } +tower-service = { version = "0.3", optional = true } + [dev-dependencies] assert_cmd = "2.2.2" predicates = "3.1.4" @@ -75,3 +96,8 @@ opentelemetry-proto = { version = "0.32", default-features = false, features = [ prost = "0.14" wiremock = "0.6" flate2 = "1" +rcgen = { version = "0.14", default-features = false, features = ["aws_lc_rs", "pem"] } +rustls = { version = "0.23", default-features = false, features = ["aws_lc_rs", "std", "tls12"] } +rustls-pemfile = "2" +rustls-pki-types = "1" +tokio-rustls = { version = "0.26", default-features = false, features = ["aws_lc_rs", "tls12"] } From 31b038833bc6746d746f59f64de46a4d2f234eaf Mon Sep 17 00:00:00 2001 From: Mostafa Moradian Date: Fri, 22 May 2026 13:05:49 +0200 Subject: [PATCH 02/11] feat(daemon): terminate TLS on the API listener (HTTP, OTLP, gRPC) Adds server-side TLS termination on the daemon's `--api-addr` listener for the Axum HTTP REST API, the Prometheus `/metrics` endpoint, and both OTLP/HTTP and OTLP/gRPC transports. All three protocols already share one socket; this change wraps that socket in `tokio-rustls` (with the `aws-lc-rs` provider) and unifies serving through `axum::serve` plus `tonic::service::Routes::into_axum_router`, so the same router dispatches both `/v1/logs` over HTTP/1.1 + HTTP/2 and gRPC via ALPN. New flags on `engine daemon` (all gated behind the `daemon-tls` feature): - `--tls-cert` / `--tls-key` to terminate TLS in-process. - `--tls-client-ca` to enable mTLS verification of inbound clients. - `--tls-min-version` (`1.2` or `1.3`, default `1.3`). - `--tls-key-password` / `RSIGMA_TLS_KEY_PASSWORD`, reserved for a future release; currently rejected with a clear error pointing at `openssl rsa` to decrypt offline. - `--allow-plaintext` to keep the historical behavior on non-loopback addresses. Loopback (`127.0.0.0/8`, `::1`) always allows plaintext. Public binds without TLS or this flag now refuse to start. Hot-reload of the certificate and key is wired into the existing SIGHUP path via `Arc>`, so cert rotation takes effect on the next handshake without dropping inflight connections. Observability gains two Prometheus metrics: - `rsigma_tls_certificate_expiry_seconds` (seconds until `not_after`) - `rsigma_tls_active_connections` The daemon logs a single WARN at startup and after every reload if the active cert expires within 30 days. Implements #128. --- crates/rsigma-cli/src/commands/daemon.rs | 149 ++++++++ crates/rsigma-cli/src/daemon/metrics.rs | 32 ++ crates/rsigma-cli/src/daemon/mod.rs | 2 + crates/rsigma-cli/src/daemon/reload.rs | 30 +- crates/rsigma-cli/src/daemon/server.rs | 144 +++++-- crates/rsigma-cli/src/daemon/tls.rs | 454 +++++++++++++++++++++++ 6 files changed, 789 insertions(+), 22 deletions(-) create mode 100644 crates/rsigma-cli/src/daemon/tls.rs diff --git a/crates/rsigma-cli/src/commands/daemon.rs b/crates/rsigma-cli/src/commands/daemon.rs index 8d160fc3..f57816b7 100644 --- a/crates/rsigma-cli/src/commands/daemon.rs +++ b/crates/rsigma-cli/src/commands/daemon.rs @@ -272,6 +272,66 @@ pub(crate) struct DaemonArgs { /// error. #[arg(long = "source", value_name = "FILE_OR_DIR")] pub sources: Vec, + + // --------------------------------------------------------------- + // TLS (requires the `daemon-tls` build feature) + // --------------------------------------------------------------- + /// PEM-encoded TLS certificate (chain) for the API listener. + /// + /// When set together with `--tls-key`, the daemon terminates TLS + /// for the HTTP REST API, the Prometheus `/metrics` endpoint, and + /// (with `daemon-otlp`) both OTLP/HTTP and OTLP/gRPC on the same + /// `--api-addr`. The leaf certificate and any intermediates may be + /// concatenated in a single PEM file. + /// + /// Hot-reloaded on SIGHUP without dropping inflight connections. + #[cfg(feature = "daemon-tls")] + #[arg(long = "tls-cert", value_name = "PATH", requires = "tls_key")] + pub tls_cert: Option, + + /// PEM-encoded TLS private key for the API listener. + /// + /// PKCS#8, PKCS#1 (RSA), and SEC1 (EC) formats are accepted. + /// Encrypted keys are not supported yet; decrypt with + /// `openssl rsa -in key.pem -out key-decrypted.pem` first. + #[cfg(feature = "daemon-tls")] + #[arg(long = "tls-key", value_name = "PATH", requires = "tls_cert")] + pub tls_key: Option, + + /// Password for an encrypted `--tls-key`. Currently rejected at + /// startup; reserved for a future release to keep the flag stable. + #[cfg(feature = "daemon-tls")] + #[arg(long = "tls-key-password", env = "RSIGMA_TLS_KEY_PASSWORD")] + pub tls_key_password: Option, + + /// PEM bundle of trusted CA certificates used to verify inbound + /// client certificates (mutual TLS). + /// + /// When set, clients must present a certificate signed by one of + /// the listed CAs or the TLS handshake is rejected with + /// `bad certificate`. Useful for agent-to-daemon pinning. + #[cfg(feature = "daemon-tls")] + #[arg(long = "tls-client-ca", value_name = "PATH", requires = "tls_cert")] + pub tls_client_ca: Option, + + /// Minimum TLS protocol version accepted by the server. + /// + /// Default is `1.3`. Use `1.2` only for compatibility with legacy + /// agents that cannot negotiate TLS 1.3. + #[cfg(feature = "daemon-tls")] + #[arg(long = "tls-min-version", value_name = "VERSION", default_value = "1.3")] + pub tls_min_version: String, + + /// Allow the daemon to bind a non-loopback `--api-addr` without TLS. + /// + /// By default the daemon refuses to start on a public address + /// (`0.0.0.0`, `::`, or any non-loopback IP) unless either + /// `--tls-cert`/`--tls-key` is supplied or this flag is set. + /// Loopback (`127.0.0.0/8`, `::1`) is always allowed in plaintext + /// to keep local development friction-free. + #[cfg(feature = "daemon-tls")] + #[arg(long = "allow-plaintext")] + pub allow_plaintext: bool, } /// Helper struct grouping NATS connection / auth flags so `cmd_daemon` does @@ -348,6 +408,18 @@ pub(crate) fn cmd_daemon(args: DaemonArgs) { cross_rule_ac, enrichers, sources: source_paths, + #[cfg(feature = "daemon-tls")] + tls_cert, + #[cfg(feature = "daemon-tls")] + tls_key, + #[cfg(feature = "daemon-tls")] + tls_key_password, + #[cfg(feature = "daemon-tls")] + tls_client_ca, + #[cfg(feature = "daemon-tls")] + tls_min_version, + #[cfg(feature = "daemon-tls")] + allow_plaintext, } = args; #[cfg(feature = "daemon-nats")] @@ -386,6 +458,16 @@ pub(crate) fn cmd_daemon(args: DaemonArgs) { daemon::server::StateRestoreMode::Auto }; + #[cfg(feature = "daemon-tls")] + let tls_args = TlsCliArgs { + cert: tls_cert, + key: tls_key, + key_password: tls_key_password, + client_ca: tls_client_ca, + min_version: tls_min_version, + allow_plaintext, + }; + run_daemon( rules_path, pipeline_paths, @@ -425,9 +507,22 @@ pub(crate) fn cmd_daemon(args: DaemonArgs) { cross_rule_ac, enrichers, source_paths, + #[cfg(feature = "daemon-tls")] + tls_args, ); } +/// Helper struct grouping TLS flags so `cmd_daemon` stays readable. +#[cfg(feature = "daemon-tls")] +pub(crate) struct TlsCliArgs { + pub cert: Option, + pub key: Option, + pub key_password: Option, + pub client_ca: Option, + pub min_version: String, + pub allow_plaintext: bool, +} + #[allow(clippy::too_many_arguments)] fn run_daemon( rules_path: PathBuf, @@ -464,6 +559,7 @@ fn run_daemon( #[cfg(feature = "daachorse-index")] cross_rule_ac: bool, enrichers_path: Option, source_paths: Vec, + #[cfg(feature = "daemon-tls")] tls_args: TlsCliArgs, ) { use rsigma_eval::resolve_builtin_pipeline; @@ -497,6 +593,9 @@ fn run_daemon( process::exit(exit_code::CONFIG_ERROR); }); + #[cfg(feature = "daemon-tls")] + let tls_state = build_tls_state(&tls_args, addr); + #[cfg(feature = "daemon-nats")] let nats_config = rsigma_runtime::NatsConnectConfig { credentials_file: nats_auth.nats_creds, @@ -586,6 +685,8 @@ fn run_daemon( cross_rule_ac, enrichers_path, source_registry, + #[cfg(feature = "daemon-tls")] + tls_state, }; let rt = tokio::runtime::Builder::new_multi_thread() @@ -634,6 +735,54 @@ pub(crate) fn parse_input_format(format_str: &str, syslog_tz: &str) -> rsigma_ru } } +/// Build the TLS state from CLI flags and enforce the +/// "no plaintext on non-loopback" policy. Returns `None` when TLS is not +/// requested. Exits with `CONFIG_ERROR` on validation failure so the +/// operator sees the problem before the daemon spins up. +#[cfg(feature = "daemon-tls")] +fn build_tls_state( + args: &TlsCliArgs, + addr: std::net::SocketAddr, +) -> Option { + use daemon::tls::{TlsCliConfig, TlsMinVersion, TlsState, enforce_plaintext_policy}; + + match (args.cert.as_ref(), args.key.as_ref()) { + (Some(cert), Some(key)) => { + let min_version: TlsMinVersion = args.min_version.parse().unwrap_or_else(|e| { + eprintln!("{e}"); + process::exit(exit_code::CONFIG_ERROR); + }); + let cli_cfg = TlsCliConfig { + cert_path: cert.clone(), + key_path: key.clone(), + key_password: args.key_password.clone(), + client_ca_path: args.client_ca.clone(), + min_version, + }; + match TlsState::from_paths(cli_cfg) { + Ok(state) => Some(state), + Err(e) => { + eprintln!("Failed to initialize TLS: {e}"); + process::exit(exit_code::CONFIG_ERROR); + } + } + } + (None, None) => { + if let Err(msg) = enforce_plaintext_policy(addr, args.allow_plaintext) { + eprintln!("{msg}"); + process::exit(exit_code::CONFIG_ERROR); + } + None + } + _ => { + // clap's `requires` should make this unreachable, but guard + // anyway in case the validator is bypassed (e.g. tests). + eprintln!("--tls-cert and --tls-key must be supplied together"); + process::exit(exit_code::CONFIG_ERROR); + } + } +} + /// Parse a timezone offset string like "+05:00" or "-08:00" into seconds east of UTC. fn parse_tz_offset(s: &str) -> i32 { let s = s.trim(); diff --git a/crates/rsigma-cli/src/daemon/metrics.rs b/crates/rsigma-cli/src/daemon/metrics.rs index 009d8b19..ae023be5 100644 --- a/crates/rsigma-cli/src/daemon/metrics.rs +++ b/crates/rsigma-cli/src/daemon/metrics.rs @@ -43,6 +43,10 @@ pub struct Metrics { pub otlp_log_records: IntCounter, #[cfg(feature = "daemon-otlp")] pub otlp_errors: IntCounterVec, + #[cfg(feature = "daemon-tls")] + pub tls_certificate_expiry_seconds: Gauge, + #[cfg(feature = "daemon-tls")] + pub tls_active_connections: std::sync::Arc, } impl Metrics { @@ -363,6 +367,30 @@ impl Metrics { registry.register(Box::new(otlp_errors.clone())).unwrap(); } + #[cfg(feature = "daemon-tls")] + let tls_certificate_expiry_seconds = Gauge::with_opts(Opts::new( + "rsigma_tls_certificate_expiry_seconds", + "Seconds until the active TLS server certificate's not_after", + )) + .unwrap(); + #[cfg(feature = "daemon-tls")] + let tls_active_connections = std::sync::Arc::new( + IntGauge::with_opts(Opts::new( + "rsigma_tls_active_connections", + "Currently active TLS-terminated connections on the API listener", + )) + .unwrap(), + ); + #[cfg(feature = "daemon-tls")] + { + registry + .register(Box::new(tls_certificate_expiry_seconds.clone())) + .unwrap(); + registry + .register(Box::new(tls_active_connections.as_ref().clone())) + .unwrap(); + } + Metrics { registry, events_processed, @@ -401,6 +429,10 @@ impl Metrics { otlp_log_records, #[cfg(feature = "daemon-otlp")] otlp_errors, + #[cfg(feature = "daemon-tls")] + tls_certificate_expiry_seconds, + #[cfg(feature = "daemon-tls")] + tls_active_connections, } } diff --git a/crates/rsigma-cli/src/daemon/mod.rs b/crates/rsigma-cli/src/daemon/mod.rs index 98a5229d..7d608ad1 100644 --- a/crates/rsigma-cli/src/daemon/mod.rs +++ b/crates/rsigma-cli/src/daemon/mod.rs @@ -5,5 +5,7 @@ mod metrics; mod reload; pub(crate) mod server; mod store; +#[cfg(feature = "daemon-tls")] +pub(crate) mod tls; pub use server::run_daemon; diff --git a/crates/rsigma-cli/src/daemon/reload.rs b/crates/rsigma-cli/src/daemon/reload.rs index 185e415b..a2d871b2 100644 --- a/crates/rsigma-cli/src/daemon/reload.rs +++ b/crates/rsigma-cli/src/daemon/reload.rs @@ -73,11 +73,17 @@ pub fn spawn_file_watcher( Some(watcher) } -/// Set up a SIGHUP handler that sends reload signals and source re-resolution triggers. +/// Set up a SIGHUP handler that sends reload signals and source re-resolution +/// triggers, and (when `daemon-tls` is built in) also re-reads the configured +/// TLS certificate and key from disk and atomically swaps the rustls +/// `ServerConfig` so new handshakes pick up the rotated material without +/// dropping inflight connections. #[cfg(unix)] pub async fn sighup_listener( reload_tx: mpsc::Sender<()>, sources_trigger_tx: Option>, + #[cfg(feature = "daemon-tls")] tls_state: Option, + #[cfg(feature = "daemon-tls")] tls_metrics: std::sync::Arc, ) { use tokio::signal::unix::{SignalKind, signal}; @@ -96,6 +102,26 @@ pub async fn sighup_listener( if let Some(tx) = &sources_trigger_tx { let _ = tx.try_send(rsigma_runtime::sources::refresh::RefreshTrigger::All); } + + #[cfg(feature = "daemon-tls")] + if let Some(ref state) = tls_state { + match state.reload() { + Ok(new_expiry) => { + super::server::update_tls_metrics(&tls_metrics, new_expiry); + super::server::warn_if_cert_expiring_soon(new_expiry); + tracing::info!( + not_after = new_expiry, + "TLS certificate hot-reloaded" + ); + } + Err(e) => { + tracing::error!( + error = %e, + "Failed to reload TLS certificate; keeping previous one active" + ); + } + } + } } } @@ -103,6 +129,8 @@ pub async fn sighup_listener( pub async fn sighup_listener( _reload_tx: mpsc::Sender<()>, _sources_trigger_tx: Option>, + #[cfg(feature = "daemon-tls")] _tls_state: Option, + #[cfg(feature = "daemon-tls")] _tls_metrics: std::sync::Arc, ) { std::future::pending::<()>().await; } diff --git a/crates/rsigma-cli/src/daemon/server.rs b/crates/rsigma-cli/src/daemon/server.rs index 0753c1bf..bc5bef03 100644 --- a/crates/rsigma-cli/src/daemon/server.rs +++ b/crates/rsigma-cli/src/daemon/server.rs @@ -116,6 +116,11 @@ pub struct DaemonConfig { /// pipeline-embedded `sources:` blocks. Collision-checked at /// construction time. pub source_registry: rsigma_runtime::sources::registry::DaemonSourceRegistry, + /// Optional server-side TLS state. `Some` when the operator passed + /// `--tls-cert`/`--tls-key`; the daemon then terminates TLS on the + /// API listener for HTTP REST, OTLP/HTTP, and OTLP/gRPC. + #[cfg(feature = "daemon-tls")] + pub tls_state: Option, } pub async fn run_daemon(config: DaemonConfig) { @@ -454,16 +459,53 @@ pub async fn run_daemon(config: DaemonConfig) { tonic::service::Routes::from(app).add_service(grpc_service) }; + // TLS state is consumed below to build either a plaintext or TLS-wrapped + // listener; pull it out of `config` here so the borrow ends before the + // serve task captures the rest of the config by move. + #[cfg(feature = "daemon-tls")] + let tls_state = config.tls_state.clone(); + #[cfg(feature = "daemon-tls")] + let tls_enabled = tls_state.is_some(); + #[cfg(not(feature = "daemon-tls"))] + let tls_enabled = false; + + #[cfg(feature = "daemon-tls")] + if let Some(ref state) = tls_state { + update_tls_metrics(&metrics, state.expiry_unix.load(Ordering::Relaxed)); + warn_if_cert_expiring_soon(state.expiry_unix.load(Ordering::Relaxed)); + } + #[cfg(feature = "daemon-otlp")] - tracing::info!(addr = %actual_addr, "API server listening (HTTP/2 + gRPC)"); + if tls_enabled { + tracing::info!(addr = %actual_addr, "API server listening (HTTPS, HTTP/2 + gRPC)"); + } else { + tracing::info!(addr = %actual_addr, "API server listening (HTTP/2 + gRPC)"); + } #[cfg(not(feature = "daemon-otlp"))] - tracing::info!(addr = %actual_addr, "API server listening"); + if tls_enabled { + tracing::info!(addr = %actual_addr, "API server listening (HTTPS)"); + } else { + tracing::info!(addr = %actual_addr, "API server listening"); + } - // Spawn SIGHUP listener (triggers both rule reload and source re-resolution) + // Spawn SIGHUP listener (triggers rule reload, source re-resolution, + // and TLS cert reload when daemon-tls is built in). let sighup_reload_tx = reload_tx.clone(); let sighup_sources_tx = sources_trigger_tx_val.clone(); + #[cfg(feature = "daemon-tls")] + let sighup_tls = tls_state.clone(); + #[cfg(feature = "daemon-tls")] + let sighup_tls_metrics = metrics.clone(); tokio::spawn(async move { - reload::sighup_listener(sighup_reload_tx, sighup_sources_tx).await; + reload::sighup_listener( + sighup_reload_tx, + sighup_sources_tx, + #[cfg(feature = "daemon-tls")] + sighup_tls, + #[cfg(feature = "daemon-tls")] + sighup_tls_metrics, + ) + .await; }); // Spawn reload handler — uses LogProcessor::reload_rules for atomic hot-reload @@ -943,28 +985,55 @@ pub async fn run_daemon(config: DaemonConfig) { let drain_duration = std::time::Duration::from_secs(config.drain_timeout); + // Build the unified axum router: with `daemon-otlp`, the OTLP/gRPC + // service is folded into the same axum::Router via Tonic's + // `Routes::into_axum_router`. axum::serve handles HTTP/1 and HTTP/2 + // (including h2c for plaintext gRPC) via hyper-util's auto::Builder. #[cfg(feature = "daemon-otlp")] + let unified_app: axum::Router = otlp_routes.into_axum_router(); + #[cfg(not(feature = "daemon-otlp"))] + let unified_app: axum::Router = app; + let mut serve_handle = { - let incoming = tokio_stream::wrappers::TcpListenerStream::new(listener); - tokio::spawn(async move { - if let Err(e) = tonic::transport::Server::builder() - .accept_http1(true) - .serve_with_incoming_shutdown(otlp_routes, incoming, shutdown_signal()) - .await - { - tracing::error!(error = %e, "server error"); + #[cfg(feature = "daemon-tls")] + { + if let Some(state) = tls_state { + let tls_listener = super::tls::RustlsListener::new( + listener, + state.config.clone(), + metrics.tls_active_connections.clone(), + ); + tokio::spawn(async move { + if let Err(e) = axum::serve(tls_listener, unified_app) + .with_graceful_shutdown(shutdown_signal()) + .await + { + tracing::error!(error = %e, "server error"); + } + }) + } else { + tokio::spawn(async move { + if let Err(e) = axum::serve(listener, unified_app) + .with_graceful_shutdown(shutdown_signal()) + .await + { + tracing::error!(error = %e, "server error"); + } + }) } - }) - }; - #[cfg(not(feature = "daemon-otlp"))] - let mut serve_handle = tokio::spawn(async move { - if let Err(e) = axum::serve(listener, app) - .with_graceful_shutdown(shutdown_signal()) - .await + } + #[cfg(not(feature = "daemon-tls"))] { - tracing::error!(error = %e, "server error"); + tokio::spawn(async move { + if let Err(e) = axum::serve(listener, unified_app) + .with_graceful_shutdown(shutdown_signal()) + .await + { + tracing::error!(error = %e, "server error"); + } + }) } - }); + }; let shutdown_triggered = tokio::select! { _ = &mut serve_handle => true, @@ -1203,6 +1272,39 @@ fn decide_state_restore( } } +/// Refresh the `rsigma_tls_certificate_expiry_seconds` gauge to the +/// number of seconds between now and `expiry_unix`. Called at startup +/// and after every SIGHUP-triggered cert reload. +#[cfg(feature = "daemon-tls")] +pub(crate) fn update_tls_metrics(metrics: &Metrics, expiry_unix: i64) { + let now = chrono::Utc::now().timestamp(); + let delta = (expiry_unix - now) as f64; + metrics.tls_certificate_expiry_seconds.set(delta); +} + +/// Emit a single WARN if the active certificate expires within 30 days. +/// Operators wire this into existing log-based alerting; the Prometheus +/// gauge handles the longer-horizon dashboards. +#[cfg(feature = "daemon-tls")] +pub(crate) fn warn_if_cert_expiring_soon(expiry_unix: i64) { + const WARN_WINDOW_SECS: i64 = 30 * 24 * 3600; + let now = chrono::Utc::now().timestamp(); + let remaining = expiry_unix - now; + if remaining < 0 { + tracing::warn!( + expiry_unix, + "TLS server certificate has already expired; clients will reject the handshake" + ); + } else if remaining < WARN_WINDOW_SECS { + let days = remaining / 86400; + tracing::warn!( + expiry_unix, + days_remaining = days, + "TLS server certificate expires in less than 30 days; rotate it soon" + ); + } +} + /// Build a `SourcePosition` from the high-water mark atomics. /// Returns `None` if no NATS messages have been acked yet (sequence == 0). fn source_position_from_atomics(seq: &AtomicU64, ts: &AtomicI64) -> Option { diff --git a/crates/rsigma-cli/src/daemon/tls.rs b/crates/rsigma-cli/src/daemon/tls.rs new file mode 100644 index 00000000..e980b50f --- /dev/null +++ b/crates/rsigma-cli/src/daemon/tls.rs @@ -0,0 +1,454 @@ +//! Server-side TLS termination for the daemon API listener. +//! +//! This module loads PEM-encoded certificates and keys from disk, builds a +//! `rustls::ServerConfig` with the `aws-lc-rs` provider (matching the rest +//! of the rsigma TLS surface), and exposes a small `TlsState` handle that +//! the daemon hot-reload path can swap in-place without dropping inflight +//! connections. +//! +//! Inspecting certificate expiry uses `x509-parser` so we can emit a single +//! WARN at startup when the leaf certificate expires within 30 days and +//! keep the `rsigma_tls_certificate_expiry_seconds` Prometheus gauge in +//! sync with the active certificate. +//! +//! Gated behind the `daemon-tls` Cargo feature. + +#![cfg(feature = "daemon-tls")] + +use std::fs; +use std::io::{self, BufReader}; +use std::net::SocketAddr; +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use arc_swap::ArcSwap; +use rustls::pki_types::{CertificateDer, PrivateKeyDer}; +use rustls::server::WebPkiClientVerifier; +use rustls::{RootCertStore, ServerConfig}; +use tokio::net::TcpListener; +use tokio_rustls::TlsAcceptor; +use tokio_rustls::server::TlsStream; +use x509_parser::prelude::FromDer; + +/// Operator-supplied configuration assembled from CLI flags. +/// +/// `cert_path` / `key_path` are required; the rest are optional. +/// Validation (loopback bypass, `--allow-plaintext`, file existence) +/// happens at `TlsState::from_paths` and `enforce_plaintext_policy`. +#[derive(Debug, Clone)] +pub struct TlsCliConfig { + pub cert_path: PathBuf, + pub key_path: PathBuf, + pub key_password: Option, + pub client_ca_path: Option, + pub min_version: TlsMinVersion, +} + +/// Minimum TLS protocol version accepted by the server. +/// +/// Default is TLS 1.3. Operators can drop to TLS 1.2 for legacy agents +/// (Fluent Bit on old distros, ancient OpenSSL builds) by passing +/// `--tls-min-version 1.2`. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum TlsMinVersion { + V1_2, + #[default] + V1_3, +} + +impl std::str::FromStr for TlsMinVersion { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "1.2" | "tls1.2" | "TLS1.2" => Ok(Self::V1_2), + "1.3" | "tls1.3" | "TLS1.3" => Ok(Self::V1_3), + other => Err(format!( + "invalid --tls-min-version '{other}', expected '1.2' or '1.3'" + )), + } + } +} + +/// Live TLS state shared between the accept loop and the SIGHUP reload path. +/// +/// The `ArcSwap` holds the active `rustls::ServerConfig` so the reload path +/// can publish a new chain atomically without coordinating with in-flight +/// handshakes or connections. The CLI args are kept around so SIGHUP knows +/// where to read replacement certs from. +#[derive(Clone)] +pub struct TlsState { + /// Atomically swappable `ServerConfig` used by every new handshake. + pub config: Arc>, + /// Original CLI config so SIGHUP can re-read cert/key from disk. + pub cli: TlsCliConfig, + /// Unix timestamp (seconds) at which the active cert expires. Updated + /// on every successful reload so the Prometheus gauge stays accurate. + pub expiry_unix: Arc, +} + +impl TlsState { + /// Build a fresh `TlsState` from operator-supplied paths. + pub fn from_paths(cli: TlsCliConfig) -> Result { + let config = build_server_config(&cli)?; + let expiry = read_cert_expiry(&cli.cert_path)?; + Ok(Self { + config: Arc::new(ArcSwap::from_pointee(config)), + cli, + expiry_unix: Arc::new(std::sync::atomic::AtomicI64::new(expiry)), + }) + } + + /// Re-read cert/key from disk and atomically swap the active config. + /// + /// Returns the new expiry timestamp so callers can update the + /// Prometheus gauge. The previous config remains active if the + /// reload fails, mirroring the rules-reload contract. + pub fn reload(&self) -> Result { + let new_config = build_server_config(&self.cli)?; + let new_expiry = read_cert_expiry(&self.cli.cert_path)?; + self.config.store(Arc::new(new_config)); + self.expiry_unix + .store(new_expiry, std::sync::atomic::Ordering::Relaxed); + Ok(new_expiry) + } +} + +/// Errors that can be produced while loading or parsing TLS material. +#[derive(Debug)] +pub enum TlsError { + Io(io::Error, PathBuf), + NoCertificates(PathBuf), + NoPrivateKey(PathBuf), + EncryptedKeyUnsupported(PathBuf), + Rustls(rustls::Error), + InvalidClientCa(PathBuf, String), + InvalidCertificate(PathBuf, String), +} + +impl std::fmt::Display for TlsError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Io(e, p) => write!(f, "I/O error reading {}: {e}", p.display()), + Self::NoCertificates(p) => write!(f, "no certificates found in {}", p.display()), + Self::NoPrivateKey(p) => write!(f, "no private key found in {}", p.display()), + Self::EncryptedKeyUnsupported(p) => write!( + f, + "encrypted private key in {} is not supported yet; decrypt with `openssl rsa -in key.pem -out key-decrypted.pem` first", + p.display() + ), + Self::Rustls(e) => write!(f, "rustls error: {e}"), + Self::InvalidClientCa(p, e) => { + write!(f, "invalid client CA bundle {}: {e}", p.display()) + } + Self::InvalidCertificate(p, e) => { + write!(f, "invalid certificate {}: {e}", p.display()) + } + } + } +} + +impl std::error::Error for TlsError {} + +/// Build a `rustls::ServerConfig` from the CLI config. +fn build_server_config(cli: &TlsCliConfig) -> Result { + if cli.key_password.is_some() { + return Err(TlsError::EncryptedKeyUnsupported(cli.key_path.clone())); + } + + let certs = load_certs(&cli.cert_path)?; + let key = load_private_key(&cli.key_path)?; + + // Pin the aws-lc-rs provider for consistency with NATS client TLS + // and to inherit upstream FIPS-mode work. + let provider = Arc::new(rustls::crypto::aws_lc_rs::default_provider()); + + let protocol_versions: &[&rustls::SupportedProtocolVersion] = match cli.min_version { + TlsMinVersion::V1_2 => rustls::ALL_VERSIONS, + TlsMinVersion::V1_3 => &[&rustls::version::TLS13], + }; + + let builder = ServerConfig::builder_with_provider(provider) + .with_protocol_versions(protocol_versions) + .map_err(TlsError::Rustls)?; + + let builder = if let Some(ca_path) = cli.client_ca_path.as_ref() { + let roots = load_client_ca_roots(ca_path)?; + let verifier = WebPkiClientVerifier::builder(Arc::new(roots)) + .build() + .map_err(|e| TlsError::InvalidClientCa(ca_path.clone(), e.to_string()))?; + builder.with_client_cert_verifier(verifier) + } else { + builder.with_no_client_auth() + }; + + let mut config = builder + .with_single_cert(certs, key) + .map_err(TlsError::Rustls)?; + + // Advertise both HTTP/2 (for OTLP/gRPC and modern HTTP/2 clients) and + // HTTP/1.1 (for legacy REST clients and OTLP/HTTP/1.1 agents). + config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()]; + + Ok(config) +} + +/// Read a PEM bundle of one or more certificates. +fn load_certs(path: &Path) -> Result>, TlsError> { + let file = fs::File::open(path).map_err(|e| TlsError::Io(e, path.to_path_buf()))?; + let mut reader = BufReader::new(file); + let certs: Result, _> = rustls_pemfile::certs(&mut reader).collect(); + let certs = certs.map_err(|e| TlsError::Io(e, path.to_path_buf()))?; + if certs.is_empty() { + return Err(TlsError::NoCertificates(path.to_path_buf())); + } + Ok(certs) +} + +/// Read a PEM-encoded private key (PKCS#8, RSA, or SEC1/EC). +fn load_private_key(path: &Path) -> Result, TlsError> { + let file = fs::File::open(path).map_err(|e| TlsError::Io(e, path.to_path_buf()))?; + let mut reader = BufReader::new(file); + let key = rustls_pemfile::private_key(&mut reader) + .map_err(|e| TlsError::Io(e, path.to_path_buf()))? + .ok_or_else(|| TlsError::NoPrivateKey(path.to_path_buf()))?; + Ok(key) +} + +/// Load a PEM bundle of trusted CA certificates for mTLS verification. +fn load_client_ca_roots(path: &Path) -> Result { + let file = fs::File::open(path).map_err(|e| TlsError::Io(e, path.to_path_buf()))?; + let mut reader = BufReader::new(file); + let certs: Result, _> = rustls_pemfile::certs(&mut reader).collect(); + let certs = certs.map_err(|e| TlsError::Io(e, path.to_path_buf()))?; + if certs.is_empty() { + return Err(TlsError::NoCertificates(path.to_path_buf())); + } + let mut roots = RootCertStore::empty(); + for (idx, cert) in certs.into_iter().enumerate() { + roots.add(cert).map_err(|e| { + TlsError::InvalidClientCa(path.to_path_buf(), format!("cert #{idx}: {e}")) + })?; + } + Ok(roots) +} + +/// Read the leaf certificate from `path` and return its `not_after` as a +/// Unix timestamp. +pub fn read_cert_expiry(path: &Path) -> Result { + let certs = load_certs(path)?; + let leaf = certs + .first() + .ok_or_else(|| TlsError::NoCertificates(path.to_path_buf()))?; + let (_, parsed) = x509_parser::certificate::X509Certificate::from_der(leaf.as_ref()) + .map_err(|e| TlsError::InvalidCertificate(path.to_path_buf(), e.to_string()))?; + Ok(parsed.validity().not_after.timestamp()) +} + +/// Decide whether the operator may bind plaintext on `addr` without TLS. +/// +/// Loopback addresses (`127.0.0.0/8`, `::1`) are always allowed for local +/// development. Public binds require an explicit `--allow-plaintext` +/// opt-in so a careless `--api-addr 0.0.0.0:9090` never silently ships +/// detection events over cleartext. +pub fn enforce_plaintext_policy(addr: SocketAddr, allow_plaintext: bool) -> Result<(), String> { + if is_loopback(addr) || allow_plaintext { + return Ok(()); + } + Err(format!( + "refusing to bind plaintext on non-loopback address {addr}; \ + pass --tls-cert/--tls-key to enable TLS or --allow-plaintext to opt out \ + (e.g. when terminating TLS at a sidecar reverse proxy)" + )) +} + +fn is_loopback(addr: SocketAddr) -> bool { + addr.ip().is_loopback() +} + +/// An `axum::serve::Listener` adapter that performs a TLS handshake on +/// every accepted TCP connection. +/// +/// Handshake failures are logged and ignored; the listener keeps polling +/// the underlying `TcpListener` so a single bad client cannot stall the +/// server. The active `ServerConfig` is loaded from the shared +/// `ArcSwap` on every new connection so SIGHUP-triggered cert rotation +/// takes effect on the next handshake without dropping inflight TLS +/// connections. +pub struct RustlsListener { + tcp: TcpListener, + config: Arc>, + active_connections: Arc, +} + +impl RustlsListener { + pub fn new( + tcp: TcpListener, + config: Arc>, + active_connections: Arc, + ) -> Self { + Self { + tcp, + config, + active_connections, + } + } +} + +impl axum::serve::Listener for RustlsListener { + type Io = TrackedTlsStream; + type Addr = SocketAddr; + + async fn accept(&mut self) -> (Self::Io, Self::Addr) { + loop { + let (tcp, peer) = match self.tcp.accept().await { + Ok(pair) => pair, + Err(e) => { + tracing::warn!(error = %e, "TCP accept failed, retrying after backoff"); + tokio::time::sleep(std::time::Duration::from_millis(100)).await; + continue; + } + }; + let cfg = self.config.load_full(); + let acceptor = TlsAcceptor::from(cfg); + match acceptor.accept(tcp).await { + Ok(tls) => { + self.active_connections.inc(); + return ( + TrackedTlsStream { + inner: tls, + counter: self.active_connections.clone(), + }, + peer, + ); + } + Err(e) => { + tracing::warn!(peer = %peer, error = %e, "TLS handshake failed"); + } + } + } + } + + fn local_addr(&self) -> io::Result { + self.tcp.local_addr() + } +} + +/// `TlsStream` wrapper that decrements the active-connection +/// gauge on drop. The gauge sits on the hot path so we use a cheap +/// `IntGauge` rather than a histogram. +pub struct TrackedTlsStream { + inner: TlsStream, + counter: Arc, +} + +impl Drop for TrackedTlsStream { + fn drop(&mut self) { + self.counter.dec(); + } +} + +impl tokio::io::AsyncRead for TrackedTlsStream { + fn poll_read( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &mut tokio::io::ReadBuf<'_>, + ) -> std::task::Poll> { + std::pin::Pin::new(&mut self.inner).poll_read(cx, buf) + } +} + +impl tokio::io::AsyncWrite for TrackedTlsStream { + fn poll_write( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &[u8], + ) -> std::task::Poll> { + std::pin::Pin::new(&mut self.inner).poll_write(cx, buf) + } + + fn poll_flush( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + std::pin::Pin::new(&mut self.inner).poll_flush(cx) + } + + fn poll_shutdown( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + std::pin::Pin::new(&mut self.inner).poll_shutdown(cx) + } + + fn poll_write_vectored( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + bufs: &[io::IoSlice<'_>], + ) -> std::task::Poll> { + std::pin::Pin::new(&mut self.inner).poll_write_vectored(cx, bufs) + } + + fn is_write_vectored(&self) -> bool { + self.inner.is_write_vectored() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; + + #[test] + fn parse_min_version() { + assert_eq!("1.2".parse::().unwrap(), TlsMinVersion::V1_2); + assert_eq!("1.3".parse::().unwrap(), TlsMinVersion::V1_3); + assert!("1.1".parse::().is_err()); + } + + #[test] + fn loopback_bypasses_plaintext_check() { + let addr_v4 = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 0); + assert!(enforce_plaintext_policy(addr_v4, false).is_ok()); + + let addr_v6 = SocketAddr::new(IpAddr::V6(Ipv6Addr::LOCALHOST), 0); + assert!(enforce_plaintext_policy(addr_v6, false).is_ok()); + } + + #[test] + fn public_bind_requires_explicit_opt_in() { + let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::UNSPECIFIED), 9090); + let err = enforce_plaintext_policy(addr, false).unwrap_err(); + assert!(err.contains("refusing to bind plaintext")); + assert!(err.contains("--allow-plaintext")); + + assert!(enforce_plaintext_policy(addr, true).is_ok()); + } + + #[test] + fn missing_cert_file_is_clear_error() { + let cli = TlsCliConfig { + cert_path: PathBuf::from("/nonexistent/cert.pem"), + key_path: PathBuf::from("/nonexistent/key.pem"), + key_password: None, + client_ca_path: None, + min_version: TlsMinVersion::V1_3, + }; + let err = TlsState::from_paths(cli).err().expect("expected an error"); + assert!(matches!(err, TlsError::Io(_, _))); + } + + #[test] + fn encrypted_key_is_rejected_with_guidance() { + let cli = TlsCliConfig { + cert_path: PathBuf::from("/nonexistent/cert.pem"), + key_path: PathBuf::from("/nonexistent/key.pem"), + key_password: Some("hunter2".to_string()), + client_ca_path: None, + min_version: TlsMinVersion::V1_3, + }; + let err = TlsState::from_paths(cli).err().expect("expected an error"); + assert!(matches!(err, TlsError::EncryptedKeyUnsupported(_))); + assert!(err.to_string().contains("openssl")); + } +} From 9ebcab53368ffcf129c3e8bed8e6e359faac3c36 Mon Sep 17 00:00:00 2001 From: Mostafa Moradian Date: Fri, 22 May 2026 13:13:59 +0200 Subject: [PATCH 03/11] test(daemon): cli_daemon_tls integration test suite with rcgen Adds 10 E2E tests covering the new server-side TLS surface: - Plaintext refusal: `0.0.0.0` bind without TLS or `--allow-plaintext` fails to start with a clear stderr message; loopback keeps working; `--allow-plaintext` flips the opt-in. - HTTPS happy path: `/healthz` and `POST /api/v1/events` succeed over a TLS connection negotiated against a self-signed CA minted with rcgen at test setup. - mTLS: a client that does not present a certificate is rejected at the TLS handshake; a client signed by the same CA succeeds. - Metrics: the `/metrics` endpoint exposes both `rsigma_tls_certificate_expiry_seconds` and `rsigma_tls_active_connections` after TLS is configured. - Misconfiguration: a missing cert file surfaces a clear startup error; `--tls-key-password` is rejected with a hint pointing at `openssl rsa` for offline decryption. The TLS module also gains a small fix: `WebPkiClientVerifier` is now built with an explicit aws-lc-rs `CryptoProvider`, which avoids a panic when both `ring` and `aws-lc-rs` end up in the dependency tree (via reqwest, jsonschema, etc). Companion `spawn_expect_failure` helper in tests/common/mod.rs lets other tests assert that a misconfigured daemon refuses to start with a specific stderr line. Implements the verification matrix from #128. --- crates/rsigma-cli/src/commands/daemon.rs | 11 +- crates/rsigma-cli/src/daemon/reload.rs | 5 +- crates/rsigma-cli/src/daemon/tls.rs | 13 +- crates/rsigma-cli/tests/cli_daemon_tls.rs | 503 ++++++++++++++++++++++ crates/rsigma-cli/tests/common/mod.rs | 50 +++ 5 files changed, 570 insertions(+), 12 deletions(-) create mode 100644 crates/rsigma-cli/tests/cli_daemon_tls.rs diff --git a/crates/rsigma-cli/src/commands/daemon.rs b/crates/rsigma-cli/src/commands/daemon.rs index f57816b7..e8aa7552 100644 --- a/crates/rsigma-cli/src/commands/daemon.rs +++ b/crates/rsigma-cli/src/commands/daemon.rs @@ -319,7 +319,11 @@ pub(crate) struct DaemonArgs { /// Default is `1.3`. Use `1.2` only for compatibility with legacy /// agents that cannot negotiate TLS 1.3. #[cfg(feature = "daemon-tls")] - #[arg(long = "tls-min-version", value_name = "VERSION", default_value = "1.3")] + #[arg( + long = "tls-min-version", + value_name = "VERSION", + default_value = "1.3" + )] pub tls_min_version: String, /// Allow the daemon to bind a non-loopback `--api-addr` without TLS. @@ -740,10 +744,7 @@ pub(crate) fn parse_input_format(format_str: &str, syslog_tz: &str) -> rsigma_ru /// requested. Exits with `CONFIG_ERROR` on validation failure so the /// operator sees the problem before the daemon spins up. #[cfg(feature = "daemon-tls")] -fn build_tls_state( - args: &TlsCliArgs, - addr: std::net::SocketAddr, -) -> Option { +fn build_tls_state(args: &TlsCliArgs, addr: std::net::SocketAddr) -> Option { use daemon::tls::{TlsCliConfig, TlsMinVersion, TlsState, enforce_plaintext_policy}; match (args.cert.as_ref(), args.key.as_ref()) { diff --git a/crates/rsigma-cli/src/daemon/reload.rs b/crates/rsigma-cli/src/daemon/reload.rs index a2d871b2..46492aa5 100644 --- a/crates/rsigma-cli/src/daemon/reload.rs +++ b/crates/rsigma-cli/src/daemon/reload.rs @@ -109,10 +109,7 @@ pub async fn sighup_listener( Ok(new_expiry) => { super::server::update_tls_metrics(&tls_metrics, new_expiry); super::server::warn_if_cert_expiring_soon(new_expiry); - tracing::info!( - not_after = new_expiry, - "TLS certificate hot-reloaded" - ); + tracing::info!(not_after = new_expiry, "TLS certificate hot-reloaded"); } Err(e) => { tracing::error!( diff --git a/crates/rsigma-cli/src/daemon/tls.rs b/crates/rsigma-cli/src/daemon/tls.rs index e980b50f..46154398 100644 --- a/crates/rsigma-cli/src/daemon/tls.rs +++ b/crates/rsigma-cli/src/daemon/tls.rs @@ -174,9 +174,16 @@ fn build_server_config(cli: &TlsCliConfig) -> Result { let builder = if let Some(ca_path) = cli.client_ca_path.as_ref() { let roots = load_client_ca_roots(ca_path)?; - let verifier = WebPkiClientVerifier::builder(Arc::new(roots)) - .build() - .map_err(|e| TlsError::InvalidClientCa(ca_path.clone(), e.to_string()))?; + // Pass the aws-lc-rs provider explicitly so the builder does not + // try (and fail, when both `ring` and `aws-lc-rs` are in the + // dependency tree) to discover the process-level + // `CryptoProvider`. + let verifier = WebPkiClientVerifier::builder_with_provider( + Arc::new(roots), + Arc::new(rustls::crypto::aws_lc_rs::default_provider()), + ) + .build() + .map_err(|e| TlsError::InvalidClientCa(ca_path.clone(), e.to_string()))?; builder.with_client_cert_verifier(verifier) } else { builder.with_no_client_auth() diff --git a/crates/rsigma-cli/tests/cli_daemon_tls.rs b/crates/rsigma-cli/tests/cli_daemon_tls.rs new file mode 100644 index 00000000..1c7ab619 --- /dev/null +++ b/crates/rsigma-cli/tests/cli_daemon_tls.rs @@ -0,0 +1,503 @@ +//! E2E tests for the `daemon-tls` feature. +//! +//! Each test mints a self-signed CA and leaf certificate on the fly with +//! `rcgen`, spawns `rsigma engine daemon` with TLS termination enabled, +//! and asserts that the HTTPS handshake (and, where applicable, the mTLS +//! client-cert verification) behaves as expected. + +#![cfg(feature = "daemon-tls")] + +mod common; + +use std::io::Write; +use std::sync::Arc; +use std::time::Duration; + +use common::{DaemonProcess, SIMPLE_RULE, spawn_expect_failure, temp_file}; +use rcgen::{ + CertificateParams, DnType, ExtendedKeyUsagePurpose, IsCa, Issuer, KeyPair, KeyUsagePurpose, +}; +use rustls::pki_types::ServerName; +use rustls::{ClientConfig, RootCertStore}; +use rustls_pki_types::pem::PemObject; +use tempfile::NamedTempFile; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; +use tokio::net::TcpStream; +use tokio_rustls::TlsConnector; + +/// Materialized cert/key files produced by `mint_ca_and_leaf`. Holding the +/// `NamedTempFile`s and the CA key keeps the on-disk PEM material and the +/// signing material alive for the lifetime of the test, so the test can +/// later mint client certs from the same CA without re-parsing PEM. +struct TlsFixture { + _ca_file: NamedTempFile, + _cert_file: NamedTempFile, + _key_file: NamedTempFile, + ca_path: String, + cert_path: String, + key_path: String, + root_store: Arc, + /// Held so `mint_client_cert` can re-use the CA without re-parsing the + /// PEM (rcgen 0.14's `Issuer::from_ca_cert_pem` is gated behind an + /// optional feature we do not pull in for tests). + ca_issuer: Issuer<'static, KeyPair>, +} + +/// Mint a self-signed CA, then sign a leaf certificate for `127.0.0.1` +/// suitable for both `serverAuth` and `clientAuth`. Returns paths plus a +/// `RootCertStore` clients can use to verify the server. +fn mint_ca_and_leaf() -> TlsFixture { + let mut ca_params = CertificateParams::new(Vec::::new()).unwrap(); + ca_params + .distinguished_name + .push(DnType::CommonName, "rsigma-test-ca"); + ca_params.is_ca = IsCa::Ca(rcgen::BasicConstraints::Unconstrained); + ca_params.key_usages = vec![ + KeyUsagePurpose::DigitalSignature, + KeyUsagePurpose::KeyCertSign, + KeyUsagePurpose::CrlSign, + ]; + let ca_key = KeyPair::generate().unwrap(); + let ca_cert = ca_params.self_signed(&ca_key).unwrap(); + let ca_pem = ca_cert.pem(); + let ca_issuer = Issuer::new(ca_params, ca_key); + + let mut leaf_params = CertificateParams::new(vec!["localhost".to_string()]).unwrap(); + leaf_params + .subject_alt_names + .push(rcgen::SanType::IpAddress(std::net::IpAddr::from([ + 127, 0, 0, 1, + ]))); + leaf_params + .distinguished_name + .push(DnType::CommonName, "rsigma-test-server"); + leaf_params.extended_key_usages = vec![ + ExtendedKeyUsagePurpose::ServerAuth, + ExtendedKeyUsagePurpose::ClientAuth, + ]; + let leaf_key = KeyPair::generate().unwrap(); + let leaf_cert = leaf_params.signed_by(&leaf_key, &ca_issuer).unwrap(); + + let ca_file = temp_file(".pem", &ca_pem); + let cert_file = temp_file(".pem", &leaf_cert.pem()); + let key_file = temp_file(".pem", &leaf_key.serialize_pem()); + + let mut store = RootCertStore::empty(); + for cert in rustls::pki_types::CertificateDer::pem_slice_iter(ca_pem.as_bytes()) { + store.add(cert.unwrap()).unwrap(); + } + + TlsFixture { + ca_path: ca_file.path().to_str().unwrap().to_string(), + cert_path: cert_file.path().to_str().unwrap().to_string(), + key_path: key_file.path().to_str().unwrap().to_string(), + root_store: Arc::new(store), + ca_issuer, + _ca_file: ca_file, + _cert_file: cert_file, + _key_file: key_file, + } +} + +/// Mint a client certificate signed by the supplied CA issuer for mTLS +/// positive-path tests. +fn mint_client_cert(issuer: &Issuer<'_, KeyPair>) -> (NamedTempFile, NamedTempFile) { + let mut client_params = CertificateParams::new(Vec::::new()).unwrap(); + client_params + .distinguished_name + .push(DnType::CommonName, "rsigma-test-client"); + client_params.extended_key_usages = vec![ExtendedKeyUsagePurpose::ClientAuth]; + let client_key = KeyPair::generate().unwrap(); + let client_cert = client_params.signed_by(&client_key, issuer).unwrap(); + + let cert_file = temp_file(".pem", &client_cert.pem()); + let key_file = temp_file(".pem", &client_key.serialize_pem()); + (cert_file, key_file) +} + +/// Build a rustls client config that trusts only the server's CA. +fn client_config(roots: Arc) -> ClientConfig { + ClientConfig::builder_with_provider(Arc::new(rustls::crypto::aws_lc_rs::default_provider())) + .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) + .unwrap() + .with_root_certificates(roots) + .with_no_client_auth() +} + +/// Build a rustls client config with client auth. +fn client_config_with_auth( + roots: Arc, + cert_pem_path: &str, + key_pem_path: &str, +) -> ClientConfig { + let cert_chain: Vec<_> = rustls::pki_types::CertificateDer::pem_file_iter(cert_pem_path) + .unwrap() + .collect::>() + .unwrap(); + let key = rustls::pki_types::PrivateKeyDer::from_pem_file(key_pem_path).unwrap(); + ClientConfig::builder_with_provider(Arc::new(rustls::crypto::aws_lc_rs::default_provider())) + .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) + .unwrap() + .with_root_certificates(roots) + .with_client_auth_cert(cert_chain, key) + .unwrap() +} + +/// Synchronous HTTPS GET. Returns (status, body). Panics on transport +/// failure; HTTPS error codes are returned, not panicked. +fn https_get( + addr: &str, + path: &str, + config: ClientConfig, +) -> Result<(u16, String), Box> { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()?; + rt.block_on(async move { + let host = addr.split(':').next().unwrap_or("127.0.0.1"); + let socket: std::net::SocketAddr = addr.parse()?; + let tcp = TcpStream::connect(socket).await?; + let connector = TlsConnector::from(Arc::new(config)); + let server_name = ServerName::try_from(host.to_string()).unwrap(); + let mut tls = connector.connect(server_name, tcp).await?; + let req = format!("GET {path} HTTP/1.1\r\nHost: {host}\r\nConnection: close\r\n\r\n"); + tls.write_all(req.as_bytes()).await?; + let mut buf = Vec::new(); + tls.read_to_end(&mut buf).await?; + let response = String::from_utf8_lossy(&buf).into_owned(); + let mut lines = response.splitn(2, "\r\n"); + let status_line = lines.next().unwrap_or(""); + let status: u16 = status_line + .split_whitespace() + .nth(1) + .unwrap_or("0") + .parse() + .unwrap_or(0); + let body = response + .split_once("\r\n\r\n") + .map(|(_, b)| b.to_string()) + .unwrap_or_default(); + Ok((status, body)) + }) +} + +// --------------------------------------------------------------------------- +// Plaintext refusal policy +// --------------------------------------------------------------------------- + +#[test] +fn public_bind_without_tls_refuses_to_start() { + let rule = temp_file(".yml", SIMPLE_RULE); + let stderr = spawn_expect_failure( + &[ + "engine", + "daemon", + "-r", + rule.path().to_str().unwrap(), + "--input", + "http", + "--api-addr", + "0.0.0.0:0", + ], + Duration::from_secs(5), + ); + assert!( + stderr.contains("refusing to bind plaintext"), + "expected plaintext refusal in stderr, got: {stderr}" + ); + assert!( + stderr.contains("--allow-plaintext"), + "stderr should mention the opt-out flag, got: {stderr}" + ); +} + +#[test] +fn loopback_keeps_plaintext_without_flag() { + let rule = temp_file(".yml", SIMPLE_RULE); + let daemon = DaemonProcess::spawn_http(rule.path().to_str().unwrap()); + let (status, body) = common::http_get(&daemon.url("/healthz")); + assert_eq!(status, 200); + let v: serde_json::Value = serde_json::from_str(&body).unwrap(); + assert_eq!(v["status"], "ok"); +} + +#[test] +fn public_bind_with_allow_plaintext_starts() { + let rule = temp_file(".yml", SIMPLE_RULE); + let daemon = DaemonProcess::spawn(&[ + "engine", + "daemon", + "-r", + rule.path().to_str().unwrap(), + "--input", + "http", + "--api-addr", + "0.0.0.0:0", + "--allow-plaintext", + ]); + let (status, _) = common::http_get(&daemon.url("/healthz")); + assert_eq!(status, 200); +} + +// --------------------------------------------------------------------------- +// HTTPS happy path +// --------------------------------------------------------------------------- + +#[test] +fn https_healthz_succeeds_with_trusted_ca() { + let fixture = mint_ca_and_leaf(); + let rule = temp_file(".yml", SIMPLE_RULE); + let daemon = DaemonProcess::spawn(&[ + "engine", + "daemon", + "-r", + rule.path().to_str().unwrap(), + "--input", + "http", + "--api-addr", + "127.0.0.1:0", + "--tls-cert", + &fixture.cert_path, + "--tls-key", + &fixture.key_path, + ]); + + let (status, body) = https_get( + daemon.api_addr(), + "/healthz", + client_config(fixture.root_store), + ) + .expect("https handshake to /healthz failed"); + assert_eq!(status, 200, "body was: {body}"); + assert!(body.contains("\"ok\""), "body was: {body}"); +} + +#[test] +fn https_post_events_triggers_detection() { + let fixture = mint_ca_and_leaf(); + let rule = temp_file(".yml", SIMPLE_RULE); + let daemon = DaemonProcess::spawn(&[ + "engine", + "daemon", + "-r", + rule.path().to_str().unwrap(), + "--input", + "http", + "--api-addr", + "127.0.0.1:0", + "--tls-cert", + &fixture.cert_path, + "--tls-key", + &fixture.key_path, + ]); + + // Manually POST via raw HTTPS (the test crate's ureq helper is plaintext). + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + let posted = rt.block_on(async { + let socket: std::net::SocketAddr = daemon.api_addr().parse().unwrap(); + let tcp = TcpStream::connect(socket).await.unwrap(); + let connector = TlsConnector::from(Arc::new(client_config(fixture.root_store.clone()))); + let server_name = ServerName::try_from("localhost").unwrap(); + let mut tls = connector.connect(server_name, tcp).await.unwrap(); + let body = r#"{"CommandLine":"malware.exe"}"#; + let req = format!( + "POST /api/v1/events HTTP/1.1\r\nHost: localhost\r\nContent-Length: {}\r\nContent-Type: application/json\r\nConnection: close\r\n\r\n{body}", + body.len() + ); + tls.write_all(req.as_bytes()).await.unwrap(); + let mut buf = Vec::new(); + tls.read_to_end(&mut buf).await.unwrap(); + String::from_utf8_lossy(&buf).into_owned() + }); + assert!( + posted.starts_with("HTTP/1.1 200"), + "expected 200 OK for /api/v1/events, got: {posted}" + ); + assert!(posted.contains("\"accepted\":1")); +} + +// --------------------------------------------------------------------------- +// mTLS verification +// --------------------------------------------------------------------------- + +#[test] +fn mtls_rejects_client_without_certificate() { + let fixture = mint_ca_and_leaf(); + let rule = temp_file(".yml", SIMPLE_RULE); + let daemon = DaemonProcess::spawn(&[ + "engine", + "daemon", + "-r", + rule.path().to_str().unwrap(), + "--input", + "http", + "--api-addr", + "127.0.0.1:0", + "--tls-cert", + &fixture.cert_path, + "--tls-key", + &fixture.key_path, + "--tls-client-ca", + &fixture.ca_path, + ]); + + let err = https_get( + daemon.api_addr(), + "/healthz", + client_config(fixture.root_store.clone()), + ) + .expect_err("handshake without client cert should be rejected"); + let msg = err.to_string(); + assert!( + msg.to_lowercase().contains("certificate") + || msg.to_lowercase().contains("handshake") + || msg.to_lowercase().contains("tls") + || msg.to_lowercase().contains("eof"), + "expected TLS-level rejection, got: {msg}" + ); +} + +#[test] +fn mtls_accepts_client_with_valid_certificate() { + let fixture = mint_ca_and_leaf(); + let (client_cert_file, client_key_file) = mint_client_cert(&fixture.ca_issuer); + + let rule = temp_file(".yml", SIMPLE_RULE); + let daemon = DaemonProcess::spawn(&[ + "engine", + "daemon", + "-r", + rule.path().to_str().unwrap(), + "--input", + "http", + "--api-addr", + "127.0.0.1:0", + "--tls-cert", + &fixture.cert_path, + "--tls-key", + &fixture.key_path, + "--tls-client-ca", + &fixture.ca_path, + ]); + + let config = client_config_with_auth( + fixture.root_store, + client_cert_file.path().to_str().unwrap(), + client_key_file.path().to_str().unwrap(), + ); + let (status, body) = + https_get(daemon.api_addr(), "/healthz", config).expect("mTLS handshake should succeed"); + assert_eq!(status, 200, "body was: {body}"); +} + +// --------------------------------------------------------------------------- +// Metrics +// --------------------------------------------------------------------------- + +#[test] +fn tls_certificate_expiry_metric_exposed() { + let fixture = mint_ca_and_leaf(); + let rule = temp_file(".yml", SIMPLE_RULE); + let daemon = DaemonProcess::spawn(&[ + "engine", + "daemon", + "-r", + rule.path().to_str().unwrap(), + "--input", + "http", + "--api-addr", + "127.0.0.1:0", + "--tls-cert", + &fixture.cert_path, + "--tls-key", + &fixture.key_path, + ]); + let (status, body) = https_get( + daemon.api_addr(), + "/metrics", + client_config(fixture.root_store), + ) + .unwrap(); + assert_eq!(status, 200); + assert!( + body.contains("rsigma_tls_certificate_expiry_seconds"), + "metrics should expose the expiry gauge; body: {body}" + ); + assert!( + body.contains("rsigma_tls_active_connections"), + "metrics should expose the active-connection gauge; body: {body}" + ); +} + +// --------------------------------------------------------------------------- +// Misconfiguration +// --------------------------------------------------------------------------- + +#[test] +fn missing_cert_file_refuses_to_start() { + let rule = temp_file(".yml", SIMPLE_RULE); + // Write a valid key so the failure is specifically the missing cert. + let key = KeyPair::generate().unwrap(); + let key_file = temp_file(".pem", &key.serialize_pem()); + + let stderr = spawn_expect_failure( + &[ + "engine", + "daemon", + "-r", + rule.path().to_str().unwrap(), + "--input", + "http", + "--api-addr", + "127.0.0.1:0", + "--tls-cert", + "/nonexistent/cert.pem", + "--tls-key", + key_file.path().to_str().unwrap(), + ], + Duration::from_secs(5), + ); + assert!( + stderr.contains("Failed to initialize TLS") || stderr.contains("/nonexistent/cert.pem"), + "expected TLS init failure in stderr, got: {stderr}" + ); +} + +#[test] +fn encrypted_key_password_is_rejected_with_guidance() { + let fixture = mint_ca_and_leaf(); + let rule = temp_file(".yml", SIMPLE_RULE); + let stderr = spawn_expect_failure( + &[ + "engine", + "daemon", + "-r", + rule.path().to_str().unwrap(), + "--input", + "http", + "--api-addr", + "127.0.0.1:0", + "--tls-cert", + &fixture.cert_path, + "--tls-key", + &fixture.key_path, + "--tls-key-password", + "hunter2", + ], + Duration::from_secs(5), + ); + assert!( + stderr.contains("openssl"), + "stderr should point at openssl for decryption, got: {stderr}" + ); +} + +// Avoid unused-write warning on rcgen's keypair pem helpers across cfg +// permutations. +#[allow(dead_code)] +fn _touch_write() { + let _ = std::io::sink().write_all(b""); +} diff --git a/crates/rsigma-cli/tests/common/mod.rs b/crates/rsigma-cli/tests/common/mod.rs index b28fe97c..6f20a07c 100644 --- a/crates/rsigma-cli/tests/common/mod.rs +++ b/crates/rsigma-cli/tests/common/mod.rs @@ -174,6 +174,11 @@ impl DaemonProcess { format!("http://{}{path}", self.api_addr) } + /// Convenience constructor that returns an `https://...` URL. + pub fn https_url(&self, path: &str) -> String { + format!("https://{}{path}", self.api_addr) + } + pub fn api_addr(&self) -> &str { &self.api_addr } @@ -184,6 +189,51 @@ impl DaemonProcess { } } +/// Spawn the daemon and return either a `DaemonProcess` on success or the +/// stderr line that caused the failure on a hard startup error. +/// +/// Use this when a test wants to assert that a misconfigured invocation +/// (e.g. plaintext bind on `0.0.0.0` without `--allow-plaintext`) refuses +/// to start with a specific error message. +pub fn spawn_expect_failure(args: &[&str], deadline: Duration) -> String { + let mut child = StdCommand::new(rsigma_bin()) + .args(args) + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .expect("failed to spawn rsigma engine daemon"); + + let stderr = child.stderr.take().unwrap(); + let (tx, rx) = std::sync::mpsc::channel::(); + std::thread::spawn(move || { + for line in BufReader::new(stderr).lines() { + let Ok(line) = line else { return }; + let _ = tx.send(line); + } + }); + + let end = Instant::now() + deadline; + let mut collected = Vec::new(); + while Instant::now() < end { + let remaining = end + .checked_duration_since(Instant::now()) + .unwrap_or(Duration::ZERO); + if let Ok(Some(_)) = child.try_wait() { + break; + } + match rx.recv_timeout(remaining.min(Duration::from_millis(200))) { + Ok(line) => { + collected.push(line); + } + Err(_) => continue, + } + } + let _ = child.kill(); + let _ = child.wait(); + collected.join("\n") +} + impl Drop for DaemonProcess { fn drop(&mut self) { self.kill(); From b5a0081ddefac6bf5abfe8f573c02c2acd86e897 Mon Sep 17 00:00:00 2001 From: Mostafa Moradian Date: Fri, 22 May 2026 13:18:32 +0200 Subject: [PATCH 04/11] docs: document daemon TLS flags, hot-reload, metrics, and security model Adds the user-facing surface for the new `daemon-tls` feature: - `docs/cli/engine/daemon.md`: full TLS flag table and a mutual-TLS example. - `docs/reference/security.md`: replaces the "future work" pointer to #128 with the TLS termination section (in-process termination, mTLS, hot-reload, expiry observability, ACME and encrypted-key scope notes). - `docs/guide/streaming-detection.md`: the HTTP API section calls out the TLS-or-`--allow-plaintext` startup check; the production checklist now reflects the in-process TLS option. - `docs/guide/otlp-integration.md`: replaces the reverse-proxy-only TLS section with the new `--tls-*` flags plus Alloy, Vector, Fluent Bit, and OpenTelemetry Collector mTLS recipes. - `docs/getting-started/quick-start.md`: short note about the loopback bypass and how to enable TLS for a public bind. - `docs/reference/feature-flags.md`: new `daemon-tls` row in the feature matrix and CI matrix. - `docs/reference/metrics.md`: documents `rsigma_tls_certificate_expiry_seconds` and `rsigma_tls_active_connections`, plus two recommended alerts. - `crates/rsigma-cli/README.md`: TLS flag table and mTLS example block. Verified with `mkdocs build --strict`. --- crates/rsigma-cli/README.md | 22 +++++++++++++++ docs/cli/engine/daemon.md | 32 +++++++++++++++++++++ docs/getting-started/quick-start.md | 2 ++ docs/guide/otlp-integration.md | 24 ++++++++++++++-- docs/guide/streaming-detection.md | 4 +-- docs/reference/feature-flags.md | 2 ++ docs/reference/metrics.md | 21 ++++++++++++++ docs/reference/security.md | 43 +++++++++++++++++++++++++---- 8 files changed, 140 insertions(+), 10 deletions(-) diff --git a/crates/rsigma-cli/README.md b/crates/rsigma-cli/README.md index 2a5c62db..a4fc82bd 100644 --- a/crates/rsigma-cli/README.md +++ b/crates/rsigma-cli/README.md @@ -202,6 +202,17 @@ Unlike `engine eval`, the daemon stays alive after stdin reaches EOF and support | `--nats-tls-key` | path | none | Client private key for mutual TLS (requires `--nats-tls-cert`) | | `--nats-require-tls` | flag | `false` | Require TLS on NATS connections | +**TLS flags** (require `daemon-tls` feature): + +| Argument | Type | Default | Description | +|----------|------|---------|-------------| +| `--tls-cert` | path | none | PEM-encoded leaf certificate (chain) for the API listener. Requires `--tls-key`. | +| `--tls-key` | path | none | PEM-encoded private key (PKCS#8, PKCS#1, or SEC1). Requires `--tls-cert`. | +| `--tls-key-password` | string | none | Password for an encrypted `--tls-key` (env: `RSIGMA_TLS_KEY_PASSWORD`). Currently rejected with a clear hint pointing at `openssl rsa` for offline decryption. | +| `--tls-client-ca` | path | none | PEM bundle of trusted CAs for inbound client certificate verification (mTLS). | +| `--tls-min-version` | string | `"1.3"` | Minimum TLS protocol version: `1.2` or `1.3`. | +| `--allow-plaintext` | flag | `false` | Permit plaintext on a non-loopback `--api-addr`. Loopback always allows plaintext. | + \* Feature-gated: `logfmt` requires the `logfmt` feature, `cef` requires the `cef` feature. **Usage:** @@ -246,6 +257,17 @@ rsigma engine daemon -r rules/ --input nats://localhost:4222/events.> --replay-f # Consumer groups for horizontal scaling rsigma engine daemon -r rules/ --input nats://localhost:4222/events.> --consumer-group detection-workers +# Terminate TLS in-process (requires the daemon-tls feature) +rsigma engine daemon -r rules/ --input http --api-addr 0.0.0.0:9090 \ + --tls-cert /etc/rsigma/tls/server.crt \ + --tls-key /etc/rsigma/tls/server.key + +# Mutual TLS: every agent must present a CA-signed client cert +rsigma engine daemon -r rules/ --input http --api-addr 0.0.0.0:9090 \ + --tls-cert /etc/rsigma/tls/server.crt \ + --tls-key /etc/rsigma/tls/server.key \ + --tls-client-ca /etc/rsigma/tls/clients-ca.crt + # With SQLite state persistence (correlation state survives restarts) hel run | rsigma engine daemon -r rules/ -p ecs.yml --state-db ./rsigma-state.db diff --git a/docs/cli/engine/daemon.md b/docs/cli/engine/daemon.md index 0cf93ce5..ca7828c1 100644 --- a/docs/cli/engine/daemon.md +++ b/docs/cli/engine/daemon.md @@ -65,6 +65,25 @@ The enrichers file accepts `max_concurrent_enrichments: ` at the top level (d |------|---------|-------------| | `--api-addr ` | `0.0.0.0:9090` | Bind address for `/healthz`, `/readyz`, `/metrics`, `/api/v1/*`, and (with the `daemon-otlp` feature) `/v1/logs`. | +### TLS (requires the `daemon-tls` build feature) + +When TLS is configured, the daemon terminates TLS in-process for every protocol on `--api-addr` (HTTP REST API, `/metrics`, OTLP/HTTP, OTLP/gRPC). The negotiation advertises both `h2` and `http/1.1` via ALPN so legacy REST clients and modern gRPC clients share one socket. + +| Flag | Env | Default | Description | +|------|-----|---------|-------------| +| `--tls-cert ` | unset | unset | PEM-encoded leaf certificate (with any intermediates) for the API listener. Requires `--tls-key`. | +| `--tls-key ` | unset | unset | PEM-encoded private key. PKCS#8, PKCS#1 (RSA), and SEC1 (EC) formats are accepted. Requires `--tls-cert`. | +| `--tls-key-password ` | `RSIGMA_TLS_KEY_PASSWORD` | unset | Password for an encrypted `--tls-key`. Currently rejected at startup with a clear error; decrypt with `openssl rsa -in key.pem -out key-decrypted.pem` first. | +| `--tls-client-ca ` | unset | unset | PEM bundle of trusted CA certificates used to verify inbound client certificates. Enables mutual TLS: clients without a cert signed by one of the listed CAs are rejected during the handshake. | +| `--tls-min-version <1.2\|1.3>` | unset | `1.3` | Minimum TLS protocol version. Drop to `1.2` only for legacy agents that cannot negotiate TLS 1.3. | +| `--allow-plaintext` | unset | off | Permit plaintext on a non-loopback `--api-addr`. Without this flag (and without `--tls-cert`/`--tls-key`) the daemon refuses to start on any public address. Loopback (`127.0.0.0/8`, `::1`) always allows plaintext for local development. | + +Hot-reload: `SIGHUP` re-reads the certificate and key from disk and atomically swaps the active `rustls::ServerConfig` for new handshakes; inflight TLS connections are unaffected. Failed reloads keep the previous certificate active and log an error. The hot-reload path piggy-backs on the existing rules-reload signal, so a single `kill -HUP ` rotates rules, pipelines, and TLS material together. + +Observability: the `/metrics` endpoint exposes `rsigma_tls_certificate_expiry_seconds` (signed; negative once the cert has expired) and `rsigma_tls_active_connections`. A single WARN is logged at startup (and after every reload) if the active certificate expires within 30 days. + +See [TLS deployment](../../reference/security.md#tls-termination-for-the-api-listener) for a deeper dive, including ACME / sidecar reverse proxy alternatives that this feature replaces. + ### Correlation behavior | Flag | Default | Description | @@ -167,6 +186,19 @@ rsigma engine daemon -r rules/ \ --output "nats://nats.internal:4222/detections.urgent" ``` +### HTTPS with mutual TLS + +```bash +rsigma engine daemon -r rules/ \ + --input http \ + --api-addr 0.0.0.0:9090 \ + --tls-cert /etc/rsigma/tls/server.crt \ + --tls-key /etc/rsigma/tls/server.key \ + --tls-client-ca /etc/rsigma/tls/clients-ca.crt +``` + +Clients connecting to `https://daemon:9090/v1/logs` (OTLP/HTTP) or `https://daemon:9090/api/v1/events` (REST) must present a certificate signed by `clients-ca.crt` or the handshake is rejected. Rotate the server cert with `cp new.crt /etc/rsigma/tls/server.crt && kill -HUP $(pidof rsigma)`. + ### Forensic replay from a NATS sequence ```bash diff --git a/docs/getting-started/quick-start.md b/docs/getting-started/quick-start.md index 0a2591ef..6c0aec31 100644 --- a/docs/getting-started/quick-start.md +++ b/docs/getting-started/quick-start.md @@ -158,6 +158,8 @@ For continuous detection with hot-reload, metrics, and a management API, run RSi rsigma engine daemon -r rules/ --input http --api-addr 127.0.0.1:9090 & ``` +Loopback (`127.0.0.0/8`, `::1`) keeps plaintext for local development. When you move to a production bind such as `--api-addr 0.0.0.0:9090`, the daemon (built with the `daemon-tls` feature) refuses to start without either `--tls-cert`/`--tls-key` or an explicit `--allow-plaintext`. See [TLS termination](../reference/security.md#tls-termination-for-the-api-listener) for the full story. + The daemon logs structured JSON to stderr while it starts. Detections are written to stdout as they fire. In another terminal (or after `&` returns control), send an event: ```bash diff --git a/docs/guide/otlp-integration.md b/docs/guide/otlp-integration.md index 3a97899c..9c2001c0 100644 --- a/docs/guide/otlp-integration.md +++ b/docs/guide/otlp-integration.md @@ -208,13 +208,31 @@ Verified with `otelcol-contrib` v0.152. The previous aliases (`filelog`, `otlpht ## TLS -The daemon's `--api-addr` listener does not terminate TLS by itself. For production, put a reverse proxy in front (Caddy, nginx, Envoy) and configure the agents to point at the proxy. The proxy speaks TLS outbound and forwards HTTP/2 + HTTP/1.1 to the daemon's plain socket on a private network. +Build the daemon with the `daemon-tls` feature and pass `--tls-cert`/`--tls-key` to have it terminate TLS in-process for OTLP/HTTP, OTLP/gRPC, and the rest of the API on the same socket. ALPN advertises `h2` and `http/1.1`, so the same listener handles legacy REST clients and modern HTTP/2 gRPC clients. -A future feature could add direct TLS at the daemon, but the reverse-proxy approach lets you reuse your existing TLS automation and cert management. +```bash +rsigma engine daemon -r rules/ \ + --api-addr 0.0.0.0:9090 \ + --tls-cert /etc/rsigma/tls/server.crt \ + --tls-key /etc/rsigma/tls/server.key +``` + +For agent-to-daemon pinning, add `--tls-client-ca` and require every agent to present a CA-signed client cert. See [TLS termination](../reference/security.md#tls-termination-for-the-api-listener) for the full reference, including SIGHUP-triggered hot-reload of the certificate. + +When `daemon-tls` is not compiled in (or the operator prefers an external TLS terminator), put a reverse proxy in front of the daemon: Caddy, nginx, Envoy, and Traefik all work. The proxy speaks TLS outbound and forwards HTTP/2 + HTTP/1.1 to the daemon's plain socket on a private network. + +### Agent recipes with TLS + +Each of the four agents documented above can be pointed at an `https://` endpoint: + +- **Grafana Alloy**: add a `tls { ca_pem = file("/etc/alloy/rsigma-ca.pem") }` block to the `otelcol.exporter.otlphttp` `client` argument. For mTLS, set `cert_pem` and `key_pem` to the agent's client cert. +- **Vector**: under the `opentelemetry` sink, add `tls { ca_file = "/etc/vector/rsigma-ca.pem", verify_certificate = true }`. For mTLS, also set `crt_file` and `key_file`. +- **Fluent Bit**: set `tls On`, `tls.ca_file /etc/fluent-bit/rsigma-ca.pem`. For mTLS, add `tls.crt_file` and `tls.key_file`. +- **OpenTelemetry Collector**: under the `otlphttp/rsigma` exporter, add `tls: { ca_file: /etc/otelcol/rsigma-ca.pem }`. For mTLS, add `cert_file` and `key_file`. ## Authentication -OTLP/HTTP supports standard `Authorization` headers, and the agents above can all set custom headers. The daemon does not validate them currently. If you need authentication, again terminate at a reverse proxy that enforces the header check before forwarding. +OTLP/HTTP supports standard `Authorization` headers, and the agents above can all set custom headers. The daemon does not currently validate bearer or basic auth headers; the recommended authentication path is mutual TLS via `--tls-client-ca`, which pins every agent to a CA-signed identity at the handshake before any HTTP request body is parsed. ## Observability diff --git a/docs/guide/streaming-detection.md b/docs/guide/streaming-detection.md index 13071b6c..868231f8 100644 --- a/docs/guide/streaming-detection.md +++ b/docs/guide/streaming-detection.md @@ -155,7 +155,7 @@ See [NATS Streaming](nats-streaming.md) for the full replay matrix. ## HTTP API -The daemon binds an Axum HTTP server on `--api-addr` (default `0.0.0.0:9090`). It serves both REST and Prometheus endpoints, plus OTLP/gRPC and OTLP/HTTP when the feature is enabled. The full reference is in [HTTP API](../reference/http-api.md). Key endpoints: +The daemon binds an Axum HTTP server on `--api-addr` (default `0.0.0.0:9090`). It serves both REST and Prometheus endpoints, plus OTLP/gRPC and OTLP/HTTP when the feature is enabled. With the optional `daemon-tls` build feature and `--tls-cert`/`--tls-key`, the same listener terminates HTTPS for every protocol on one socket (ALPN negotiates `h2` and `http/1.1`). When `daemon-tls` is built in, the daemon refuses to start on a non-loopback `--api-addr` without TLS or `--allow-plaintext`; loopback always allows plaintext for local development. See the [TLS reference](../reference/security.md#tls-termination-for-the-api-listener) for the flag table and hot-reload semantics. The full HTTP reference is in [HTTP API](../reference/http-api.md). Key endpoints: | Path | Method | Purpose | |------|--------|---------| @@ -206,7 +206,7 @@ If the drain timeout expires before the queue empties, the daemon force-exits wi | `--pipeline` references either a builtin (`ecs_windows`, `sysmon`) or a versioned file in the same directory. | Same. | | `--state-db` is set and points to durable storage. | Correlation state survives restarts. | | `--dlq` is configured. | Parse errors and sink failures land somewhere you can audit. | -| `--api-addr` is bound to an internal interface (or behind a proxy). | The management API has no auth. Never expose it on the public internet. | +| `--api-addr` is bound to an internal interface, behind a TLS-terminating proxy, or paired with `--tls-cert`/`--tls-key` (and `--tls-client-ca` for agent pinning). | The management API has no bearer-token auth; rely on mTLS or network isolation, never expose plaintext to the public internet. | | The container runs read-only with capabilities dropped. | See the [Docker guide](../deployment/docker.md). | | Prometheus scrapes `/metrics`. | Detect back-pressure, parse errors, DLQ events. | | `/readyz` is wired to the orchestrator's startup probe. | Avoid sending traffic to a daemon that has not loaded rules yet. | diff --git a/docs/reference/feature-flags.md b/docs/reference/feature-flags.md index 70c710b4..a4776ee8 100644 --- a/docs/reference/feature-flags.md +++ b/docs/reference/feature-flags.md @@ -13,6 +13,7 @@ The crate that produces the `rsigma` binary. | `daemon` | yes | `rsigma-runtime`, `tokio`, `axum`, `prometheus`, `notify`, `rusqlite`, `tower-http` | `engine daemon`, the HTTP API server, `/metrics`, hot-reload, SQLite state persistence. The default; disable only for a minimal `engine eval` / `rule *` build. | | `daemon-nats` | no | `daemon` + `async-nats`, `tokio-stream`, `time`, `rsigma-runtime/nats` | NATS JetStream as `--input` and `--output` (and DLQ). All `--nats-*` flags. `RSIGMA_CONSUMER_GROUP`. See [NATS Streaming](../guide/nats-streaming.md). | | `daemon-otlp` | no | `daemon` + `prost`, `tonic`, `flate2`, `rsigma-runtime/otlp` | OTLP/HTTP and OTLP/gRPC receivers on `/v1/logs`. See [OTLP Integration](../guide/otlp-integration.md). | +| `daemon-tls` | no | `daemon` + `rustls` (aws-lc-rs), `tokio-rustls`, `rustls-pemfile`, `rustls-pki-types`, `x509-parser`, `hyper`/`hyper-util` | Server-side TLS termination for the API listener (HTTP REST, `/metrics`, OTLP/HTTP, OTLP/gRPC) with optional mTLS client verification, SIGHUP-triggered cert hot-reload, and two extra Prometheus metrics. See [TLS termination](security.md#tls-termination-for-the-api-listener). | | `logfmt` | no | `rsigma-runtime/logfmt` | `--input-format logfmt` for the daemon and `engine eval`. | | `cef` | no | `rsigma-runtime/cef` | `--input-format cef` for ArcSight-style logs. | | `evtx` | no | `rsigma-runtime/evtx` (dep on the `evtx` crate) | Native `.evtx` file input via `engine eval -e @file.evtx`. See [Input Formats](../guide/input-formats.md#evtx-windows-event-log-feature-gated). | @@ -77,6 +78,7 @@ The repo's `ci.yml` matrix tests these combinations on every push: - default (`daemon` on, no extras) - `daemon-nats` - `daemon-otlp` +- `daemon-tls` - `logfmt`, `cef`, `evtx`, `daachorse-index` individually - `--all-features` (the release shape) diff --git a/docs/reference/metrics.md b/docs/reference/metrics.md index a499fc3a..5d832ed7 100644 --- a/docs/reference/metrics.md +++ b/docs/reference/metrics.md @@ -78,6 +78,15 @@ Exposed when the daemon is built with `daemon-otlp` and an OTLP receiver is acti | `rsigma_otlp_log_records_total` | counter | — | Log records ingested via OTLP. | | `rsigma_otlp_errors_total` | counter | `transport`, `reason` (`unsupported_content_type`, `decompression`, `decode`, `channel_closed`) | OTLP request errors. | +## TLS (2 metrics) + +Exposed when the daemon is built with `daemon-tls`. Both metrics render with their `# HELP` and `# TYPE` lines as soon as TLS is configured, even before the first handshake. + +| Metric | Type | Labels | Description | +|--------|------|--------|-------------| +| `rsigma_tls_certificate_expiry_seconds` | gauge | — | Seconds until the active TLS server certificate's `not_after`. Signed: negative once expired. Updated at startup and after every successful SIGHUP-triggered reload. | +| `rsigma_tls_active_connections` | gauge | — | Currently active TLS-terminated connections on the API listener. Decrements on connection close (including handshake failure). | + ## Scrape configuration Minimum Prometheus scrape config: @@ -136,6 +145,18 @@ groups: ) > 1 for: 10m labels: {severity: warning} + + # TLS certificate expires within 14 days. + - alert: RsigmaTlsCertExpiring + expr: rsigma_tls_certificate_expiry_seconds < 14 * 86400 + for: 5m + labels: {severity: warning} + + # TLS certificate has already expired. + - alert: RsigmaTlsCertExpired + expr: rsigma_tls_certificate_expiry_seconds < 0 + for: 1m + labels: {severity: critical} ``` ## Histograms: bucket guidance diff --git a/docs/reference/security.md b/docs/reference/security.md index c31353d6..99904b55 100644 --- a/docs/reference/security.md +++ b/docs/reference/security.md @@ -80,15 +80,48 @@ Custom identifiers passed through `-O table=...` or pipeline `set_state` are val ## Daemon network exposure -The `engine daemon` HTTP and gRPC listeners are unauthenticated today. The recommended deployment shape is one of: +The `engine daemon` HTTP and gRPC listeners share one socket. With the optional `daemon-tls` build feature the daemon terminates TLS in-process; without it a sidecar reverse proxy is the recommended path. The recommended deployment shape is one of: +- Build with `daemon-tls` and pass `--tls-cert`/`--tls-key` to terminate TLS in-process for HTTP REST, OTLP/HTTP, and OTLP/gRPC on the same `--api-addr`. Add `--tls-client-ca` to require mTLS for agent-to-daemon pinning. See [TLS termination](#tls-termination-for-the-api-listener). - Bind to loopback (`--api-addr 127.0.0.1:9090`) and access via a reverse proxy that adds TLS and authentication. Nginx, Caddy, and Traefik all work; an example is documented in [Docker deployment](../deployment/docker.md). - Bind to a private network segment that the SOC controls. -- Future: in-process TLS termination including mTLS for OTLP agents. Tracked at [issue #128](https://github.com/timescale/rsigma/issues/128). + +To prevent accidental cleartext exposure when `daemon-tls` is built in, the daemon refuses to start on a non-loopback `--api-addr` unless either `--tls-cert`/`--tls-key` or `--allow-plaintext` is supplied. Loopback (`127.0.0.0/8`, `::1`) always allows plaintext. NATS connections from the daemon (source, sink, DLQ) support five auth methods (creds file, token, user+password, NKey, mTLS) and TLS-required mode. See [NATS Streaming: authentication](../guide/nats-streaming.md#authentication). -OTLP receiver authentication is the upstream agent's responsibility today (TLS terminates upstream of rsigma in any deployment that needs it). +### TLS termination for the API listener + +Pass any two of the four `--tls-*` flags to enable in-process TLS: + +```bash +rsigma engine daemon -r rules/ \ + --api-addr 0.0.0.0:9090 \ + --tls-cert /etc/rsigma/tls/server.crt \ + --tls-key /etc/rsigma/tls/server.key +``` + +ALPN advertises both `h2` and `http/1.1` so the same listener serves OTLP/gRPC (HTTP/2 framing) and the REST API (HTTP/1.1) without splitting ports. + +For mutual TLS (every agent must present a CA-signed client cert): + +```bash +rsigma engine daemon -r rules/ \ + --api-addr 0.0.0.0:9090 \ + --tls-cert /etc/rsigma/tls/server.crt \ + --tls-key /etc/rsigma/tls/server.key \ + --tls-client-ca /etc/rsigma/tls/clients-ca.crt +``` + +Use `--tls-min-version 1.2` only when a legacy agent cannot negotiate TLS 1.3. The provider is `aws-lc-rs`, matching the NATS client TLS path and inheriting upstream FIPS-mode work. + +Hot-reload: `SIGHUP` re-reads the certificate and key from disk and atomically swaps the rustls `ServerConfig` for new handshakes via `Arc>`. Inflight TLS connections are not dropped. Failed reloads keep the previous certificate active and log an error so a typo in the cert path cannot black-hole the listener. The same SIGHUP also reloads rules, pipelines, and enrichers, so cert rotation typically piggy-backs on a routine reload. + +Observability: `/metrics` exposes `rsigma_tls_certificate_expiry_seconds` (signed; negative once expired) and `rsigma_tls_active_connections`. A single WARN is logged at startup (and on every successful reload) when the active cert expires within 30 days; wire that line into the existing log-based alerting. + +Out of scope for this feature today: ACME / Let's Encrypt automation. Operators point `--tls-cert` and `--tls-key` at renewed files (cert-manager, certbot, Vault PKI, ...) and send SIGHUP. Encrypted private keys are also out of scope; the flag (`--tls-key-password` / `RSIGMA_TLS_KEY_PASSWORD`) is reserved for a future release and currently rejects with a clear `openssl rsa` hint. + +OTLP receiver authentication is the upstream agent's responsibility. The recommended pattern is mTLS (`--tls-client-ca`) so every OpenTelemetry agent pins to a known CA without rsigma needing a bearer-token authn layer. ## Filesystem footprint @@ -118,14 +151,14 @@ See [`.github/workflows/docker.yml`](https://github.com/timescale/rsigma/blob/ma ## Threat model summary -In one paragraph: rsigma assumes a trusted operator providing rules, pipelines, and source declarations on disk, plus an event stream from a trusted upstream agent. The hardening here exists to defend against malformed input, unbounded resource consumption (an attacker-controlled JSON event, a rule that recurses without bound, a dynamic source serving 100 GiB of garbage), and supply-chain attacks against dependencies. The daemon HTTP listeners are NOT a hardened public surface; deploy them behind a reverse proxy. The NATS and OTLP entry points support authentication, but mTLS termination for the rsigma HTTP API itself is on the roadmap (issue #128). +In one paragraph: rsigma assumes a trusted operator providing rules, pipelines, and source declarations on disk, plus an event stream from a trusted upstream agent. The hardening here exists to defend against malformed input, unbounded resource consumption (an attacker-controlled JSON event, a rule that recurses without bound, a dynamic source serving 100 GiB of garbage), and supply-chain attacks against dependencies. Daemon HTTP and OTLP listeners can be hardened in-process by building with the `daemon-tls` feature and pairing `--tls-cert`/`--tls-key` with `--tls-client-ca` for mTLS; without that, deploy behind a reverse proxy. NATS connections (source, sink, DLQ) support five auth methods plus TLS-required mode. ## See also - [`SECURITY.md`](../security-policy.md) for the disclosure policy. - [Dynamic Pipeline Sources: resource limits](dynamic-sources.md#resource-limits) for the per-source enforcement table. - [NATS Streaming: authentication](../guide/nats-streaming.md#authentication) for the five NATS auth methods and TLS. -- [Issue #128](https://github.com/timescale/rsigma/issues/128) for the planned in-process TLS for the daemon API and OTLP endpoints. +- [`engine daemon` TLS flags](../cli/engine/daemon.md#tls-requires-the-daemon-tls-build-feature) for the user-facing flag table. - [Prometheus metrics: dynamic pipeline sources](metrics.md#dynamic-pipeline-sources-5-metrics) for observability of limit hits. - [`rsigma_runtime::sources`](https://github.com/timescale/rsigma/tree/main/crates/rsigma-runtime/src/sources) for the implementation of the resource limits. - [`rsigma-eval` README: constants and limits](https://github.com/timescale/rsigma/blob/main/crates/rsigma-eval/README.md#constants-and-limits) for the engine-side enforcement. From 17e37c29674816855bd39d01cb73651301062ecb Mon Sep 17 00:00:00 2001 From: Mostafa Moradian Date: Fri, 22 May 2026 18:14:37 +0200 Subject: [PATCH 05/11] docs(changelog): document #128 server-side TLS for the daemon API Adds the Unreleased entry covering the new `daemon-tls` feature: flag table, plaintext refusal policy, unified serving path, SIGHUP-triggered cert hot-reload, the two new Prometheus metrics, and pointers to the updated reference / guide / quick-start pages. --- CHANGELOG.md | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 30832096..47120f94 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,40 @@ Each entry corresponds to a [GitHub Release](https://github.com/timescale/rsigma ## [Unreleased] +### Server-side TLS for the daemon API listener (#128) + +The `engine daemon` API listener now terminates TLS in-process for every protocol that already shares `--api-addr`: the Axum HTTP REST API (`/healthz`, `/readyz`, `/metrics`, `/api/v1/*`), OTLP/HTTP on `POST /v1/logs`, and OTLP/gRPC via `LogsService/Export`. Operators can drop the sidecar reverse proxy they previously needed for confidentiality, integrity, and agent-to-daemon pinning. + +**New Cargo feature.** `daemon-tls` on `rsigma-cli` gates the TLS surface and pulls in `rustls` (with the `aws-lc-rs` provider, matching the NATS client TLS path and inheriting upstream FIPS-mode work), `tokio-rustls`, `rustls-pemfile`, `rustls-pki-types`, `x509-parser`, and `hyper`/`hyper-util`. The default build is unchanged. + +**Six new flags on `rsigma engine daemon`.** + +| Flag | Env | Default | Purpose | +|------|-----|---------|---------| +| `--tls-cert ` | -- | -- | PEM-encoded leaf certificate (chain). Requires `--tls-key`. | +| `--tls-key ` | -- | -- | PEM-encoded private key (PKCS#8, PKCS#1, or SEC1). Requires `--tls-cert`. | +| `--tls-key-password ` | `RSIGMA_TLS_KEY_PASSWORD` | -- | Password for an encrypted `--tls-key`. Currently rejected with a clear hint pointing at `openssl rsa` for offline decryption; reserved for a future release. | +| `--tls-client-ca ` | -- | -- | PEM bundle of trusted CAs. Enables mutual TLS: clients without a CA-signed cert are rejected during the handshake. | +| `--tls-min-version <1.2\|1.3>` | -- | `1.3` | Minimum negotiated TLS protocol version. | +| `--allow-plaintext` | -- | off | Opt-in for plaintext on a non-loopback `--api-addr`. | + +**Plaintext refusal policy.** When `daemon-tls` is built in, the daemon refuses to start on any non-loopback address unless either `--tls-cert`/`--tls-key` or `--allow-plaintext` is supplied. Loopback (`127.0.0.0/8`, `::1`) always allows plaintext to keep local development friction-free. + +**Unified serving path.** The implementation collapses the previous split between `axum::serve` (for plaintext non-OTLP) and `tonic::transport::Server::serve_with_incoming_shutdown` (for OTLP) into a single `axum::Router` built via `tonic::service::Routes::into_axum_router`. For TLS, a small custom `axum::serve::Listener` wraps the `TcpListener` and performs the `tokio-rustls` handshake on every accepted connection. ALPN advertises both `h2` and `http/1.1`, so the same socket continues to serve REST + Prometheus + OTLP/HTTP + gRPC after TLS termination. + +**SIGHUP cert hot-reload.** The existing rules-reload signal also re-reads the certificate and key from disk and atomically swaps the active `rustls::ServerConfig` via `Arc>`. Inflight TLS connections are not dropped; failed reloads keep the previous certificate active and log an error so a typo in the path cannot black-hole the listener. Encrypted-key support and ACME/Let's Encrypt automation are intentionally out of scope; operators rotate cert files (cert-manager, certbot, Vault PKI, ...) and send `kill -HUP`. + +**Two new Prometheus metrics.** + +| Metric | Type | Description | +|--------|------|-------------| +| `rsigma_tls_certificate_expiry_seconds` | gauge | Seconds until the active TLS server certificate's `not_after`. Signed: negative once expired. Updated at startup and after every successful reload. | +| `rsigma_tls_active_connections` | gauge | Currently active TLS-terminated connections on the API listener. Decrements on connection close (including handshake failure). | + +A single WARN is logged at startup (and after every successful reload) when the active cert expires within 30 days, so operators can plug the line into existing log-based alerting alongside the longer-horizon Prometheus alert on `rsigma_tls_certificate_expiry_seconds`. + +**Docs.** Full reference under "TLS termination for the API listener" in `docs/reference/security.md`; flag table in `docs/cli/engine/daemon.md`; agent recipes (Grafana Alloy, Vector, Fluent Bit, OpenTelemetry Collector) with `tls`/mTLS blocks in `docs/guide/otlp-integration.md`; quick-start note in `docs/getting-started/quick-start.md`; new row in `docs/reference/feature-flags.md`; two new alerts in `docs/reference/metrics.md`. + ### Deprecated CLI aliases hidden from `--help` (#125) The 12 flat top-level CLI aliases (`eval`, `daemon`, `parse`, `validate`, `lint`, `fields`, `condition`, `stdin`, `convert`, `list-targets`, `list-formats`, `resolve`) introduced as visible-deprecated forwarders in v0.12.0 (PR #124) are now hidden from `rsigma --help` via `#[command(hide = true)]`. The dispatch arms and the `deprecation_warn` helper are otherwise unchanged, so: From 348da7606058138d7ca84fdfcd995b3d7c9431ae Mon Sep 17 00:00:00 2001 From: Mostafa Moradian Date: Fri, 22 May 2026 18:22:07 +0200 Subject: [PATCH 06/11] chore(deps): drop unmaintained rustls-pemfile in favor of rustls-pki-types `rustls-pemfile` was flagged as unmaintained by RUSTSEC-2025-0134 on 2025-11-28 (repo archived in August 2025). The advisory itself notes that the crate is "in fact a thin wrapper around the same code used in rustls-pki-types" and points consumers at the `PemObject` trait. Migrates the daemon TLS module to call `CertificateDer::pem_file_iter` and `PrivateKeyDer::from_pem_file` directly, bumps the `rustls-pki-types` minimum to 1.9 (where the `pem` module landed), and removes `rustls-pemfile` from both the `daemon-tls` feature dependency list and the dev-dependencies (the tests already used `rustls-pki-types::pem::PemObject`). `cargo tree -i rustls-pemfile` now reports the crate is no longer in the dependency graph; `cargo audit` no longer reports RUSTSEC-2025-0134. --- Cargo.lock | 10 ------- crates/rsigma-cli/Cargo.toml | 7 ++--- crates/rsigma-cli/src/daemon/tls.rs | 46 +++++++++++++++++------------ 3 files changed, 29 insertions(+), 34 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0198efc6..d4c5859e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3857,7 +3857,6 @@ dependencies = [ "rsigma-runtime", "rusqlite", "rustls", - "rustls-pemfile", "rustls-pki-types", "serde", "serde_json", @@ -4090,15 +4089,6 @@ dependencies = [ "security-framework", ] -[[package]] -name = "rustls-pemfile" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" -dependencies = [ - "rustls-pki-types", -] - [[package]] name = "rustls-pki-types" version = "1.14.1" diff --git a/crates/rsigma-cli/Cargo.toml b/crates/rsigma-cli/Cargo.toml index a48b5635..f949884a 100644 --- a/crates/rsigma-cli/Cargo.toml +++ b/crates/rsigma-cli/Cargo.toml @@ -18,7 +18,6 @@ daemon-tls = [ "daemon", "dep:rustls", "dep:tokio-rustls", - "dep:rustls-pemfile", "dep:rustls-pki-types", "dep:x509-parser", "dep:hyper", @@ -72,8 +71,7 @@ flate2 = { version = "1", optional = true } # daemon-tls dependencies rustls = { version = "0.23", default-features = false, features = ["aws_lc_rs", "std", "tls12", "logging"], optional = true } tokio-rustls = { version = "0.26", default-features = false, features = ["aws_lc_rs", "tls12", "logging"], optional = true } -rustls-pemfile = { version = "2", optional = true } -rustls-pki-types = { version = "1", optional = true } +rustls-pki-types = { version = "1.9", features = ["std"], optional = true } x509-parser = { version = "0.18", optional = true } hyper = { version = "1", features = ["server", "http1", "http2"], optional = true } hyper-util = { version = "0.1", features = ["server-auto", "tokio", "service"], optional = true } @@ -98,6 +96,5 @@ wiremock = "0.6" flate2 = "1" rcgen = { version = "0.14", default-features = false, features = ["aws_lc_rs", "pem"] } rustls = { version = "0.23", default-features = false, features = ["aws_lc_rs", "std", "tls12"] } -rustls-pemfile = "2" -rustls-pki-types = "1" +rustls-pki-types = { version = "1.9", features = ["std"] } tokio-rustls = { version = "0.26", default-features = false, features = ["aws_lc_rs", "tls12"] } diff --git a/crates/rsigma-cli/src/daemon/tls.rs b/crates/rsigma-cli/src/daemon/tls.rs index 46154398..9c640937 100644 --- a/crates/rsigma-cli/src/daemon/tls.rs +++ b/crates/rsigma-cli/src/daemon/tls.rs @@ -15,13 +15,13 @@ #![cfg(feature = "daemon-tls")] -use std::fs; -use std::io::{self, BufReader}; +use std::io; use std::net::SocketAddr; use std::path::{Path, PathBuf}; use std::sync::Arc; use arc_swap::ArcSwap; +use rustls::pki_types::pem::PemObject; use rustls::pki_types::{CertificateDer, PrivateKeyDer}; use rustls::server::WebPkiClientVerifier; use rustls::{RootCertStore, ServerConfig}; @@ -201,11 +201,17 @@ fn build_server_config(cli: &TlsCliConfig) -> Result { } /// Read a PEM bundle of one or more certificates. +/// +/// Uses the `PemObject` API from `rustls-pki-types` directly. The +/// `rustls-pemfile` crate is unmaintained as of RUSTSEC-2025-0134 and +/// was always a thin wrapper around this same code; consuming it +/// straight from `rustls-pki-types` avoids the advisory without +/// changing behavior. fn load_certs(path: &Path) -> Result>, TlsError> { - let file = fs::File::open(path).map_err(|e| TlsError::Io(e, path.to_path_buf()))?; - let mut reader = BufReader::new(file); - let certs: Result, _> = rustls_pemfile::certs(&mut reader).collect(); - let certs = certs.map_err(|e| TlsError::Io(e, path.to_path_buf()))?; + let certs: Vec> = CertificateDer::pem_file_iter(path) + .map_err(|e| pem_error_to_tls(e, path))? + .collect::, _>>() + .map_err(|e| pem_error_to_tls(e, path))?; if certs.is_empty() { return Err(TlsError::NoCertificates(path.to_path_buf())); } @@ -214,23 +220,15 @@ fn load_certs(path: &Path) -> Result>, TlsError> { /// Read a PEM-encoded private key (PKCS#8, RSA, or SEC1/EC). fn load_private_key(path: &Path) -> Result, TlsError> { - let file = fs::File::open(path).map_err(|e| TlsError::Io(e, path.to_path_buf()))?; - let mut reader = BufReader::new(file); - let key = rustls_pemfile::private_key(&mut reader) - .map_err(|e| TlsError::Io(e, path.to_path_buf()))? - .ok_or_else(|| TlsError::NoPrivateKey(path.to_path_buf()))?; - Ok(key) + PrivateKeyDer::from_pem_file(path).map_err(|e| match e { + rustls::pki_types::pem::Error::NoItemsFound => TlsError::NoPrivateKey(path.to_path_buf()), + other => pem_error_to_tls(other, path), + }) } /// Load a PEM bundle of trusted CA certificates for mTLS verification. fn load_client_ca_roots(path: &Path) -> Result { - let file = fs::File::open(path).map_err(|e| TlsError::Io(e, path.to_path_buf()))?; - let mut reader = BufReader::new(file); - let certs: Result, _> = rustls_pemfile::certs(&mut reader).collect(); - let certs = certs.map_err(|e| TlsError::Io(e, path.to_path_buf()))?; - if certs.is_empty() { - return Err(TlsError::NoCertificates(path.to_path_buf())); - } + let certs = load_certs(path)?; let mut roots = RootCertStore::empty(); for (idx, cert) in certs.into_iter().enumerate() { roots.add(cert).map_err(|e| { @@ -240,6 +238,16 @@ fn load_client_ca_roots(path: &Path) -> Result { Ok(roots) } +/// Translate a `rustls-pki-types` PEM error to our `TlsError` variant, +/// preserving the source path so the operator-facing message names the +/// file that failed. +fn pem_error_to_tls(err: rustls::pki_types::pem::Error, path: &Path) -> TlsError { + match err { + rustls::pki_types::pem::Error::Io(io_err) => TlsError::Io(io_err, path.to_path_buf()), + other => TlsError::InvalidCertificate(path.to_path_buf(), other.to_string()), + } +} + /// Read the leaf certificate from `path` and return its `not_after` as a /// Unix timestamp. pub fn read_cert_expiry(path: &Path) -> Result { From 95f1d2eea168167034f15e5b8f48e0061dbb7ec8 Mon Sep 17 00:00:00 2001 From: Mostafa Moradian Date: Fri, 22 May 2026 18:50:46 +0200 Subject: [PATCH 07/11] fix(daemon-tls): silence dead-code lint on non-Unix targets `TlsState::cli` and `TlsState::reload` are only exercised from the SIGHUP listener in `daemon::reload`, which lives behind `#[cfg(unix)]`. On Windows the non-Unix stub variant of `sighup_listener` accepts the TLS state but never invokes `reload`, so the dead-code lint flags both the field and the method. Scopes the existing fix-by-design (keep the public type platform-agnostic so the rest of the daemon code doesn't need extra cfg gates) with a targeted `#[cfg_attr(not(unix), allow(dead_code))]` on just the two items the lint complains about, leaving the rest of the struct under normal dead-code analysis. Verified with `cargo clippy -p rsigma --all-features --all-targets --target x86_64-pc-windows-gnu -- -D warnings`. --- crates/rsigma-cli/src/daemon/tls.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/crates/rsigma-cli/src/daemon/tls.rs b/crates/rsigma-cli/src/daemon/tls.rs index 9c640937..cb9d1ff8 100644 --- a/crates/rsigma-cli/src/daemon/tls.rs +++ b/crates/rsigma-cli/src/daemon/tls.rs @@ -81,6 +81,11 @@ pub struct TlsState { /// Atomically swappable `ServerConfig` used by every new handshake. pub config: Arc>, /// Original CLI config so SIGHUP can re-read cert/key from disk. + /// Only read by the SIGHUP handler in [`super::reload::sighup_listener`], + /// which is `#[cfg(unix)]`. On Windows the field is intentionally kept + /// in the struct so the public type stays platform-agnostic, hence the + /// scoped `allow(dead_code)`. + #[cfg_attr(not(unix), allow(dead_code))] pub cli: TlsCliConfig, /// Unix timestamp (seconds) at which the active cert expires. Updated /// on every successful reload so the Prometheus gauge stays accurate. @@ -104,6 +109,10 @@ impl TlsState { /// Returns the new expiry timestamp so callers can update the /// Prometheus gauge. The previous config remains active if the /// reload fails, mirroring the rules-reload contract. + /// + /// Only invoked from the `#[cfg(unix)]` SIGHUP path; Windows daemons + /// rotate certificates by restarting the process. + #[cfg_attr(not(unix), allow(dead_code))] pub fn reload(&self) -> Result { let new_config = build_server_config(&self.cli)?; let new_expiry = read_cert_expiry(&self.cli.cert_path)?; From 9dc9c5410b9fde708b176f0c72d8651599f9588e Mon Sep 17 00:00:00 2001 From: Mostafa Moradian Date: Fri, 22 May 2026 19:09:50 +0200 Subject: [PATCH 08/11] refactor(daemon-tls): fold cert reload into the central reload task Drops the platform-specific TLS reload path that previously lived in the SIGHUP listener and folds it into the same reload task that already handles rules + enrichers. Every reload trigger (file watcher, SIGHUP, `POST /api/v1/reload`) now funnels through one debounced task, so: - Windows daemons can rotate certificates via the HTTP endpoint, with the exact same semantics as a Unix `kill -HUP`. This was the motivating gap; previously the `cfg(unix)` SIGHUP path was the only way to trigger `TlsState::reload`. - The dead-code workaround on `TlsState::cli` and `TlsState::reload` goes away: both are now exercised cross-platform. - A failing reload bumps `rsigma_reloads_failed_total`, matching the enrichers contract: the previous certificate stays active so a typo in the cert path cannot black-hole the listener. The SIGHUP handler now just routes the signal into `reload_tx` like the file watcher and HTTP handler do, with no TLS-specific knowledge. Two new integration tests in `cli_daemon_tls`: - `http_reload_endpoint_rotates_tls_certificate` mints a 30-day leaf, overwrites the cert/key files in place with a 365-day leaf signed by the same CA, POSTs `/api/v1/reload`, and asserts that `rsigma_tls_certificate_expiry_seconds` jumps from ~30 to ~365 days via `/metrics` polling. - `http_reload_with_invalid_cert_keeps_previous_one` corrupts the cert file on disk and confirms a subsequent HTTPS GET still succeeds because the previous chain stayed live. `time = "0.3"` is added to dev-dependencies so the reload test can set explicit `not_before` / `not_after` on the rcgen-minted certificates. Verified with: - `cargo test -p rsigma --no-default-features --features daemon-tls --test cli_daemon_tls --test-threads=1` (12 passed) - `cargo clippy -p rsigma --all-features --all-targets -- -D warnings` - `cargo clippy -p rsigma --all-features --all-targets --target x86_64-pc-windows-gnu -- -D warnings` --- crates/rsigma-cli/Cargo.toml | 1 + crates/rsigma-cli/src/daemon/reload.rs | 36 +--- crates/rsigma-cli/src/daemon/server.rs | 51 +++-- crates/rsigma-cli/src/daemon/tls.rs | 18 +- crates/rsigma-cli/tests/cli_daemon_tls.rs | 248 ++++++++++++++++++++-- 5 files changed, 281 insertions(+), 73 deletions(-) diff --git a/crates/rsigma-cli/Cargo.toml b/crates/rsigma-cli/Cargo.toml index f949884a..66accd00 100644 --- a/crates/rsigma-cli/Cargo.toml +++ b/crates/rsigma-cli/Cargo.toml @@ -98,3 +98,4 @@ rcgen = { version = "0.14", default-features = false, features = ["aws_lc_rs", " rustls = { version = "0.23", default-features = false, features = ["aws_lc_rs", "std", "tls12"] } rustls-pki-types = { version = "1.9", features = ["std"] } tokio-rustls = { version = "0.26", default-features = false, features = ["aws_lc_rs", "tls12"] } +time = "0.3" diff --git a/crates/rsigma-cli/src/daemon/reload.rs b/crates/rsigma-cli/src/daemon/reload.rs index 46492aa5..1122fc3f 100644 --- a/crates/rsigma-cli/src/daemon/reload.rs +++ b/crates/rsigma-cli/src/daemon/reload.rs @@ -73,17 +73,20 @@ pub fn spawn_file_watcher( Some(watcher) } -/// Set up a SIGHUP handler that sends reload signals and source re-resolution -/// triggers, and (when `daemon-tls` is built in) also re-reads the configured -/// TLS certificate and key from disk and atomically swaps the rustls -/// `ServerConfig` so new handshakes pick up the rotated material without -/// dropping inflight connections. +/// Set up a SIGHUP handler that signals the central reload task and +/// kicks off source re-resolution. +/// +/// SIGHUP just routes the signal into the same `reload_tx` channel the +/// file watcher and `POST /api/v1/reload` use, so every reload trigger +/// funnels through the one debounced reload task in `server::run_daemon`. +/// That task is what actually re-reads rules, enrichers, and (when +/// `daemon-tls` is built in) the TLS certificate and key. The Windows +/// build has no SIGHUP equivalent; operators rotate certs via +/// `POST /api/v1/reload` instead. #[cfg(unix)] pub async fn sighup_listener( reload_tx: mpsc::Sender<()>, sources_trigger_tx: Option>, - #[cfg(feature = "daemon-tls")] tls_state: Option, - #[cfg(feature = "daemon-tls")] tls_metrics: std::sync::Arc, ) { use tokio::signal::unix::{SignalKind, signal}; @@ -102,23 +105,6 @@ pub async fn sighup_listener( if let Some(tx) = &sources_trigger_tx { let _ = tx.try_send(rsigma_runtime::sources::refresh::RefreshTrigger::All); } - - #[cfg(feature = "daemon-tls")] - if let Some(ref state) = tls_state { - match state.reload() { - Ok(new_expiry) => { - super::server::update_tls_metrics(&tls_metrics, new_expiry); - super::server::warn_if_cert_expiring_soon(new_expiry); - tracing::info!(not_after = new_expiry, "TLS certificate hot-reloaded"); - } - Err(e) => { - tracing::error!( - error = %e, - "Failed to reload TLS certificate; keeping previous one active" - ); - } - } - } } } @@ -126,8 +112,6 @@ pub async fn sighup_listener( pub async fn sighup_listener( _reload_tx: mpsc::Sender<()>, _sources_trigger_tx: Option>, - #[cfg(feature = "daemon-tls")] _tls_state: Option, - #[cfg(feature = "daemon-tls")] _tls_metrics: std::sync::Arc, ) { std::future::pending::<()>().await; } diff --git a/crates/rsigma-cli/src/daemon/server.rs b/crates/rsigma-cli/src/daemon/server.rs index bc5bef03..d4f80743 100644 --- a/crates/rsigma-cli/src/daemon/server.rs +++ b/crates/rsigma-cli/src/daemon/server.rs @@ -488,27 +488,19 @@ pub async fn run_daemon(config: DaemonConfig) { tracing::info!(addr = %actual_addr, "API server listening"); } - // Spawn SIGHUP listener (triggers rule reload, source re-resolution, - // and TLS cert reload when daemon-tls is built in). + // Spawn SIGHUP listener (Unix-only; routes the signal into the + // same `reload_tx` channel the file watcher and HTTP endpoint + // use, so every reload trigger funnels through one task). let sighup_reload_tx = reload_tx.clone(); let sighup_sources_tx = sources_trigger_tx_val.clone(); - #[cfg(feature = "daemon-tls")] - let sighup_tls = tls_state.clone(); - #[cfg(feature = "daemon-tls")] - let sighup_tls_metrics = metrics.clone(); tokio::spawn(async move { - reload::sighup_listener( - sighup_reload_tx, - sighup_sources_tx, - #[cfg(feature = "daemon-tls")] - sighup_tls, - #[cfg(feature = "daemon-tls")] - sighup_tls_metrics, - ) - .await; + reload::sighup_listener(sighup_reload_tx, sighup_sources_tx).await; }); - // Spawn reload handler — uses LogProcessor::reload_rules for atomic hot-reload + // Spawn reload handler — uses LogProcessor::reload_rules for atomic hot-reload. + // Also re-reads enricher config and (when `daemon-tls` is built in) the TLS + // certificate / key so a single `POST /api/v1/reload`, SIGHUP, or file-watcher + // event rotates every hot-reloadable component in one debounced pass. let reload_processor = processor.clone(); let reload_metrics = metrics.clone(); let reload_health = health.clone(); @@ -516,6 +508,10 @@ pub async fn run_daemon(config: DaemonConfig) { let reload_enrichers_path = config.enrichers_path.clone(); let reload_enrichment_metrics = enrichment_metrics.clone(); let reload_source_cache = initial_source_cache.clone(); + #[cfg(feature = "daemon-tls")] + let reload_tls_state = tls_state.clone(); + #[cfg(feature = "daemon-tls")] + let reload_tls_metrics = metrics.clone(); tokio::spawn(async move { while reload_rx.recv().await.is_some() { // Debounce: batch rapid file changes @@ -585,6 +581,29 @@ pub async fn run_daemon(config: DaemonConfig) { } } } + + // Reload TLS certificate / key from disk when daemon-tls is + // built in and configured. Failures keep the previous + // certificate active so a typo in the cert path cannot + // black-hole the listener; the operator sees the error in + // the daemon log and via `rsigma_reloads_failed_total`. + #[cfg(feature = "daemon-tls")] + if let Some(ref state) = reload_tls_state { + match state.reload() { + Ok(new_expiry) => { + update_tls_metrics(&reload_tls_metrics, new_expiry); + warn_if_cert_expiring_soon(new_expiry); + tracing::info!(not_after = new_expiry, "TLS certificate hot-reloaded"); + } + Err(e) => { + tracing::error!( + error = %e, + "Failed to reload TLS certificate; keeping previous one active" + ); + reload_metrics.reloads_failed.inc(); + } + } + } } }); diff --git a/crates/rsigma-cli/src/daemon/tls.rs b/crates/rsigma-cli/src/daemon/tls.rs index cb9d1ff8..0a00b6b7 100644 --- a/crates/rsigma-cli/src/daemon/tls.rs +++ b/crates/rsigma-cli/src/daemon/tls.rs @@ -80,12 +80,10 @@ impl std::str::FromStr for TlsMinVersion { pub struct TlsState { /// Atomically swappable `ServerConfig` used by every new handshake. pub config: Arc>, - /// Original CLI config so SIGHUP can re-read cert/key from disk. - /// Only read by the SIGHUP handler in [`super::reload::sighup_listener`], - /// which is `#[cfg(unix)]`. On Windows the field is intentionally kept - /// in the struct so the public type stays platform-agnostic, hence the - /// scoped `allow(dead_code)`. - #[cfg_attr(not(unix), allow(dead_code))] + /// Original CLI config so the reload path can re-read cert/key + /// from disk on every `POST /api/v1/reload`, file-watcher event, + /// or SIGHUP (the three triggers all funnel through the daemon's + /// central reload task). pub cli: TlsCliConfig, /// Unix timestamp (seconds) at which the active cert expires. Updated /// on every successful reload so the Prometheus gauge stays accurate. @@ -108,11 +106,9 @@ impl TlsState { /// /// Returns the new expiry timestamp so callers can update the /// Prometheus gauge. The previous config remains active if the - /// reload fails, mirroring the rules-reload contract. - /// - /// Only invoked from the `#[cfg(unix)]` SIGHUP path; Windows daemons - /// rotate certificates by restarting the process. - #[cfg_attr(not(unix), allow(dead_code))] + /// reload fails, mirroring the rules-reload contract. Invoked + /// cross-platform from the central reload task on every reload + /// trigger (SIGHUP, file-watcher, `POST /api/v1/reload`). pub fn reload(&self) -> Result { let new_config = build_server_config(&self.cli)?; let new_expiry = read_cert_expiry(&self.cli.cert_path)?; diff --git a/crates/rsigma-cli/tests/cli_daemon_tls.rs b/crates/rsigma-cli/tests/cli_daemon_tls.rs index 1c7ab619..9e1a3ef7 100644 --- a/crates/rsigma-cli/tests/cli_daemon_tls.rs +++ b/crates/rsigma-cli/tests/cli_daemon_tls.rs @@ -47,6 +47,10 @@ struct TlsFixture { /// suitable for both `serverAuth` and `clientAuth`. Returns paths plus a /// `RootCertStore` clients can use to verify the server. fn mint_ca_and_leaf() -> TlsFixture { + mint_ca_and_leaf_with_validity(time::Duration::days(30)) +} + +fn mint_ca_and_leaf_with_validity(validity: time::Duration) -> TlsFixture { let mut ca_params = CertificateParams::new(Vec::::new()).unwrap(); ca_params .distinguished_name @@ -62,25 +66,11 @@ fn mint_ca_and_leaf() -> TlsFixture { let ca_pem = ca_cert.pem(); let ca_issuer = Issuer::new(ca_params, ca_key); - let mut leaf_params = CertificateParams::new(vec!["localhost".to_string()]).unwrap(); - leaf_params - .subject_alt_names - .push(rcgen::SanType::IpAddress(std::net::IpAddr::from([ - 127, 0, 0, 1, - ]))); - leaf_params - .distinguished_name - .push(DnType::CommonName, "rsigma-test-server"); - leaf_params.extended_key_usages = vec![ - ExtendedKeyUsagePurpose::ServerAuth, - ExtendedKeyUsagePurpose::ClientAuth, - ]; - let leaf_key = KeyPair::generate().unwrap(); - let leaf_cert = leaf_params.signed_by(&leaf_key, &ca_issuer).unwrap(); + let leaf = mint_leaf_pem(&ca_issuer, validity); let ca_file = temp_file(".pem", &ca_pem); - let cert_file = temp_file(".pem", &leaf_cert.pem()); - let key_file = temp_file(".pem", &leaf_key.serialize_pem()); + let cert_file = temp_file(".pem", &leaf.cert); + let key_file = temp_file(".pem", &leaf.key); let mut store = RootCertStore::empty(); for cert in rustls::pki_types::CertificateDer::pem_slice_iter(ca_pem.as_bytes()) { @@ -99,6 +89,39 @@ fn mint_ca_and_leaf() -> TlsFixture { } } +struct LeafPem { + cert: String, + key: String, +} + +/// Sign a fresh leaf certificate suitable for the test daemon +/// (`localhost` + `127.0.0.1`, `serverAuth` + `clientAuth`) with an +/// explicit validity window. +fn mint_leaf_pem(issuer: &Issuer<'_, KeyPair>, validity: time::Duration) -> LeafPem { + let mut leaf_params = CertificateParams::new(vec!["localhost".to_string()]).unwrap(); + leaf_params + .subject_alt_names + .push(rcgen::SanType::IpAddress(std::net::IpAddr::from([ + 127, 0, 0, 1, + ]))); + leaf_params + .distinguished_name + .push(DnType::CommonName, "rsigma-test-server"); + leaf_params.extended_key_usages = vec![ + ExtendedKeyUsagePurpose::ServerAuth, + ExtendedKeyUsagePurpose::ClientAuth, + ]; + let now = time::OffsetDateTime::now_utc(); + leaf_params.not_before = now; + leaf_params.not_after = now + validity; + let leaf_key = KeyPair::generate().unwrap(); + let leaf_cert = leaf_params.signed_by(&leaf_key, issuer).unwrap(); + LeafPem { + cert: leaf_cert.pem(), + key: leaf_key.serialize_pem(), + } +} + /// Mint a client certificate signed by the supplied CA issuer for mTLS /// positive-path tests. fn mint_client_cert(issuer: &Issuer<'_, KeyPair>) -> (NamedTempFile, NamedTempFile) { @@ -149,10 +172,34 @@ fn https_get( addr: &str, path: &str, config: ClientConfig, +) -> Result<(u16, String), Box> { + https_request("GET", addr, path, None, config) +} + +/// Synchronous HTTPS POST with an optional body. Returns (status, body). +fn https_post( + addr: &str, + path: &str, + body: &str, + config: ClientConfig, +) -> Result<(u16, String), Box> { + https_request("POST", addr, path, Some(body), config) +} + +fn https_request( + method: &str, + addr: &str, + path: &str, + body: Option<&str>, + config: ClientConfig, ) -> Result<(u16, String), Box> { let rt = tokio::runtime::Builder::new_current_thread() .enable_all() .build()?; + let method = method.to_string(); + let addr = addr.to_string(); + let path = path.to_string(); + let body = body.map(|s| s.to_string()); rt.block_on(async move { let host = addr.split(':').next().unwrap_or("127.0.0.1"); let socket: std::net::SocketAddr = addr.parse()?; @@ -160,13 +207,20 @@ fn https_get( let connector = TlsConnector::from(Arc::new(config)); let server_name = ServerName::try_from(host.to_string()).unwrap(); let mut tls = connector.connect(server_name, tcp).await?; - let req = format!("GET {path} HTTP/1.1\r\nHost: {host}\r\nConnection: close\r\n\r\n"); + let req = match body.as_ref() { + Some(b) => format!( + "{method} {path} HTTP/1.1\r\nHost: {host}\r\nContent-Length: {}\r\nContent-Type: application/json\r\nConnection: close\r\n\r\n{b}", + b.len() + ), + None => format!( + "{method} {path} HTTP/1.1\r\nHost: {host}\r\nConnection: close\r\n\r\n" + ), + }; tls.write_all(req.as_bytes()).await?; let mut buf = Vec::new(); tls.read_to_end(&mut buf).await?; let response = String::from_utf8_lossy(&buf).into_owned(); - let mut lines = response.splitn(2, "\r\n"); - let status_line = lines.next().unwrap_or(""); + let status_line = response.split("\r\n").next().unwrap_or(""); let status: u16 = status_line .split_whitespace() .nth(1) @@ -181,6 +235,20 @@ fn https_get( }) } +/// Read the `rsigma_tls_certificate_expiry_seconds` gauge value (a +/// signed float, in seconds) from a Prometheus text-format scrape body. +fn parse_expiry_metric(body: &str) -> Option { + for line in body.lines() { + if line.starts_with('#') { + continue; + } + if let Some(rest) = line.strip_prefix("rsigma_tls_certificate_expiry_seconds ") { + return rest.trim().parse::().ok(); + } + } + None +} + // --------------------------------------------------------------------------- // Plaintext refusal policy // --------------------------------------------------------------------------- @@ -495,6 +563,146 @@ fn encrypted_key_password_is_rejected_with_guidance() { ); } +// --------------------------------------------------------------------------- +// Cross-platform cert hot-reload +// --------------------------------------------------------------------------- + +#[test] +fn http_reload_endpoint_rotates_tls_certificate() { + // Mint a CA + an initial leaf with a ~30 day validity, spawn the + // daemon, then overwrite the cert/key files on disk with a freshly + // signed leaf that has a deliberately longer validity. POSTing to + // `/api/v1/reload` (which works on every platform, including + // Windows, unlike SIGHUP) should atomically pick up the new + // material and bump `rsigma_tls_certificate_expiry_seconds` + // accordingly. + let fixture = mint_ca_and_leaf_with_validity(time::Duration::days(30)); + let rule = temp_file(".yml", SIMPLE_RULE); + let daemon = DaemonProcess::spawn(&[ + "engine", + "daemon", + "-r", + rule.path().to_str().unwrap(), + "--input", + "http", + "--api-addr", + "127.0.0.1:0", + "--tls-cert", + &fixture.cert_path, + "--tls-key", + &fixture.key_path, + ]); + + // Initial expiry should be around 30 days (2_592_000 seconds). + let (status, body) = https_get( + daemon.api_addr(), + "/metrics", + client_config(fixture.root_store.clone()), + ) + .expect("initial /metrics scrape failed"); + assert_eq!(status, 200); + let initial_expiry = + parse_expiry_metric(&body).expect("expiry gauge missing from initial scrape"); + assert!( + initial_expiry > 25.0 * 86_400.0 && initial_expiry < 31.0 * 86_400.0, + "initial expiry should be ~30 days, got {initial_expiry} seconds" + ); + + // Sign a fresh leaf with the same CA but a 365-day validity and + // overwrite the cert/key files in place. + let new_leaf = mint_leaf_pem(&fixture.ca_issuer, time::Duration::days(365)); + std::fs::write(&fixture.cert_path, &new_leaf.cert).unwrap(); + std::fs::write(&fixture.key_path, &new_leaf.key).unwrap(); + + // Trigger the cross-platform reload path. The handler queues the + // reload; the actual cert swap happens after the 500 ms debounce + // in the central reload task. + let (status, body) = https_post( + daemon.api_addr(), + "/api/v1/reload", + "", + client_config(fixture.root_store.clone()), + ) + .expect("reload POST failed"); + assert!( + status == 200 || status == 429, + "reload POST should return 200 or 429, got {status} ({body})" + ); + + // Wait for the debounced reload, then scrape /metrics again and + // confirm the gauge moved to roughly 365 days. We use a generous + // poll window because the reload task sleeps 500 ms before + // draining; macOS file watchers also fire while we wait. + let new_expiry = common::poll_until(std::time::Duration::from_secs(10), || { + let (s, b) = https_get( + daemon.api_addr(), + "/metrics", + client_config(fixture.root_store.clone()), + ) + .ok()?; + if s != 200 { + return None; + } + let v = parse_expiry_metric(&b)?; + // 365 days is ~31_536_000 seconds; anything past 60 days proves + // the rotation took effect. + (v > 60.0 * 86_400.0).then_some(v) + }) + .expect("expiry gauge never reflected rotated certificate within 10s"); + assert!( + new_expiry > 360.0 * 86_400.0 && new_expiry < 366.0 * 86_400.0, + "post-reload expiry should be ~365 days, got {new_expiry} seconds" + ); +} + +#[test] +fn http_reload_with_invalid_cert_keeps_previous_one() { + // After a reload with a broken cert file, the previous chain stays + // live and the daemon keeps serving HTTPS without an interruption. + let fixture = mint_ca_and_leaf_with_validity(time::Duration::days(30)); + let rule = temp_file(".yml", SIMPLE_RULE); + let daemon = DaemonProcess::spawn(&[ + "engine", + "daemon", + "-r", + rule.path().to_str().unwrap(), + "--input", + "http", + "--api-addr", + "127.0.0.1:0", + "--tls-cert", + &fixture.cert_path, + "--tls-key", + &fixture.key_path, + ]); + + // Corrupt the cert file in place. + std::fs::write(&fixture.cert_path, b"not a pem certificate\n").unwrap(); + + // Best-effort reload trigger; status may be 200 (queued) or 429 + // (already pending from the on-startup file-watcher event). + let _ = https_post( + daemon.api_addr(), + "/api/v1/reload", + "", + client_config(fixture.root_store.clone()), + ); + + // Wait past the 500 ms debounce so the reload task definitely + // attempted (and rejected) the broken material. + std::thread::sleep(std::time::Duration::from_millis(1_500)); + + // The original certificate is still trusted, so a fresh HTTPS GET + // against `/healthz` must still succeed. + let (status, body) = https_get( + daemon.api_addr(), + "/healthz", + client_config(fixture.root_store.clone()), + ) + .expect("HTTPS should still succeed after a failed reload"); + assert_eq!(status, 200, "body: {body}"); +} + // Avoid unused-write warning on rcgen's keypair pem helpers across cfg // permutations. #[allow(dead_code)] From f45b9cdea3970713789067fa0a4893a5b4049e44 Mon Sep 17 00:00:00 2001 From: Mostafa Moradian Date: Fri, 22 May 2026 19:13:41 +0200 Subject: [PATCH 09/11] docs: cert hot-reload is cross-platform via POST /api/v1/reload Updates the TLS docs (CLI reference, security reference, CHANGELOG) to reflect that cert rotation no longer requires SIGHUP. Every reload trigger (HTTP POST, SIGHUP on Unix, file watcher) funnels through the same debounced task, so the HTTP endpoint works as the Windows-friendly equivalent of `kill -HUP`. Adds an explicit Windows-compatible rotation recipe to the mutual-TLS example, and points the failed-reload note at \`rsigma_reloads_failed_total\` so operators can wire an alert. --- CHANGELOG.md | 2 +- docs/cli/engine/daemon.md | 4 ++-- docs/reference/security.md | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 47120f94..20b65929 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,7 +26,7 @@ The `engine daemon` API listener now terminates TLS in-process for every protoco **Unified serving path.** The implementation collapses the previous split between `axum::serve` (for plaintext non-OTLP) and `tonic::transport::Server::serve_with_incoming_shutdown` (for OTLP) into a single `axum::Router` built via `tonic::service::Routes::into_axum_router`. For TLS, a small custom `axum::serve::Listener` wraps the `TcpListener` and performs the `tokio-rustls` handshake on every accepted connection. ALPN advertises both `h2` and `http/1.1`, so the same socket continues to serve REST + Prometheus + OTLP/HTTP + gRPC after TLS termination. -**SIGHUP cert hot-reload.** The existing rules-reload signal also re-reads the certificate and key from disk and atomically swaps the active `rustls::ServerConfig` via `Arc>`. Inflight TLS connections are not dropped; failed reloads keep the previous certificate active and log an error so a typo in the path cannot black-hole the listener. Encrypted-key support and ACME/Let's Encrypt automation are intentionally out of scope; operators rotate cert files (cert-manager, certbot, Vault PKI, ...) and send `kill -HUP`. +**Cross-platform cert hot-reload.** Cert rotation funnels through the daemon's central debounced reload task, which is triggered by `POST /api/v1/reload` (works on every platform, including Windows), `SIGHUP` (Unix), or a YAML change picked up by the file watcher. All three paths re-read the certificate and key from disk and atomically swap the active `rustls::ServerConfig` via `Arc>`. Inflight TLS connections are not dropped; failed reloads keep the previous certificate active, bump `rsigma_reloads_failed_total`, and log an error so a typo in the cert path cannot black-hole the listener. Encrypted-key support and ACME/Let's Encrypt automation are intentionally out of scope; operators rotate cert files (cert-manager, certbot, Vault PKI, ...) and trigger a reload. **Two new Prometheus metrics.** diff --git a/docs/cli/engine/daemon.md b/docs/cli/engine/daemon.md index ca7828c1..9396450b 100644 --- a/docs/cli/engine/daemon.md +++ b/docs/cli/engine/daemon.md @@ -78,7 +78,7 @@ When TLS is configured, the daemon terminates TLS in-process for every protocol | `--tls-min-version <1.2\|1.3>` | unset | `1.3` | Minimum TLS protocol version. Drop to `1.2` only for legacy agents that cannot negotiate TLS 1.3. | | `--allow-plaintext` | unset | off | Permit plaintext on a non-loopback `--api-addr`. Without this flag (and without `--tls-cert`/`--tls-key`) the daemon refuses to start on any public address. Loopback (`127.0.0.0/8`, `::1`) always allows plaintext for local development. | -Hot-reload: `SIGHUP` re-reads the certificate and key from disk and atomically swaps the active `rustls::ServerConfig` for new handshakes; inflight TLS connections are unaffected. Failed reloads keep the previous certificate active and log an error. The hot-reload path piggy-backs on the existing rules-reload signal, so a single `kill -HUP ` rotates rules, pipelines, and TLS material together. +Hot-reload: every reload trigger funnels through the daemon's central debounced reload task, so a single `POST /api/v1/reload` (cross-platform, including Windows), `kill -HUP ` (Unix), or a YAML file change picked up by the file watcher rotates rules, pipelines, enrichers, and the TLS certificate in one pass. The active `rustls::ServerConfig` is swapped atomically via `Arc>`, so new handshakes pick up the rotated material without dropping inflight TLS connections. Failed reloads keep the previous certificate active, bump `rsigma_reloads_failed_total`, and log an error so a typo in the cert path cannot black-hole the listener. Observability: the `/metrics` endpoint exposes `rsigma_tls_certificate_expiry_seconds` (signed; negative once the cert has expired) and `rsigma_tls_active_connections`. A single WARN is logged at startup (and after every reload) if the active certificate expires within 30 days. @@ -197,7 +197,7 @@ rsigma engine daemon -r rules/ \ --tls-client-ca /etc/rsigma/tls/clients-ca.crt ``` -Clients connecting to `https://daemon:9090/v1/logs` (OTLP/HTTP) or `https://daemon:9090/api/v1/events` (REST) must present a certificate signed by `clients-ca.crt` or the handshake is rejected. Rotate the server cert with `cp new.crt /etc/rsigma/tls/server.crt && kill -HUP $(pidof rsigma)`. +Clients connecting to `https://daemon:9090/v1/logs` (OTLP/HTTP) or `https://daemon:9090/api/v1/events` (REST) must present a certificate signed by `clients-ca.crt` or the handshake is rejected. Rotate the server cert with `cp new.crt /etc/rsigma/tls/server.crt && kill -HUP $(pidof rsigma)` on Unix, or `cp new.crt … && curl -X POST https://daemon:9090/api/v1/reload` on any platform (including Windows, where SIGHUP does not exist). ### Forensic replay from a NATS sequence diff --git a/docs/reference/security.md b/docs/reference/security.md index 99904b55..4896bad7 100644 --- a/docs/reference/security.md +++ b/docs/reference/security.md @@ -115,7 +115,7 @@ rsigma engine daemon -r rules/ \ Use `--tls-min-version 1.2` only when a legacy agent cannot negotiate TLS 1.3. The provider is `aws-lc-rs`, matching the NATS client TLS path and inheriting upstream FIPS-mode work. -Hot-reload: `SIGHUP` re-reads the certificate and key from disk and atomically swaps the rustls `ServerConfig` for new handshakes via `Arc>`. Inflight TLS connections are not dropped. Failed reloads keep the previous certificate active and log an error so a typo in the cert path cannot black-hole the listener. The same SIGHUP also reloads rules, pipelines, and enrichers, so cert rotation typically piggy-backs on a routine reload. +Hot-reload: cert rotation funnels through the daemon's central debounced reload task, which is triggered by `POST /api/v1/reload` (works on every platform, including Windows), `SIGHUP` (Unix), or a YAML change picked up by the file watcher. All three paths re-read the certificate and key from disk and atomically swap the rustls `ServerConfig` via `Arc>`. Inflight TLS connections are not dropped. Failed reloads keep the previous certificate active, bump `rsigma_reloads_failed_total`, and log an error so a typo in the cert path cannot black-hole the listener. The same trigger also reloads rules, pipelines, and enrichers, so cert rotation typically piggy-backs on a routine reload. Observability: `/metrics` exposes `rsigma_tls_certificate_expiry_seconds` (signed; negative once expired) and `rsigma_tls_active_connections`. A single WARN is logged at startup (and on every successful reload) when the active cert expires within 30 days; wire that line into the existing log-based alerting. From 67a91c453a59713f8ca24925057c41e0a006784a Mon Sep 17 00:00:00 2001 From: Mostafa Moradian Date: Fri, 22 May 2026 19:18:41 +0200 Subject: [PATCH 10/11] docs(readme): surface daemon-tls in top-level and CLI READMEs Top-level README: - Adds in-process TLS termination (with cross-platform cert hot-reload) to the feature list so it shows up in the at-a-glance bullet section. - New "TLS Termination" subsection alongside NATS / OTLP with a plain HTTPS and an mTLS recipe, plus a note about the cert hot-reload path and the two new Prometheus gauges. CLI README: - After the TLS flag table, documents the plaintext refusal policy, the shared HTTP/1 + HTTP/2 socket via ALPN, the aws-lc-rs provider, and the cross-platform hot-reload story. - Updates the "Hot-reload triggers" section to call out that `POST /api/v1/reload` is the cross-platform recommendation (especially for Windows where SIGHUP does not exist) and that every trigger now also rotates the TLS certificate alongside rules, pipelines, and enrichers. - Adds the two new TLS metrics (`rsigma_tls_certificate_expiry_seconds` and `rsigma_tls_active_connections`) to the Prometheus metrics table. - Broadens the `/api/v1/reload` endpoint description from "manual rule reload" to the full set of components it now rotates. The runtime crate README is intentionally unchanged: the `daemon-tls` feature lives on `rsigma-cli`, and the runtime crate has no awareness of TLS configuration. Verified with `mkdocs build --strict`. --- README.md | 20 ++++++++++++++++++++ crates/rsigma-cli/README.md | 10 ++++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3170b0ef..3482cb5e 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ For rule quality and editor integration, a built-in linter validates rules again * Convert rules into backend-native query strings via a pluggable backend trait (PostgreSQL/TimescaleDB SQL, LynxDB) * Optional eval prefilters for large rule sets: bloom filter for substring matchers (`--bloom-prefilter`) and cross-rule Aho-Corasick index for whole-rule pruning (`--cross-rule-ac`, requires `daachorse-index` feature) * Run as a streaming detection daemon with hot-reload, Prometheus metrics, and HTTP/NATS/OTLP input +* In-process TLS termination for the daemon API listener (HTTP REST, `/metrics`, OTLP/HTTP, OTLP/gRPC) with optional mutual TLS, `aws-lc-rs` crypto, and cross-platform certificate hot-reload * NATS JetStream support with authentication (credentials, mTLS), replay, consumer groups, and dead-letter queues * OTLP support for any OpenTelemetry-compatible agent (Grafana Alloy, Vector, Fluent Bit, OTel Collector) via HTTP or gRPC * Built-in linter with 66 checks, four severity levels, a full suppression system, and auto-fix (`--fix`) for 13 safe rules @@ -176,6 +177,25 @@ rsigma engine daemon -r rules/ --input nats://localhost:4222/events.> --consumer rsigma engine daemon -r rules/ --input nats://localhost:4222/events.> --dlq file:///var/log/rsigma-dlq.ndjson ``` +### TLS Termination + +Optional in-process TLS termination for the daemon's API listener (HTTP REST, `/metrics`, OTLP/HTTP, OTLP/gRPC), all on one socket. Requires the `daemon-tls` build feature. ALPN advertises both `h2` and `http/1.1` so legacy REST clients and modern gRPC clients share the listener. The daemon refuses to start on a non-loopback `--api-addr` without TLS or `--allow-plaintext`; loopback always allows plaintext for local development. + +```bash +# HTTPS for every protocol on --api-addr +rsigma engine daemon -r rules/ --input http --api-addr 0.0.0.0:9090 \ + --tls-cert /etc/rsigma/tls/server.crt \ + --tls-key /etc/rsigma/tls/server.key + +# Mutual TLS: every agent must present a CA-signed client cert +rsigma engine daemon -r rules/ --input http --api-addr 0.0.0.0:9090 \ + --tls-cert /etc/rsigma/tls/server.crt \ + --tls-key /etc/rsigma/tls/server.key \ + --tls-client-ca /etc/rsigma/tls/clients-ca.crt +``` + +Certificate hot-reload is cross-platform: `POST /api/v1/reload`, `SIGHUP` (Unix), or a YAML file change picked up by the file watcher all trigger the central reload task, which re-reads the cert/key from disk and atomically swaps the active `rustls::ServerConfig` via `Arc>` without dropping inflight connections. Two extra Prometheus gauges (`rsigma_tls_certificate_expiry_seconds` and `rsigma_tls_active_connections`) make the rotation observable. + ### Input Formats and Pipelines Events are parsed with auto-detection by default (JSON, syslog, plain text). Feature-gated formats: `logfmt`, `cef`, `evtx`. Processing pipelines handle field mapping between source schemas and Sigma field names. diff --git a/crates/rsigma-cli/README.md b/crates/rsigma-cli/README.md index a4fc82bd..a920803c 100644 --- a/crates/rsigma-cli/README.md +++ b/crates/rsigma-cli/README.md @@ -213,6 +213,8 @@ Unlike `engine eval`, the daemon stays alive after stdin reaches EOF and support | `--tls-min-version` | string | `"1.3"` | Minimum TLS protocol version: `1.2` or `1.3`. | | `--allow-plaintext` | flag | `false` | Permit plaintext on a non-loopback `--api-addr`. Loopback always allows plaintext. | +When the `daemon-tls` feature is built in, the daemon refuses to start on a non-loopback `--api-addr` without `--tls-cert`/`--tls-key` or an explicit `--allow-plaintext` opt-in. With TLS configured, the same socket serves HTTP REST, `/metrics`, OTLP/HTTP, and OTLP/gRPC over a single TLS connection via ALPN (advertises both `h2` and `http/1.1`). Crypto provider is `aws-lc-rs`, matching the NATS client TLS path. Certificate hot-reload is cross-platform: any of the hot-reload triggers below re-reads the cert/key from disk and atomically swaps the active `rustls::ServerConfig` via `Arc>` without dropping inflight TLS connections. + \* Feature-gated: `logfmt` requires the `logfmt` feature, `cef` requires the `cef` feature. **Usage:** @@ -304,7 +306,7 @@ rsigma engine daemon \ | `/metrics` | GET | Prometheus metrics (events processed, matches, latency, rules loaded, etc.) | | `/api/v1/status` | GET | Full daemon status (rules, state entries, counters, uptime) | | `/api/v1/rules` | GET | Rule counts and rules path | -| `/api/v1/reload` | POST | Trigger a manual rule reload | +| `/api/v1/reload` | POST | Trigger a manual reload of rules, pipelines, enrichers, and (with `daemon-tls`) the TLS certificate. Cross-platform alternative to `SIGHUP`. | | `/api/v1/events` | POST | Ingest events (NDJSON body, one event per line). Only available with `--input http` | | `/api/v1/sources` | GET | List dynamic sources and their resolution status | | `/api/v1/sources/resolve` | POST | Trigger re-resolution of all dynamic sources (or specific ones via request body) | @@ -344,10 +346,12 @@ curl -X POST http://localhost:9090/v1/logs \ - File system changes to `.yml`/`.yaml` files in the rules directory (debounced 500ms) - `SIGHUP` signal (Unix only) -- triggers both rule reload and dynamic source re-resolution -- `POST /api/v1/reload` +- `POST /api/v1/reload` -- cross-platform; the recommended cert-rotation path on Windows - `POST /api/v1/sources/resolve` -- re-resolves dynamic sources without reloading rules - NATS control subject `rsigma.control.resolve` (when using NATS sources) -- payload can be empty (resolve all) or `{"source_id": "..."}` (resolve one) +The first three triggers all funnel through one debounced reload task that re-reads rules, pipelines, enrichers, and (when `daemon-tls` is built in) the TLS certificate and key. A failed reload of any component bumps `rsigma_reloads_failed_total`, logs an error, and leaves the previous in-memory state active so a typo on disk cannot black-hole the daemon. + **Prometheus metrics:** | Metric | Type | Labels | Description | @@ -379,6 +383,8 @@ curl -X POST http://localhost:9090/v1/logs \ | `rsigma_otlp_requests_total` | counter | `transport`, `encoding` | OTLP export requests received (requires `daemon-otlp`) | | `rsigma_otlp_log_records_total` | counter | | Log records ingested via OTLP (requires `daemon-otlp`) | | `rsigma_otlp_errors_total` | counter | `transport`, `reason` | OTLP request errors (requires `daemon-otlp`) | +| `rsigma_tls_certificate_expiry_seconds` | gauge | | Seconds until the active TLS server certificate's `not_after` (signed; negative once expired). Requires `daemon-tls` | +| `rsigma_tls_active_connections` | gauge | | Currently active TLS-terminated connections on the API listener (requires `daemon-tls`) | The per-rule labeled counters (`_by_rule_total`) enable per-rule alerting in Grafana or other Prometheus-based tools. A single PromQL query like `increase(rsigma_detection_matches_by_rule_total[5m]) > 0` produces separate alert instances for each `{rule_title, level}` combination. The aggregate counters (`_total`) remain for lightweight total-throughput monitoring. From 26c3e65d1f9f4b04c9c25a13436b01034603174f Mon Sep 17 00:00:00 2001 From: Mostafa Moradian Date: Fri, 22 May 2026 19:58:37 +0200 Subject: [PATCH 11/11] test(common): rewrite wildcard bind addr to loopback before probing `public_bind_with_allow_plaintext_starts` failed on Windows because the daemon binds to `0.0.0.0:0` and the test then tries to TCP-connect back to the captured `0.0.0.0:`. Linux and macOS silently route that to loopback; Windows returns `WSAEADDRNOTAVAIL`, so the readiness probe (and any subsequent `http_get` via `daemon.url(...)`) times out. Adds a small `rewrite_wildcard_to_loopback` helper in the shared test harness that converts `0.0.0.0:PORT` -> `127.0.0.1:PORT` and `[::]:PORT` -> `[::1]:PORT` before storing the address. The daemon is listening on every interface anyway, so loopback is always reachable. No production code changes. Verified with: - `cargo test -p rsigma --no-default-features --features daemon-tls --test cli_daemon_tls --test-threads=1` (12 passed) - `cargo clippy -p rsigma --all-features --all-targets --target x86_64-pc-windows-gnu -- -D warnings` --- crates/rsigma-cli/tests/common/mod.rs | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/crates/rsigma-cli/tests/common/mod.rs b/crates/rsigma-cli/tests/common/mod.rs index 6f20a07c..a236849e 100644 --- a/crates/rsigma-cli/tests/common/mod.rs +++ b/crates/rsigma-cli/tests/common/mod.rs @@ -138,6 +138,15 @@ impl DaemonProcess { } } + // The daemon may log a wildcard bind address like `0.0.0.0:PORT` + // (or `[::]:PORT`). Connecting to a wildcard address returns + // `WSAEADDRNOTAVAIL` on Windows. Linux and macOS silently treat + // it as loopback, so the same test was green there. Rewrite the + // recorded address to the loopback equivalent before probing + // and before exposing it via `url()`; the daemon listens on + // every interface so loopback is always reachable. + let api_addr = rewrite_wildcard_to_loopback(api_addr); + let socket: std::net::SocketAddr = api_addr .parse() .unwrap_or_else(|e| panic!("invalid api_addr {api_addr:?}: {e}")); @@ -248,6 +257,24 @@ fn extract_addr(line: &str) -> Option { .and_then(|v| v["fields"]["addr"].as_str().map(|s| s.to_string())) } +/// Rewrite a wildcard bind address (`0.0.0.0:PORT` or `[::]:PORT`) to the +/// loopback equivalent. Connecting to a wildcard works on Linux/macOS +/// (silently routed to loopback) but fails with `WSAEADDRNOTAVAIL` on +/// Windows, which made `public_bind_with_allow_plaintext_starts` flake +/// only on Windows CI before this rewrite. +fn rewrite_wildcard_to_loopback(addr: String) -> String { + match addr.parse::() { + Ok(parsed) if parsed.ip().is_unspecified() => { + let port = parsed.port(); + match parsed { + std::net::SocketAddr::V4(_) => format!("127.0.0.1:{port}"), + std::net::SocketAddr::V6(_) => format!("[::1]:{port}"), + } + } + _ => addr, + } +} + // --------------------------------------------------------------------------- // HTTP and polling helpers // ---------------------------------------------------------------------------