diff --git a/Cargo.lock b/Cargo.lock index dfc0310..b46e978 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6,7 +6,6 @@ version = 4 name = "NVRC" version = "0.1.1" dependencies = [ - "anyhow", "cfg-if", "kernlog", "libc", @@ -19,12 +18,6 @@ dependencies = [ "tempfile", ] -[[package]] -name = "anyhow" -version = "1.0.100" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" - [[package]] name = "autocfg" version = "1.4.0" diff --git a/Cargo.toml b/Cargo.toml index 3d11cf1..e13fdf3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,6 @@ version = "0.1.1" edition = "2021" [dependencies] -anyhow = "1.0.100" nix = { version = "0.30.1", features = ["fs", "mount", "user", "process", "reboot", "signal", "mman", "poll", "ioctl"] } cfg-if = "1.0.4" log = "0.4.29" diff --git a/src/daemon.rs b/src/daemon.rs index bbe4f7d..7016303 100644 --- a/src/daemon.rs +++ b/src/daemon.rs @@ -1,9 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // Copyright (c) NVIDIA CORPORATION -use anyhow::{Context, Result}; - use crate::execute::background; +use crate::macros::ResultExt; use crate::nvrc::NVRC; use std::fs; @@ -41,62 +40,58 @@ impl NVRC { /// nvidia-persistenced keeps GPU state warm between container invocations, /// reducing cold-start latency. UVM persistence mode enables unified memory /// optimizations. Enabled by default since most workloads benefit from it. - pub fn nvidia_persistenced(&mut self) -> Result<()> { + pub fn nvidia_persistenced(&mut self) { self.spawn_persistenced("/var/run/nvidia-persistenced", "/bin/nvidia-persistenced") } - fn spawn_persistenced(&mut self, run_dir: &str, bin: &str) -> Result<()> { - fs::create_dir_all(run_dir).with_context(|| format!("create_dir_all {}", run_dir))?; + fn spawn_persistenced(&mut self, run_dir: &str, bin: &str) { + fs::create_dir_all(run_dir).or_panic(format_args!("create_dir_all {run_dir}")); let uvm_enabled = self.uvm_persistence_mode.unwrap_or(true); let args = persistenced_args(uvm_enabled); - let child = background(bin, &args)?; + let child = background(bin, &args); self.track_daemon("nvidia-persistenced", child); - Ok(()) } /// nv-hostengine is the DCGM backend daemon. Only started when DCGM monitoring /// is explicitly requested - not needed for basic GPU workloads. - pub fn nv_hostengine(&mut self) -> Result<()> { + pub fn nv_hostengine(&mut self) { self.spawn_hostengine("/bin/nv-hostengine") } - fn spawn_hostengine(&mut self, bin: &str) -> Result<()> { + fn spawn_hostengine(&mut self, bin: &str) { if !self.dcgm_enabled.unwrap_or(false) { - return Ok(()); + return; } - let child = background(bin, hostengine_args())?; + let child = background(bin, hostengine_args()); self.track_daemon("nv-hostengine", child); - Ok(()) } /// dcgm-exporter exposes GPU metrics for Prometheus. Only started when DCGM /// is enabled - adds overhead so disabled by default. - pub fn dcgm_exporter(&mut self) -> Result<()> { + pub fn dcgm_exporter(&mut self) { self.spawn_dcgm_exporter("/bin/dcgm-exporter") } - fn spawn_dcgm_exporter(&mut self, bin: &str) -> Result<()> { + fn spawn_dcgm_exporter(&mut self, bin: &str) { if !self.dcgm_enabled.unwrap_or(false) { - return Ok(()); + return; } - let child = background(bin, dcgm_exporter_args())?; + let child = background(bin, dcgm_exporter_args()); self.track_daemon("dcgm-exporter", child); - Ok(()) } /// NVSwitch fabric manager is only needed for multi-GPU NVLink topologies. /// Disabled by default since most VMs have single GPUs. - pub fn nv_fabricmanager(&mut self) -> Result<()> { + pub fn nv_fabricmanager(&mut self) { self.spawn_fabricmanager("/bin/nv-fabricmanager") } - fn spawn_fabricmanager(&mut self, bin: &str) -> Result<()> { + fn spawn_fabricmanager(&mut self, bin: &str) { if !self.fabricmanager_enabled.unwrap_or(false) { - return Ok(()); + return; } - let child = background(bin, fabricmanager_args())?; + let child = background(bin, fabricmanager_args()); self.track_daemon("nv-fabricmanager", child); - Ok(()) } } @@ -152,20 +147,20 @@ mod tests { fn test_nv_hostengine_skipped_by_default() { // DCGM disabled by default - should be a no-op, no daemon spawned let mut nvrc = NVRC::default(); - assert!(nvrc.nv_hostengine().is_ok()); - assert!(nvrc.check_daemons().is_ok()); + nvrc.nv_hostengine(); + nvrc.check_daemons(); } #[test] fn test_dcgm_exporter_skipped_by_default() { let mut nvrc = NVRC::default(); - assert!(nvrc.dcgm_exporter().is_ok()); + nvrc.dcgm_exporter(); } #[test] fn test_nv_fabricmanager_skipped_by_default() { let mut nvrc = NVRC::default(); - assert!(nvrc.nv_fabricmanager().is_ok()); + nvrc.nv_fabricmanager(); } #[test] @@ -174,14 +169,13 @@ mod tests { let run_dir = tmpdir.path().join("nvidia-persistenced"); let mut nvrc = NVRC::default(); - let result = nvrc.spawn_persistenced(run_dir.to_str().unwrap(), "/bin/true"); - assert!(result.is_ok()); + nvrc.spawn_persistenced(run_dir.to_str().unwrap(), "/bin/true"); // Directory should be created assert!(run_dir.exists()); // Daemon should be tracked and exit cleanly - assert!(nvrc.check_daemons().is_ok()); + nvrc.check_daemons(); } #[test] @@ -191,48 +185,48 @@ mod tests { let mut nvrc = NVRC::default(); nvrc.uvm_persistence_mode = Some(false); // Tests the else branch for args - let result = nvrc.spawn_persistenced(run_dir.to_str().unwrap(), "/bin/true"); - assert!(result.is_ok()); + nvrc.spawn_persistenced(run_dir.to_str().unwrap(), "/bin/true"); } #[test] fn test_spawn_hostengine_success() { let mut nvrc = NVRC::default(); nvrc.dcgm_enabled = Some(true); - let result = nvrc.spawn_hostengine("/bin/true"); - assert!(result.is_ok()); - assert!(nvrc.check_daemons().is_ok()); + nvrc.spawn_hostengine("/bin/true"); + nvrc.check_daemons(); } #[test] fn test_spawn_dcgm_exporter_success() { let mut nvrc = NVRC::default(); nvrc.dcgm_enabled = Some(true); - let result = nvrc.spawn_dcgm_exporter("/bin/true"); - assert!(result.is_ok()); + nvrc.spawn_dcgm_exporter("/bin/true"); } #[test] fn test_spawn_fabricmanager_success() { let mut nvrc = NVRC::default(); nvrc.fabricmanager_enabled = Some(true); - let result = nvrc.spawn_fabricmanager("/bin/true"); - assert!(result.is_ok()); + nvrc.spawn_fabricmanager("/bin/true"); } #[test] fn test_spawn_persistenced_binary_not_found() { + use std::panic; + let tmpdir = TempDir::new().unwrap(); let run_dir = tmpdir.path().join("nvidia-persistenced"); - let mut nvrc = NVRC::default(); - let result = nvrc.spawn_persistenced(run_dir.to_str().unwrap(), "/nonexistent/binary"); + let result = panic::catch_unwind(|| { + let mut nvrc = NVRC::default(); + nvrc.spawn_persistenced(run_dir.to_str().unwrap(), "/nonexistent/binary"); + }); assert!(result.is_err()); } #[test] fn test_check_daemons_empty() { let mut nvrc = NVRC::default(); - assert!(nvrc.check_daemons().is_ok()); + nvrc.check_daemons(); } } diff --git a/src/execute.rs b/src/execute.rs index 3522689..3ad79c7 100644 --- a/src/execute.rs +++ b/src/execute.rs @@ -1,81 +1,81 @@ // SPDX-License-Identifier: Apache-2.0 // Copyright (c) NVIDIA CORPORATION -use anyhow::{anyhow, Context, Result}; use std::process::{Child, Command, Stdio}; use crate::kmsg::kmsg; +use crate::macros::ResultExt; /// Run a command and block until completion. Output goes to kmsg so it appears /// in dmesg/kernel log - the only reliable log destination in minimal VMs. /// Used for setup commands that must succeed before continuing (nvidia-smi, modprobe). -pub fn foreground(command: &str, args: &[&str]) -> Result<()> { +pub fn foreground(command: &str, args: &[&str]) { debug!("{} {}", command, args.join(" ")); - let kmsg_file = kmsg().context("Failed to open kmsg device")?; + let kmsg_file = kmsg(); let status = Command::new(command) .args(args) .stdout(Stdio::from(kmsg_file.try_clone().unwrap())) .stderr(Stdio::from(kmsg_file)) .status() - .context(format!("failed to execute {command}"))?; + .or_panic(format_args!("execute {command}")); if !status.success() { - return Err(anyhow!("{} failed with status: {}", command, status)); + panic!("{command} failed with status: {status}"); } - Ok(()) } /// Spawn a daemon without waiting. Returns Child so caller can track it later. /// Used for long-running services (nvidia-persistenced, fabricmanager) that run /// alongside kata-agent. Output to kmsg for visibility in kernel log. -pub fn background(command: &str, args: &[&str]) -> Result { +pub fn background(command: &str, args: &[&str]) -> Child { debug!("{} {}", command, args.join(" ")); - let kmsg_file = kmsg().context("Failed to open kmsg device")?; + let kmsg_file = kmsg(); Command::new(command) .args(args) .stdout(Stdio::from(kmsg_file.try_clone().unwrap())) .stderr(Stdio::from(kmsg_file)) .spawn() - .with_context(|| format!("Failed to start {}", command)) + .or_panic(format_args!("start {command}")) } #[cfg(test)] mod tests { use super::*; + use std::panic; // ==================== foreground tests ==================== #[test] fn test_foreground_success() { - let result = foreground("/bin/true", &[]); - assert!(result.is_ok()); + foreground("/bin/true", &[]); } #[test] fn test_foreground_failure_exit_code() { - // Command runs but exits non-zero - let result = foreground("/bin/false", &[]); + // Command runs but exits non-zero - should panic + let result = panic::catch_unwind(|| { + foreground("/bin/false", &[]); + }); assert!(result.is_err()); - let err = result.unwrap_err().to_string(); - assert!(err.contains("failed")); } #[test] fn test_foreground_not_found() { - // Command doesn't exist - triggers .context() error path - let result = foreground("/nonexistent/command", &[]); + // Command doesn't exist - should panic + let result = panic::catch_unwind(|| { + foreground("/nonexistent/command", &[]); + }); assert!(result.is_err()); - let err = result.unwrap_err().to_string(); - assert!(err.contains("execute")); } #[test] fn test_foreground_with_args() { - let result = foreground("/bin/sh", &["-c", "exit 0"]); - assert!(result.is_ok()); + foreground("/bin/sh", &["-c", "exit 0"]); - let result = foreground("/bin/sh", &["-c", "exit 42"]); + let result = panic::catch_unwind(|| { + foreground("/bin/sh", &["-c", "exit 42"]); + }); assert!(result.is_err()); } @@ -83,27 +83,23 @@ mod tests { #[test] fn test_background_spawns() { - let result = background("/bin/sleep", &["0.01"]); - assert!(result.is_ok()); - let mut child = result.unwrap(); + let mut child = background("/bin/sleep", &["0.01"]); let status = child.wait().unwrap(); assert!(status.success()); } #[test] fn test_background_not_found() { - // Command doesn't exist - triggers .with_context() error path - let result = background("/nonexistent/command", &[]); + // Command doesn't exist - should panic + let result = panic::catch_unwind(|| { + background("/nonexistent/command", &[]); + }); assert!(result.is_err()); - let err = result.unwrap_err().to_string(); - assert!(err.contains("start"), "error should mention start: {}", err); } #[test] fn test_background_check_later() { - let result = background("/bin/sh", &["-c", "exit 7"]); - assert!(result.is_ok()); - let mut child = result.unwrap(); + let mut child = background("/bin/sh", &["-c", "exit 7"]); let status = child.wait().unwrap(); assert!(!status.success()); assert_eq!(status.code(), Some(7)); diff --git a/src/kata_agent.rs b/src/kata_agent.rs index 3c3060b..0bce567 100644 --- a/src/kata_agent.rs +++ b/src/kata_agent.rs @@ -1,8 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // Copyright (c) NVIDIA CORPORATION -use anyhow::{anyhow, Context, Result}; -use log::{debug, error}; +use log::debug; use nix::unistd::{fork, ForkResult}; use rlimit::{setrlimit, Resource}; use std::fs; @@ -23,49 +22,45 @@ const KATA_AGENT_OOM_SCORE_ADJ: &str = "-997"; /// kata-agent needs high file descriptor limits for container workloads and /// must survive OOM conditions to maintain VM stability -fn agent_setup() -> Result<()> { +fn agent_setup() { let nofile = 1024 * 1024; - setrlimit(Resource::NOFILE, nofile, nofile).context("setrlimit RLIMIT_NOFILE")?; + setrlimit(Resource::NOFILE, nofile, nofile).expect("setrlimit RLIMIT_NOFILE"); fs::write( "/proc/self/oom_score_adj", KATA_AGENT_OOM_SCORE_ADJ.as_bytes(), ) - .context("write /proc/self/oom_score_adj")?; - let lim = rlimit::getrlimit(Resource::NOFILE)?; + .expect("write /proc/self/oom_score_adj"); + let lim = rlimit::getrlimit(Resource::NOFILE).expect("getrlimit RLIMIT_NOFILE"); debug!("kata-agent RLIMIT_NOFILE: {:?}", lim); - Ok(()) } /// exec() replaces this process with kata-agent, so it only returns on failure. /// We want kata-agent to become PID 1's child for proper process hierarchy. -fn exec_agent(cmd: &str) -> Result<()> { +fn exec_agent(cmd: &str) { let err = Command::new(cmd).exec(); - Err(anyhow!("exec {} failed: {err}", cmd)) + panic!("exec {cmd} failed: {err}"); } /// Path parameter enables testing with /bin/true instead of real kata-agent -fn kata_agent(path: &str) -> Result<()> { - agent_setup()?; - exec_agent(path) +fn kata_agent(path: &str) { + agent_setup(); + exec_agent(path); } /// Guest VMs lack a syslog daemon, so we poll /dev/log to drain messages /// and forward them to kmsg. Timeout enables testing without infinite loops. -fn syslog_loop(timeout_secs: u32) -> Result<()> { +fn syslog_loop(timeout_secs: u32) { let iterations = (timeout_secs as u64) * 2; // 500ms per iteration for _ in 0..iterations { sleep(Duration::from_millis(500)); - if let Err(e) = crate::syslog::poll() { - return Err(anyhow!("poll syslog: {e}")); - } + crate::syslog::poll(); } - Ok(()) } /// Parent execs kata-agent (becoming it), child stays as syslog poller. /// This way kata-agent inherits our PID and becomes the main guest process. /// Timeout parameter allows tests to verify the fork/syslog logic exits cleanly -pub fn fork_agent(timeout_secs: u32) -> Result<()> { +pub fn fork_agent(timeout_secs: u32) { // SAFETY: fork() is safe here because: // 1. We are PID 1 with no other threads (single-threaded process) // 2. Parent immediately execs kata-agent (no shared state issues) @@ -73,15 +68,12 @@ pub fn fork_agent(timeout_secs: u32) -> Result<()> { // 4. No locks or mutexes exist that could deadlock in child match unsafe { fork() }.expect("fork agent") { ForkResult::Parent { .. } => { - kata_agent(KATA_AGENT_PATH).context("kata-agent parent")?; + kata_agent(KATA_AGENT_PATH); } ForkResult::Child => { - if let Err(e) = syslog_loop(timeout_secs) { - error!("{e}"); - } + syslog_loop(timeout_secs); } } - Ok(()) } #[cfg(test)] @@ -89,14 +81,24 @@ mod tests { use super::*; use crate::test_utils::require_root; use nix::sys::wait::{waitpid, WaitStatus}; + use std::panic; + + /// Install a panic hook that exits with code 1. + /// Required in forked children because Rust's test harness catches panics + /// and exits with 0, which breaks our "panic = failure" assertions. + fn set_test_panic_hook() { + panic::set_hook(Box::new(|info| { + eprintln!("panic: {info}"); + std::process::exit(1); + })); + } #[test] fn test_agent_setup() { require_root(); // agent_setup sets rlimit and writes oom_score_adj - let result = agent_setup(); - assert!(result.is_ok(), "agent_setup failed: {:?}", result); + agent_setup(); // Verify rlimit was set let (soft, hard) = rlimit::getrlimit(Resource::NOFILE).unwrap(); @@ -110,31 +112,31 @@ mod tests { #[test] fn test_exec_agent_not_found() { - // exec_agent with nonexistent command returns error (doesn't exec) - let result = exec_agent("/nonexistent/command"); + // exec_agent with nonexistent command panics (doesn't exec) + let result = panic::catch_unwind(|| { + exec_agent("/nonexistent/command"); + }); assert!(result.is_err()); - let err = result.unwrap_err().to_string(); - assert!(err.contains("exec"), "error should mention exec: {}", err); } #[test] fn test_kata_agent_not_found() { require_root(); - // kata_agent with nonexistent path - setup succeeds, exec fails + // kata_agent with nonexistent path - setup succeeds, exec panics // SAFETY: Test forks to isolate agent_setup() and exec failure. // Single-threaded test process with no shared state. match unsafe { fork() }.expect("fork") { ForkResult::Parent { child } => { - assert!(matches!( - waitpid(child, None).expect("waitpid"), - WaitStatus::Exited(_, 1) - )); + // Child exits abnormally due to panic + let status = waitpid(child, None).expect("waitpid"); + assert!(!matches!(status, WaitStatus::Exited(_, 0))); } ForkResult::Child => { - // Setup succeeds, exec fails - verify and exit with expected code - assert!(kata_agent("/nonexistent/agent").is_err()); - std::process::exit(1); + set_test_panic_hook(); + // Setup succeeds, exec panics + kata_agent("/nonexistent/agent"); + std::process::exit(0); // Won't reach here } } } @@ -144,10 +146,10 @@ mod tests { // syslog_loop with 1 second timeout runs up to 2 iterations (500ms each). // Two possible outcomes: // 1. poll() works: runs full 2 iterations (~1000ms) - // 2. poll() fails: exits early after 1st iteration (~500ms) due to missing /dev/log - // Either way, verifies the loop terminates properly. + // 2. poll() panics: test fails due to missing /dev/log + // We catch_unwind to handle missing /dev/log gracefully in test env let start = std::time::Instant::now(); - let _ = syslog_loop(1); // May error if /dev/log not bound, that's fine + let _ = panic::catch_unwind(|| syslog_loop(1)); let elapsed = start.elapsed(); // Lower bound: at least 1 sleep cycle (500ms) runs before poll @@ -158,25 +160,25 @@ mod tests { #[test] fn test_fork_agent_with_timeout() { - // Double fork: outer fork isolates the test, inner fork (inside fork_agent_with_timeout) - // does the real work. This lets us actually call fork_agent_with_timeout() directly. + require_root(); + + // Double fork: outer fork isolates the test, inner fork (inside fork_agent) + // does the real work. This lets us actually call fork_agent() directly. // SAFETY: Outer fork isolates the test in a child process. // Single-threaded test with no shared state. match unsafe { fork() }.expect("outer fork") { ForkResult::Parent { child } => { - // Wrapper exits 1 because kata_agent() fails (no binary) - assert!(matches!( - waitpid(child, None).expect("waitpid"), - WaitStatus::Exited(_, 1) - )); + // Wrapper exits abnormally because kata_agent() panics (no binary) + let status = waitpid(child, None).expect("waitpid"); + assert!(!matches!(status, WaitStatus::Exited(_, 0))); } ForkResult::Child => { - // This child calls fork_agent_with_timeout, which forks again internally. - // - Inner parent (us): kata_agent() fails, returns Err + set_test_panic_hook(); + // This child calls fork_agent, which forks again internally. + // - Inner parent (us): kata_agent() panics // - Inner child: runs syslog_loop(1), exits after ~1 second - let result = fork_agent(1); - // We're the inner parent, so we get the error from kata_agent() - std::process::exit(if result.is_err() { 1 } else { 0 }); + fork_agent(1); + std::process::exit(0); // Won't reach here due to panic } } } diff --git a/src/kernel_params.rs b/src/kernel_params.rs index cc94075..9c38221 100644 --- a/src/kernel_params.rs +++ b/src/kernel_params.rs @@ -1,4 +1,3 @@ -use anyhow::{Context, Result}; use log::{debug, warn}; use std::fs; @@ -21,59 +20,55 @@ impl NVRC { /// Parse kernel command line parameters to configure NVRC behavior. /// Using kernel params allows configuration without userspace tools—critical /// for a minimal init where no config files or environment variables exist. - pub fn process_kernel_params(&mut self, cmdline: Option<&str>) -> Result<()> { + pub fn process_kernel_params(&mut self, cmdline: Option<&str>) { let content = match cmdline { Some(c) => c.to_owned(), - None => fs::read_to_string("/proc/cmdline").context("read /proc/cmdline")?, + None => fs::read_to_string("/proc/cmdline").expect("read /proc/cmdline"), }; for (k, v) in content.split_whitespace().filter_map(|p| p.split_once('=')) { match k { - "nvrc.mode" => nvrc_mode(v, self)?, - "nvrc.log" => nvrc_log(v, self)?, - "nvrc.uvm.persistence.mode" => uvm_persistenced_mode(v, self)?, - "nvrc.dcgm" => nvrc_dcgm(v, self)?, - "nvrc.fabricmanager" => nvrc_fabricmanager(v, self)?, - "nvrc.smi.srs" => nvidia_smi_srs(v, self)?, - "nvrc.smi.lgc" => nvidia_smi_lgc(v, self)?, - "nvrc.smi.lmc" => nvidia_smi_lmc(v, self)?, - "nvrc.smi.pl" => nvidia_smi_pl(v, self)?, + "nvrc.mode" => nvrc_mode(v, self), + "nvrc.log" => nvrc_log(v, self), + "nvrc.uvm.persistence.mode" => uvm_persistenced_mode(v, self), + "nvrc.dcgm" => nvrc_dcgm(v, self), + "nvrc.fabricmanager" => nvrc_fabricmanager(v, self), + "nvrc.smi.srs" => nvidia_smi_srs(v, self), + "nvrc.smi.lgc" => nvidia_smi_lgc(v, self), + "nvrc.smi.lmc" => nvidia_smi_lmc(v, self), + "nvrc.smi.pl" => nvidia_smi_pl(v, self), _ => {} } } - Ok(()) } } /// Operation mode: "gpu" (default) or "cpu" (skip GPU management). /// Use nvrc.mode=cpu for CPU-only workloads that don't need GPU initialization. -fn nvrc_mode(value: &str, ctx: &mut NVRC) -> Result<()> { +fn nvrc_mode(value: &str, ctx: &mut NVRC) { ctx.mode = Some(value.to_lowercase()); debug!("nvrc.mode: {}", value); - Ok(()) } /// DCGM (Data Center GPU Manager) provides telemetry and health monitoring. /// Off by default—only enable when observability infrastructure expects it. -fn nvrc_dcgm(value: &str, ctx: &mut NVRC) -> Result<()> { +fn nvrc_dcgm(value: &str, ctx: &mut NVRC) { let dcgm = parse_boolean(value); ctx.dcgm_enabled = Some(dcgm); debug!("nvrc.dcgm: {dcgm}"); - Ok(()) } /// Fabric Manager enables NVLink/NVSwitch multi-GPU communication. /// Only needed for multi-GPU systems with NVLink topology. -fn nvrc_fabricmanager(value: &str, ctx: &mut NVRC) -> Result<()> { +fn nvrc_fabricmanager(value: &str, ctx: &mut NVRC) { let fabricmanager = parse_boolean(value); ctx.fabricmanager_enabled = Some(fabricmanager); debug!("nvrc.fabricmanager: {fabricmanager}"); - Ok(()) } /// Control log verbosity at runtime. Defaults to off to minimize noise. /// Enabling devkmsg allows kernel log output even in minimal init environments. -fn nvrc_log(value: &str, _ctx: &mut NVRC) -> Result<()> { +fn nvrc_log(value: &str, _ctx: &mut NVRC) { let lvl = match value.to_ascii_lowercase().as_str() { "off" | "0" | "" => log::LevelFilter::Off, "error" => log::LevelFilter::Error, @@ -86,52 +81,45 @@ fn nvrc_log(value: &str, _ctx: &mut NVRC) -> Result<()> { log::set_max_level(lvl); debug!("nvrc.log: {}", log::max_level()); - fs::write("/proc/sys/kernel/printk_devkmsg", b"on\n").context("printk_devkmsg")?; - - Ok(()) + fs::write("/proc/sys/kernel/printk_devkmsg", b"on\n").expect("printk_devkmsg"); } /// Secure Randomization Seed for GPU memory. Passed directly to nvidia-smi. -fn nvidia_smi_srs(value: &str, ctx: &mut NVRC) -> Result<()> { +fn nvidia_smi_srs(value: &str, ctx: &mut NVRC) { ctx.nvidia_smi_srs = Some(value.to_owned()); debug!("nvidia_smi_srs: {value}"); - Ok(()) } /// Lock GPU core clocks to a fixed frequency (MHz) for consistent performance. /// Eliminates thermal/power throttling variance in benchmarks and latency-sensitive workloads. -fn nvidia_smi_lgc(value: &str, ctx: &mut NVRC) -> Result<()> { - let mhz: u32 = value.parse().context("nvrc.smi.lgc: invalid frequency")?; +fn nvidia_smi_lgc(value: &str, ctx: &mut NVRC) { + let mhz: u32 = value.parse().expect("nvrc.smi.lgc: invalid frequency"); debug!("nvrc.smi.lgc: {} MHz (all GPUs)", mhz); ctx.nvidia_smi_lgc = Some(mhz); - Ok(()) } /// Lock memory clocks to a fixed frequency (MHz). /// Used alongside lgc for fully deterministic GPU behavior. -fn nvidia_smi_lmc(value: &str, ctx: &mut NVRC) -> Result<()> { - let mhz: u32 = value.parse().context("nvrc.smi.lmc: invalid frequency")?; +fn nvidia_smi_lmc(value: &str, ctx: &mut NVRC) { + let mhz: u32 = value.parse().expect("nvrc.smi.lmc: invalid frequency"); debug!("nvrc.smi.lmc: {} MHz (all GPUs)", mhz); ctx.nvidia_smi_lmc = Some(mhz); - Ok(()) } /// Set GPU power limit (Watts). Lower limits reduce heat/power, higher allows peak perf. /// Useful for power-constrained environments or thermal management. -fn nvidia_smi_pl(value: &str, ctx: &mut NVRC) -> Result<()> { - let watts: u32 = value.parse().context("nvrc.smi.pl: invalid wattage")?; +fn nvidia_smi_pl(value: &str, ctx: &mut NVRC) { + let watts: u32 = value.parse().expect("nvrc.smi.pl: invalid wattage"); debug!("nvrc.smi.pl: {} W (all GPUs)", watts); ctx.nvidia_smi_pl = Some(watts); - Ok(()) } /// UVM persistence mode keeps unified memory state across CUDA context teardowns. /// Reduces initialization overhead for short-lived CUDA applications. -fn uvm_persistenced_mode(value: &str, ctx: &mut NVRC) -> Result<()> { +fn uvm_persistenced_mode(value: &str, ctx: &mut NVRC) { let enabled = parse_boolean(value); ctx.uvm_persistence_mode = Some(enabled); debug!("nvrc.uvm.persistence.mode: {enabled}"); - Ok(()) } #[cfg(test)] @@ -139,6 +127,7 @@ mod tests { use super::*; use crate::test_utils::require_root; use serial_test::serial; + use std::panic; use std::sync::{LazyLock, Once}; static LOG: LazyLock = LazyLock::new(Once::new); @@ -156,7 +145,7 @@ mod tests { log_setup(); let mut c = NVRC::default(); - nvrc_log("debug", &mut c).unwrap(); + nvrc_log("debug", &mut c); assert!(log_enabled!(log::Level::Debug)); } @@ -169,8 +158,7 @@ mod tests { init.process_kernel_params(Some( "nvidia.smi.lgc=1500 nvrc.log=debug nvidia.smi.lgc=1500", - )) - .unwrap(); + )); assert_eq!(log::max_level(), log::LevelFilter::Debug); assert!(!log_enabled!(log::Level::Trace)); @@ -185,8 +173,7 @@ mod tests { init.process_kernel_params(Some( "nvidia.smi.lgc=1500 nvrc.log=info nvidia.smi.lgc=1500", - )) - .unwrap(); + )); assert_eq!(log::max_level(), log::LevelFilter::Info); assert!(!log_enabled!(log::Level::Debug)); @@ -199,8 +186,7 @@ mod tests { log_setup(); let mut init = NVRC::default(); - init.process_kernel_params(Some("nvidia.smi.lgc=1500 nvrc.log=0 nvidia.smi.lgc=1500")) - .unwrap(); + init.process_kernel_params(Some("nvidia.smi.lgc=1500 nvrc.log=0 nvidia.smi.lgc=1500")); assert_eq!(log::max_level(), log::LevelFilter::Off); } @@ -211,8 +197,7 @@ mod tests { log_setup(); let mut init = NVRC::default(); - init.process_kernel_params(Some("nvidia.smi.lgc=1500 nvrc.log= ")) - .unwrap(); + init.process_kernel_params(Some("nvidia.smi.lgc=1500 nvrc.log= ")); assert_eq!(log::max_level(), log::LevelFilter::Off); } @@ -223,7 +208,7 @@ mod tests { log_setup(); let mut init = NVRC::default(); - init.process_kernel_params(Some("nvrc.log=trace")).unwrap(); + init.process_kernel_params(Some("nvrc.log=trace")); assert_eq!(log::max_level(), log::LevelFilter::Trace); } @@ -235,8 +220,7 @@ mod tests { let mut init = NVRC::default(); // Unknown log level should default to Off - init.process_kernel_params(Some("nvrc.log=garbage")) - .unwrap(); + init.process_kernel_params(Some("nvrc.log=garbage")); assert_eq!(log::max_level(), log::LevelFilter::Off); } @@ -245,26 +229,26 @@ mod tests { let mut c = NVRC::default(); // Test various "on" values - nvrc_dcgm("on", &mut c).unwrap(); + nvrc_dcgm("on", &mut c); assert_eq!(c.dcgm_enabled, Some(true)); - nvrc_dcgm("true", &mut c).unwrap(); + nvrc_dcgm("true", &mut c); assert_eq!(c.dcgm_enabled, Some(true)); - nvrc_dcgm("1", &mut c).unwrap(); + nvrc_dcgm("1", &mut c); assert_eq!(c.dcgm_enabled, Some(true)); - nvrc_dcgm("yes", &mut c).unwrap(); + nvrc_dcgm("yes", &mut c); assert_eq!(c.dcgm_enabled, Some(true)); // Test "off" values - nvrc_dcgm("off", &mut c).unwrap(); + nvrc_dcgm("off", &mut c); assert_eq!(c.dcgm_enabled, Some(false)); - nvrc_dcgm("false", &mut c).unwrap(); + nvrc_dcgm("false", &mut c); assert_eq!(c.dcgm_enabled, Some(false)); - nvrc_dcgm("invalid", &mut c).unwrap(); + nvrc_dcgm("invalid", &mut c); assert_eq!(c.dcgm_enabled, Some(false)); } @@ -272,10 +256,10 @@ mod tests { fn test_nvrc_fabricmanager() { let mut c = NVRC::default(); - nvrc_fabricmanager("on", &mut c).unwrap(); + nvrc_fabricmanager("on", &mut c); assert_eq!(c.fabricmanager_enabled, Some(true)); - nvrc_fabricmanager("off", &mut c).unwrap(); + nvrc_fabricmanager("off", &mut c); assert_eq!(c.fabricmanager_enabled, Some(false)); } @@ -283,10 +267,10 @@ mod tests { fn test_nvidia_smi_srs() { let mut c = NVRC::default(); - nvidia_smi_srs("enabled", &mut c).unwrap(); + nvidia_smi_srs("enabled", &mut c); assert_eq!(c.nvidia_smi_srs, Some("enabled".to_owned())); - nvidia_smi_srs("disabled", &mut c).unwrap(); + nvidia_smi_srs("disabled", &mut c); assert_eq!(c.nvidia_smi_srs, Some("disabled".to_owned())); } @@ -294,13 +278,13 @@ mod tests { fn test_uvm_persistenced_mode() { let mut c = NVRC::default(); - uvm_persistenced_mode("on", &mut c).unwrap(); + uvm_persistenced_mode("on", &mut c); assert_eq!(c.uvm_persistence_mode, Some(true)); - uvm_persistenced_mode("OFF", &mut c).unwrap(); + uvm_persistenced_mode("OFF", &mut c); assert_eq!(c.uvm_persistence_mode, Some(false)); - uvm_persistenced_mode("True", &mut c).unwrap(); + uvm_persistenced_mode("True", &mut c); assert_eq!(c.uvm_persistence_mode, Some(true)); } @@ -326,50 +310,58 @@ mod tests { fn test_nvidia_smi_lgc() { let mut c = NVRC::default(); - nvidia_smi_lgc("1500", &mut c).unwrap(); + nvidia_smi_lgc("1500", &mut c); assert_eq!(c.nvidia_smi_lgc, Some(1500)); - nvidia_smi_lgc("2100", &mut c).unwrap(); + nvidia_smi_lgc("2100", &mut c); assert_eq!(c.nvidia_smi_lgc, Some(2100)); - // Invalid value should error - assert!(nvidia_smi_lgc("invalid", &mut c).is_err()); + // Invalid value should panic + let result = panic::catch_unwind(|| { + nvidia_smi_lgc("invalid", &mut NVRC::default()); + }); + assert!(result.is_err()); } #[test] fn test_nvidia_smi_lmc() { let mut c = NVRC::default(); - nvidia_smi_lmc("5001", &mut c).unwrap(); + nvidia_smi_lmc("5001", &mut c); assert_eq!(c.nvidia_smi_lmc, Some(5001)); - nvidia_smi_lmc("6000", &mut c).unwrap(); + nvidia_smi_lmc("6000", &mut c); assert_eq!(c.nvidia_smi_lmc, Some(6000)); - // Invalid value should error - assert!(nvidia_smi_lmc("not_a_number", &mut c).is_err()); + // Invalid value should panic + let result = panic::catch_unwind(|| { + nvidia_smi_lmc("not_a_number", &mut NVRC::default()); + }); + assert!(result.is_err()); } #[test] fn test_nvidia_smi_pl() { let mut c = NVRC::default(); - nvidia_smi_pl("300", &mut c).unwrap(); + nvidia_smi_pl("300", &mut c); assert_eq!(c.nvidia_smi_pl, Some(300)); - nvidia_smi_pl("450", &mut c).unwrap(); + nvidia_smi_pl("450", &mut c); assert_eq!(c.nvidia_smi_pl, Some(450)); - // Invalid value should error - assert!(nvidia_smi_pl("abc", &mut c).is_err()); + // Invalid value should panic + let result = panic::catch_unwind(|| { + nvidia_smi_pl("abc", &mut NVRC::default()); + }); + assert!(result.is_err()); } #[test] fn test_process_kernel_params_gpu_settings() { let mut c = NVRC::default(); - c.process_kernel_params(Some("nvrc.smi.lgc=1500 nvrc.smi.lmc=5001 nvrc.smi.pl=300")) - .unwrap(); + c.process_kernel_params(Some("nvrc.smi.lgc=1500 nvrc.smi.lmc=5001 nvrc.smi.pl=300")); assert_eq!(c.nvidia_smi_lgc, Some(1500)); assert_eq!(c.nvidia_smi_lmc, Some(5001)); @@ -382,8 +374,7 @@ mod tests { c.process_kernel_params(Some( "nvrc.smi.lgc=2100 nvrc.uvm.options=opt1=1,opt2=2 nvrc.dcgm=on nvrc.smi.pl=400", - )) - .unwrap(); + )); assert_eq!(c.nvidia_smi_lgc, Some(2100)); assert_eq!(c.nvidia_smi_pl, Some(400)); @@ -393,10 +384,9 @@ mod tests { #[test] fn test_process_kernel_params_from_proc_cmdline() { // Exercise the None path which reads /proc/cmdline. - // We can't control the contents but can verify it doesn't error. + // We can't control the contents but can verify it doesn't panic. let mut c = NVRC::default(); - let result = c.process_kernel_params(None); - assert!(result.is_ok()); + c.process_kernel_params(None); } #[test] @@ -405,8 +395,7 @@ mod tests { c.process_kernel_params(Some( "nvrc.fabricmanager=on nvrc.uvm.persistence.mode=true nvrc.smi.srs=enabled", - )) - .unwrap(); + )); assert_eq!(c.fabricmanager_enabled, Some(true)); assert_eq!(c.uvm_persistence_mode, Some(true)); @@ -417,22 +406,22 @@ mod tests { fn test_nvrc_mode() { let mut c = NVRC::default(); - nvrc_mode("cpu", &mut c).unwrap(); + nvrc_mode("cpu", &mut c); assert_eq!(c.mode, Some("cpu".to_owned())); - nvrc_mode("GPU", &mut c).unwrap(); + nvrc_mode("GPU", &mut c); assert_eq!(c.mode, Some("gpu".to_owned())); // normalized to lowercase - nvrc_mode("nvswitch-nvl4", &mut c).unwrap(); + nvrc_mode("nvswitch-nvl4", &mut c); assert_eq!(c.mode, Some("nvswitch-nvl4".to_owned())); - nvrc_mode("NVSWITCH-NVL4", &mut c).unwrap(); + nvrc_mode("NVSWITCH-NVL4", &mut c); assert_eq!(c.mode, Some("nvswitch-nvl4".to_owned())); // normalized to lowercase - nvrc_mode("nvswitch-nvl5", &mut c).unwrap(); + nvrc_mode("nvswitch-nvl5", &mut c); assert_eq!(c.mode, Some("nvswitch-nvl5".to_owned())); - nvrc_mode("NVSWITCH-NVL5", &mut c).unwrap(); + nvrc_mode("NVSWITCH-NVL5", &mut c); assert_eq!(c.mode, Some("nvswitch-nvl5".to_owned())); // normalized to lowercase } @@ -440,8 +429,7 @@ mod tests { fn test_process_kernel_params_with_mode() { let mut c = NVRC::default(); - c.process_kernel_params(Some("nvrc.mode=cpu nvrc.dcgm=on")) - .unwrap(); + c.process_kernel_params(Some("nvrc.mode=cpu nvrc.dcgm=on")); assert_eq!(c.mode, Some("cpu".to_owned())); assert_eq!(c.dcgm_enabled, Some(true)); @@ -451,8 +439,7 @@ mod tests { fn test_process_kernel_params_nvswitch_nvl4_mode() { let mut c = NVRC::default(); - c.process_kernel_params(Some("nvrc.mode=nvswitch-nvl4")) - .unwrap(); + c.process_kernel_params(Some("nvrc.mode=nvswitch-nvl4")); assert_eq!(c.mode, Some("nvswitch-nvl4".to_owned())); } @@ -461,8 +448,7 @@ mod tests { fn test_process_kernel_params_nvswitch_nvl5_mode() { let mut c = NVRC::default(); - c.process_kernel_params(Some("nvrc.mode=nvswitch-nvl5")) - .unwrap(); + c.process_kernel_params(Some("nvrc.mode=nvswitch-nvl5")); assert_eq!(c.mode, Some("nvswitch-nvl5".to_owned())); } diff --git a/src/kmsg.rs b/src/kmsg.rs index a4bc8f4..8f955d3 100644 --- a/src/kmsg.rs +++ b/src/kmsg.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // Copyright (c) NVIDIA CORPORATION -use anyhow::{Context, Result}; +use crate::macros::ResultExt; use std::fs::{self, File, OpenOptions}; use std::sync::Once; @@ -15,7 +15,7 @@ const SOCKET_BUFFER_SIZE: &str = "16777216"; /// Initialize kernel logging and tune socket buffer sizes. /// Large buffers (16MB) prevent message loss during high-throughput GPU operations /// where drivers may emit bursts of diagnostic data. -pub fn kernlog_setup() -> Result<()> { +pub fn kernlog_setup() { KERNLOG_INIT.call_once(|| { let _ = kernlog::init(); }); @@ -26,16 +26,14 @@ pub fn kernlog_setup() -> Result<()> { "/proc/sys/net/core/rmem_max", "/proc/sys/net/core/wmem_max", ] { - fs::write(path, SOCKET_BUFFER_SIZE.as_bytes()) - .with_context(|| format!("write {}", path))?; + fs::write(path, SOCKET_BUFFER_SIZE.as_bytes()).or_panic(format_args!("write {path}")); } - Ok(()) } /// Get a file handle for kernel message output. /// Routes to /dev/kmsg when debug logging is enabled for visibility in dmesg, /// otherwise /dev/null to suppress noise in production. -pub fn kmsg() -> Result { +pub fn kmsg() -> File { kmsg_at(if log_enabled!(log::Level::Debug) { "/dev/kmsg" } else { @@ -44,11 +42,11 @@ pub fn kmsg() -> Result { } /// Internal: open the given path for writing. Extracted for testability. -fn kmsg_at(path: &str) -> Result { +fn kmsg_at(path: &str) -> File { OpenOptions::new() .write(true) .open(path) - .with_context(|| format!("open {}", path)) + .or_panic(format_args!("open {path}")) } #[cfg(test)] @@ -57,24 +55,21 @@ mod tests { use crate::test_utils::require_root; use serial_test::serial; use std::io::Write; + use std::panic; use tempfile::NamedTempFile; #[test] fn test_kmsg_at_dev_null() { // /dev/null is always writable, no root needed - let file = kmsg_at("/dev/null"); - assert!(file.is_ok()); + let _file = kmsg_at("/dev/null"); } #[test] fn test_kmsg_at_nonexistent() { - let err = kmsg_at("/nonexistent/path").unwrap_err(); - // Should contain the path in the error context - assert!( - err.to_string().contains("/nonexistent/path"), - "error should mention the path: {}", - err - ); + let result = panic::catch_unwind(|| { + kmsg_at("/nonexistent/path"); + }); + assert!(result.is_err()); } #[test] @@ -82,7 +77,7 @@ mod tests { // Create a temp file to verify we can write to it let temp = NamedTempFile::new().unwrap(); let path = temp.path().to_str().unwrap(); - let mut file = kmsg_at(path).unwrap(); + let mut file = kmsg_at(path); assert!(file.write_all(b"test").is_ok()); } @@ -91,8 +86,7 @@ mod tests { fn test_kmsg_routes_to_dev_null_when_log_off() { // Default log level is Off, so kmsg() should open /dev/null log::set_max_level(log::LevelFilter::Off); - let file = kmsg(); - assert!(file.is_ok()); + let _file = kmsg(); } #[test] @@ -101,8 +95,7 @@ mod tests { require_root(); // When debug is enabled, kmsg() should open /dev/kmsg log::set_max_level(log::LevelFilter::Debug); - let file = kmsg(); - assert!(file.is_ok()); + let _file = kmsg(); log::set_max_level(log::LevelFilter::Off); } @@ -134,7 +127,7 @@ mod tests { .collect(); let _restore = Restore(saved); - assert!(kernlog_setup().is_ok()); + kernlog_setup(); for &path in &PATHS { let v = fs::read_to_string(path).expect("should read sysctl"); diff --git a/src/lockdown.rs b/src/lockdown.rs index 83c5d7e..d221f8b 100644 --- a/src/lockdown.rs +++ b/src/lockdown.rs @@ -6,7 +6,7 @@ //! In production, panic triggers VM power-off. For tests, the shutdown //! action is configurable via `set_panic_hook_with()`. -use anyhow::{Context, Result}; +use crate::macros::ResultExt; use nix::sys::reboot::{reboot, RebootMode}; use nix::unistd::sync; use std::fs; @@ -22,20 +22,19 @@ fn power_off() { /// with potential data exposure. Power-off ensures clean termination—the host /// hypervisor will see the VM exit and can handle cleanup appropriately. /// sync() flushes pending writes before power-off to preserve any logs. -pub fn set_panic_hook() -> Result<()> { +pub fn set_panic_hook() { set_panic_hook_with(power_off) } /// Internal: panic handler with configurable shutdown (for unit tests). /// Production uses power_off(); tests inject a no-op to avoid rebooting. -fn set_panic_hook_with(shutdown: F) -> Result<()> { +fn set_panic_hook_with(shutdown: F) { panic::set_hook(Box::new(move |panic_info| { // fd 1,2 are always available from the kernel eprintln!("panic: {panic_info}"); sync(); shutdown(); })); - Ok(()) } /// Permanently disable kernel module loading for this boot. @@ -43,9 +42,9 @@ fn set_panic_hook_with(shutdown: F) -> Result<( /// module insertion—a security hardening measure for confidential VMs /// that blocks potential kernel-level attacks via malicious modules. /// This is a one-way operation: once set, it cannot be undone without reboot. -pub fn disable_modules_loading() -> Result<()> { +pub fn disable_modules_loading() { const PATH: &str = "/proc/sys/kernel/modules_disabled"; - fs::write(PATH, b"1\n").with_context(|| format!("disable module loading: {}", PATH)) + fs::write(PATH, b"1\n").or_panic(format_args!("disable module loading {PATH}")); } #[cfg(test)] @@ -61,7 +60,7 @@ mod tests { let called_clone = called.clone(); // Install hook with test closure - let _ = set_panic_hook_with(move || { + set_panic_hook_with(move || { called_clone.store(true, Ordering::SeqCst); }); @@ -77,8 +76,7 @@ mod tests { // This permanently disables module loading until reboot. // Only run on dedicated test runners! - let result = disable_modules_loading(); - assert!(result.is_ok()); + disable_modules_loading(); // Verify it was set let content = fs::read_to_string("/proc/sys/kernel/modules_disabled").unwrap(); @@ -95,7 +93,7 @@ mod tests { #[ignore] // Installs real power_off hook - run with --include-ignored on CI fn test_set_panic_hook() { // Installs the real hook (with power_off) - just don't trigger it! - let _ = set_panic_hook(); + set_panic_hook(); // If we got here, the hook was installed successfully } } diff --git a/src/macros.rs b/src/macros.rs index 6a0c7c0..7c11f73 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -1,49 +1,61 @@ // SPDX-License-Identifier: Apache-2.0 // Copyright (c) NVIDIA CORPORATION -//! Common macros for the init system. - -/// Unwrap a Result or panic with a descriptive init failure message. -/// Used for operations that must succeed for init to proceed. -#[macro_export] -macro_rules! must { - ($expr:expr) => { - if let Err(e) = $expr { - panic!("init failure: {} => {e}", stringify!($expr)); - } - }; - ($expr:expr, $msg:literal) => { - if let Err(e) = $expr { - panic!("init failure: {}: {e}", $msg); +//! Error handling extensions for fail-fast init semantics. +//! +//! NVRC runs as PID 1 in an ephemeral VM. On any error, we panic (which +//! triggers VM power-off via our panic hook). This trait provides a clean +//! `.or_panic(msg)` method instead of verbose `.unwrap_or_else(|e| panic!(...))`. + +use std::fmt::Display; + +/// Extension trait for Result types that panic on error with context. +pub trait ResultExt { + /// Unwrap the value or panic with the given message and error details. + /// Use with static strings or pre-formatted messages. + fn or_panic(self, msg: impl Display) -> T; +} + +impl ResultExt for Result { + #[cold] + #[inline(never)] + fn or_panic(self, msg: impl Display) -> T { + match self { + Ok(v) => v, + Err(e) => panic!("{msg}: {e}"), } - }; + } } #[cfg(test)] mod tests { + use super::*; use std::panic::catch_unwind; - /// Test must! macro with Ok result - should not panic - #[test] - fn test_must_ok() { - must!(Ok::<(), &str>(())); - } - - /// Test must! macro with custom message - should not panic on Ok #[test] - fn test_must_ok_with_message() { - must!(Ok::<(), &str>(()), "custom message"); + fn test_or_panic_ok() { + let result: Result = Ok(42); + assert_eq!(result.or_panic("should not panic"), 42); } - /// Test must! macro panics on Err #[test] - fn test_must_err_panics() { - assert!(catch_unwind(|| must!(Err::<(), _>("something went wrong"))).is_err()); + fn test_or_panic_err() { + let result = catch_unwind(|| { + let r: Result = Err("boom"); + r.or_panic("operation failed"); + }); + assert!(result.is_err()); } - /// Test must! macro with custom message panics on Err #[test] - fn test_must_err_with_message_panics() { - assert!(catch_unwind(|| must!(Err::<(), _>("boom"), "custom error")).is_err()); + fn test_or_panic_with_io_error() { + let result = catch_unwind(|| { + let r: Result<(), std::io::Error> = Err(std::io::Error::new( + std::io::ErrorKind::NotFound, + "file missing", + )); + r.or_panic("read config"); + }); + assert!(result.is_err()); } } diff --git a/src/main.rs b/src/main.rs index 8caf5d9..cd4624f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,7 +7,6 @@ mod kata_agent; mod kernel_params; mod kmsg; mod lockdown; -#[macro_use] mod macros; mod modprobe; mod mount; @@ -16,6 +15,8 @@ mod smi; mod syslog; mod toolkit; +pub use macros::ResultExt; + #[cfg(test)] mod test_utils; @@ -34,21 +35,21 @@ type ModeFn = fn(&mut NVRC); /// VMs with GPU passthrough need driver setup, clock tuning, /// and monitoring daemons before workloads can use the GPU. fn mode_gpu(init: &mut NVRC) { - must!(modprobe::load("nvidia")); - must!(modprobe::load("nvidia-uvm")); + modprobe::load("nvidia"); + modprobe::load("nvidia-uvm"); - must!(init.nvidia_smi_lmc()); - must!(init.nvidia_smi_lgc()); - must!(init.nvidia_smi_pl()); + init.nvidia_smi_lmc(); + init.nvidia_smi_lgc(); + init.nvidia_smi_pl(); - must!(init.nvidia_persistenced()); + init.nvidia_persistenced(); - must!(init.nv_hostengine()); - must!(init.dcgm_exporter()); - must!(init.nv_fabricmanager()); - must!(nvidia_ctk_cdi()); - must!(init.nvidia_smi_srs()); - must!(init.check_daemons()); + init.nv_hostengine(); + init.dcgm_exporter(); + init.nv_fabricmanager(); + nvidia_ctk_cdi(); + init.nvidia_smi_srs(); + init.check_daemons(); } /// NVSwitch NVL4 mode for HGX H100/H200/H800 systems (third-gen NVSwitch). @@ -59,9 +60,9 @@ fn mode_nvswitch_nvl4(init: &mut NVRC) { // Override kernel parameter: always enable fabricmanager for nvswitch mode init.fabricmanager_enabled = Some(true); - must!(modprobe::load("nvidia")); - must!(init.nv_fabricmanager()); - must!(init.check_daemons()); + modprobe::load("nvidia"); + init.nv_fabricmanager(); + init.check_daemons(); } /// NVSwitch NVL5 mode for HGX B200/B300/B100 systems (fourth-gen NVSwitch). @@ -75,9 +76,9 @@ fn mode_nvswitch_nvl5(init: &mut NVRC) { init.fabricmanager_enabled = Some(true); // Load InfiniBand user MAD module for CX7 bridge device access - must!(modprobe::load("ib_umad")); - must!(init.nv_fabricmanager()); - must!(init.check_daemons()); + modprobe::load("ib_umad"); + init.nv_fabricmanager(); + init.check_daemons(); } fn main() { @@ -89,13 +90,13 @@ fn main() { ("nvswitch-nvl5", mode_nvswitch_nvl5 as ModeFn), ]); - must!(lockdown::set_panic_hook()); + lockdown::set_panic_hook(); let mut init = NVRC::default(); - must!(mount::setup()); - must!(kmsg::kernlog_setup()); - must!(syslog::poll()); - must!(mount::readonly("/")); - must!(init.process_kernel_params(None)); + mount::setup(); + kmsg::kernlog_setup(); + syslog::poll(); + mount::readonly("/"); + init.process_kernel_params(None); // Kernel param nvrc.mode selects runtime behavior; GPU is the safe default // since most users expect full GPU functionality. @@ -103,6 +104,6 @@ fn main() { let setup = modes.get(mode).copied().unwrap_or(mode_gpu); setup(&mut init); - must!(lockdown::disable_modules_loading()); - must!(kata_agent::fork_agent(POLL_FOREVER)); + lockdown::disable_modules_loading(); + kata_agent::fork_agent(POLL_FOREVER); } diff --git a/src/modprobe.rs b/src/modprobe.rs index 68730a5..e089a86 100644 --- a/src/modprobe.rs +++ b/src/modprobe.rs @@ -1,18 +1,19 @@ use crate::execute::foreground; -use anyhow::Result; const MODPROBE: &str = "/sbin/modprobe"; /// Load a kernel module via modprobe. /// Used to load GPU drivers (nvidia, nvidia-uvm) during init. -pub fn load(module: &str) -> Result<()> { - foreground(MODPROBE, &[module]) +pub fn load(module: &str) { + foreground(MODPROBE, &[module]); } + #[cfg(test)] mod tests { use super::*; use crate::test_utils::require_root; use serial_test::serial; + use std::panic; // Kernel module loading must be serialized - parallel modprobe // calls can race and cause spurious failures. @@ -22,15 +23,16 @@ mod tests { fn test_load_loop() { require_root(); // 'loop' module is almost always available (loop devices) - assert!(load("loop").is_ok()); + load("loop"); } #[test] #[serial] fn test_load_nonexistent() { require_root(); - let err = load("nonexistent_module_xyz123").unwrap_err(); - // modprobe exits non-zero for missing modules - assert!(err.to_string().contains("modprobe")); + let result = panic::catch_unwind(|| { + load("nonexistent_module_xyz123"); + }); + assert!(result.is_err()); } } diff --git a/src/mount.rs b/src/mount.rs index 351ec9e..1d19e35 100644 --- a/src/mount.rs +++ b/src/mount.rs @@ -3,30 +3,24 @@ //! Filesystem setup for the minimal init environment. -use anyhow::{Context, Result}; +use crate::macros::ResultExt; use nix::mount::MsFlags; use std::fs; use std::path::Path; /// Mount a filesystem. Errors if mount fails. -fn mount( - source: &str, - target: &str, - fstype: &str, - flags: MsFlags, - data: Option<&str>, -) -> Result<()> { +fn mount(source: &str, target: &str, fstype: &str, flags: MsFlags, data: Option<&str>) { nix::mount::mount(Some(source), target, Some(fstype), flags, data) - .with_context(|| format!("mount {source} on {target}")) + .or_panic(format_args!("mount {source} on {target}")); } /// Remount a filesystem as read-only. /// Security hardening: prevents writes to the root filesystem after init, /// reducing attack surface in the confidential VM. -pub fn readonly(target: &str) -> Result<()> { +pub fn readonly(target: &str) { let flags = MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_RDONLY | MsFlags::MS_REMOUNT; nix::mount::mount(None::<&str>, target, None::<&str>, flags, None::<&str>) - .with_context(|| format!("remount {target} readonly")) + .or_panic(format_args!("remount {target} readonly")); } /// Check if a filesystem type is available in the kernel. @@ -36,17 +30,10 @@ fn fs_available(filesystems: &str, fstype: &str) -> bool { /// Mount optional filesystem if the fstype is available AND the target exists. /// Used for securityfs and efivarfs that may not be present on all kernels. -fn mount_optional( - filesystems: &str, - source: &str, - target: &str, - fstype: &str, - flags: MsFlags, -) -> Result<()> { +fn mount_optional(filesystems: &str, source: &str, target: &str, fstype: &str, flags: MsFlags) { if fs_available(filesystems, fstype) && Path::new(target).exists() { - mount(source, target, fstype, flags, None)?; + mount(source, target, fstype, flags, None); } - Ok(()) } /// Set up the minimal filesystem hierarchy required for GPU initialization. @@ -54,15 +41,15 @@ fn mount_optional( /// devtmpfs automatically creates standard device nodes; symlinks /// (/dev/stdin, /dev/stdout, /dev/stderr, /dev/fd, /dev/core) are /// created later by kata-agent. -pub fn setup() -> Result<()> { +pub fn setup() { setup_at("") } /// Internal: setup with configurable root path (for testing with temp directories). -fn setup_at(root: &str) -> Result<()> { +fn setup_at(root: &str) { let common = MsFlags::MS_NOSUID | MsFlags::MS_NOEXEC | MsFlags::MS_NODEV | MsFlags::MS_RELATIME; - mount("proc", &format!("{root}/proc"), "proc", common, None)?; + mount("proc", &format!("{root}/proc"), "proc", common, None); // devtmpfs automatically creates /dev/null, /dev/zero, /dev/random, /dev/urandom // Symlinks (/dev/stdin, /dev/stdout, /dev/stderr, /dev/fd, /dev/core) are created by kata-agent @@ -73,19 +60,19 @@ fn setup_at(root: &str) -> Result<()> { "devtmpfs", dev_flags, Some("mode=0755"), - )?; + ); - mount("sysfs", &format!("{root}/sys"), "sysfs", common, None)?; + mount("sysfs", &format!("{root}/sys"), "sysfs", common, None); mount( "run", &format!("{root}/run"), "tmpfs", common, Some("mode=0755"), - )?; + ); let tmp_flags = MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_RELATIME; - mount("tmpfs", &format!("{root}/tmp"), "tmpfs", tmp_flags, None)?; + mount("tmpfs", &format!("{root}/tmp"), "tmpfs", tmp_flags, None); // Read once for all optional mounts let filesystems = fs::read_to_string("/proc/filesystems").unwrap_or_default(); @@ -96,16 +83,14 @@ fn setup_at(root: &str) -> Result<()> { &format!("{root}/sys/kernel/security"), "securityfs", common, - )?; + ); mount_optional( &filesystems, "efivarfs", &format!("{root}/sys/firmware/efi/efivars"), "efivarfs", common, - )?; - - Ok(()) + ); } #[cfg(test)] @@ -136,43 +121,41 @@ mod tests { fn test_mount_optional_target_not_exists() { // When target path doesn't exist, should be no-op let filesystems = "nodev tmpfs\n"; - let result = mount_optional( + mount_optional( filesystems, "tmpfs", "/nonexistent/path", "tmpfs", MsFlags::empty(), ); - assert!(result.is_ok()); } // === Error path tests === #[test] fn test_mount_fails_nonexistent_target() { - let err = mount( - "tmpfs", - "/nonexistent/mount/point", - "tmpfs", - MsFlags::empty(), - None, - ) - .unwrap_err(); - assert!( - err.to_string().contains("/nonexistent/mount/point"), - "error should mention the path: {}", - err - ); + use std::panic; + + let result = panic::catch_unwind(|| { + mount( + "tmpfs", + "/nonexistent/mount/point", + "tmpfs", + MsFlags::empty(), + None, + ); + }); + assert!(result.is_err()); } #[test] fn test_readonly_fails_nonexistent() { - let err = readonly("/nonexistent/path").unwrap_err(); - assert!( - err.to_string().contains("/nonexistent/path"), - "error should mention the path: {}", - err - ); + use std::panic; + + let result = panic::catch_unwind(|| { + readonly("/nonexistent/path"); + }); + assert!(result.is_err()); } // === setup_at() tests with temp directory === @@ -193,8 +176,7 @@ mod tests { } // Run setup_at with temp root - let result = setup_at(root); - assert!(result.is_ok(), "setup_at failed: {:?}", result); + setup_at(root); // devtmpfs creates these automatically assert!(Path::new(&format!("{root}/dev/null")).exists()); diff --git a/src/nvrc.rs b/src/nvrc.rs index 1f6d3a1..09451ea 100644 --- a/src/nvrc.rs +++ b/src/nvrc.rs @@ -3,7 +3,6 @@ //! NVRC configuration state and daemon lifecycle management. -use anyhow::{anyhow, Result}; use std::process::Child; /// Central configuration state for the NVIDIA Runtime Container init. @@ -43,21 +42,21 @@ impl NVRC { /// Check all background daemons haven't failed. /// Exit status 0 is OK (daemon may fork and parent exits successfully). /// Non-zero exit means the daemon crashed—fail init before kata-agent starts. - pub fn check_daemons(&mut self) -> Result<()> { + pub fn check_daemons(&mut self) { for (name, child) in &mut self.children { if let Ok(Some(status)) = child.try_wait() { if !status.success() { - return Err(anyhow!("{} exited with status: {}", name, status)); + panic!("{} exited with status: {}", name, status); } } } - Ok(()) } } #[cfg(test)] mod tests { use super::*; + use std::panic; use std::process::Command; #[test] @@ -85,7 +84,7 @@ mod tests { let child = Command::new("/bin/true").spawn().unwrap(); nvrc.track_daemon("good-daemon", child); std::thread::sleep(std::time::Duration::from_millis(50)); - assert!(nvrc.check_daemons().is_ok()); + nvrc.check_daemons(); } #[test] @@ -95,9 +94,10 @@ mod tests { let child = Command::new("/bin/false").spawn().unwrap(); nvrc.track_daemon("bad-daemon", child); std::thread::sleep(std::time::Duration::from_millis(50)); - let result = nvrc.check_daemons(); + let result = panic::catch_unwind(panic::AssertUnwindSafe(|| { + nvrc.check_daemons(); + })); assert!(result.is_err()); - assert!(result.unwrap_err().to_string().contains("bad-daemon")); } #[test] @@ -107,7 +107,7 @@ mod tests { let child = Command::new("/bin/sleep").arg("1").spawn().unwrap(); nvrc.track_daemon("slow-daemon", child); // Check immediately while still running - assert!(nvrc.check_daemons().is_ok()); + nvrc.check_daemons(); // Clean up: kill the child to avoid orphaned process if let Some((_, ref mut child)) = nvrc.children.last_mut() { let _ = child.kill(); @@ -121,6 +121,6 @@ mod tests { nvrc.track_daemon("d1", Command::new("/bin/true").spawn().unwrap()); nvrc.track_daemon("d2", Command::new("/bin/true").spawn().unwrap()); std::thread::sleep(std::time::Duration::from_millis(50)); - assert!(nvrc.check_daemons().is_ok()); + nvrc.check_daemons(); } } diff --git a/src/smi.rs b/src/smi.rs index 630d941..3b8914f 100644 --- a/src/smi.rs +++ b/src/smi.rs @@ -1,132 +1,123 @@ //! nvidia-smi GPU configuration commands. //! //! These functions apply GPU settings via nvidia-smi before workloads run. -//! All are optional—if the kernel param isn't set, they return Ok immediately. +//! All are optional—if the kernel param isn't set, they return immediately. use crate::execute::foreground; use crate::nvrc::NVRC; -use anyhow::Result; const NVIDIA_SMI: &str = "/bin/nvidia-smi"; impl NVRC { /// Lock memory clocks to a specific frequency (MHz). /// Reduces memory clock jitter for latency-sensitive workloads. - pub fn nvidia_smi_lmc(&self) -> Result<()> { + pub fn nvidia_smi_lmc(&self) { let Some(mhz) = self.nvidia_smi_lmc else { - return Ok(()); + return; }; - foreground(NVIDIA_SMI, &["-lmc", &mhz.to_string()]) + foreground(NVIDIA_SMI, &["-lmc", &mhz.to_string()]); } /// Lock GPU core clocks to a specific frequency (MHz). /// Provides consistent performance by preventing dynamic frequency scaling. - pub fn nvidia_smi_lgc(&self) -> Result<()> { + pub fn nvidia_smi_lgc(&self) { let Some(mhz) = self.nvidia_smi_lgc else { - return Ok(()); + return; }; - foreground(NVIDIA_SMI, &["-lgc", &mhz.to_string()]) + foreground(NVIDIA_SMI, &["-lgc", &mhz.to_string()]); } /// Set GPU power limit in watts. /// Caps power consumption for thermal/power budget compliance. - pub fn nvidia_smi_pl(&self) -> Result<()> { + pub fn nvidia_smi_pl(&self) { let Some(watts) = self.nvidia_smi_pl else { - return Ok(()); + return; }; - foreground(NVIDIA_SMI, &["-pl", &watts.to_string()]) + foreground(NVIDIA_SMI, &["-pl", &watts.to_string()]); } /// Set GPU Ready State after successful attestation. /// In Confidential Computing mode, GPUs default to NotReady and refuse /// workloads. After attestation verifies the GPU's integrity, we set /// the state to Ready so it can execute compute jobs. - pub fn nvidia_smi_srs(&self) -> Result<()> { + pub fn nvidia_smi_srs(&self) { let Some(ref state) = self.nvidia_smi_srs else { - return Ok(()); + return; }; - foreground(NVIDIA_SMI, &["conf-compute", "-srs", state]) + foreground(NVIDIA_SMI, &["conf-compute", "-srs", state]); } } #[cfg(test)] mod tests { use super::*; + use std::panic; - // When fields are None, functions return Ok immediately (no nvidia-smi call) + // When fields are None, functions return immediately (no nvidia-smi call) #[test] fn test_lmc_none() { let nvrc = NVRC::default(); - assert!(nvrc.nvidia_smi_lmc().is_ok()); + nvrc.nvidia_smi_lmc(); } #[test] fn test_lgc_none() { let nvrc = NVRC::default(); - assert!(nvrc.nvidia_smi_lgc().is_ok()); + nvrc.nvidia_smi_lgc(); } #[test] fn test_pl_none() { let nvrc = NVRC::default(); - assert!(nvrc.nvidia_smi_pl().is_ok()); + nvrc.nvidia_smi_pl(); } #[test] fn test_srs_none() { let nvrc = NVRC::default(); - assert!(nvrc.nvidia_smi_srs().is_ok()); + nvrc.nvidia_smi_srs(); } - // When fields are Some, nvidia-smi is called (fails without NVIDIA hardware) + // When fields are Some, nvidia-smi is called (panics without NVIDIA hardware) #[test] fn test_lmc_some_fails_without_nvidia_smi() { let mut nvrc = NVRC::default(); nvrc.nvidia_smi_lmc = Some(1000); - let err = nvrc.nvidia_smi_lmc().unwrap_err(); - // Should fail mentioning nvidia-smi binary - assert!( - err.to_string().contains("nvidia-smi"), - "error should mention nvidia-smi: {}", - err - ); + let result = panic::catch_unwind(panic::AssertUnwindSafe(|| { + nvrc.nvidia_smi_lmc(); + })); + assert!(result.is_err()); } #[test] fn test_lgc_some_fails_without_nvidia_smi() { let mut nvrc = NVRC::default(); nvrc.nvidia_smi_lgc = Some(1500); - let err = nvrc.nvidia_smi_lgc().unwrap_err(); - assert!( - err.to_string().contains("nvidia-smi"), - "error should mention nvidia-smi: {}", - err - ); + let result = panic::catch_unwind(panic::AssertUnwindSafe(|| { + nvrc.nvidia_smi_lgc(); + })); + assert!(result.is_err()); } #[test] fn test_pl_some_fails_without_nvidia_smi() { let mut nvrc = NVRC::default(); nvrc.nvidia_smi_pl = Some(300); - let err = nvrc.nvidia_smi_pl().unwrap_err(); - assert!( - err.to_string().contains("nvidia-smi"), - "error should mention nvidia-smi: {}", - err - ); + let result = panic::catch_unwind(panic::AssertUnwindSafe(|| { + nvrc.nvidia_smi_pl(); + })); + assert!(result.is_err()); } #[test] fn test_srs_some_fails_without_nvidia_smi() { let mut nvrc = NVRC::default(); nvrc.nvidia_smi_srs = Some("1".into()); - let err = nvrc.nvidia_smi_srs().unwrap_err(); - assert!( - err.to_string().contains("nvidia-smi"), - "error should mention nvidia-smi: {}", - err - ); + let result = panic::catch_unwind(panic::AssertUnwindSafe(|| { + nvrc.nvidia_smi_srs(); + })); + assert!(result.is_err()); } } diff --git a/src/syslog.rs b/src/syslog.rs index 8e0d336..d04bf5e 100644 --- a/src/syslog.rs +++ b/src/syslog.rs @@ -57,8 +57,9 @@ fn poll_socket(sock: &UnixDatagram) -> std::io::Result> { /// Lazily initializes /dev/log on first call. /// Drains one message per call—rate-limited to prevent DoS by syslog flooding. /// Caller loops at ~2 msg/sec (500ms sleep between calls). -pub fn poll() -> std::io::Result<()> { - poll_at(Path::new(DEV_LOG)) +pub fn poll() { + use crate::macros::ResultExt; + poll_at(Path::new(DEV_LOG)).or_panic("syslog poll"); } /// Internal: poll a specific socket path (for unit tests). @@ -267,8 +268,9 @@ mod tests { #[test] fn test_poll_dev_log() { - // poll() tries to bind /dev/log - may fail if already bound or no permission + use std::panic; + // poll() tries to bind /dev/log - may panic if already bound or no permission // Just exercise the code path, don't assert success - let _ = poll(); + let _ = panic::catch_unwind(poll); } } diff --git a/src/toolkit.rs b/src/toolkit.rs index 3f5414d..ea7de5a 100644 --- a/src/toolkit.rs +++ b/src/toolkit.rs @@ -7,38 +7,40 @@ //! can discover and mount GPU devices without needing the legacy hook. use crate::execute::foreground; -use anyhow::Result; const NVIDIA_CTK: &str = "/bin/nvidia-ctk"; /// Run nvidia-ctk with given arguments. -fn ctk(args: &[&str]) -> Result<()> { - foreground(NVIDIA_CTK, args) +fn ctk(args: &[&str]) { + foreground(NVIDIA_CTK, args); } /// Generate CDI spec for GPU device discovery. /// CDI allows container runtimes (containerd, CRI-O) to inject GPU devices /// without nvidia-docker. The spec is written to /var/run/cdi/nvidia.yaml /// where runtimes expect to find it. -pub fn nvidia_ctk_cdi() -> Result<()> { - ctk(&["-d", "cdi", "generate", "--output=/var/run/cdi/nvidia.yaml"]) +pub fn nvidia_ctk_cdi() { + ctk(&["-d", "cdi", "generate", "--output=/var/run/cdi/nvidia.yaml"]); } #[cfg(test)] mod tests { use super::*; + use std::panic; #[test] fn test_ctk_fails_without_binary() { - let err = ctk(&["--version"]).unwrap_err(); - // Should fail because nvidia-ctk binary doesn't exist - assert!(err.to_string().contains("nvidia-ctk")); + let result = panic::catch_unwind(|| { + ctk(&["--version"]); + }); + assert!(result.is_err()); } #[test] fn test_nvidia_ctk_cdi_fails_without_binary() { - let err = nvidia_ctk_cdi().unwrap_err(); - // Should fail because nvidia-ctk binary doesn't exist - assert!(err.to_string().contains("nvidia-ctk")); + let result = panic::catch_unwind(|| { + nvidia_ctk_cdi(); + }); + assert!(result.is_err()); } }