Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ version = "0.1.1"
edition = "2021"

[dependencies]
anyhow = "1.0.100"
nix = { version = "0.30.1", features = ["fs", "mount", "user", "process", "reboot", "signal", "mman", "poll", "ioctl"] }
cfg-if = "1.0.4"
log = "0.4.29"
Expand Down
76 changes: 35 additions & 41 deletions src/daemon.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright (c) NVIDIA CORPORATION

use anyhow::{Context, Result};

use crate::execute::background;
use crate::macros::ResultExt;
use crate::nvrc::NVRC;
use std::fs;

Expand Down Expand Up @@ -41,62 +40,58 @@ impl NVRC {
/// nvidia-persistenced keeps GPU state warm between container invocations,
/// reducing cold-start latency. UVM persistence mode enables unified memory
/// optimizations. Enabled by default since most workloads benefit from it.
pub fn nvidia_persistenced(&mut self) -> Result<()> {
pub fn nvidia_persistenced(&mut self) {
self.spawn_persistenced("/var/run/nvidia-persistenced", "/bin/nvidia-persistenced")
}

fn spawn_persistenced(&mut self, run_dir: &str, bin: &str) -> Result<()> {
fs::create_dir_all(run_dir).with_context(|| format!("create_dir_all {}", run_dir))?;
fn spawn_persistenced(&mut self, run_dir: &str, bin: &str) {
fs::create_dir_all(run_dir).or_panic(format_args!("create_dir_all {run_dir}"));
let uvm_enabled = self.uvm_persistence_mode.unwrap_or(true);
let args = persistenced_args(uvm_enabled);
let child = background(bin, &args)?;
let child = background(bin, &args);
self.track_daemon("nvidia-persistenced", child);
Ok(())
}

/// nv-hostengine is the DCGM backend daemon. Only started when DCGM monitoring
/// is explicitly requested - not needed for basic GPU workloads.
pub fn nv_hostengine(&mut self) -> Result<()> {
pub fn nv_hostengine(&mut self) {
self.spawn_hostengine("/bin/nv-hostengine")
}

fn spawn_hostengine(&mut self, bin: &str) -> Result<()> {
fn spawn_hostengine(&mut self, bin: &str) {
if !self.dcgm_enabled.unwrap_or(false) {
return Ok(());
return;
}
let child = background(bin, hostengine_args())?;
let child = background(bin, hostengine_args());
self.track_daemon("nv-hostengine", child);
Ok(())
}

/// dcgm-exporter exposes GPU metrics for Prometheus. Only started when DCGM
/// is enabled - adds overhead so disabled by default.
pub fn dcgm_exporter(&mut self) -> Result<()> {
pub fn dcgm_exporter(&mut self) {
self.spawn_dcgm_exporter("/bin/dcgm-exporter")
}

fn spawn_dcgm_exporter(&mut self, bin: &str) -> Result<()> {
fn spawn_dcgm_exporter(&mut self, bin: &str) {
if !self.dcgm_enabled.unwrap_or(false) {
return Ok(());
return;
}
let child = background(bin, dcgm_exporter_args())?;
let child = background(bin, dcgm_exporter_args());
self.track_daemon("dcgm-exporter", child);
Ok(())
}

/// NVSwitch fabric manager is only needed for multi-GPU NVLink topologies.
/// Disabled by default since most VMs have single GPUs.
pub fn nv_fabricmanager(&mut self) -> Result<()> {
pub fn nv_fabricmanager(&mut self) {
self.spawn_fabricmanager("/bin/nv-fabricmanager")
}

fn spawn_fabricmanager(&mut self, bin: &str) -> Result<()> {
fn spawn_fabricmanager(&mut self, bin: &str) {
if !self.fabricmanager_enabled.unwrap_or(false) {
return Ok(());
return;
}
let child = background(bin, fabricmanager_args())?;
let child = background(bin, fabricmanager_args());
self.track_daemon("nv-fabricmanager", child);
Ok(())
}
}

Expand Down Expand Up @@ -152,20 +147,20 @@ mod tests {
fn test_nv_hostengine_skipped_by_default() {
// DCGM disabled by default - should be a no-op, no daemon spawned
let mut nvrc = NVRC::default();
assert!(nvrc.nv_hostengine().is_ok());
assert!(nvrc.check_daemons().is_ok());
nvrc.nv_hostengine();
nvrc.check_daemons();
}

#[test]
fn test_dcgm_exporter_skipped_by_default() {
let mut nvrc = NVRC::default();
assert!(nvrc.dcgm_exporter().is_ok());
nvrc.dcgm_exporter();
}

#[test]
fn test_nv_fabricmanager_skipped_by_default() {
let mut nvrc = NVRC::default();
assert!(nvrc.nv_fabricmanager().is_ok());
nvrc.nv_fabricmanager();
}

#[test]
Expand All @@ -174,14 +169,13 @@ mod tests {
let run_dir = tmpdir.path().join("nvidia-persistenced");

let mut nvrc = NVRC::default();
let result = nvrc.spawn_persistenced(run_dir.to_str().unwrap(), "/bin/true");
assert!(result.is_ok());
nvrc.spawn_persistenced(run_dir.to_str().unwrap(), "/bin/true");

// Directory should be created
assert!(run_dir.exists());

// Daemon should be tracked and exit cleanly
assert!(nvrc.check_daemons().is_ok());
nvrc.check_daemons();
}

#[test]
Expand All @@ -191,48 +185,48 @@ mod tests {

let mut nvrc = NVRC::default();
nvrc.uvm_persistence_mode = Some(false); // Tests the else branch for args
let result = nvrc.spawn_persistenced(run_dir.to_str().unwrap(), "/bin/true");
assert!(result.is_ok());
nvrc.spawn_persistenced(run_dir.to_str().unwrap(), "/bin/true");
}

#[test]
fn test_spawn_hostengine_success() {
let mut nvrc = NVRC::default();
nvrc.dcgm_enabled = Some(true);
let result = nvrc.spawn_hostengine("/bin/true");
assert!(result.is_ok());
assert!(nvrc.check_daemons().is_ok());
nvrc.spawn_hostengine("/bin/true");
nvrc.check_daemons();
}

#[test]
fn test_spawn_dcgm_exporter_success() {
let mut nvrc = NVRC::default();
nvrc.dcgm_enabled = Some(true);
let result = nvrc.spawn_dcgm_exporter("/bin/true");
assert!(result.is_ok());
nvrc.spawn_dcgm_exporter("/bin/true");
}

#[test]
fn test_spawn_fabricmanager_success() {
let mut nvrc = NVRC::default();
nvrc.fabricmanager_enabled = Some(true);
let result = nvrc.spawn_fabricmanager("/bin/true");
assert!(result.is_ok());
nvrc.spawn_fabricmanager("/bin/true");
}

#[test]
fn test_spawn_persistenced_binary_not_found() {
use std::panic;

let tmpdir = TempDir::new().unwrap();
let run_dir = tmpdir.path().join("nvidia-persistenced");

let mut nvrc = NVRC::default();
let result = nvrc.spawn_persistenced(run_dir.to_str().unwrap(), "/nonexistent/binary");
let result = panic::catch_unwind(|| {
let mut nvrc = NVRC::default();
nvrc.spawn_persistenced(run_dir.to_str().unwrap(), "/nonexistent/binary");
});
assert!(result.is_err());
}

#[test]
fn test_check_daemons_empty() {
let mut nvrc = NVRC::default();
assert!(nvrc.check_daemons().is_ok());
nvrc.check_daemons();
}
}
60 changes: 28 additions & 32 deletions src/execute.rs
Original file line number Diff line number Diff line change
@@ -1,109 +1,105 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright (c) NVIDIA CORPORATION

use anyhow::{anyhow, Context, Result};
use std::process::{Child, Command, Stdio};

use crate::kmsg::kmsg;
use crate::macros::ResultExt;

/// Run a command and block until completion. Output goes to kmsg so it appears
/// in dmesg/kernel log - the only reliable log destination in minimal VMs.
/// Used for setup commands that must succeed before continuing (nvidia-smi, modprobe).
pub fn foreground(command: &str, args: &[&str]) -> Result<()> {
pub fn foreground(command: &str, args: &[&str]) {
debug!("{} {}", command, args.join(" "));

let kmsg_file = kmsg().context("Failed to open kmsg device")?;
let kmsg_file = kmsg();
let status = Command::new(command)
.args(args)
.stdout(Stdio::from(kmsg_file.try_clone().unwrap()))
.stderr(Stdio::from(kmsg_file))
.status()
.context(format!("failed to execute {command}"))?;
.or_panic(format_args!("execute {command}"));

if !status.success() {
return Err(anyhow!("{} failed with status: {}", command, status));
panic!("{command} failed with status: {status}");
}
Ok(())
}

/// Spawn a daemon without waiting. Returns Child so caller can track it later.
/// Used for long-running services (nvidia-persistenced, fabricmanager) that run
/// alongside kata-agent. Output to kmsg for visibility in kernel log.
pub fn background(command: &str, args: &[&str]) -> Result<Child> {
pub fn background(command: &str, args: &[&str]) -> Child {
debug!("{} {}", command, args.join(" "));
let kmsg_file = kmsg().context("Failed to open kmsg device")?;
let kmsg_file = kmsg();
Command::new(command)
.args(args)
.stdout(Stdio::from(kmsg_file.try_clone().unwrap()))
.stderr(Stdio::from(kmsg_file))
.spawn()
.with_context(|| format!("Failed to start {}", command))
.or_panic(format_args!("start {command}"))
}

#[cfg(test)]
mod tests {
use super::*;
use std::panic;

// ==================== foreground tests ====================

#[test]
fn test_foreground_success() {
let result = foreground("/bin/true", &[]);
assert!(result.is_ok());
foreground("/bin/true", &[]);
}

#[test]
fn test_foreground_failure_exit_code() {
// Command runs but exits non-zero
let result = foreground("/bin/false", &[]);
// Command runs but exits non-zero - should panic
let result = panic::catch_unwind(|| {
foreground("/bin/false", &[]);
});
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(err.contains("failed"));
}

#[test]
fn test_foreground_not_found() {
// Command doesn't exist - triggers .context() error path
let result = foreground("/nonexistent/command", &[]);
// Command doesn't exist - should panic
let result = panic::catch_unwind(|| {
foreground("/nonexistent/command", &[]);
});
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(err.contains("execute"));
}

#[test]
fn test_foreground_with_args() {
let result = foreground("/bin/sh", &["-c", "exit 0"]);
assert!(result.is_ok());
foreground("/bin/sh", &["-c", "exit 0"]);

let result = foreground("/bin/sh", &["-c", "exit 42"]);
let result = panic::catch_unwind(|| {
foreground("/bin/sh", &["-c", "exit 42"]);
});
assert!(result.is_err());
}

// ==================== background tests ====================

#[test]
fn test_background_spawns() {
let result = background("/bin/sleep", &["0.01"]);
assert!(result.is_ok());
let mut child = result.unwrap();
let mut child = background("/bin/sleep", &["0.01"]);
let status = child.wait().unwrap();
assert!(status.success());
}

#[test]
fn test_background_not_found() {
// Command doesn't exist - triggers .with_context() error path
let result = background("/nonexistent/command", &[]);
// Command doesn't exist - should panic
let result = panic::catch_unwind(|| {
background("/nonexistent/command", &[]);
});
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(err.contains("start"), "error should mention start: {}", err);
}

#[test]
fn test_background_check_later() {
let result = background("/bin/sh", &["-c", "exit 7"]);
assert!(result.is_ok());
let mut child = result.unwrap();
let mut child = background("/bin/sh", &["-c", "exit 7"]);
let status = child.wait().unwrap();
assert!(!status.success());
assert_eq!(status.code(), Some(7));
Expand Down
Loading
Loading