Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 18 additions & 6 deletions openhcl/openhcl_boot/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -289,13 +289,25 @@ fn build_kernel_command_line(
)?;
}

// Only when explicitly supported by Host.
// Generate the NVMe keep alive command line which should look something
// like: OPENHCL_NVME_KEEP_ALIVE=disabled,host,privatepool
// TODO: Move from command line to device tree when stabilized.
if partition_info.nvme_keepalive
&& vtl2_pool_supported
&& !partition_info.boot_options.disable_nvme_keep_alive
{
write!(cmdline, "OPENHCL_NVME_KEEP_ALIVE=1 ")?;
write!(cmdline, "OPENHCL_NVME_KEEP_ALIVE=")?;

if partition_info.boot_options.disable_nvme_keep_alive {
write!(cmdline, "disabled,")?;
}

if partition_info.nvme_keepalive {
write!(cmdline, "host,")?;
} else {
write!(cmdline, "nohost,")?;
}

if vtl2_pool_supported {
write!(cmdline, "privatepool ")?;
} else {
write!(cmdline, "noprivatepool ")?;
Comment on lines +298 to +310
Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When disable_nvme_keep_alive is false, the command line will not include the "disabled," prefix. However, when it's true, the pattern becomes "disabled,host,privatepool" or "disabled,nohost,noprivatepool", etc.

The FromStr implementation at line 97 handles strings starting with "disabled," but doesn't have explicit matches for patterns like "disabled,host,privatepool". While the catch-all at line 97 will map these to Disabled, this means the host and privatepool information is ignored even though it's being generated.

Consider either:

  1. Not generating the host/privatepool parts when disabled is true (since they're ignored)
  2. Or adding explicit parsing for these patterns if the information is meant to be preserved for logging/debugging
Suggested change
write!(cmdline, "disabled,")?;
}
if partition_info.nvme_keepalive {
write!(cmdline, "host,")?;
} else {
write!(cmdline, "nohost,")?;
}
if vtl2_pool_supported {
write!(cmdline, "privatepool ")?;
} else {
write!(cmdline, "noprivatepool ")?;
write!(cmdline, "disabled ")?;
} else {
if partition_info.nvme_keepalive {
write!(cmdline, "host,")?;
} else {
write!(cmdline, "nohost,")?;
}
if vtl2_pool_supported {
write!(cmdline, "privatepool ")?;
} else {
write!(cmdline, "noprivatepool ")?;
}

Copilot uses AI. Check for mistakes.
}
Comment on lines +308 to 311
Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The space after "privatepool" and "noprivatepool" will become part of the environment variable value, causing parsing to fail. The FromStr implementation for KeepAliveConfig (line 93 in options.rs) does not trim whitespace before parsing.

Move the space outside the environment variable value:

if vtl2_pool_supported {
    write!(cmdline, "privatepool")?;
} else {
    write!(cmdline, "noprivatepool")?;
}
write!(cmdline, " ")?;
Suggested change
write!(cmdline, "privatepool ")?;
} else {
write!(cmdline, "noprivatepool ")?;
}
write!(cmdline, "privatepool")?;
} else {
write!(cmdline, "noprivatepool")?;
}
write!(cmdline, " ")?;

Copilot uses AI. Check for mistakes.

if let Some(sidecar) = sidecar {
Expand Down
73 changes: 48 additions & 25 deletions openhcl/underhill_core/src/dispatch/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ pub(crate) struct LoadedVm {

pub _periodic_telemetry_task: Task<()>,

pub nvme_keep_alive: bool,
pub nvme_keep_alive: KeepAliveConfig,
pub mana_keep_alive: KeepAliveConfig,
pub test_configuration: Option<TestScenarioConfig>,
pub dma_manager: OpenhclDmaManager,
Expand Down Expand Up @@ -306,7 +306,10 @@ impl LoadedVm {
WorkerRpc::Restart(rpc) => {
let state = async {
let running = self.stop().await;
match self.save(None, false, KeepAliveConfig::Disabled).await {
match self
.save(None, KeepAliveConfig::Disabled, KeepAliveConfig::Disabled)
.await
{
Ok(servicing_state) => Some((rpc, servicing_state)),
Err(err) => {
if running {
Expand Down Expand Up @@ -372,7 +375,9 @@ impl LoadedVm {
UhVmRpc::Save(rpc) => {
rpc.handle_failable(async |()| {
let running = self.stop().await;
let r = self.save(None, false, KeepAliveConfig::Disabled).await;
let r = self
.save(None, KeepAliveConfig::Disabled, KeepAliveConfig::Disabled)
.await;
if running {
self.start(None).await;
}
Expand Down Expand Up @@ -577,7 +582,10 @@ impl LoadedVm {

// NOTE: This is set via the corresponding env arg, as this feature is
// experimental.
let nvme_keepalive = self.nvme_keep_alive && capabilities_flags.enable_nvme_keepalive();
if !capabilities_flags.enable_nvme_keepalive() {
self.nvme_keep_alive = KeepAliveConfig::Disabled
};

if !capabilities_flags.enable_mana_keepalive() {
self.mana_keep_alive = KeepAliveConfig::Disabled
};
Expand All @@ -595,15 +603,23 @@ impl LoadedVm {
anyhow::bail!("cannot service underhill while paused");
}

let mut state = self.save(Some(deadline), nvme_keepalive, self.mana_keep_alive.clone()).await?;
let mut state = self
.save(
Some(deadline),
self.nvme_keep_alive.clone(),
self.mana_keep_alive.clone(),
)
.await?;
state.init_state.correlation_id = Some(correlation_id);

// Unload any network devices.
let shutdown_mana = async {
if let Some(network_settings) = self.network_settings.as_mut() {
network_settings
.unload_for_servicing()
.instrument(tracing::info_span!("shutdown_mana", CVM_ALLOWED, %correlation_id))
.instrument(
tracing::info_span!("shutdown_mana", CVM_ALLOWED, %correlation_id),
)
.await;
}
};
Expand All @@ -612,8 +628,13 @@ impl LoadedVm {
let shutdown_nvme = async {
if let Some(nvme_manager) = self.nvme_manager.take() {
nvme_manager
.shutdown(nvme_keepalive)
.instrument(tracing::info_span!("shutdown_nvme_vfio", CVM_ALLOWED, %correlation_id, %nvme_keepalive))
.shutdown(self.nvme_keep_alive.is_enabled())
.instrument(tracing::info_span!(
"shutdown_nvme_vfio",
CVM_ALLOWED,
correlation_id = %correlation_id,
nvme_keep_alive_enabled = self.nvme_keep_alive.is_enabled()
))
.await;
}
};
Expand All @@ -622,18 +643,19 @@ impl LoadedVm {
// restart.
let shutdown_pci = async {
pci_shutdown::shutdown_pci_devices()
.instrument(tracing::info_span!("shutdown_pci_devices", CVM_ALLOWED, %correlation_id))
.instrument(
tracing::info_span!("shutdown_pci_devices", CVM_ALLOWED, %correlation_id),
)
.await
};

// Save the persisted state used by the next openhcl_boot.
let cpus_with_mapped_interrupts = match state
.init_state
.nvme_state
.as_ref() {
Some(nvme_state) => crate::nvme_manager::save_restore::cpus_with_interrupts(&nvme_state.nvme_state),
None => vec![],
};
let cpus_with_mapped_interrupts = match state.init_state.nvme_state.as_ref() {
Some(nvme_state) => {
crate::nvme_manager::save_restore::cpus_with_interrupts(&nvme_state.nvme_state)
}
None => vec![],
};

crate::loader::vtl2_config::write_persisted_info(
self.runtime_params.parsed_openhcl_boot(),
Expand Down Expand Up @@ -772,7 +794,7 @@ impl LoadedVm {
async fn save(
&mut self,
_deadline: Option<std::time::Instant>,
nvme_keepalive_flag: bool,
nvme_keepalive_mode: KeepAliveConfig,
mana_keepalive_mode: KeepAliveConfig,
) -> anyhow::Result<ServicingState> {
assert!(!self.state_units.is_running());
Expand All @@ -785,23 +807,24 @@ impl LoadedVm {
//
// This has to happen before saving the network state, otherwise its allocations
// are marked as Free and are unable to be restored.
let dma_manager_state = if nvme_keepalive_flag || mana_keepalive_mode.is_enabled() {
use vmcore::save_restore::SaveRestore;
Some(self.dma_manager.save().context("dma_manager save failed")?)
} else {
None
};
let dma_manager_state =
if nvme_keepalive_mode.is_enabled() || mana_keepalive_mode.is_enabled() {
use vmcore::save_restore::SaveRestore;
Some(self.dma_manager.save().context("dma_manager save failed")?)
} else {
None
};

// Only save NVMe state when there are NVMe controllers and keep alive
// was enabled.
let nvme_state = if let Some(n) = &self.nvme_manager {
// DEVNOTE: A subtlety here is that the act of saving the NVMe state also causes the driver
// to enter a state where subsequent teardown operations will noop. There is a STRONG
// correlation between save/restore and keepalive.
n.save(nvme_keepalive_flag)
n.save(nvme_keepalive_mode.is_enabled())
.instrument(tracing::info_span!(
"nvme_manager_save",
nvme_keepalive_flag,
nvme_keepalive_mode = %nvme_keepalive_mode.is_enabled(),
CVM_ALLOWED
))
.await
Expand Down
2 changes: 1 addition & 1 deletion openhcl/underhill_core/src/emuplat/netvsp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
// Licensed under the MIT License.

use crate::dispatch::vtl2_settings_worker::wait_for_pci_path;
use crate::options::KeepAliveConfig;
use crate::vpci::HclVpciBusControl;
use anyhow::Context;
use async_trait::async_trait;
Expand Down Expand Up @@ -40,6 +39,7 @@ use std::task::ready;
use tracing::Instrument;
use uevent::UeventListener;

use crate::options::KeepAliveConfig;
use user_driver::vfio::PciDeviceResetMethod;
use user_driver::vfio::VfioDevice;
use user_driver::vfio::VfioDmaClients;
Expand Down
30 changes: 26 additions & 4 deletions openhcl/underhill_core/src/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ impl FromStr for KeepAliveConfig {
"host,privatepool" => Ok(KeepAliveConfig::EnabledHostAndPrivatePoolPresent),
"nohost,privatepool" => Ok(KeepAliveConfig::DisabledHostAndPrivatePoolPresent),
"nohost,noprivatepool" => Ok(KeepAliveConfig::Disabled),
x if x.starts_with("disabled,") => Ok(KeepAliveConfig::Disabled),
_ => Err(anyhow::anyhow!("Invalid keepalive config: {}", s)),
}
}
Expand Down Expand Up @@ -207,15 +208,23 @@ pub struct Options {
/// hit exits.
pub no_sidecar_hotplug: bool,

/// (OPENHCL_NVME_KEEP_ALIVE=1) Enable nvme keep alive when servicing.
pub nvme_keep_alive: bool,
/// (OPENHCL_NVME_KEEP_ALIVE=\<KeepaliveConfig\>)
/// Configure NVMe keep alive behavior when servicing.
/// Options are:
/// - "host,privatepool" - Enable keep alive if both host and private pool support it.
/// - "nohost,privatepool" - Used when the host does not support keepalive, but a private pool is present. Keepalive is disabled.
/// - "nohost,noprivatepool" - Keepalive is disabled.
/// - "disabled,<x>,<x>" - Keepalive is disabled due to manual
/// override. Host and private pool options are ignored.
pub nvme_keep_alive: KeepAliveConfig,

/// (OPENHCL_MANA_KEEP_ALIVE=\<KeepAliveConfig\>)
/// Configure MANA keep alive behavior when servicing.
/// Options are:
/// - "host,privatepool" - Enable keep alive if both host and private pool support it.
/// - "nohost,privatepool" - Used when the host does not support keepalive, but a private pool is present. Keepalive is disabled.
/// - "nohost,noprivatepool" - Keepalive is disabled.
/// - "disabled,<x>,<x>" - TODO: This needs to be implemented for mana.
pub mana_keep_alive: KeepAliveConfig,

/// (OPENHCL_NVME_ALWAYS_FLR=1)
Expand Down Expand Up @@ -366,15 +375,28 @@ impl Options {
let no_sidecar_hotplug = parse_legacy_env_bool("OPENHCL_NO_SIDECAR_HOTPLUG");
let gdbstub = parse_legacy_env_bool("OPENHCL_GDBSTUB");
let gdbstub_port = parse_legacy_env_number("OPENHCL_GDBSTUB_PORT")?.map(|x| x as u32);
let nvme_keep_alive = parse_env_bool("OPENHCL_NVME_KEEP_ALIVE");
let nvme_keep_alive = read_env("OPENHCL_NVME_KEEP_ALIVE")
.map(|x| {
let s = x.to_string_lossy();
match s.parse::<KeepAliveConfig>() {
Ok(v) => v,
Err(e) => {
tracing::warn!(
"failed to parse OPENHCL_NVME_KEEP_ALIVE ('{s}'): {e}. Nvme keepalive will be disabled."
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NOTE: Decided to keep this here as opposed to defaulting to disabled in parse. That way we can more accurately log which variable parsing is failing (nvme vs mana)

);
KeepAliveConfig::Disabled
}
}
})
.unwrap_or(KeepAliveConfig::Disabled);
let mana_keep_alive = read_env("OPENHCL_MANA_KEEP_ALIVE")
.map(|x| {
let s = x.to_string_lossy();
match s.parse::<KeepAliveConfig>() {
Ok(v) => v,
Err(e) => {
tracing::warn!(
"failed to parse OPENHCL_MANA_KEEP_ALIVE ('{s}'): {e}. keepalive will be disabled."
"failed to parse OPENHCL_MANA_KEEP_ALIVE ('{s}'): {e}. Mana keepalive will be disabled."
);
KeepAliveConfig::Disabled
}
Expand Down
4 changes: 2 additions & 2 deletions openhcl/underhill_core/src/worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ pub struct UnderhillEnvCfg {
/// Hide the isolation mode from the guest.
pub hide_isolation: bool,
/// Enable nvme keep alive.
pub nvme_keep_alive: bool,
pub nvme_keep_alive: KeepAliveConfig,
/// Enable mana keep alive.
pub mana_keep_alive: KeepAliveConfig,
/// Don't skip FLR for NVMe devices.
Expand Down Expand Up @@ -2174,7 +2174,7 @@ async fn new_underhill_vm(
// TODO: reevaluate enablement of nvme save restore when private pool
// save restore to bootshim is available.
let private_pool_available = !runtime_params.private_pool_ranges().is_empty();
let save_restore_supported = env_cfg.nvme_keep_alive && private_pool_available;
let save_restore_supported = env_cfg.nvme_keep_alive.is_enabled() && private_pool_available;

let manager = NvmeManager::new(
&driver_source,
Expand Down
Loading