Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Containerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ RUN cargo install cargo-nextest cargo-audit cargo-deny --locked
RUN apt-get update && apt-get install -y \
fuse3 libfuse3-dev autoconf automake libtool perl libclang-dev clang cmake \
musl-tools iproute2 iptables passt dnsmasq qemu-utils e2fsprogs btrfs-progs \
parted fdisk podman skopeo git curl sudo procps zstd busybox-static cpio uidmap \
parted fdisk podman skopeo git curl sudo procps zstd busybox-static cpio uidmap iputils-ping \
flex bison bc libelf-dev libssl-dev libseccomp-dev \
&& rm -rf /var/lib/apt/lists/*

Expand Down
2 changes: 1 addition & 1 deletion Containerfile.nested
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ FROM ubuntu:24.04

RUN apt-get update && apt-get install -y --no-install-recommends \
iproute2 iptables podman python3 kmod procps fuse3 curl nginx \
fuse-overlayfs sudo iperf3 rsync btrfs-progs \
fuse-overlayfs sudo iperf3 rsync btrfs-progs iputils-ping \
&& rm -rf /var/lib/apt/lists/*

# Configure podman to use fuse-overlayfs (required for nested containers)
Expand Down
102 changes: 79 additions & 23 deletions benches/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -501,20 +501,18 @@ impl CloneFixture {
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
let stdout = String::from_utf8_lossy(&output.stdout);
// Dump serve log for diagnostics
let serve_log = "/tmp/fcvm-bench-serve-clone-exec.log";
if let Ok(logs) = std::fs::read_to_string(serve_log) {
let tail: Vec<&str> = logs.lines().rev().take(30).collect();
eprintln!("=== Last 30 lines of serve log ===");
for line in tail.into_iter().rev() {
eprintln!("{}", line);
}
}
let serve_log_path = "/tmp/fcvm-bench-serve-clone-exec.log";
let serve_log_content = std::fs::read_to_string(serve_log_path).unwrap_or_default();
panic!(
"clone exec failed after {:.1}s:\nstderr: {}\nstdout: {}",
"clone exec failed after {:.1}s:\n\
stderr: {}\n\
stdout: {}\n\
\n=== full serve log ({}) ===\n{}",
elapsed.as_secs_f64(),
stderr,
stdout
stdout,
serve_log_path,
serve_log_content,
);
}

Expand Down Expand Up @@ -637,16 +635,69 @@ impl CloneFixture {
.map(|o| String::from_utf8_lossy(&o.stdout).to_string())
.unwrap_or_default();

// Last 30 lines of clone log (full, not filtered)
let log_tail: String = clone_log
.lines()
.rev()
.take(30)
.collect::<Vec<_>>()
.into_iter()
.rev()
.collect::<Vec<_>>()
.join("\n");
// Get holder PID for namespace diagnostics
let holder_diag = Command::new(&fcvm)
.args(["ls", "--json", "--pid", &clone_pid.to_string()])
.output()
.ok()
.and_then(|o| {
let stdout = String::from_utf8_lossy(&o.stdout);
serde_json::from_str::<Vec<serde_json::Value>>(&stdout).ok()
})
.and_then(|vms| {
vms.first()
.and_then(|v| v["holder_pid"].as_u64())
.map(|hp| {
let hp_str = hp.to_string();
let mut diag = String::new();

// ARP cache in namespace
if let Ok(o) = Command::new("nsenter")
.args(["-t", &hp_str, "-n", "ip", "neigh", "show"])
.output()
{
diag.push_str(&format!(
"\n=== ARP cache (ns {}) ===\n{}",
hp,
String::from_utf8_lossy(&o.stdout)
));
}

// Namespace sockets
if let Ok(o) = Command::new("nsenter")
.args(["-t", &hp_str, "-n", "ss", "-tnp"])
.output()
{
diag.push_str(&format!(
"\n=== namespace sockets (ns {}) ===\n{}",
hp,
String::from_utf8_lossy(&o.stdout)
));
}

// Bridge links
if let Ok(o) = Command::new("nsenter")
.args(["-t", &hp_str, "-n", "bridge", "link"])
.output()
{
diag.push_str(&format!(
"\n=== bridge links (ns {}) ===\n{}",
hp,
String::from_utf8_lossy(&o.stdout)
));
}

diag
})
})
.unwrap_or_default();

// VM listening sockets
let vm_ss = Command::new(&fcvm)
.args(["exec", "--pid", &clone_pid.to_string(), "--", "ss", "-tnl"])
.output()
.map(|o| String::from_utf8_lossy(&o.stdout).to_string())
.unwrap_or_else(|e| format!("exec ss failed: {}", e));

panic!(
"clone HTTP failed after 10 attempts\n\
Expand All @@ -657,7 +708,9 @@ impl CloneFixture {
\n=== listening sockets on {} ===\n{}\
\n=== pasta processes ===\n{}\
\n=== stale process counts ===\n{}\
\n=== clone log (last 30 lines) ===\n{}",
{}\
\n=== VM listening sockets ===\n{}\
\n=== full clone log ({}) ===\n{}",
loopback_ip,
health_port,
last_response.len(),
Expand All @@ -667,7 +720,10 @@ impl CloneFixture {
ss_check,
pasta_check,
stale_check,
log_tail,
holder_diag,
vm_ss,
clone_log_path,
clone_log,
);
}

Expand Down
16 changes: 9 additions & 7 deletions src/commands/snapshot.rs
Original file line number Diff line number Diff line change
Expand Up @@ -724,7 +724,8 @@ pub async fn cmd_snapshot_run(args: SnapshotRunArgs) -> Result<()> {
// With bridge mode, guest IP is always 10.0.2.100 on pasta network
// Each clone runs in its own namespace, so no IP conflict
let net = PastaNetwork::new(vm_id.clone(), tap_device.clone(), port_mappings.clone())
.with_loopback_ip(loopback_ip);
.with_loopback_ip(loopback_ip)
.with_restore_mode();
Box::new(net)
}
};
Expand Down Expand Up @@ -1081,13 +1082,14 @@ pub async fn cmd_snapshot_run(args: SnapshotRunArgs) -> Result<()> {
}
}

// Verify pasta's L2 forwarding path has ARP resolved before starting health monitor.
// Verify pasta's L2 forwarding path is ready before starting health monitor.
// After snapshot restore, pasta may not have learned the guest's MAC yet.
// This probes each forwarded port to trigger and verify ARP resolution —
// no guest service needs to be running, just the guest's kernel.
if let Err(e) = network.verify_port_forwarding().await {
warn!(vm_id = %vm_id, error = %e, "port forwarding verification failed");
}
// This pings the guest to trigger ARP resolution, then probes each forwarded
// port to confirm end-to-end forwarding works.
network
.verify_port_forwarding()
.await
.context("port forwarding verification failed after snapshot restore")?;

// Spawn health monitor task with startup snapshot trigger support
let health_monitor_handle = crate::health::spawn_health_monitor_full(
Expand Down
73 changes: 48 additions & 25 deletions src/network/pasta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ pub struct PastaNetwork {
pid_file: Option<PathBuf>,
loopback_ip: Option<String>, // Unique loopback IP for port forwarding (127.x.y.z)
holder_pid: Option<u32>, // Namespace PID (set in post_start)
restore_mode: bool, // Skip port probe in post_start (VM not loaded yet)
}

impl PastaNetwork {
Expand All @@ -90,6 +91,7 @@ impl PastaNetwork {
pid_file: None,
loopback_ip: None,
holder_pid: None,
restore_mode: false,
}
}

Expand All @@ -106,6 +108,19 @@ impl PastaNetwork {
self
}

/// Skip port forwarding probe in post_start() for snapshot restore.
///
/// During snapshot restore, post_start() runs BEFORE the VM snapshot is loaded
/// into Firecracker. Probing ports at that point forces pasta to attempt L2
/// forwarding to a non-existent guest, which can poison pasta's internal
/// connection tracking and cause subsequent connections to return 0 bytes.
/// The proper verification happens later via verify_port_forwarding() after
/// the VM is resumed and fc-agent has sent its gratuitous ARP.
pub fn with_restore_mode(mut self) -> Self {
self.restore_mode = true;
self
}

/// Get the loopback IP assigned to this VM for port forwarding
pub fn loopback_ip(&self) -> Option<&str> {
self.loopback_ip.as_deref()
Expand Down Expand Up @@ -604,7 +619,13 @@ impl NetworkManager for PastaNetwork {
// The PID file only means pasta spawned, not that ports are bound.
// Health checks use nsenter (bridge path), so without this check
// "healthy" doesn't mean port forwarding works.
if !self.port_mappings.is_empty() {
//
// Skip in restore mode: during snapshot restore, post_start() runs BEFORE
// the VM snapshot is loaded. Probing ports now forces pasta to attempt L2
// forwarding to a non-existent guest, poisoning its connection state and
// causing subsequent connections to return 0 bytes. The port check happens
// later via verify_port_forwarding() after the VM is actually running.
if !self.restore_mode && !self.port_mappings.is_empty() {
self.wait_for_port_forwarding().await?;
Comment on lines +628 to 629

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Preserve hard failure when restore-mode probe is skipped

When restore_mode is enabled, this branch skips the only startup-time port-forward readiness check that previously failed snapshot run on broken forwarding. The later check in cmd_snapshot_run (network.verify_port_forwarding()) only logs a warning on error, so a clone can now come up “healthy” while all published ports are unusable; this is a behavior regression specific to rootless snapshot restores with --publish where pasta never reaches a working forwarding state.

Useful? React with 👍 / 👎.

}

Expand Down Expand Up @@ -641,14 +662,14 @@ impl NetworkManager for PastaNetwork {
/// Verify pasta's L2 forwarding path is ready after snapshot restore.
///
/// After snapshot restore, pasta needs the guest's MAC address to forward
/// L2 frames. fc-agent sends a gratuitous ARP (ping to gateway) during
/// restore, which broadcasts the guest's MAC to all bridge ports including
/// pasta0. We verify this by checking the namespace's ARP table — if the
/// namespace kernel learned the guest's MAC, pasta received the same
/// broadcast frame.
/// L2 frames. We actively ping the guest from the namespace to trigger a
/// normal ARP exchange. With arp_accept=0 (Linux default), the guest's
/// gratuitous arping does NOT create neighbor entries — only updates
/// existing ones. The active ping forces the namespace kernel to send an
/// ARP request that the guest replies to, creating a REACHABLE entry.
///
/// This runs after fc-agent's output vsock reconnects, so the gratuitous
/// ARP has already been sent. Typically resolves on the first check.
/// Once ARP is resolved, we probe each forwarded port to confirm pasta's
/// loopback port forwarding is end-to-end functional.
async fn verify_port_forwarding(&self) -> Result<()> {
if self.port_mappings.is_empty() {
return Ok(());
Expand All @@ -662,38 +683,40 @@ impl NetworkManager for PastaNetwork {
let deadline = std::time::Instant::now() + std::time::Duration::from_secs(5);
let nsenter_prefix = self.build_nsenter_prefix(holder_pid);

// Ping the guest from inside the namespace to trigger ARP resolution.
// A successful ping proves ARP resolved AND the guest is reachable.
// Use 200ms timeout for ~16 retries within the 5s deadline.
loop {
let output = Command::new(&nsenter_prefix[0])
.args(&nsenter_prefix[1..])
.args(["ip", "neigh", "show", GUEST_IP, "dev", BRIDGE_DEVICE])
.stdout(Stdio::piped())
.stderr(Stdio::null())
.args(["ping", "-c", "1", "-W", "0.2", GUEST_IP])
.stdout(Stdio::null())
.stderr(Stdio::piped())
.output()
.await
.context("checking ARP table in namespace")?;

let stdout = String::from_utf8_lossy(&output.stdout);
// Entry looks like: "10.0.2.100 lladdr aa:bb:cc:dd:ee:ff REACHABLE"
// If lladdr is present, the guest's MAC is known.
if stdout.contains("lladdr") {
info!(guest_ip = GUEST_IP, arp = %stdout.trim(), "ARP resolved");
// ARP is resolved but pasta's loopback port forwarding may not be
// ready yet. Probe each mapped port on the loopback IP to confirm
// end-to-end forwarding works before declaring ready.
.context("running ping via nsenter in namespace")?;

if output.status.success() {
info!(
guest_ip = GUEST_IP,
"guest reachable via ping, ARP resolved"
);
self.wait_for_port_forwarding().await?;
return Ok(());
}

if std::time::Instant::now() > deadline {
let stderr = String::from_utf8_lossy(&output.stderr);
let stderr = stderr.trim();
anyhow::bail!(
"ARP for guest {} not resolved within 5s on {}",
"ARP for guest {} not resolved within 5s on {}: ping stderr: {}",
GUEST_IP,
BRIDGE_DEVICE
BRIDGE_DEVICE,
if stderr.is_empty() { "(empty)" } else { stderr }
);
}

debug!(guest_ip = GUEST_IP, "ARP not yet resolved, waiting");
tokio::time::sleep(std::time::Duration::from_millis(10)).await;
debug!(guest_ip = GUEST_IP, "ping to guest failed, retrying");
}
}

Expand Down
Loading