Skip to content
Merged
2 changes: 1 addition & 1 deletion .claude/CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -1229,7 +1229,7 @@ fuse-pipe/benches/
- Each VM gets a unique IPv6 derived from host's /64 subnet via hash of vm_id
- Network namespace with bridge (br0) connecting TAP and veth for L2 forwarding
- Proxy NDP on default interface makes VM IPv6 routable from network fabric
- ip6tables MASQUERADE for AWS VPC source/dest checks
- ip6tables MASQUERADE for AWS VPC source/dest checks (skipped when `--ipv6-prefix` is set)
- Port forwarding via built-in TCP proxy (setns + tokio relay) on unique loopback IP (same allocation as rootless)
- IPv4 stays internal to namespace (health checks only); all external traffic uses IPv6
- Egress proxy is NOT used — IPv6 goes natively through the kernel stack
Expand Down
11 changes: 6 additions & 5 deletions DESIGN.md
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ iptables -t nat -A PREROUTING -d 172.30.x.1 -p tcp --dport 8080 -j DNAT --to-des
Uses veth pairs + IPv6 routing for kernel line-rate networking without userspace proxies.

**Features**:
- Requires root and a host with a global IPv6 /64 subnet
- Requires root and a host with a global IPv6 /64 subnet (or `--ipv6-prefix` to specify one explicitly)
- Native IPv6 routing through the kernel stack (no userspace L4 translation)
- Each VM gets a unique IPv6 derived from the host's /64 prefix
- Port forwarding via built-in TCP proxy (`setns` + tokio relay) on loopback IP (same as rootless)
Expand All @@ -308,11 +308,12 @@ struct RoutedNetwork {
vm_ipv6: Option<String>,
default_iface: Option<String>,
proxy_handles: Vec<JoinHandle<()>>,
ipv6_prefix: Option<String>, // explicit /64 prefix (skips auto-detect + MASQUERADE)
}

async fn setup() -> Result<NetworkConfig> {
preflight_check() // root, IPv6, ip6tables
detect_host_ipv6() // find /64 subnet (or /128 with on-link /64)
self.preflight_check() // root, IPv6, ip6tables (ip6tables skipped if --ipv6-prefix)
detect_host_ipv6() // find /64 subnet (or /128 with on-link /64); skipped if --ipv6-prefix
generate_vm_ipv6(prefix, vm_id) // deterministic IPv6 from hash
create_namespace(ns_name)
create_veth_pair(host_veth, guest_veth)
Expand All @@ -323,7 +324,7 @@ async fn setup() -> Result<NetworkConfig> {
// Namespace: default IPv6 route via host veth link-local
// Host: /128 route to VM IPv6 via host veth
// Proxy NDP on default interface
// ip6tables MASQUERADE for outbound
// ip6tables MASQUERADE for outbound (skipped if --ipv6-prefix is set)
// TCP proxy port forwarding on loopback IP (setns + tokio relay)
}
```
Expand Down Expand Up @@ -1366,7 +1367,7 @@ fcvm snapshot run --pid <SERVE_PID> [OPTIONS]
--exec <CMD> Execute command in container after clone is healthy
```

Network mode, port mappings, TTY, and interactive flags are inherited from the snapshot
Network mode, port mappings, TTY, interactive flags, and `--ipv6-prefix` are inherited from the snapshot
metadata automatically — no need to re-specify them on clone.

**Examples**:
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ fcvm auto-forwards `http_proxy`/`https_proxy` from host to VM via MMDS.
- Firecracker binary in PATH
- For rootless: `passt` package (provides `pasta`)
- For bridged: sudo, iptables, iproute2
- For routed: sudo, ip6tables, iproute2, host with global IPv6 /64
- For routed: sudo, iproute2, host with global IPv6 /64 (ip6tables also needed unless `--ipv6-prefix` is set)
- For rootfs build: qemu-utils, e2fsprogs

**Storage:** btrfs at `/mnt/fcvm-btrfs` (auto-created as loopback on non-btrfs hosts)
Expand Down Expand Up @@ -336,6 +336,7 @@ See [`Containerfile`](Containerfile) for the complete dependency list used in CI
--portable-volumes Path-hash inodes for cross-machine snapshot/restore
--rootfs-size <SIZE> Minimum free space on rootfs (default: 10G)
--no-snapshot Disable automatic snapshot creation
--ipv6-prefix <PREFIX> Use explicit /64 prefix for routed mode (skips auto-detect and MASQUERADE)
```

Run `fcvm --help` or `fcvm <command> --help` for full options.
Expand Down
8 changes: 8 additions & 0 deletions src/cli/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,14 @@ pub struct RunArgs {
#[arg(long, value_enum, default_value_t = NetworkMode::Rootless)]
pub network: NetworkMode,

/// Routable IPv6 /64 prefix for routed mode VM addressing.
/// Each VM gets a unique address in this prefix via NDP proxy.
/// When set, MASQUERADE is skipped (the prefix is directly routable).
/// When not set, auto-detected from host interfaces.
/// Example: --ipv6-prefix 2803:6084:7058:46f6
#[arg(long)]
pub ipv6_prefix: Option<String>,

/// HTTP health check URL. If not specified, health is based on container running status.
/// The URL hostname is sent as the Host header; the connection goes to the guest IP.
/// Example: --health-check http://myapp.example.com/status
Expand Down
1 change: 1 addition & 0 deletions src/commands/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1282,6 +1282,7 @@ pub fn build_snapshot_config(
user: vm_state.config.user.clone(),
port_mappings: vm_state.config.port_mappings.clone(),
network_mode: vm_state.config.network_mode,
ipv6_prefix: vm_state.config.ipv6_prefix.clone(),
tty: vm_state.config.tty,
interactive: vm_state.config.interactive,
},
Expand Down
12 changes: 10 additions & 2 deletions src/commands/podman/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,7 @@ pub async fn prepare_vm(mut args: RunArgs) -> Result<Option<VmContext>> {
vm_state.config.portable_volumes = args.portable_volumes;
vm_state.config.port_mappings = port_mappings.clone();
vm_state.config.network_mode = args.network.into();
vm_state.config.ipv6_prefix = args.ipv6_prefix.clone();
vm_state.config.tty = args.tty;
vm_state.config.interactive = args.interactive;
vm_state.config.user = args.user.clone();
Expand Down Expand Up @@ -650,9 +651,13 @@ pub async fn prepare_vm(mut args: RunArgs) -> Result<Option<VmContext>> {
port_mappings.clone(),
)),
NetworkMode::Routed => {
RoutedNetwork::preflight_check().context("routed mode preflight check failed")?;
let mut net =
RoutedNetwork::new(vm_id.clone(), tap_device.clone(), port_mappings.clone());
if let Some(ref prefix) = args.ipv6_prefix {
net = net.with_ipv6_prefix(prefix.clone());
}
net.preflight_check()
.context("routed mode preflight check failed")?;
if !port_mappings.is_empty() {
let loopback_ip = state_manager
.allocate_loopback_ip(&mut vm_state)
Expand Down Expand Up @@ -813,7 +818,9 @@ pub async fn prepare_vm(mut args: RunArgs) -> Result<Option<VmContext>> {
None
};

// Start egress proxy for rootless mode (bypasses TAP/bridge for outbound TCP)
// Start egress proxy for rootless mode only.
// Routed mode uses native IPv6 kernel routing — no proxy needed.
// Services use mutual TLS with client certs, not source IP matching.
let egress_proxy_handle = if matches!(args.network, NetworkMode::Rootless) {
let socket_path = vsock_socket_path.clone();
Some(tokio::spawn(async move {
Expand Down Expand Up @@ -1161,6 +1168,7 @@ mod tests {
rootfs_type: None,
non_blocking_output: false,
label: vec![],
ipv6_prefix: None,
image: "alpine:latest".to_string(),
command_args: vec![],
}
Expand Down
1 change: 1 addition & 0 deletions src/commands/serve.rs
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ async fn create_sandbox(
publish: vec![],
balloon: None,
network: crate::cli::NetworkMode::Rootless,
ipv6_prefix: None,
health_check: None,
health_check_timeout: 5,
privileged: false,
Expand Down
44 changes: 36 additions & 8 deletions src/commands/snapshot.rs
Original file line number Diff line number Diff line change
Expand Up @@ -643,7 +643,7 @@ pub async fn cmd_snapshot_run(args: SnapshotRunArgs) -> Result<()> {
// stays stuck reading from the old (dead) connection after VM resume resets vsock.
let output_reconnect = Arc::new(tokio::sync::Notify::new());
// Channel to know when fc-agent's output connection arrives (gates health monitor)
let (output_connected_tx, output_connected_rx) = tokio::sync::oneshot::channel();
let (output_connected_tx, mut output_connected_rx) = tokio::sync::oneshot::channel();
let output_handle = if !tty_mode {
let socket_path = output_socket_path.clone();
let vm_id_clone = vm_id.clone();
Expand Down Expand Up @@ -673,7 +673,7 @@ pub async fn cmd_snapshot_run(args: SnapshotRunArgs) -> Result<()> {
// Network mode inherited from snapshot metadata
let network_mode = snapshot_config.metadata.network_mode;

// Start egress proxy for rootless mode (bypasses TAP/bridge for outbound TCP)
// Start egress proxy for rootless mode only
let _egress_proxy_handle = if matches!(network_mode, FcNetworkMode::Rootless) {
let socket_path = clone_vsock_base.clone();
Some(tokio::spawn(async move {
Expand Down Expand Up @@ -726,9 +726,13 @@ pub async fn cmd_snapshot_run(args: SnapshotRunArgs) -> Result<()> {
Box::new(net)
}
FcNetworkMode::Routed => {
RoutedNetwork::preflight_check().context("routed mode preflight check failed")?;
let mut net =
RoutedNetwork::new(vm_id.clone(), tap_device.clone(), port_mappings.clone());
if let Some(ref prefix) = snapshot_config.metadata.ipv6_prefix {
net = net.with_ipv6_prefix(prefix.clone());
}
net.preflight_check()
.context("routed mode preflight check failed")?;
if !port_mappings.is_empty() {
let loopback_ip = state_manager
.allocate_loopback_ip(&mut vm_state)
Expand Down Expand Up @@ -790,6 +794,7 @@ pub async fn cmd_snapshot_run(args: SnapshotRunArgs) -> Result<()> {
vm_state.config.user = snapshot_config.metadata.user.clone();
vm_state.config.port_mappings = port_mappings;
vm_state.config.network_mode = network_mode;
vm_state.config.ipv6_prefix = snapshot_config.metadata.ipv6_prefix.clone();
vm_state.config.tty = tty_mode;
vm_state.config.interactive = interactive;

Expand Down Expand Up @@ -1098,12 +1103,35 @@ pub async fn cmd_snapshot_run(args: SnapshotRunArgs) -> Result<()> {
// exec_rebind → exec_re_register → rebind_done → output.reconnect() → HERE
// Without this gate, the health monitor could start exec calls before
// the exec server has re-registered its AsyncFd after restore.
// No timeout — after snapshot restore, the VM may be CPU-starved (HHVM, EdenFS,
// falcon all resume simultaneously) and fc-agent's MMDS poll + restore handler
// can take minutes. Proceeding early causes exec failures; waiting is correct.
// But poll VM liveness to avoid hanging forever if Firecracker crashes.
if !tty_mode {
match tokio::time::timeout(std::time::Duration::from_secs(30), output_connected_rx).await {
Ok(Ok(())) => info!(vm_id = %vm_id, "fc-agent output connected, exec server ready"),
Ok(Err(_)) => warn!(vm_id = %vm_id, "output connected_tx dropped"),
Err(_) => {
warn!(vm_id = %vm_id, "fc-agent did not connect within 30s, proceeding anyway")
let mut liveness_interval = tokio::time::interval(std::time::Duration::from_secs(5));
liveness_interval.tick().await; // consume immediate first tick
loop {
tokio::select! {
result = &mut output_connected_rx => {
match result {
Ok(()) => info!(vm_id = %vm_id, "fc-agent output connected, exec server ready"),
Err(_) => warn!(vm_id = %vm_id, "output connected_tx dropped"),
}
break;
}
_ = liveness_interval.tick() => {
match vm_manager.try_wait() {
Ok(Some(status)) => {
warn!(vm_id = %vm_id, ?status, "VM exited before fc-agent connected");
break;
}
Ok(None) => {} // still running
Err(e) => {
warn!(vm_id = %vm_id, error = %e, "VM liveness check failed");
break;
}
}
}
}
}
}
Expand Down
Loading