Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 93 additions & 6 deletions scripts/brev-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,64 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
export NEEDRESTART_MODE=a
export DEBIAN_FRONTEND=noninteractive

# ── GPU detection ─────────────────────────────────────────────────
# Brev GPU images ship the nvidia-smi binary even on CPU-only instances.
# We need nvidia-smi to actually *run* (driver loaded) to treat this as
# a GPU host. Export a flag so every later section uses the same check.
if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi >/dev/null 2>&1; then
HAS_GPU=true
else
HAS_GPU=false
fi
info "GPU detected: $HAS_GPU"

# ── Acquire exclusive apt access ──────────────────────────────────
# Cloud VMs run multiple apt processes on boot: cloud-init, apt-daily,
# and unattended-upgrades. We must wait for cloud-init AND disable the
# background services before we can safely use apt ourselves.
APT_WAIT_MAX=300 # seconds — fail if apt is still busy after 5 min
APT_QUIET_WINDOW=5 # seconds — apt must be idle this long before we proceed

if command -v cloud-init >/dev/null 2>&1; then
info "Waiting for cloud-init to finish..."
if ! cloud-init status --wait >/dev/null 2>&1; then
warn "cloud-init exited with an error — proceeding, but the VM may be in an inconsistent state"
fi
info "cloud-init done"
fi
# Stop background apt services that run independently of cloud-init
info "Disabling background apt services..."
sudo systemctl stop apt-daily.timer apt-daily-upgrade.timer unattended-upgrades 2>/dev/null || true
sudo systemctl disable apt-daily.timer apt-daily-upgrade.timer 2>/dev/null || true
# Kill any straggler apt/dpkg processes
sudo pkill -9 -x apt-get 2>/dev/null || true
sudo pkill -9 -x apt 2>/dev/null || true
sudo pkill -9 -x dpkg 2>/dev/null || true
# Release any stale locks left by killed processes
sudo rm -f /var/lib/dpkg/lock-frontend /var/lib/apt/lists/lock /var/cache/apt/archives/lock 2>/dev/null || true
sudo dpkg --configure -a 2>/dev/null || true

# Wait until apt/dpkg are truly idle for $APT_QUIET_WINDOW consecutive seconds.
apt_waited=0
apt_quiet=0
while [ "$apt_quiet" -lt "$APT_QUIET_WINDOW" ]; do
if [ "$apt_waited" -ge "$APT_WAIT_MAX" ]; then
fail "apt/dpkg still busy after ${APT_WAIT_MAX}s — cannot proceed"
fi
if pgrep -Ex "apt-get|apt|dpkg" >/dev/null 2>&1 \
|| fuser /var/lib/dpkg/lock-frontend /var/lib/apt/lists/lock /var/cache/apt/archives/lock 2>/dev/null; then
info "Waiting for apt/dpkg processes to finish... (${apt_waited}s elapsed)"
apt_quiet=0
sleep 2
apt_waited=$((apt_waited + 2))
else
apt_quiet=$((apt_quiet + 1))
sleep 1
apt_waited=$((apt_waited + 1))
fi
done
info "apt is now exclusively ours"

# --- 0. Node.js (needed for services) ---
if ! command -v node >/dev/null 2>&1; then
info "Installing Node.js..."
Expand All @@ -55,9 +113,16 @@ if ! command -v node >/dev/null 2>&1; then
else
fail "No SHA-256 verification tool found (need sha256sum or shasum)"
fi
sudo -E bash "$tmpdir/setup_node.sh" >/dev/null 2>&1
sudo -E bash "$tmpdir/setup_node.sh"
)
sudo apt-get install -y -qq nodejs >/dev/null 2>&1
for attempt in 1 2 3 4 5; do
if sudo apt-get install -y -qq nodejs; then
break
fi
[ "$attempt" -eq 5 ] && fail "Node.js install failed after 5 attempts"
info "Node.js install failed (attempt $attempt/5), retrying in 30s..."
sleep 30
done
info "Node.js $(node --version) installed"
else
info "Node.js already installed: $(node --version)"
Expand All @@ -66,16 +131,22 @@ fi
# --- 1. Docker ---
if ! command -v docker >/dev/null 2>&1; then
info "Installing Docker..."
sudo apt-get update -qq >/dev/null 2>&1
sudo apt-get install -y -qq docker.io >/dev/null 2>&1
for attempt in 1 2 3 4 5; do
if sudo apt-get update -qq && sudo apt-get install -y -qq docker.io; then
break
fi
[ "$attempt" -eq 5 ] && fail "Docker install failed after 5 attempts"
info "Docker install failed (attempt $attempt/5), retrying in 30s..."
sleep 30
done
sudo usermod -aG docker "$(whoami)"
info "Docker installed"
else
info "Docker already installed"
fi

# --- 2. NVIDIA Container Toolkit (if GPU present) ---
if command -v nvidia-smi >/dev/null 2>&1; then
if [ "$HAS_GPU" = true ]; then
if ! dpkg -s nvidia-container-toolkit >/dev/null 2>&1; then
info "Installing NVIDIA Container Toolkit..."
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
Expand All @@ -91,6 +162,22 @@ if command -v nvidia-smi >/dev/null 2>&1; then
else
info "NVIDIA Container Toolkit already installed"
fi
else
# CPU-only instance: ensure Docker uses runc (not nvidia) as default runtime.
# Brev GPU images pre-configure nvidia as the default Docker runtime even on
# CPU instances, causing "nvidia-container-cli: nvml error: driver not loaded"
# when starting containers.
if grep -q '"default-runtime".*nvidia' /etc/docker/daemon.json 2>/dev/null; then
info "Resetting Docker default runtime to runc (no GPU detected)..."
sudo python3 -c "
import json
with open('/etc/docker/daemon.json') as f: cfg = json.load(f)
cfg.pop('default-runtime', None)
with open('/etc/docker/daemon.json', 'w') as f: json.dump(cfg, f, indent=2)
"
sudo systemctl restart docker
info "Docker runtime reset to runc"
fi
fi

# --- 3. openshell CLI (binary release, not pip) ---
Expand Down Expand Up @@ -139,7 +226,7 @@ fi
VLLM_MODEL="nvidia/nemotron-3-nano-30b-a3b"
if [ "${SKIP_VLLM:-}" = "1" ]; then
info "Skipping vLLM install (SKIP_VLLM=1)"
elif command -v nvidia-smi >/dev/null 2>&1; then
elif [ "$HAS_GPU" = true ]; then
if ! python3 -c "import vllm" 2>/dev/null; then
info "Installing vLLM..."
if ! command -v pip3 >/dev/null 2>&1; then
Expand Down
3 changes: 2 additions & 1 deletion scripts/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ info "Starting OpenShell gateway..."
openshell gateway destroy -g nemoclaw >/dev/null 2>&1 || true
docker volume ls -q --filter "name=openshell-cluster-nemoclaw" | grep . && docker volume ls -q --filter "name=openshell-cluster-nemoclaw" | xargs docker volume rm || true
GATEWAY_ARGS=(--name nemoclaw)
command -v nvidia-smi >/dev/null 2>&1 && GATEWAY_ARGS+=(--gpu)
# Only enable GPU if nvidia-smi actually works (driver loaded), not just present on PATH
nvidia-smi >/dev/null 2>&1 && GATEWAY_ARGS+=(--gpu)
if ! openshell gateway start "${GATEWAY_ARGS[@]}" 2>&1 | grep -E "Gateway|✓|Error|error"; then
warn "Gateway start failed. Cleaning up stale state..."
openshell gateway destroy -g nemoclaw >/dev/null 2>&1 || true
Expand Down
47 changes: 27 additions & 20 deletions test/e2e/brev-e2e.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@
*
* Optional env vars:
* TEST_SUITE — which test to run: full (default), credential-sanitization, telegram-injection, all
* BREV_MIN_VCPU — Minimum vCPUs for CPU instance (default: 4)
* BREV_MIN_RAM — Minimum RAM in GB for CPU instance (default: 16)
* BREV_INSTANCE_TYPE — Brev/GCP instance type (default: n2-standard-4)
*/

import { describe, it, expect, beforeAll, afterAll } from "vitest";
Expand All @@ -28,8 +27,8 @@ import { homedir } from "node:os";
import path from "node:path";

// CPU instance specs: min vCPUs and RAM for the instance search
const BREV_MIN_VCPU = parseInt(process.env.BREV_MIN_VCPU || "4", 10);
const BREV_MIN_RAM = parseInt(process.env.BREV_MIN_RAM || "16", 10);
// Use a known CPU-only GCP instance type to avoid GPU images with broken nvidia runtime
const BREV_INSTANCE_TYPE = process.env.BREV_INSTANCE_TYPE || "n2-standard-4";
const INSTANCE_NAME = process.env.INSTANCE_NAME;
const TEST_SUITE = process.env.TEST_SUITE || "full";
const REPO_DIR = path.resolve(import.meta.dirname, "../..");
Expand Down Expand Up @@ -58,19 +57,15 @@ function ssh(cmd, { timeout = 120_000, stream = false } = {}) {
return stream ? "" : result.trim();
}

/**
* Escape a value for safe inclusion in a single-quoted shell string.
* Replaces single quotes with the shell-safe sequence: '\''
*/
function shellEscape(value) {
return String(value).replace(/'/g, "'\\''");
function shellQuote(value) {
return `'${String(value).replace(/'/g, "'\\''")}'`;
}

/** Run a command on the remote VM with env vars set for NemoClaw. */
function sshEnv(cmd, { timeout = 600_000, stream = false } = {}) {
const envPrefix = [
`export NVIDIA_API_KEY='${shellEscape(process.env.NVIDIA_API_KEY)}'`,
`export GITHUB_TOKEN='${shellEscape(process.env.GITHUB_TOKEN)}'`,
`export NVIDIA_API_KEY=${shellQuote(process.env.NVIDIA_API_KEY)}`,
`export GITHUB_TOKEN=${shellQuote(process.env.GITHUB_TOKEN)}`,
`export NEMOCLAW_NON_INTERACTIVE=1`,
`export NEMOCLAW_SANDBOX_NAME=e2e-test`,
].join(" && ");
Expand Down Expand Up @@ -131,14 +126,21 @@ describe.runIf(hasRequiredVars)("Brev E2E", () => {
);
brev("login", "--token", process.env.BREV_API_TOKEN);

// Create bare CPU instance via brev search cpu | brev create
console.log(`[${elapsed()}] Creating CPU instance via brev search cpu | brev create...`);
console.log(`[${elapsed()}] min-vcpu: ${BREV_MIN_VCPU}, min-ram: ${BREV_MIN_RAM}GB`);
execSync(
`brev search cpu --min-vcpu ${BREV_MIN_VCPU} --min-ram ${BREV_MIN_RAM} --sort price | ` +
`brev create ${INSTANCE_NAME} --detached`,
{ encoding: "utf-8", timeout: 180_000, stdio: ["pipe", "inherit", "inherit"] },
);
// Delete any leftover instance from a previous failed run
try {
brev("delete", INSTANCE_NAME);
console.log(`[${elapsed()}] Deleted leftover instance "${INSTANCE_NAME}"`);
} catch {
// Expected — no leftover instance
}

// Create CPU instance with a known GCP instance type
console.log(`[${elapsed()}] Creating CPU instance (type: ${BREV_INSTANCE_TYPE})...`);
execFileSync("brev", ["create", "--type", BREV_INSTANCE_TYPE, INSTANCE_NAME, "--detached"], {
encoding: "utf-8",
timeout: 180_000,
stdio: ["pipe", "inherit", "inherit"],
});
instanceCreated = true;
console.log(`[${elapsed()}] brev create returned (instance provisioning in background)`);

Expand All @@ -161,6 +163,11 @@ describe.runIf(hasRequiredVars)("Brev E2E", () => {
);
console.log(`[${elapsed()}] Code synced`);

// Wait for cloud-init to finish — Brev instances run apt provisioning on boot
console.log(`[${elapsed()}] Waiting for cloud-init to finish...`);
ssh(`cloud-init status --wait 2>/dev/null || true`, { timeout: 600_000, stream: true });
console.log(`[${elapsed()}] cloud-init done`);

// Bootstrap VM — stream output to CI log so we can see progress
console.log(`[${elapsed()}] Running brev-setup.sh (bootstrap)...`);
sshEnv(`cd ${remoteDir} && SKIP_VLLM=1 bash scripts/brev-setup.sh`, {
Expand Down
Loading