From b8cdbba114115f4e2e105012413aebdc22641f61 Mon Sep 17 00:00:00 2001 From: businesscurry123 Date: Sat, 16 May 2026 01:37:52 +0900 Subject: [PATCH] Add safe NVIDIA driver installer --- Makefile | 1 + packages/cx-gpu-nvidia/debian/control | 15 +- packages/cx-gpu-nvidia/debian/rules | 13 + .../usr/sbin/cx-nvidia-safe-install | 698 ++++++++++++++++++ .../doc/cx-gpu-nvidia/nvidia-safe-install.md | 79 ++ tests/cx-nvidia-safe-install-tests.sh | 266 +++++++ 6 files changed, 1070 insertions(+), 2 deletions(-) create mode 100755 packages/cx-gpu-nvidia/usr/sbin/cx-nvidia-safe-install create mode 100644 packages/cx-gpu-nvidia/usr/share/doc/cx-gpu-nvidia/nvidia-safe-install.md create mode 100755 tests/cx-nvidia-safe-install-tests.sh diff --git a/Makefile b/Makefile index b5862e3..4121d28 100644 --- a/Makefile +++ b/Makefile @@ -157,6 +157,7 @@ sbom: # Run tests test: @echo -e "$(GREEN)Running build verification tests...$(NC)" + bash tests/cx-nvidia-safe-install-tests.sh ./tests/verify-iso.sh $(OUTPUT_DIR)/$(ISO_NAME)-offline.iso || true ./tests/verify-packages.sh || true ./tests/verify-preseed.sh || true diff --git a/packages/cx-gpu-nvidia/debian/control b/packages/cx-gpu-nvidia/debian/control index 594f3f0..2082bc2 100644 --- a/packages/cx-gpu-nvidia/debian/control +++ b/packages/cx-gpu-nvidia/debian/control @@ -12,12 +12,21 @@ Rules-Requires-Root: no Package: cx-gpu-nvidia Architecture: amd64 Depends: ${misc:Depends}, - cx-core + cx-core, + apt, + bash, + dpkg, + kmod, + pciutils Recommends: + dkms, + mokutil, nvidia-driver, nvidia-kernel-dkms | nvidia-kernel-open-dkms, nvidia-smi, nvidia-settings, + mesa-utils, + ubuntu-drivers-common, nvidia-cuda-toolkit, nvidia-container-toolkit, libnvidia-ml1, @@ -33,6 +42,8 @@ Description: CX Linux NVIDIA GPU enablement . Features: - NVIDIA driver (proprietary or open kernel modules) + - Safe driver installer with package snapshot and rollback + - Post-install validation for nvidia-smi, DKMS, kernel modules, and OpenGL - CUDA compute support - Container GPU passthrough (nvidia-container-toolkit) - GPU monitoring (nvidia-smi) @@ -40,7 +51,7 @@ Description: CX Linux NVIDIA GPU enablement For Secure Boot systems: - Run 'cx gpu mok-setup' to configure MOK keys - Reboot and enroll keys in UEFI - - Then install this package + - Then run 'cx-nvidia-safe-install install' . Supported GPU architectures: - Ada Lovelace (H100, L40, RTX 40xx) diff --git a/packages/cx-gpu-nvidia/debian/rules b/packages/cx-gpu-nvidia/debian/rules index cbe925d..baf68c6 100755 --- a/packages/cx-gpu-nvidia/debian/rules +++ b/packages/cx-gpu-nvidia/debian/rules @@ -1,3 +1,16 @@ #!/usr/bin/make -f +export DH_VERBOSE=1 + %: dh $@ + +override_dh_auto_install: + install -d $(CURDIR)/debian/cx-gpu-nvidia/usr/sbin + install -m 755 usr/sbin/cx-nvidia-safe-install \ + $(CURDIR)/debian/cx-gpu-nvidia/usr/sbin/cx-nvidia-safe-install + install -d $(CURDIR)/debian/cx-gpu-nvidia/usr/share/doc/cx-gpu-nvidia + install -m 644 usr/share/doc/cx-gpu-nvidia/nvidia-safe-install.md \ + $(CURDIR)/debian/cx-gpu-nvidia/usr/share/doc/cx-gpu-nvidia/nvidia-safe-install.md + +override_dh_auto_clean: + @echo "No clean step required" diff --git a/packages/cx-gpu-nvidia/usr/sbin/cx-nvidia-safe-install b/packages/cx-gpu-nvidia/usr/sbin/cx-nvidia-safe-install new file mode 100755 index 0000000..aad5175 --- /dev/null +++ b/packages/cx-gpu-nvidia/usr/sbin/cx-nvidia-safe-install @@ -0,0 +1,698 @@ +#!/usr/bin/env bash +# Safe NVIDIA driver installer for CX Linux. +# Copyright 2026 AI Venture Holdings LLC +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +PROGRAM="$(basename "$0")" +VERSION="0.1.0" + +STATE_DIR="${CX_NVIDIA_STATE_DIR:-/var/lib/cx/nvidia-installer}" +LOG_FILE="${CX_NVIDIA_LOG_FILE:-/var/log/cx/nvidia-safe-installer.log}" +DRY_RUN=0 +ASSUME_YES=0 +NO_UPDATE=0 +FORCE=0 +STRICT_TAINT=0 +SKIP_OPENGL=0 +TARGET_DRIVER="" +SNAPSHOT_ID="" +TAINT_FILE="${CX_NVIDIA_TAINT_FILE:-/proc/sys/kernel/tainted}" +APT_DPKG_OPTIONS=( + -o "Dpkg::Options::=--force-confdef" + -o "Dpkg::Options::=--force-confold" +) + +usage() { + cat <&2 + if mkdir -p "$(dirname "$LOG_FILE")" 2>/dev/null; then + printf '%s\n' "$line" >> "$LOG_FILE" 2>/dev/null || true + fi +} + +info() { log_line INFO "$@"; } +warn() { log_line WARN "$@"; } +error() { log_line ERROR "$@"; } + +die() { + error "$@" + exit 1 +} + +cmd_exists() { + command -v "$1" >/dev/null 2>&1 +} + +quote_cmd() { + printf '%q ' "$@" +} + +run_cmd() { + local rendered + rendered="$(quote_cmd "$@")" + if [ "$DRY_RUN" -eq 1 ]; then + info "DRY-RUN: $rendered" + return 0 + fi + info "RUN: $rendered" + "$@" +} + +require_root_for_changes() { + if [ "$DRY_RUN" -eq 1 ] || [ "${CX_NVIDIA_SKIP_ROOT:-0}" = "1" ]; then + return 0 + fi + if [ "$(id -u)" -ne 0 ]; then + die "install and rollback must run as root. Re-run with sudo, or use --dry-run." + fi +} + +ensure_state_dirs() { + mkdir -p "$STATE_DIR/snapshots" "$STATE_DIR/tmp" +} + +parse_args() { + while [ "$#" -gt 0 ]; do + case "$1" in + --driver) + [ "$#" -ge 2 ] || die "--driver requires a package name" + TARGET_DRIVER="$2" + shift 2 + ;; + --snapshot) + [ "$#" -ge 2 ] || die "--snapshot requires an ID or path" + SNAPSHOT_ID="$2" + shift 2 + ;; + --dry-run) + DRY_RUN=1 + shift + ;; + --yes|-y) + ASSUME_YES=1 + shift + ;; + --no-update) + NO_UPDATE=1 + shift + ;; + --force) + FORCE=1 + shift + ;; + --strict-taint) + STRICT_TAINT=1 + shift + ;; + --skip-opengl) + SKIP_OPENGL=1 + shift + ;; + --help|-h) + usage + exit 0 + ;; + *) + die "unknown option: $1" + ;; + esac + done +} + +detect_gpu() { + if cmd_exists nvidia-smi && nvidia-smi -L >/dev/null 2>&1; then + nvidia-smi -L + return 0 + fi + + if cmd_exists lspci; then + local pci + pci="$(lspci -nn 2>/dev/null | grep -i 'nvidia' || true)" + if [ -n "$pci" ]; then + printf '%s\n' "$pci" + return 0 + fi + fi + + return 1 +} + +current_driver_version() { + if cmd_exists nvidia-smi; then + local smi_version + smi_version="$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -n 1 || true)" + if [ -n "$smi_version" ]; then + printf '%s\n' "$smi_version" + return 0 + fi + fi + + if cmd_exists dpkg-query; then + dpkg-query -W -f='${binary:Package} ${Version}\n' 'nvidia-driver*' 2>/dev/null \ + | sort -V \ + | tail -n 1 \ + || true + fi +} + +recommend_driver() { + if cmd_exists ubuntu-drivers; then + local recommended + recommended="$(ubuntu-drivers devices 2>/dev/null \ + | awk -F': ' '/recommended/ { split($2, p, " "); print p[1]; exit }' \ + || true)" + if [ -n "$recommended" ]; then + printf '%s\n' "$recommended" + return 0 + fi + fi + + if cmd_exists apt-cache; then + local newest + newest="$(apt-cache search '^nvidia-driver-[0-9]+' 2>/dev/null \ + | awk '{print $1}' \ + | sort -V \ + | tail -n 1 \ + || true)" + if [ -n "$newest" ]; then + printf '%s\n' "$newest" + return 0 + fi + fi + + printf '%s\n' "nvidia-driver" +} + +check_nvidia_gpu_or_force() { + local gpu + if gpu="$(detect_gpu)"; then + info "Detected NVIDIA hardware:" + printf '%s\n' "$gpu" >&2 + return 0 + fi + + if [ "$FORCE" -eq 1 ]; then + warn "No NVIDIA GPU was detected, but --force was supplied." + return 0 + fi + + die "No NVIDIA GPU detected. Use --force only for image-building or controlled test systems." +} + +check_kernel_compatibility() { + local kernel="$1" + local problems=0 + + info "Checking running kernel compatibility for $kernel" + + if [ ! -d "/lib/modules/$kernel" ]; then + warn "Missing /lib/modules/$kernel; DKMS modules may not build for the running kernel." + problems=$((problems + 1)) + fi + + if [ ! -e "/lib/modules/$kernel/build" ]; then + if cmd_exists dpkg-query && dpkg-query -W "linux-headers-$kernel" >/dev/null 2>&1; then + info "Found linux-headers-$kernel package." + else + warn "Kernel headers for $kernel were not found." + problems=$((problems + 1)) + fi + else + info "Kernel header build link exists for $kernel." + fi + + if cmd_exists dkms; then + info "DKMS is available." + else + warn "dkms command not found; NVIDIA module rebuild state cannot be validated." + problems=$((problems + 1)) + fi + + if [ "$problems" -gt 0 ] && [ "$FORCE" -ne 1 ]; then + die "Kernel compatibility checks found $problems problem(s). Install matching headers/DKMS or re-run with --force." + fi +} + +check_secure_boot() { + if ! cmd_exists mokutil; then + warn "mokutil is not installed; Secure Boot state cannot be checked." + return 0 + fi + + local sb_state + sb_state="$(mokutil --sb-state 2>/dev/null || true)" + if printf '%s\n' "$sb_state" | grep -qi 'enabled'; then + if [ "$FORCE" -eq 1 ]; then + warn "Secure Boot appears enabled. Continuing because --force was supplied." + return 0 + fi + die "Secure Boot appears enabled. Enroll MOK keys or pass --force after confirming signed module handling." + fi + + if [ -n "$sb_state" ]; then + info "$sb_state" + else + warn "mokutil did not report Secure Boot state." + fi +} + +preflight_apt() { + local target="$1" + + cmd_exists apt-cache || die "apt-cache is required for package compatibility checks." + cmd_exists apt-get || die "apt-get is required to install or roll back NVIDIA packages." + + if ! apt-cache policy "$target" 2>/dev/null \ + | awk '/Candidate:/ && $2 != "(none)" { found=1 } END { exit(found ? 0 : 1) }'; then + die "No install candidate found for $target." + fi + + info "APT candidate exists for $target." + info "RUN: apt-get -s install $target" + apt-get -s install "$target" +} + +capture_file() { + local dest="$1" + shift + { + "$@" 2>&1 || true + } > "$dest" +} + +snapshot_dpkg_nvidia() { + if cmd_exists dpkg-query; then + dpkg-query -W -f='${binary:Package}\t${Version}\t${db:Status-Abbrev}\n' \ + 'nvidia*' 'libnvidia*' 'cuda*' 'xserver-xorg-video-nvidia*' 'firmware-nvidia*' \ + 2>/dev/null || true + fi +} + +snapshot_dpkg_selections() { + if cmd_exists dpkg; then + dpkg --get-selections 2>/dev/null || true + fi +} + +snapshot_apt_mark_manual() { + if cmd_exists apt-mark; then + apt-mark showmanual 2>/dev/null || true + fi +} + +snapshot_apt_mark_auto() { + if cmd_exists apt-mark; then + apt-mark showauto 2>/dev/null || true + fi +} + +snapshot_dkms() { + if cmd_exists dkms; then + dkms status 2>/dev/null || true + fi +} + +snapshot_modules() { + if cmd_exists lsmod; then + lsmod 2>/dev/null | grep -E '^(nvidia|nouveau)' || true + fi +} + +create_snapshot() { + local target="$1" + local snapshot_id + local snapshot_path + local kernel + local current_driver + + ensure_state_dirs + snapshot_id="$(date -u +%Y%m%dT%H%M%SZ)-${RANDOM}${RANDOM}" + snapshot_path="$STATE_DIR/snapshots/$snapshot_id" + kernel="$(uname -r)" + current_driver="$(current_driver_version || true)" + + mkdir -p "$snapshot_path/etc-modprobe.d" + + cat > "$snapshot_path/meta.env" </dev/null || true + fi + + printf '%s\n' "$snapshot_id" > "$STATE_DIR/latest" + info "Created NVIDIA driver snapshot: $snapshot_path" + printf '%s\n' "$snapshot_path" +} + +snapshot_path_from_id() { + local id="$1" + if [ -z "$id" ]; then + [ -f "$STATE_DIR/latest" ] || die "No latest snapshot is recorded under $STATE_DIR." + id="$(cat "$STATE_DIR/latest")" + fi + + if [ -d "$id" ]; then + printf '%s\n' "$id" + return 0 + fi + + if [ -d "$STATE_DIR/snapshots/$id" ]; then + printf '%s\n' "$STATE_DIR/snapshots/$id" + return 0 + fi + + die "Snapshot not found: $id" +} + +list_installed_nvidia_packages() { + snapshot_dpkg_nvidia | awk -F '\t' '$3 ~ /^ii/ { print $1 "\t" $2 }' +} + +snapshot_contains_installed_pkg() { + local snapshot_path="$1" + local pkg="$2" + awk -F '\t' -v pkg="$pkg" '$1 == pkg && $3 ~ /^ii/ { found=1 } END { exit(found ? 0 : 1) }' \ + "$snapshot_path/dpkg-nvidia.txt" +} + +restore_manual_marks() { + local snapshot_path="$1" + [ -f "$snapshot_path/apt-mark-manual.txt" ] || return 0 + cmd_exists apt-mark || return 0 + + local pkg + while IFS= read -r pkg; do + case "$pkg" in + nvidia*|libnvidia*|cuda*|xserver-xorg-video-nvidia*|firmware-nvidia*) + run_cmd apt-mark manual "$pkg" + ;; + esac + done < "$snapshot_path/apt-mark-manual.txt" +} + +rollback_snapshot() { + local snapshot_path="$1" + local remove_pkgs=() + local restore_specs=() + local pkg + local version + local status + + require_root_for_changes + [ -f "$snapshot_path/dpkg-nvidia.txt" ] || die "Snapshot is missing dpkg-nvidia.txt: $snapshot_path" + + while IFS=$'\t' read -r pkg version; do + [ -n "$pkg" ] || continue + if ! snapshot_contains_installed_pkg "$snapshot_path" "$pkg"; then + remove_pkgs+=("$pkg") + fi + done < <(list_installed_nvidia_packages) + + while IFS=$'\t' read -r pkg version status; do + [ -n "${pkg:-}" ] || continue + if [[ "$status" == ii* ]] && [ -n "${version:-}" ]; then + restore_specs+=("${pkg}=${version}") + fi + done < "$snapshot_path/dpkg-nvidia.txt" + + if [ "${#remove_pkgs[@]}" -gt 0 ]; then + info "Rolling back newly installed NVIDIA/CUDA packages: ${remove_pkgs[*]}" + run_cmd env DEBIAN_FRONTEND=noninteractive apt-get purge -y "${APT_DPKG_OPTIONS[@]}" "${remove_pkgs[@]}" + else + info "No newly installed NVIDIA/CUDA packages need purging." + fi + + if [ "${#restore_specs[@]}" -gt 0 ]; then + info "Restoring NVIDIA/CUDA package versions from snapshot." + run_cmd env DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-downgrades "${APT_DPKG_OPTIONS[@]}" "${restore_specs[@]}" + else + info "Snapshot had no previously installed NVIDIA/CUDA packages to restore." + fi + + restore_manual_marks "$snapshot_path" + + if cmd_exists update-initramfs; then + run_cmd update-initramfs -u + fi + + info "Rollback completed from snapshot: $snapshot_path" +} + +validate_driver() { + local failed=0 + + info "Running NVIDIA post-install validation." + + if cmd_exists nvidia-smi && nvidia-smi >/dev/null 2>&1; then + info "nvidia-smi responded successfully." + else + error "nvidia-smi did not respond." + failed=1 + fi + + if cmd_exists modprobe; then + if modprobe -n nvidia >/dev/null 2>&1; then + info "nvidia kernel module can be resolved by modprobe." + else + error "modprobe cannot resolve the nvidia kernel module." + failed=1 + fi + else + warn "modprobe is unavailable; skipping module resolution check." + fi + + if cmd_exists lsmod; then + if lsmod 2>/dev/null | grep -q '^nvidia'; then + info "nvidia kernel module is loaded." + else + warn "nvidia module is not currently loaded; a reboot or display-manager restart may still be required." + fi + fi + + if cmd_exists dkms; then + local dkms_output + dkms_output="$(dkms status 2>/dev/null || true)" + if printf '%s\n' "$dkms_output" | grep -Ei 'nvidia' | grep -Eiq '(bad|failed|error|broken)'; then + error "DKMS reports a failed NVIDIA module state." + printf '%s\n' "$dkms_output" >&2 + failed=1 + elif printf '%s\n' "$dkms_output" | grep -qi 'nvidia'; then + info "DKMS reports NVIDIA module state." + else + warn "DKMS did not report an NVIDIA module entry." + fi + else + warn "dkms is unavailable; skipping DKMS state validation." + fi + + if [ -r "$TAINT_FILE" ]; then + local tainted + tainted="$(cat "$TAINT_FILE")" + if [[ ! "$tainted" =~ ^[0-9]+$ ]]; then + warn "Kernel taint value is not numeric: $tainted" + elif [ "$tainted" != "0" ]; then + local expected_nvidia_taint_mask + local unexpected_taint + expected_nvidia_taint_mask=$((1 | 4096)) + unexpected_taint=$((tainted & ~expected_nvidia_taint_mask)) + if [ "$STRICT_TAINT" -eq 1 ]; then + if [ "$unexpected_taint" -ne 0 ]; then + error "Unexpected kernel taint bits set (raw=$tainted, unexpected=$unexpected_taint)." + failed=1 + else + info "Kernel taint value is $tainted (only expected NVIDIA proprietary/out-of-tree bits set)." + fi + elif [ "$unexpected_taint" -ne 0 ]; then + warn "Unexpected kernel taint bits set (raw=$tainted, unexpected=$unexpected_taint); not failing without --strict-taint." + else + info "Kernel taint value is $tainted (only expected NVIDIA proprietary/out-of-tree bits set)." + fi + else + info "Kernel taint value is 0." + fi + fi + + if [ "$SKIP_OPENGL" -eq 1 ]; then + info "Skipping OpenGL validation because --skip-opengl was supplied." + elif cmd_exists glxinfo && [ -n "${DISPLAY:-}" ]; then + if glxinfo -B 2>/dev/null | grep -qi 'OpenGL renderer'; then + info "OpenGL renderer query succeeded." + else + error "OpenGL renderer query failed." + failed=1 + fi + else + warn "OpenGL validation skipped; glxinfo or DISPLAY is unavailable." + fi + + [ "$failed" -eq 0 ] +} + +install_driver() { + local kernel + local target + local snapshot_path + + require_root_for_changes + ensure_state_dirs + + check_nvidia_gpu_or_force + + kernel="$(uname -r)" + target="${TARGET_DRIVER:-$(recommend_driver)}" + [ -n "$target" ] || die "Could not determine an NVIDIA driver package." + + info "Target NVIDIA driver package: $target" + info "Current driver state: $(current_driver_version || printf 'unknown')" + + check_kernel_compatibility "$kernel" + check_secure_boot + + snapshot_path="$(create_snapshot "$target" | tail -n 1)" + preflight_apt "$target" + + if [ "$NO_UPDATE" -ne 1 ]; then + run_cmd apt-get update + fi + + if [ "$DRY_RUN" -eq 1 ]; then + info "Dry run complete. Snapshot and APT simulation succeeded; no packages were changed." + return 0 + fi + + if ! run_cmd env DEBIAN_FRONTEND=noninteractive apt-get install -y "${APT_DPKG_OPTIONS[@]}" "$target"; then + warn "APT install failed; attempting rollback." + rollback_snapshot "$snapshot_path" || true + return 1 + fi + + if ! validate_driver; then + warn "Validation failed; attempting automatic rollback." + rollback_snapshot "$snapshot_path" || true + return 1 + fi + + cat > "$STATE_DIR/last-success.env" <` and +include: + +- GPU and `nvidia-smi` output. +- NVIDIA, CUDA, and libnvidia package versions. +- Full `dpkg --get-selections` output. +- Manual and automatic apt-mark state. +- DKMS and loaded module state. +- NVIDIA or nouveau modprobe configuration files. + +Rollback purges NVIDIA/CUDA packages that were not present in the snapshot, +reinstalls previously recorded NVIDIA/CUDA package versions with +`--allow-downgrades`, restores NVIDIA-related manual apt marks, and refreshes +initramfs when `update-initramfs` is available. + +## Validation + +After install, the validation suite checks: + +- `nvidia-smi` responds. +- `modprobe -n nvidia` can resolve the kernel module. +- DKMS does not report failed or broken NVIDIA module state. +- Kernel taint is reported. With `--strict-taint`, expected NVIDIA proprietary + and out-of-tree module bits are allowed, while other taint bits fail + validation. +- OpenGL renderer output when `glxinfo` and `DISPLAY` are available. + +Headless servers often do not have an OpenGL display stack. Use +`--skip-opengl` when validating those systems. + +## Test And Image Build Notes + +For ISO/image build systems or CI that do not expose GPU hardware, use +`--dry-run --force` to confirm package availability and snapshot creation without +touching the system. The test suite uses `CX_NVIDIA_STATE_DIR` and +`CX_NVIDIA_LOG_FILE` to redirect all state into a temporary directory. diff --git a/tests/cx-nvidia-safe-install-tests.sh b/tests/cx-nvidia-safe-install-tests.sh new file mode 100755 index 0000000..b5c1fe7 --- /dev/null +++ b/tests/cx-nvidia-safe-install-tests.sh @@ -0,0 +1,266 @@ +#!/usr/bin/env bash +# Mocked unit tests for cx-nvidia-safe-install. +# Copyright 2026 AI Venture Holdings LLC +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +SCRIPT="$REPO_ROOT/packages/cx-gpu-nvidia/usr/sbin/cx-nvidia-safe-install" +TEST_ROOT="$(mktemp -d)" +FAKEBIN="$TEST_ROOT/bin" +STATE_DIR="$TEST_ROOT/state" +LOG_FILE="$TEST_ROOT/install.log" +FAKE_APT_LOG="$TEST_ROOT/apt.log" +FAKE_INSTALLED_NEW="$TEST_ROOT/installed-new" +FAKE_TAINT_FILE="$TEST_ROOT/tainted" +PASS=0 +FAIL=0 + +cleanup() { + rm -rf "$TEST_ROOT" +} +trap cleanup EXIT + +mkdir -p "$FAKEBIN" "$STATE_DIR" + +write_fake() { + local name="$1" + cat > "$FAKEBIN/$name" + chmod +x "$FAKEBIN/$name" +} + +write_fake nvidia-smi <<'SH' +#!/usr/bin/env bash +if [ "${1:-}" = "-L" ]; then + echo "GPU 0: NVIDIA RTX 4090 (UUID: GPU-test)" + exit 0 +fi +if [[ "${1:-}" == --query-gpu=* ]]; then + echo "535.154.05" + exit 0 +fi +if [ "${1:-}" = "-q" ]; then + echo "Driver Version : 535.154.05" + exit 0 +fi +if [ "${FAKE_NVIDIA_SMI_FAIL_EMPTY:-0}" = "1" ]; then + echo "nvidia-smi validation failure" >&2 + exit 1 +fi +echo "NVIDIA-SMI mock ok" +SH + +write_fake lspci <<'SH' +#!/usr/bin/env bash +echo "01:00.0 VGA compatible controller [0300]: NVIDIA Corporation AD102 [10de:2684]" +SH + +write_fake uname <<'SH' +#!/usr/bin/env bash +if [ "${1:-}" = "-r" ]; then + echo "6.5.0-generic" +else + /usr/bin/uname "$@" +fi +SH + +write_fake ubuntu-drivers <<'SH' +#!/usr/bin/env bash +cat <> "$FAKE_APT_LOG" +case " $* " in + *" install "*) + touch "$FAKE_INSTALLED_NEW" + ;; + *" purge "*|*" remove "*) + rm -f "$FAKE_INSTALLED_NEW" + ;; +esac +exit 0 +SH + +write_fake dpkg-query <<'SH' +#!/usr/bin/env bash +if [[ " $* " == *" linux-headers-"* ]]; then + echo "linux-headers-6.5.0-generic 6.5.0.1 ii " + exit 0 +fi +echo "nvidia-driver-535 535.154.05-0ubuntu1 ii " +echo "libnvidia-compute-535 535.154.05-0ubuntu1 ii " +if [ -f "$FAKE_INSTALLED_NEW" ]; then + echo "nvidia-driver-545 545.29.06-0ubuntu1 ii " +fi +SH + +write_fake dpkg <<'SH' +#!/usr/bin/env bash +if [ "${1:-}" = "--get-selections" ]; then + echo "nvidia-driver-535 install" + echo "libnvidia-compute-535 install" +fi +SH + +write_fake apt-mark <<'SH' +#!/usr/bin/env bash +case "${1:-}" in + showmanual) + echo "nvidia-driver-535" + ;; + showauto) + echo "libnvidia-compute-535" + ;; + manual) + echo "apt-mark manual ${2:-}" >> "$FAKE_APT_LOG" + ;; +esac +SH + +write_fake mokutil <<'SH' +#!/usr/bin/env bash +echo "SecureBoot disabled" +SH + +write_fake dkms <<'SH' +#!/usr/bin/env bash +echo "nvidia/535.154.05, 6.5.0-generic, x86_64: installed" +SH + +write_fake modprobe <<'SH' +#!/usr/bin/env bash +if [ "${1:-}" = "-n" ] && [ "${2:-}" = "nvidia" ]; then + exit 0 +fi +exit 1 +SH + +write_fake lsmod <<'SH' +#!/usr/bin/env bash +echo "nvidia 123456 0" +SH + +write_fake update-initramfs <<'SH' +#!/usr/bin/env bash +echo "update-initramfs $*" >> "$FAKE_APT_LOG" +SH + +export PATH="$FAKEBIN:$PATH" +export CX_NVIDIA_STATE_DIR="$STATE_DIR" +export CX_NVIDIA_LOG_FILE="$LOG_FILE" +export CX_NVIDIA_SKIP_ROOT=1 +export CX_NVIDIA_TAINT_FILE="$FAKE_TAINT_FILE" +export FAKE_APT_LOG +export FAKE_INSTALLED_NEW + +reset_state() { + rm -rf "$STATE_DIR" + mkdir -p "$STATE_DIR" + : > "$FAKE_APT_LOG" + echo "0" > "$FAKE_TAINT_FILE" + rm -f "$FAKE_INSTALLED_NEW" + unset FAKE_NVIDIA_SMI_FAIL_EMPTY || true +} + +assert_file() { + [ -f "$1" ] || { + echo "missing expected file: $1" >&2 + return 1 + } +} + +assert_grep() { + local pattern="$1" + local file="$2" + grep -q "$pattern" "$file" || { + echo "pattern '$pattern' not found in $file" >&2 + return 1 + } +} + +run_case() { + local name="$1" + shift + if "$@"; then + echo "[PASS] $name" + PASS=$((PASS + 1)) + else + echo "[FAIL] $name" >&2 + FAIL=$((FAIL + 1)) + fi +} + +test_dry_run_install_creates_snapshot() { + reset_state + bash "$SCRIPT" install --dry-run --driver nvidia-driver-545 --force --no-update > "$TEST_ROOT/dry-run.out" 2>&1 + assert_file "$STATE_DIR/latest" + local latest + latest="$(cat "$STATE_DIR/latest")" + assert_file "$STATE_DIR/snapshots/$latest/meta.env" + assert_grep "target_driver=nvidia-driver-545" "$STATE_DIR/snapshots/$latest/meta.env" + assert_grep "apt-get -s install nvidia-driver-545" "$FAKE_APT_LOG" +} + +test_status_reports_latest_snapshot() { + reset_state + bash "$SCRIPT" install --dry-run --driver nvidia-driver-545 --force --no-update > /dev/null 2>&1 + bash "$SCRIPT" status > "$TEST_ROOT/status.out" 2>&1 + assert_grep "Latest snapshot:" "$TEST_ROOT/status.out" + assert_grep "Current driver: 535.154.05" "$TEST_ROOT/status.out" +} + +test_validation_failure_triggers_rollback() { + reset_state + export FAKE_NVIDIA_SMI_FAIL_EMPTY=1 + if bash "$SCRIPT" install --driver nvidia-driver-545 --force --no-update > "$TEST_ROOT/fail.out" 2>&1; then + echo "install unexpectedly succeeded" >&2 + return 1 + fi + assert_grep "apt-get install -y .*nvidia-driver-545" "$FAKE_APT_LOG" + assert_grep "apt-get purge -y .*nvidia-driver-545" "$FAKE_APT_LOG" + assert_grep "Validation failed" "$TEST_ROOT/fail.out" +} + +test_validate_success() { + reset_state + bash "$SCRIPT" validate --skip-opengl > "$TEST_ROOT/validate.out" 2>&1 + assert_grep "nvidia-smi responded successfully" "$TEST_ROOT/validate.out" +} + +test_strict_taint_allows_expected_nvidia_bits() { + reset_state + echo "4097" > "$FAKE_TAINT_FILE" + bash "$SCRIPT" validate --strict-taint --skip-opengl > "$TEST_ROOT/strict-taint.out" 2>&1 + assert_grep "only expected NVIDIA" "$TEST_ROOT/strict-taint.out" +} + +run_case "dry-run install creates rollback snapshot" test_dry_run_install_creates_snapshot +run_case "status reports latest snapshot" test_status_reports_latest_snapshot +run_case "validation failure triggers rollback" test_validation_failure_triggers_rollback +run_case "validate succeeds with mocked NVIDIA stack" test_validate_success +run_case "strict taint allows expected NVIDIA bits" test_strict_taint_allows_expected_nvidia_bits + +echo "Passed: $PASS" +echo "Failed: $FAIL" + +[ "$FAIL" -eq 0 ]