Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,15 @@ jobs:
rm -rf /tmp/.buildx-cache
mv /tmp/.buildx-cache-new /tmp/.buildx-cache

lint-and-test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- name: Shellcheck prebake scripts
run: shellcheck -S warning install.sh entrypoint.sh
- name: Install bats
run: |
sudo apt-get update
sudo apt-get install -y bats
- name: Run install.sh unit tests
run: bats test/install.bats
110 changes: 69 additions & 41 deletions install.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
#!/usr/bin/env bash
set -euxo pipefail

source /opt/gpu/config.sh
source /opt/gpu/package_manager_helpers.sh
# NOTE: `set -euxo pipefail` and the `source`s of the gpu config + package-manager helpers are
# applied inside main() rather than at top level, so this script can be sourced by the unit tests
# (test/install.bats) to exercise the individual functions on a GPU-less host without running the
# install or requiring /opt/gpu to exist. main() is invoked only when the script is executed
# directly (see the guard at the bottom).

PS4='+ $(date -u -I"seconds" | cut -c1-19) '

Expand Down Expand Up @@ -35,11 +37,13 @@ ARCH=$(uname -m)
# normal node boot, where uname -r is already correct).
target_build_kernel() {
local d k
# newest installed kernel that has a headers/build tree (the VHD's target kernel)
k=$(for d in /lib/modules/*/build; do
# newest installed kernel that has a headers/build tree (the VHD's target kernel). The modules
# root is overridable (AKSGPU_MODULES_ROOT) so the unit tests can point it at a fixture dir.
local modules_root="${AKSGPU_MODULES_ROOT:-/lib/modules}"
k=$(for d in "${modules_root}"/*/build; do
[ -d "$d" ] || continue
d=${d%/build}
echo "${d#/lib/modules/}"
echo "${d#"${modules_root}"/}"
done | sort -V | tail -n1)
if [ -n "$k" ]; then echo "$k"; else uname -r; fi
}
Expand All @@ -62,9 +66,9 @@ cleanup_overlay() {
fi
set -e
}
# Reset PS4 on exit alongside the overlay cleanup (a single EXIT trap, since a second
# `trap ... EXIT` would replace the first rather than chain).
trap 'cleanup_overlay; PS4="+ "' EXIT
# The EXIT trap that runs cleanup_overlay (and resets PS4) is installed at the start of main(),
# not here, so that sourcing this script for unit tests neither registers a trap nor tears down
# mounts when the test process exits.

resolve_runfile() {
if [[ "${DRIVER_KIND}" == "cuda" ]]; then
Expand Down Expand Up @@ -225,41 +229,65 @@ fast_path_ok() {
modinfo -k "${KERNEL_NAME}" nvidia >/dev/null 2>&1 || return 1
}

set +euo pipefail
open_devices="$(lsof /dev/nvidia* 2>/dev/null)"
echo "Open devices: $open_devices"

open_gridd="$(lsof /usr/bin/nvidia-gridd 2>/dev/null)"
echo "Open gridd: $open_gridd"
set -euo pipefail

if [ "${AKSGPU_BUILD_ONLY}" = "1" ]; then
# VHD build time: compile + cache + marker only, no device access. Target the kernel the VHD
# will boot (not the builder's running kernel) so the prebuilt module + marker match at boot.
KERNEL_NAME="$(target_build_kernel)"
echo "aks-gpu: build-only mode (prebuilding kernel module for kernel ${KERNEL_NAME}; builder running $(uname -r))"
echo "aks-gpu: kernels with installed headers (build trees):"; ls -ld /lib/modules/*/build 2>/dev/null || echo " (none found)"
build_and_mark
# purge_gpu_cache removes the gpu cache that entrypoint.sh staged under /opt/gpu once install.sh
# has consumed it. Factored out of the two former inline `rm -r /opt/gpu` calls so the unit tests
# can stub it when exercising main()'s dispatch.
purge_gpu_cache() {
rm -r /opt/gpu
exit 0
fi
}

install_nvidia_container_toolkit
# main is the install entrypoint. It is run only when the script is executed directly (the guard
# at the bottom), so sourcing the script for unit tests loads the functions above without running
# any install steps. The set-flags and the gpu config/helper sources live here (not at top level)
# for the same reason.
main() {
set -euxo pipefail
# shellcheck source=/dev/null
source "${AKSGPU_CONFIG_PATH:-/opt/gpu/config.sh}"
# shellcheck source=/dev/null
source "${AKSGPU_PMH_PATH:-/opt/gpu/package_manager_helpers.sh}"
trap 'cleanup_overlay; PS4="+ "' EXIT

set +euo pipefail
open_devices="$(lsof /dev/nvidia* 2>/dev/null)"
echo "Open devices: $open_devices"

open_gridd="$(lsof /usr/bin/nvidia-gridd 2>/dev/null)"
echo "Open gridd: $open_gridd"
set -euo pipefail

if [ "${AKSGPU_BUILD_ONLY}" = "1" ]; then
# VHD build time: compile + cache + marker only, no device access. Target the kernel the VHD
# will boot (not the builder's running kernel) so the prebuilt module + marker match at boot.
KERNEL_NAME="$(target_build_kernel)"
echo "aks-gpu: build-only mode (prebuilding kernel module for kernel ${KERNEL_NAME}; builder running $(uname -r))"
echo "aks-gpu: kernels with installed headers (build trees):"; ls -ld "${AKSGPU_MODULES_ROOT:-/lib/modules}"/*/build 2>/dev/null || echo " (none found)"
build_and_mark
purge_gpu_cache
exit 0
fi

if [ "${AKSGPU_SKIP_KERNEL_BUILD}" = "1" ] && baked_marker_matches && fast_path_ok; then
# Prebuilt module is present and valid for this kernel+driver: skip the ~100s recompile.
echo "aks-gpu: using kernel module prebuilt in the VHD for kernel ${KERNEL_NAME} (recompile skipped)"
else
if [ "${AKSGPU_SKIP_KERNEL_BUILD}" = "1" ]; then
echo "aks-gpu: prebuilt module missing/invalid for ${DRIVER_KIND} ${DRIVER_VERSION} on ${KERNEL_NAME}; building from source"
install_nvidia_container_toolkit

if [ "${AKSGPU_SKIP_KERNEL_BUILD}" = "1" ] && baked_marker_matches && fast_path_ok; then
# Prebuilt module is present and valid for this kernel+driver: skip the ~100s recompile.
echo "aks-gpu: using kernel module prebuilt in the VHD for kernel ${KERNEL_NAME} (recompile skipped)"
else
if [ "${AKSGPU_SKIP_KERNEL_BUILD}" = "1" ]; then
echo "aks-gpu: prebuilt module missing/invalid for ${DRIVER_KIND} ${DRIVER_VERSION} on ${KERNEL_NAME}; building from source"
fi
# No bespoke stale-driver teardown is needed: `nvidia-installer -s` automatically uninstalls
# any previously runfile-installed driver -- including a mismatched prebaked one -- and its
# DKMS registration before installing the new one. build_and_mark then refreshes the marker
# to match what we just built, so subsequent boots take the fast path.
build_and_mark
fi
# No bespoke stale-driver teardown is needed: `nvidia-installer -s` automatically uninstalls
# any previously runfile-installed driver -- including a mismatched prebaked one -- and its
# DKMS registration before installing the new one. build_and_mark then refreshes the marker
# to match what we just built, so subsequent boots take the fast path.
build_and_mark
fi

device_init
device_init

rm -r /opt/gpu
purge_gpu_cache
}

if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main
fi
206 changes: 206 additions & 0 deletions test/install.bats
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
#!/usr/bin/env bats
#
# Unit tests for install.sh: the prebake mode dispatch (build-only / install-skip-build /
# install), the dkms marker write+parse+validate, the opportunistic fast-path fallback, and
# the target-kernel selection. These run on a GPU-less host: install.sh is sourced (its main()
# is guarded by BASH_SOURCE so nothing executes on source) and the device/host-dependent steps
# are stubbed. Covers the runtime branching that previously had no automated guard (only the
# cross-repo AgentBaker e2e exercised it).

setup() {
TEST_TMP="$(mktemp -d)"
mkdir -p "${TEST_TMP}/bin"

# Fixture module root for target_build_kernel.
export AKSGPU_MODULES_ROOT="${TEST_TMP}/modules"
mkdir -p "${AKSGPU_MODULES_ROOT}"

# Fake gpu config + package-manager helpers so install.sh can be sourced without /opt/gpu.
export AKSGPU_CONFIG_PATH="${TEST_TMP}/config.sh"
export AKSGPU_PMH_PATH="${TEST_TMP}/pmh.sh"
cat > "${AKSGPU_CONFIG_PATH}" <<'EOF'
DRIVER_VERSION="${DRIVER_VERSION:-580.0.0}"
DRIVER_KIND="${DRIVER_KIND:-cuda}"
GPU_DEST="${GPU_DEST:-/usr/bin}"
NVIDIA_CONTAINER_TOOLKIT_VER="1.19.1"
NVIDIA_PACKAGES="pkg"
EOF
printf ':\n' > "${AKSGPU_PMH_PATH}"

# Load the functions under test. main() is guarded, so sourcing runs no install steps.
# shellcheck source=/dev/null
source "${BATS_TEST_DIRNAME}/../install.sh"

# Stable identity for the marker tests (individual tests override as needed).
KERNEL_NAME="5.15.0-1114-azure"
DRIVER_VERSION="580.0.0"
DRIVER_KIND="cuda"
ARCH="x86_64"
DKMS_MARKER_FILE="${TEST_TMP}/dkms-marker"
}

teardown() {
rm -rf "${TEST_TMP}"
}

# --- helpers ---------------------------------------------------------------

# _stub_bin <name> <exit_code>: put an executable stub with the given exit code on the test PATH.
_stub_bin() {
cat > "${TEST_TMP}/bin/${1}" <<EOF
#!/usr/bin/env bash
exit ${2}
EOF
chmod +x "${TEST_TMP}/bin/${1}"
}

# _has_sort_v: GNU sort with -V (version sort). Absent on BSD/macOS, present on the CI runner.
_has_sort_v() { printf '1\n2\n' | sort -V >/dev/null 2>&1; }

# _stub_dispatch: replace the device/host-dependent steps so main()'s dispatch can run GPU-less.
# Each stub records that it ran via a sentinel file.
_stub_dispatch() {
cleanup_overlay() { :; }
install_nvidia_container_toolkit() { :; }
build_and_mark() { echo "BUILD_AND_MARK"; touch "${TEST_TMP}/build_and_mark.ran"; }
device_init() { echo "DEVICE_INIT"; touch "${TEST_TMP}/device_init.ran"; }
purge_gpu_cache() { :; }
}

# --- marker: write -------------------------------------------------------

@test "write_dkms_marker records kernel/version/kind/arch" {
KERNEL_NAME="6.8.0-1059-azure"; DRIVER_VERSION="580.126.09"; DRIVER_KIND="cuda"; ARCH="x86_64"
write_dkms_marker
run cat "${DKMS_MARKER_FILE}"
[ "$status" -eq 0 ]
[[ "$output" == *"kernel=6.8.0-1059-azure"* ]]
[[ "$output" == *"driver_version=580.126.09"* ]]
[[ "$output" == *"driver_kind=cuda"* ]]
[[ "$output" == *"arch=x86_64"* ]]
}

@test "write_dkms_marker publishes atomically (no .tmp left behind)" {
write_dkms_marker
run bash -c "ls ${TEST_TMP}/dkms-marker.tmp.* 2>/dev/null"
[ "$status" -ne 0 ]
}

# --- marker: parse + validate -------------------------------------------

@test "baked_marker_matches succeeds on exact kernel+version+kind match" {
write_dkms_marker
run baked_marker_matches
[ "$status" -eq 0 ]
}

@test "baked_marker_matches fails on kernel mismatch (kernel drift since bake)" {
write_dkms_marker
KERNEL_NAME="6.11.0-1000-azure"
run baked_marker_matches
[ "$status" -ne 0 ]
}

@test "baked_marker_matches fails on driver_version mismatch" {
write_dkms_marker
DRIVER_VERSION="999.99.99"
run baked_marker_matches
[ "$status" -ne 0 ]
}

@test "baked_marker_matches fails on driver_kind mismatch (cuda marker on grid node)" {
write_dkms_marker
DRIVER_KIND="grid"
run baked_marker_matches
[ "$status" -ne 0 ]
}

@test "baked_marker_matches fails when the marker file is absent" {
rm -f "${DKMS_MARKER_FILE}"
run baked_marker_matches
[ "$status" -ne 0 ]
}

# --- fast-path fallback --------------------------------------------------

@test "fast_path_ok succeeds when ldconfig+dkms+modinfo all pass" {
_stub_bin ldconfig 0; _stub_bin dkms 0; _stub_bin modinfo 0
PATH="${TEST_TMP}/bin:$PATH"
run fast_path_ok
[ "$status" -eq 0 ]
}

@test "fast_path_ok fails (-> full build) when modinfo reports the module is unusable" {
_stub_bin ldconfig 0; _stub_bin dkms 0; _stub_bin modinfo 1
PATH="${TEST_TMP}/bin:$PATH"
run fast_path_ok
[ "$status" -ne 0 ]
}

@test "fast_path_ok fails (-> full build) when dkms status fails" {
_stub_bin ldconfig 0; _stub_bin dkms 1; _stub_bin modinfo 0
PATH="${TEST_TMP}/bin:$PATH"
run fast_path_ok
[ "$status" -ne 0 ]
}

# --- target kernel selection --------------------------------------------

@test "target_build_kernel picks the newest kernel that has a build tree" {
_has_sort_v || skip "requires GNU sort -V (runs on the Linux CI)"
mkdir -p "${AKSGPU_MODULES_ROOT}/5.15.0-1114-azure/build"
mkdir -p "${AKSGPU_MODULES_ROOT}/6.8.0-1059-azure/build"
mkdir -p "${AKSGPU_MODULES_ROOT}/6.8.0-1200-azure" # no build tree -> must be ignored
run target_build_kernel
[ "$status" -eq 0 ]
[ "$output" = "6.8.0-1059-azure" ]
}

@test "target_build_kernel falls back to uname -r when no build trees exist" {
run target_build_kernel
[ "$status" -eq 0 ]
[ "$output" = "$(uname -r)" ]
}

# --- mode dispatch -------------------------------------------------------

@test "dispatch build-only: builds+marks then skips device init" {
_has_sort_v || skip "build-only resolves target kernel via GNU sort -V (runs on the Linux CI)"
_stub_dispatch
mkdir -p "${AKSGPU_MODULES_ROOT}/6.8.0-1059-azure/build"
AKSGPU_BUILD_ONLY=1; AKSGPU_SKIP_KERNEL_BUILD=0
run main
[ "$status" -eq 0 ]
[[ "$output" == *"build-only mode"* ]]
[ -f "${TEST_TMP}/build_and_mark.ran" ]
[ ! -f "${TEST_TMP}/device_init.ran" ]
}

@test "dispatch install-skip-build with matching marker: skips recompile, runs device init" {
_stub_dispatch
_stub_bin ldconfig 0; _stub_bin dkms 0; _stub_bin modinfo 0
PATH="${TEST_TMP}/bin:$PATH"
KERNEL_NAME="5.15.0-1114-azure"; DRIVER_VERSION="580.0.0"; DRIVER_KIND="cuda"; ARCH="x86_64"
write_dkms_marker
AKSGPU_BUILD_ONLY=0; AKSGPU_SKIP_KERNEL_BUILD=1
run main
[ "$status" -eq 0 ]
[[ "$output" == *"recompile skipped"* ]]
[ ! -f "${TEST_TMP}/build_and_mark.ran" ]
[ -f "${TEST_TMP}/device_init.ran" ]
}

@test "dispatch install-skip-build with mismatched marker: falls back to a full build" {
_stub_dispatch
_stub_bin ldconfig 0; _stub_bin dkms 0; _stub_bin modinfo 0
PATH="${TEST_TMP}/bin:$PATH"
KERNEL_NAME="5.15.0-1114-azure"; DRIVER_VERSION="580.0.0"; DRIVER_KIND="cuda"; ARCH="x86_64"
write_dkms_marker
DRIVER_VERSION="999.0.0" # node now needs a different version than the baked marker
AKSGPU_BUILD_ONLY=0; AKSGPU_SKIP_KERNEL_BUILD=1
run main
[ "$status" -eq 0 ]
[[ "$output" == *"building from source"* ]]
[ -f "${TEST_TMP}/build_and_mark.ran" ]
[ -f "${TEST_TMP}/device_init.ran" ]
}
Loading