From 37a8ebe709d3831ed674f22e9bec738adde0a480 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Mon, 29 Jun 2026 11:13:30 -0700 Subject: [PATCH] test(install): add shellcheck + bats harness for prebake logic The prebake mode dispatch (build-only / install-skip-build / install), the dkms marker write/parse/validate, and the opportunistic fast-path fallback previously had no automated guard -- CI only ran `docker buildx build` (xuexu6666 review on #162). - make install.sh sourceable for unit tests: move set-flags + config/helper sources + EXIT trap into main(), guard main() behind BASH_SOURCE, extract purge_gpu_cache, parameterize config/modules paths. Behavior-preserving (execution still runs main()). - test/install.bats: 15 tests (marker write/parse/validate, fast-path fallback, target-kernel selection, all three dispatch modes). - ci.yaml: new lint-and-test job (shellcheck install.sh+entrypoint.sh, bats test/). Local: shellcheck clean, bats 15/15 pass. Signed-off-by: Ganeshkumar Ashokavardhanan --- .github/workflows/ci.yaml | 12 +++ install.sh | 110 ++++++++++++-------- test/install.bats | 206 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 287 insertions(+), 41 deletions(-) create mode 100644 test/install.bats diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 172f20a..8b04713 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -175,3 +175,15 @@ jobs: rm -rf /tmp/.buildx-cache mv /tmp/.buildx-cache-new /tmp/.buildx-cache + lint-and-test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - name: Shellcheck prebake scripts + run: shellcheck -S warning install.sh entrypoint.sh + - name: Install bats + run: | + sudo apt-get update + sudo apt-get install -y bats + - name: Run install.sh unit tests + run: bats test/install.bats diff --git a/install.sh b/install.sh index d9917a6..83125a0 100644 --- a/install.sh +++ b/install.sh @@ -1,8 +1,10 @@ #!/usr/bin/env bash -set -euxo pipefail -source /opt/gpu/config.sh -source /opt/gpu/package_manager_helpers.sh +# NOTE: `set -euxo pipefail` and the `source`s of the gpu config + package-manager helpers are +# applied inside main() rather than at top level, so this script can be sourced by the unit tests +# (test/install.bats) to exercise the individual functions on a GPU-less host without running the +# install or requiring /opt/gpu to exist. main() is invoked only when the script is executed +# directly (see the guard at the bottom). PS4='+ $(date -u -I"seconds" | cut -c1-19) ' @@ -35,11 +37,13 @@ ARCH=$(uname -m) # normal node boot, where uname -r is already correct). target_build_kernel() { local d k - # newest installed kernel that has a headers/build tree (the VHD's target kernel) - k=$(for d in /lib/modules/*/build; do + # newest installed kernel that has a headers/build tree (the VHD's target kernel). The modules + # root is overridable (AKSGPU_MODULES_ROOT) so the unit tests can point it at a fixture dir. + local modules_root="${AKSGPU_MODULES_ROOT:-/lib/modules}" + k=$(for d in "${modules_root}"/*/build; do [ -d "$d" ] || continue d=${d%/build} - echo "${d#/lib/modules/}" + echo "${d#"${modules_root}"/}" done | sort -V | tail -n1) if [ -n "$k" ]; then echo "$k"; else uname -r; fi } @@ -62,9 +66,9 @@ cleanup_overlay() { fi set -e } -# Reset PS4 on exit alongside the overlay cleanup (a single EXIT trap, since a second -# `trap ... EXIT` would replace the first rather than chain). -trap 'cleanup_overlay; PS4="+ "' EXIT +# The EXIT trap that runs cleanup_overlay (and resets PS4) is installed at the start of main(), +# not here, so that sourcing this script for unit tests neither registers a trap nor tears down +# mounts when the test process exits. resolve_runfile() { if [[ "${DRIVER_KIND}" == "cuda" ]]; then @@ -225,41 +229,65 @@ fast_path_ok() { modinfo -k "${KERNEL_NAME}" nvidia >/dev/null 2>&1 || return 1 } -set +euo pipefail -open_devices="$(lsof /dev/nvidia* 2>/dev/null)" -echo "Open devices: $open_devices" - -open_gridd="$(lsof /usr/bin/nvidia-gridd 2>/dev/null)" -echo "Open gridd: $open_gridd" -set -euo pipefail - -if [ "${AKSGPU_BUILD_ONLY}" = "1" ]; then - # VHD build time: compile + cache + marker only, no device access. Target the kernel the VHD - # will boot (not the builder's running kernel) so the prebuilt module + marker match at boot. - KERNEL_NAME="$(target_build_kernel)" - echo "aks-gpu: build-only mode (prebuilding kernel module for kernel ${KERNEL_NAME}; builder running $(uname -r))" - echo "aks-gpu: kernels with installed headers (build trees):"; ls -ld /lib/modules/*/build 2>/dev/null || echo " (none found)" - build_and_mark +# purge_gpu_cache removes the gpu cache that entrypoint.sh staged under /opt/gpu once install.sh +# has consumed it. Factored out of the two former inline `rm -r /opt/gpu` calls so the unit tests +# can stub it when exercising main()'s dispatch. +purge_gpu_cache() { rm -r /opt/gpu - exit 0 -fi +} -install_nvidia_container_toolkit +# main is the install entrypoint. It is run only when the script is executed directly (the guard +# at the bottom), so sourcing the script for unit tests loads the functions above without running +# any install steps. The set-flags and the gpu config/helper sources live here (not at top level) +# for the same reason. +main() { + set -euxo pipefail + # shellcheck source=/dev/null + source "${AKSGPU_CONFIG_PATH:-/opt/gpu/config.sh}" + # shellcheck source=/dev/null + source "${AKSGPU_PMH_PATH:-/opt/gpu/package_manager_helpers.sh}" + trap 'cleanup_overlay; PS4="+ "' EXIT + + set +euo pipefail + open_devices="$(lsof /dev/nvidia* 2>/dev/null)" + echo "Open devices: $open_devices" + + open_gridd="$(lsof /usr/bin/nvidia-gridd 2>/dev/null)" + echo "Open gridd: $open_gridd" + set -euo pipefail + + if [ "${AKSGPU_BUILD_ONLY}" = "1" ]; then + # VHD build time: compile + cache + marker only, no device access. Target the kernel the VHD + # will boot (not the builder's running kernel) so the prebuilt module + marker match at boot. + KERNEL_NAME="$(target_build_kernel)" + echo "aks-gpu: build-only mode (prebuilding kernel module for kernel ${KERNEL_NAME}; builder running $(uname -r))" + echo "aks-gpu: kernels with installed headers (build trees):"; ls -ld "${AKSGPU_MODULES_ROOT:-/lib/modules}"/*/build 2>/dev/null || echo " (none found)" + build_and_mark + purge_gpu_cache + exit 0 + fi -if [ "${AKSGPU_SKIP_KERNEL_BUILD}" = "1" ] && baked_marker_matches && fast_path_ok; then - # Prebuilt module is present and valid for this kernel+driver: skip the ~100s recompile. - echo "aks-gpu: using kernel module prebuilt in the VHD for kernel ${KERNEL_NAME} (recompile skipped)" -else - if [ "${AKSGPU_SKIP_KERNEL_BUILD}" = "1" ]; then - echo "aks-gpu: prebuilt module missing/invalid for ${DRIVER_KIND} ${DRIVER_VERSION} on ${KERNEL_NAME}; building from source" + install_nvidia_container_toolkit + + if [ "${AKSGPU_SKIP_KERNEL_BUILD}" = "1" ] && baked_marker_matches && fast_path_ok; then + # Prebuilt module is present and valid for this kernel+driver: skip the ~100s recompile. + echo "aks-gpu: using kernel module prebuilt in the VHD for kernel ${KERNEL_NAME} (recompile skipped)" + else + if [ "${AKSGPU_SKIP_KERNEL_BUILD}" = "1" ]; then + echo "aks-gpu: prebuilt module missing/invalid for ${DRIVER_KIND} ${DRIVER_VERSION} on ${KERNEL_NAME}; building from source" + fi + # No bespoke stale-driver teardown is needed: `nvidia-installer -s` automatically uninstalls + # any previously runfile-installed driver -- including a mismatched prebaked one -- and its + # DKMS registration before installing the new one. build_and_mark then refreshes the marker + # to match what we just built, so subsequent boots take the fast path. + build_and_mark fi - # No bespoke stale-driver teardown is needed: `nvidia-installer -s` automatically uninstalls - # any previously runfile-installed driver -- including a mismatched prebaked one -- and its - # DKMS registration before installing the new one. build_and_mark then refreshes the marker - # to match what we just built, so subsequent boots take the fast path. - build_and_mark -fi -device_init + device_init -rm -r /opt/gpu + purge_gpu_cache +} + +if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then + main +fi diff --git a/test/install.bats b/test/install.bats new file mode 100644 index 0000000..26b84cc --- /dev/null +++ b/test/install.bats @@ -0,0 +1,206 @@ +#!/usr/bin/env bats +# +# Unit tests for install.sh: the prebake mode dispatch (build-only / install-skip-build / +# install), the dkms marker write+parse+validate, the opportunistic fast-path fallback, and +# the target-kernel selection. These run on a GPU-less host: install.sh is sourced (its main() +# is guarded by BASH_SOURCE so nothing executes on source) and the device/host-dependent steps +# are stubbed. Covers the runtime branching that previously had no automated guard (only the +# cross-repo AgentBaker e2e exercised it). + +setup() { + TEST_TMP="$(mktemp -d)" + mkdir -p "${TEST_TMP}/bin" + + # Fixture module root for target_build_kernel. + export AKSGPU_MODULES_ROOT="${TEST_TMP}/modules" + mkdir -p "${AKSGPU_MODULES_ROOT}" + + # Fake gpu config + package-manager helpers so install.sh can be sourced without /opt/gpu. + export AKSGPU_CONFIG_PATH="${TEST_TMP}/config.sh" + export AKSGPU_PMH_PATH="${TEST_TMP}/pmh.sh" + cat > "${AKSGPU_CONFIG_PATH}" <<'EOF' +DRIVER_VERSION="${DRIVER_VERSION:-580.0.0}" +DRIVER_KIND="${DRIVER_KIND:-cuda}" +GPU_DEST="${GPU_DEST:-/usr/bin}" +NVIDIA_CONTAINER_TOOLKIT_VER="1.19.1" +NVIDIA_PACKAGES="pkg" +EOF + printf ':\n' > "${AKSGPU_PMH_PATH}" + + # Load the functions under test. main() is guarded, so sourcing runs no install steps. + # shellcheck source=/dev/null + source "${BATS_TEST_DIRNAME}/../install.sh" + + # Stable identity for the marker tests (individual tests override as needed). + KERNEL_NAME="5.15.0-1114-azure" + DRIVER_VERSION="580.0.0" + DRIVER_KIND="cuda" + ARCH="x86_64" + DKMS_MARKER_FILE="${TEST_TMP}/dkms-marker" +} + +teardown() { + rm -rf "${TEST_TMP}" +} + +# --- helpers --------------------------------------------------------------- + +# _stub_bin : put an executable stub with the given exit code on the test PATH. +_stub_bin() { + cat > "${TEST_TMP}/bin/${1}" </dev/null 2>&1; } + +# _stub_dispatch: replace the device/host-dependent steps so main()'s dispatch can run GPU-less. +# Each stub records that it ran via a sentinel file. +_stub_dispatch() { + cleanup_overlay() { :; } + install_nvidia_container_toolkit() { :; } + build_and_mark() { echo "BUILD_AND_MARK"; touch "${TEST_TMP}/build_and_mark.ran"; } + device_init() { echo "DEVICE_INIT"; touch "${TEST_TMP}/device_init.ran"; } + purge_gpu_cache() { :; } +} + +# --- marker: write ------------------------------------------------------- + +@test "write_dkms_marker records kernel/version/kind/arch" { + KERNEL_NAME="6.8.0-1059-azure"; DRIVER_VERSION="580.126.09"; DRIVER_KIND="cuda"; ARCH="x86_64" + write_dkms_marker + run cat "${DKMS_MARKER_FILE}" + [ "$status" -eq 0 ] + [[ "$output" == *"kernel=6.8.0-1059-azure"* ]] + [[ "$output" == *"driver_version=580.126.09"* ]] + [[ "$output" == *"driver_kind=cuda"* ]] + [[ "$output" == *"arch=x86_64"* ]] +} + +@test "write_dkms_marker publishes atomically (no .tmp left behind)" { + write_dkms_marker + run bash -c "ls ${TEST_TMP}/dkms-marker.tmp.* 2>/dev/null" + [ "$status" -ne 0 ] +} + +# --- marker: parse + validate ------------------------------------------- + +@test "baked_marker_matches succeeds on exact kernel+version+kind match" { + write_dkms_marker + run baked_marker_matches + [ "$status" -eq 0 ] +} + +@test "baked_marker_matches fails on kernel mismatch (kernel drift since bake)" { + write_dkms_marker + KERNEL_NAME="6.11.0-1000-azure" + run baked_marker_matches + [ "$status" -ne 0 ] +} + +@test "baked_marker_matches fails on driver_version mismatch" { + write_dkms_marker + DRIVER_VERSION="999.99.99" + run baked_marker_matches + [ "$status" -ne 0 ] +} + +@test "baked_marker_matches fails on driver_kind mismatch (cuda marker on grid node)" { + write_dkms_marker + DRIVER_KIND="grid" + run baked_marker_matches + [ "$status" -ne 0 ] +} + +@test "baked_marker_matches fails when the marker file is absent" { + rm -f "${DKMS_MARKER_FILE}" + run baked_marker_matches + [ "$status" -ne 0 ] +} + +# --- fast-path fallback -------------------------------------------------- + +@test "fast_path_ok succeeds when ldconfig+dkms+modinfo all pass" { + _stub_bin ldconfig 0; _stub_bin dkms 0; _stub_bin modinfo 0 + PATH="${TEST_TMP}/bin:$PATH" + run fast_path_ok + [ "$status" -eq 0 ] +} + +@test "fast_path_ok fails (-> full build) when modinfo reports the module is unusable" { + _stub_bin ldconfig 0; _stub_bin dkms 0; _stub_bin modinfo 1 + PATH="${TEST_TMP}/bin:$PATH" + run fast_path_ok + [ "$status" -ne 0 ] +} + +@test "fast_path_ok fails (-> full build) when dkms status fails" { + _stub_bin ldconfig 0; _stub_bin dkms 1; _stub_bin modinfo 0 + PATH="${TEST_TMP}/bin:$PATH" + run fast_path_ok + [ "$status" -ne 0 ] +} + +# --- target kernel selection -------------------------------------------- + +@test "target_build_kernel picks the newest kernel that has a build tree" { + _has_sort_v || skip "requires GNU sort -V (runs on the Linux CI)" + mkdir -p "${AKSGPU_MODULES_ROOT}/5.15.0-1114-azure/build" + mkdir -p "${AKSGPU_MODULES_ROOT}/6.8.0-1059-azure/build" + mkdir -p "${AKSGPU_MODULES_ROOT}/6.8.0-1200-azure" # no build tree -> must be ignored + run target_build_kernel + [ "$status" -eq 0 ] + [ "$output" = "6.8.0-1059-azure" ] +} + +@test "target_build_kernel falls back to uname -r when no build trees exist" { + run target_build_kernel + [ "$status" -eq 0 ] + [ "$output" = "$(uname -r)" ] +} + +# --- mode dispatch ------------------------------------------------------- + +@test "dispatch build-only: builds+marks then skips device init" { + _has_sort_v || skip "build-only resolves target kernel via GNU sort -V (runs on the Linux CI)" + _stub_dispatch + mkdir -p "${AKSGPU_MODULES_ROOT}/6.8.0-1059-azure/build" + AKSGPU_BUILD_ONLY=1; AKSGPU_SKIP_KERNEL_BUILD=0 + run main + [ "$status" -eq 0 ] + [[ "$output" == *"build-only mode"* ]] + [ -f "${TEST_TMP}/build_and_mark.ran" ] + [ ! -f "${TEST_TMP}/device_init.ran" ] +} + +@test "dispatch install-skip-build with matching marker: skips recompile, runs device init" { + _stub_dispatch + _stub_bin ldconfig 0; _stub_bin dkms 0; _stub_bin modinfo 0 + PATH="${TEST_TMP}/bin:$PATH" + KERNEL_NAME="5.15.0-1114-azure"; DRIVER_VERSION="580.0.0"; DRIVER_KIND="cuda"; ARCH="x86_64" + write_dkms_marker + AKSGPU_BUILD_ONLY=0; AKSGPU_SKIP_KERNEL_BUILD=1 + run main + [ "$status" -eq 0 ] + [[ "$output" == *"recompile skipped"* ]] + [ ! -f "${TEST_TMP}/build_and_mark.ran" ] + [ -f "${TEST_TMP}/device_init.ran" ] +} + +@test "dispatch install-skip-build with mismatched marker: falls back to a full build" { + _stub_dispatch + _stub_bin ldconfig 0; _stub_bin dkms 0; _stub_bin modinfo 0 + PATH="${TEST_TMP}/bin:$PATH" + KERNEL_NAME="5.15.0-1114-azure"; DRIVER_VERSION="580.0.0"; DRIVER_KIND="cuda"; ARCH="x86_64" + write_dkms_marker + DRIVER_VERSION="999.0.0" # node now needs a different version than the baked marker + AKSGPU_BUILD_ONLY=0; AKSGPU_SKIP_KERNEL_BUILD=1 + run main + [ "$status" -eq 0 ] + [[ "$output" == *"building from source"* ]] + [ -f "${TEST_TMP}/build_and_mark.ran" ] + [ -f "${TEST_TMP}/device_init.ran" ] +}