From a99f333237d160d34db332bd601e8387ef5f1b71 Mon Sep 17 00:00:00 2001 From: Terje Kvernes Date: Wed, 8 Oct 2025 11:26:45 +0200 Subject: [PATCH] Handle nvidia-smi mismatch errors for archdetect. --- init/eessi_archdetect.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/init/eessi_archdetect.sh b/init/eessi_archdetect.sh index 4fd979ce..ec22ee22 100755 --- a/init/eessi_archdetect.sh +++ b/init/eessi_archdetect.sh @@ -181,6 +181,12 @@ accelpath() { nvidia_smi_out=$(mktemp -p /tmp nvidia_smi_out.XXXXX) nvidia-smi --query-gpu=gpu_name,count,driver_version,compute_cap --format=csv,noheader 2>&1 > $nvidia_smi_out if [[ $? -eq 0 ]]; then + if grep -q "Failed to initialize NVML: Driver/library version mismatch" $nvidia_smi_out; then + log "ERROR" "accelpath: nvidia-smi command failed with 'Failed to initialize NVML: Driver/library version mismatch'" + rm -f $nvidia_smi_out + exit 4 + fi + nvidia_smi_info=$(head -1 $nvidia_smi_out) cuda_cc=$(echo $nvidia_smi_info | sed 's/, /,/g' | cut -f4 -d, | sed 's/\.//g') log "DEBUG" "accelpath: CUDA compute capability '${cuda_cc}' derived from nvidia-smi output '${nvidia_smi_info}'"