driver-manager

#! /bin/bash
# Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

NODE_NAME=${NODE_NAME:?"Missing node name"}
DRAIN_USE_FORCE=${DRAIN_USE_FORCE:-"false"}
DRAIN_POD_SELECTOR_LABEL=${DRAIN_POD_SELECTOR_LABEL:-""}
DRAIN_TIMEOUT_SECONDS=${DRAIN_TIMEOUT_SECONDS:-"0s"}
DRAIN_DELETE_EMPTYDIR_DATA=${DRAIN_DELETE_EMPTYDIR_DATA:-"false"}
ENABLE_AUTO_DRAIN=${ENABLE_AUTO_DRAIN:-"true"}
ENABLE_GPU_POD_EVICTION=${ENABLE_GPU_POD_EVICTION:-"true"}
NVDRAIN_DEBUG=${NVDRAIN_DEBUG:-"false"}
DRIVER_ROOT=/run/nvidia/driver
DRIVER_PID_FILE=/run/nvidia/nvidia-driver.pid
OPERATOR_NAMESPACE=${OPERATOR_NAMESPACE:-"gpu-operator-resources"}
PAUSED_STR="paused-for-driver-upgrade"
PLUGIN_DEPLOYED=""
GFD_DEPLOYED=""
DCGM_DEPLOYED=""
DCGM_EXPORTER_DEPLOYED=""
NVSM_DEPLOYED=""
TOOLKIT_DEPLOYED=""
VALIDATOR_DEPLOYED=""
MIG_MANAGER_DEPLOYED=""
SANDBOX_VALIDATOR_DEPLOYED=""
SANDBOX_PLUGIN_DEPLOYED=""
VGPU_DEVICE_MANAGER_DEPLOYED=""
NODE_LABEL_FOR_GPU_POD_EVICTION=$(echo ${NODE_LABEL_FOR_GPU_POD_EVICTION} | sed -e "s/=.*$//g")
CUSTOM_OPERAND_NODE_LABEL_VALUE=""
AUTO_UPGRADE_POLICY_ENABLED=""
GPU_DIRECT_RDMA_ENABLED=${GPU_DIRECT_RDMA_ENABLED:-"false"}
USE_HOST_MOFED=${USE_HOST_MOFED:-"false"}

_drain_k8s_node() {
    echo "Draining node ${NODE_NAME}..."
    kubectl drain ${NODE_NAME} --ignore-daemonsets=true --force=${DRAIN_USE_FORCE} --pod-selector=${DRAIN_POD_SELECTOR_LABEL} --delete-emptydir-data=${DRAIN_DELETE_EMPTYDIR_DATA} --timeout=${DRAIN_TIMEOUT_SECONDS}
    if [ "$?" != "0" ]; then
        return 1
    fi
    return 0
}

_nvdrain_k8s_node() {
    echo "Draining node ${NODE_NAME} of any GPU pods..."
    nvdrain --debug=${NVDRAIN_DEBUG} --node-name=${NODE_NAME} --kubeconfig=${KUBECONFIG} --force=${DRAIN_USE_FORCE} --delete-emptydir-data=${DRAIN_DELETE_EMPTYDIR_DATA} --timeout=${DRAIN_TIMEOUT_SECONDS}
    if [ "$?" != "0" ]; then
        return 1
    fi
    return 0
}

_is_auto_drain_enabled() {
    if _is_driver_auto_upgrade_policy_enabled; then
        echo "Auto drain of the node ${NODE_NAME} is disabled by the upgrade policy"
        return 1
    fi
    # check env
    if [ "${ENABLE_AUTO_DRAIN}" = "true" ]; then
        return 0
    fi
    echo "Auto drain of the node ${NODE_NAME} is disabled"
    return 1
}

_is_gpu_pod_eviction_enabled() {
    if _is_driver_auto_upgrade_policy_enabled; then
        echo "Auto eviction of GPU pods on node ${NODE_NAME} is disabled by the upgrade policy"
        return 1
    fi
    # check env
    if [ "${ENABLE_GPU_POD_EVICTION}" = "true" ]; then
        return 0
    fi
    echo "Auto eviction of GPU pods on node ${NODE_NAME} is disabled"
    return 1
}

_is_driver_auto_upgrade_policy_enabled() {
    if [ "${AUTO_UPGRADE_POLICY_ENABLED}" = "true" ]; then
        return 0
    fi
    echo "Auto upgrade policy of the GPU driver on the node ${NODE_NAME} is disabled"
    return 1
}

# Only return 'paused-*' if the value passed in is != 'false'. It should only
# be 'false' if some external entity has forced it to this value, at which point
# we want to honor it's existing value and not change it.
_maybe_set_paused() {
    local current_value="${1}"
    if [  "${current_value}" = "" ]; then
        # disabled by user with empty value, retain it
        echo ""
    elif [  "${current_value}" = "false" ]; then
        # disabled by user
        echo "false"
    elif [  "${current_value}" = "true" ]; then
        # disable
        echo "${PAUSED_STR}"
    elif [[ "${current_value}" == *"${PAUSED_STR}"* ]]; then
        # already added paused status for driver upgrade
        echo "${current_value}"
    else
        # append paused status for driver upgrade
        echo "${current_value}_${PAUSED_STR}"
    fi
}

# Only return 'true' if the value passed in is != 'false'. It should only
# be 'false' if some external entity has forced it to this value, at which point
# we want to honor it's existing value and not change it.
_maybe_set_true() {
    local current_value="${1}"
    if [ "${current_value}" = "false" ]; then
        # disabled by user
        echo "false"
    elif [ "${current_value}" = "${PAUSED_STR}" ]; then
        # enable the component
        echo "true"
    else
        # revert back to original label
        echo "${current_value}" | sed -r "s/${PAUSED_STR}//g" | tr -d "_"
    fi
}

_fetch_auto_upgrade_annotation() {
    AUTO_UPGRADE_POLICY_ENABLED=$(kubectl get nodes ${NODE_NAME} -o=jsonpath='{$.metadata.annotations.nvidia\.com/gpu-driver-upgrade-enabled}')
    if [ "${?}" != "0" ]; then
        echo "Unable to get the value of the 'nvidia.com/gpu-driver-upgrade-enabled' annotation"
        exit 1
    fi
    echo "Current value of AUTO_UPGRADE_POLICY_ENABLED=${AUTO_UPGRADE_POLICY_ENABLED}'"
}

_fetch_current_labels() {
    echo "Getting current value of the 'nvidia.com/gpu.deploy.operator-validator' node label"
    VALIDATOR_DEPLOYED=$(kubectl get nodes ${NODE_NAME} -o=jsonpath='{$.metadata.labels.nvidia\.com/gpu\.deploy\.operator-validator}')
    if [ "${?}" != "0" ]; then
        echo "Unable to get the value of the 'nvidia.com/gpu.deploy.operator-validator' label"
        exit 1
    fi
    echo "Current value of 'nvidia.com/gpu.deploy.operator-validator=${VALIDATOR_DEPLOYED}'"

    echo "Getting current value of the 'nvidia.com/gpu.deploy.container-toolkit' node label"
    TOOLKIT_DEPLOYED=$(kubectl get nodes ${NODE_NAME} -o=jsonpath='{$.metadata.labels.nvidia\.com/gpu\.deploy\.container-toolkit}')
    if [ "${?}" != "0" ]; then
        echo "Unable to get the value of the 'nvidia.com/gpu.deploy.container-toolkit' label"
        exit 1
    fi
    echo "Current value of 'nvidia.com/gpu.deploy.container-toolkit=${TOOLKIT_DEPLOYED}'"

    echo "Getting current value of the 'nvidia.com/gpu.deploy.device-plugin' node label"
    PLUGIN_DEPLOYED=$(kubectl get nodes ${NODE_NAME} -o=jsonpath='{$.metadata.labels.nvidia\.com/gpu\.deploy\.device-plugin}')
    if [ "${?}" != "0" ]; then
        echo "Unable to get the value of the 'nvidia.com/gpu.deploy.device-plugin' label"
        exit 1
    fi
    echo "Current value of 'nvidia.com/gpu.deploy.device-plugin=${PLUGIN_DEPLOYED}'"

    echo "Getting current value of the 'nvidia.com/gpu.deploy.gpu-feature-discovery' node label"
    GFD_DEPLOYED=$(kubectl get nodes ${NODE_NAME} -o=jsonpath='{$.metadata.labels.nvidia\.com/gpu\.deploy\.gpu-feature-discovery}')
    if [ "${?}" != "0" ]; then
        echo "Unable to get the value of the 'nvidia.com/gpu.deploy.gpu-feature-discovery' label"
        exit 1
    fi
    echo "Current value of 'nvidia.com/gpu.deploy.gpu-feature-discovery=${GFD_DEPLOYED}'"

    echo "Getting current value of the 'nvidia.com/gpu.deploy.dcgm-exporter' node label"
    DCGM_EXPORTER_DEPLOYED=$(kubectl get nodes ${NODE_NAME} -o=jsonpath='{$.metadata.labels.nvidia\.com/gpu\.deploy\.dcgm-exporter}')
    if [ "${?}" != "0" ]; then
        echo "Unable to get the value of the 'nvidia.com/gpu.deploy.dcgm-exporter' label"
        exit 1
    fi
    echo "Current value of 'nvidia.com/gpu.deploy.dcgm-exporter=${DCGM_EXPORTER_DEPLOYED}'"

    echo "Getting current value of the 'nvidia.com/gpu.deploy.dcgm' node label"
    DCGM_DEPLOYED=$(kubectl get nodes ${NODE_NAME} -o=jsonpath='{$.metadata.labels.nvidia\.com/gpu\.deploy\.dcgm}')
    if [ "${?}" != "0" ]; then
        echo "Unable to get the value of the 'nvidia.com/gpu.deploy.dcgm' label"
        exit 1
    fi
    echo "Current value of 'nvidia.com/gpu.deploy.dcgm=${DCGM_DEPLOYED}'"

    echo "Getting current value of the 'nvidia.com/gpu.deploy.mig-manager' node label"
    MIG_MANAGER_DEPLOYED=$(kubectl get nodes ${NODE_NAME} -o=jsonpath='{$.metadata.labels.nvidia\.com/gpu\.deploy\.mig-manager}')
    if [ "${?}" != "0" ]; then
        echo "Unable to get the value of the 'nvidia.com/gpu.deploy.mig-manager' label"
        exit 1
    fi
    echo "Current value of 'nvidia.com/gpu.deploy.mig-manager=${MIG_MANAGER_DEPLOYED}'"

    echo "Getting current value of the 'nvidia.com/gpu.deploy.nvsm' node label"
    NVSM_DEPLOYED=$(kubectl get nodes ${NODE_NAME} -o=jsonpath='{$.metadata.labels.nvidia\.com/gpu\.deploy\.nvsm}')
    if [ "${?}" != "0" ]; then
        echo "Unable to get the value of the 'nvidia.com/gpu.deploy.nvsm' label"
        exit 1
    fi
    echo "Current value of 'nvidia.com/gpu.deploy.nvsm=${NVSM_DEPLOYED}'"

    echo "Getting current value of the 'nvidia.com/gpu.deploy.sandbox-validator' node label"
    SANDBOX_VALIDATOR_DEPLOYED=$(kubectl get nodes ${NODE_NAME} -o=jsonpath='{$.metadata.labels.nvidia\.com/gpu\.deploy\.sandbox-validator}')
    if [ "${?}" != "0" ]; then
        echo "Unable to get the value of the 'nvidia.com/gpu.deploy.sandbox-validator' label"
        exit 1
    fi
    echo "Current value of 'nvidia.com/gpu.deploy.sandbox-validator=${SANDBOX_VALIDATOR_DEPLOYED}'"

    echo "Getting current value of the 'nvidia.com/gpu.deploy.sandbox-device-plugin' node label"
    SANDBOX_PLUGIN_DEPLOYED=$(kubectl get nodes ${NODE_NAME} -o=jsonpath='{$.metadata.labels.nvidia\.com/gpu\.deploy\.sandbox-device-plugin}')
    if [ "${?}" != "0" ]; then
        echo "Unable to get the value of the 'nvidia.com/gpu.deploy.sandbox-device-plugin' label"
        exit 1
    fi
    echo "Current value of 'nvidia.com/gpu.deploy.sandbox-device-plugin=${SANDBOX_PLUGIN_DEPLOYED}'"

    echo "Getting current value of the 'nvidia.com/gpu.deploy.vgpu-device-manager' node label"
    VGPU_DEVICE_MANAGER_DEPLOYED=$(kubectl get nodes ${NODE_NAME} -o=jsonpath='{$.metadata.labels.nvidia\.com/gpu\.deploy\.vgpu-device-manager}')
    if [ "${?}" != "0" ]; then
        echo "Unable to get the value of the 'nvidia.com/gpu.deploy.vgpu-device-manager' label"
        exit 1
    fi
    echo "Current value of 'nvidia.com/gpu.deploy.vgpu-device-manager=${VGPU_DEVICE_MANAGER_DEPLOYED}'"

    if [ "${NODE_LABEL_FOR_GPU_POD_EVICTION}" != "" ]; then
        echo "Getting current value of the '${NODE_LABEL_FOR_GPU_POD_EVICTION}' node label used by custom operands"
        LABEL_KEY=$(echo $NODE_LABEL_FOR_GPU_POD_EVICTION | sed -e "s/\./\\\./g")
        JSON_LABEL_PATH="{$.metadata.labels.$LABEL_KEY}"
        CUSTOM_OPERAND_NODE_LABEL_VALUE=$(kubectl get nodes ${NODE_NAME} -o=jsonpath=${JSON_LABEL_PATH})
        if [ "${?}" != "0" ]; then
            echo "Unable to get the value of the '${NODE_LABEL_FOR_GPU_POD_EVICTION}' label"
            exit 1
        fi
        echo "Current value of '${NODE_LABEL_FOR_GPU_POD_EVICTION}=${CUSTOM_OPERAND_NODE_LABEL_VALUE}'"
    fi
}

_evict_required_gpu_operator_components() {
    echo "Shutting GPU Operator components that must be restarted on driver restarts by disabling their component-specific nodeSelector labels"
    kubectl label --overwrite \
        node ${NODE_NAME} \
        nvidia.com/gpu.deploy.operator-validator=$(_maybe_set_paused ${VALIDATOR_DEPLOYED}) \
        nvidia.com/gpu.deploy.sandbox-validator=$(_maybe_set_paused ${SANDBOX_VALIDATOR_DEPLOYED}) \
        nvidia.com/gpu.deploy.sandbox-device-plugin=$(_maybe_set_paused ${SANDBOX_PLUGIN_DEPLOYED}) \
        nvidia.com/gpu.deploy.vgpu-device-manager=$(_maybe_set_paused ${VGPU_DEVICE_MANAGER_DEPLOYED})

    if [ "$?" != "0" ]; then
        return 1
    fi

    echo "Waiting for the operator-validator to shutdown"
    kubectl wait --for=delete pod \
        --timeout=5m \
        --field-selector "spec.nodeName=${NODE_NAME}" \
        -n ${OPERATOR_NAMESPACE} \
        -l app=nvidia-operator-validator

    if [ "${SANDBOX_VALIDATOR_DEPLOYED}" != "" ]; then
        echo "Waiting for sandbox-validator to shutdown"
        kubectl wait --for=delete pod \
            --timeout=5m \
            --field-selector "spec.nodeName=${NODE_NAME}" \
            -n ${OPERATOR_NAMESPACE} \
            -l app=nvidia-sandbox-validator
    fi

    if [ "${SANDBOX_PLUGIN_DEPLOYED}" != "" ]; then
        echo "Waiting for sandbox-device-plugin to shutdown"
        kubectl wait --for=delete pod \
            --timeout=5m \
            --field-selector "spec.nodeName=${NODE_NAME}" \
            -n ${OPERATOR_NAMESPACE} \
            -l app=nvidia-sandbox-device-plugin-daemonset
    fi

    if [ "${VGPU_DEVICE_MANAGER_DEPLOYED}" != "" ]; then
        echo "Waiting for vgpu-device-manager to shutdown"
        kubectl wait --for=delete pod \
            --timeout=5m \
            --field-selector "spec.nodeName=${NODE_NAME}" \
            -n ${OPERATOR_NAMESPACE} \
            -l app=nvidia-vgpu-device-manager
    fi

    return 0
}

_evict_all_gpu_operator_components() {
    echo "Shutting down all GPU clients on the current node by disabling their component-specific nodeSelector labels"
    kubectl label --overwrite \
        node ${NODE_NAME} \
        nvidia.com/gpu.deploy.operator-validator=$(_maybe_set_paused ${VALIDATOR_DEPLOYED}) \
        nvidia.com/gpu.deploy.container-toolkit=$(_maybe_set_paused ${TOOLKIT_DEPLOYED}) \
        nvidia.com/gpu.deploy.device-plugin=$(_maybe_set_paused ${PLUGIN_DEPLOYED}) \
        nvidia.com/gpu.deploy.gpu-feature-discovery=$(_maybe_set_paused ${GFD_DEPLOYED}) \
        nvidia.com/gpu.deploy.dcgm-exporter=$(_maybe_set_paused ${DCGM_EXPORTER_DEPLOYED}) \
        nvidia.com/gpu.deploy.dcgm=$(_maybe_set_paused ${DCGM_DEPLOYED}) \
        nvidia.com/gpu.deploy.nvsm=$(_maybe_set_paused ${NVSM_DEPLOYED}) \
        nvidia.com/gpu.deploy.sandbox-device-plugin=$(_maybe_set_paused ${SANDBOX_PLUGIN_DEPLOYED}) \
        nvidia.com/gpu.deploy.sandbox-validator=$(_maybe_set_paused ${SANDBOX_VALIDATOR_DEPLOYED}) \
        nvidia.com/gpu.deploy.vgpu-device-manager=$(_maybe_set_paused ${VGPU_DEVICE_MANAGER_DEPLOYED})

    if [ "$?" != "0" ]; then
        return 1
    fi

    # check if a custom operand node selector label is provided for eviction
    if [ "${CUSTOM_OPERAND_NODE_LABEL_VALUE}" != "" ]; then
        echo "Shutting down GPU clients using node selector label ${NODE_LABEL_FOR_GPU_POD_EVICTION}=${CUSTOM_OPERAND_NODE_LABEL_VALUE}"
        kubectl label --overwrite node ${NODE_NAME} \
            ${NODE_LABEL_FOR_GPU_POD_EVICTION}=$(_maybe_set_paused ${CUSTOM_OPERAND_NODE_LABEL_VALUE})

        if [ "$?" != "0" ]; then
            return 1
        fi
    fi

    if [ "${MIG_MANAGER_DEPLOYED}" != "" ]; then
        kubectl label --overwrite \
            node ${NODE_NAME} \
            nvidia.com/gpu.deploy.mig-manager=$(_maybe_set_paused ${MIG_MANAGER_DEPLOYED})
        if [ "$?" != "0" ]; then
            return 1
        fi
    fi

    echo "Waiting for the operator-validator to shutdown"
    kubectl wait --for=delete pod \
        --timeout=5m \
        --field-selector "spec.nodeName=${NODE_NAME}" \
        -n ${OPERATOR_NAMESPACE} \
        -l app=nvidia-operator-validator

    echo "Waiting for the container-toolkit to shutdown"
    kubectl wait --for=delete pod \
        --timeout=5m \
        --field-selector "spec.nodeName=${NODE_NAME}" \
        -n ${OPERATOR_NAMESPACE} \
        -l app=nvidia-container-toolkit-daemonset

    echo "Waiting for the device-plugin to shutdown"
    kubectl wait --for=delete pod \
        --timeout=5m \
        --field-selector "spec.nodeName=${NODE_NAME}" \
        -n ${OPERATOR_NAMESPACE} \
        -l app=nvidia-device-plugin-daemonset

    echo "Waiting for gpu-feature-discovery to shutdown"
    kubectl wait --for=delete pod \
        --timeout=5m \
        --field-selector "spec.nodeName=${NODE_NAME}" \
        -n ${OPERATOR_NAMESPACE} \
        -l app=gpu-feature-discovery

    echo "Waiting for dcgm-exporter to shutdown"
    kubectl wait --for=delete pod \
        --timeout=5m \
        --field-selector "spec.nodeName=${NODE_NAME}" \
        -n ${OPERATOR_NAMESPACE} \
        -l app=nvidia-dcgm-exporter

    echo "Waiting for dcgm to shutdown"
    kubectl wait --for=delete pod \
        --timeout=5m \
        --field-selector "spec.nodeName=${NODE_NAME}" \
        -n ${OPERATOR_NAMESPACE} \
        -l app=nvidia-dcgm

    if [ "${MIG_MANAGER_DEPLOYED}" != "" ]; then
        echo "Waiting for mig-manager to shutdown"
        kubectl wait --for=delete pod \
            --timeout=5m \
            --field-selector "spec.nodeName=${NODE_NAME}" \
            -n ${OPERATOR_NAMESPACE} \
            -l app=nvidia-mig-manager
    fi

    if [ "${SANDBOX_VALIDATOR_DEPLOYED}" != "" ]; then
        echo "Waiting for sandbox-validator to shutdown"
        kubectl wait --for=delete pod \
            --timeout=5m \
            --field-selector "spec.nodeName=${NODE_NAME}" \
            -n ${OPERATOR_NAMESPACE} \
            -l app=nvidia-sandbox-validator
    fi

    if [ "${SANDBOX_PLUGIN_DEPLOYED}" != "" ]; then
        echo "Waiting for sandbox-device-plugin to shutdown"
        kubectl wait --for=delete pod \
            --timeout=5m \
            --field-selector "spec.nodeName=${NODE_NAME}" \
            -n ${OPERATOR_NAMESPACE} \
            -l app=nvidia-sandbox-device-plugin-daemonset
    fi

    if [ "${VGPU_DEVICE_MANAGER_DEPLOYED}" != "" ]; then
        echo "Waiting for vgpu-device-manager to shutdown"
        kubectl wait --for=delete pod \
            --timeout=5m \
            --field-selector "spec.nodeName=${NODE_NAME}" \
            -n ${OPERATOR_NAMESPACE} \
            -l app=nvidia-vgpu-device-manager
    fi


    return 0
}

_cordon_k8s_node() {
    echo "Cordoning node ${NODE_NAME}..."
    kubectl cordon ${NODE_NAME}
    if [ "$?" != "0" ]; then
        return 1
    fi
    return 0
}

_uncordon_k8s_node() {
    echo "Uncordoning node ${NODE_NAME}..."
    kubectl uncordon ${NODE_NAME}
    if [ "$?" != "0" ]; then
        return 1
    fi
    return 0
}

_reschedule_gpu_operator_components() {
    echo "Rescheduling all GPU clients on the current node by enabling their component-specific nodeSelector labels"
    kubectl label --overwrite \
        node ${NODE_NAME} \
        nvidia.com/gpu.deploy.operator-validator=$(_maybe_set_true ${VALIDATOR_DEPLOYED}) \
        nvidia.com/gpu.deploy.container-toolkit=$(_maybe_set_true ${TOOLKIT_DEPLOYED}) \
        nvidia.com/gpu.deploy.device-plugin=$(_maybe_set_true ${PLUGIN_DEPLOYED}) \
        nvidia.com/gpu.deploy.gpu-feature-discovery=$(_maybe_set_true ${GFD_DEPLOYED}) \
        nvidia.com/gpu.deploy.dcgm-exporter=$(_maybe_set_true ${DCGM_EXPORTER_DEPLOYED}) \
        nvidia.com/gpu.deploy.dcgm=$(_maybe_set_true ${DCGM_DEPLOYED}) \
        nvidia.com/gpu.deploy.nvsm=$(_maybe_set_true ${NVSM_DEPLOYED}) \
        nvidia.com/gpu.deploy.sandbox-validator=$(_maybe_set_true ${SANDBOX_VALIDATOR_DEPLOYED}) \
        nvidia.com/gpu.deploy.sandbox-device-plugin=$(_maybe_set_true ${SANDBOX_PLUGIN_DEPLOYED}) \
        nvidia.com/gpu.deploy.vgpu-device-manager=$(_maybe_set_true ${VGPU_DEVICE_MANAGER_DEPLOYED})

    if [ "$?" != "0" ]; then
        return 1
    fi

    # check for custom operand node selector label
    if [ "${CUSTOM_OPERAND_NODE_LABEL_VALUE}" != "" ]; then
        echo "Restarting GPU clients using node selector label ${NODE_LABEL_FOR_GPU_POD_EVICTION}=${CUSTOM_OPERAND_NODE_LABEL_VALUE}"
        kubectl label --overwrite node ${NODE_NAME} \
            ${NODE_LABEL_FOR_GPU_POD_EVICTION}=$(_maybe_set_true ${CUSTOM_OPERAND_NODE_LABEL_VALUE})
        if [ "$?" != "0" ]; then
            return 1
        fi
    fi

    if [ "${MIG_MANAGER_DEPLOYED}" != "" ]; then
        kubectl label --overwrite \
            node ${NODE_NAME} \
            nvidia.com/gpu.deploy.mig-manager=$(_maybe_set_true ${MIG_MANAGER_DEPLOYED})
        if [ "$?" != "0" ]; then
            return 1
        fi
    fi
    return 0
}

_driver_loaded() {
    if [ -f /sys/module/nvidia/refcnt ]; then
        return 0
    fi
    return 1
}

_nouveau_loaded() {
    if [ -f /sys/module/nouveau/refcnt ]; then
        return 0
    fi
    return 1
}

_unload_nouveau() {
    if [ -f /sys/module/nouveau/refcnt ]; then
        echo "Unloading nouveau driver..."
        rmmod nouveau
        if [ "$?" != "0" ]; then
            echo "Failed to unload nouveau driver"
            return 1
        fi
    fi
    return 0
}

_exit_failed() {
    # below commands are no-op if node is already in desired state
    if _is_gpu_pod_eviction_enabled || _is_auto_drain_enabled; then
        _uncordon_k8s_node
    fi
    _reschedule_gpu_operator_components
    exit 1
}

_exit_success() {
    # below commands are no-op if node is already in desired state
    if _is_gpu_pod_eviction_enabled || _is_auto_drain_enabled; then
        _uncordon_k8s_node
    fi
    _reschedule_gpu_operator_components
    exit 0
}

_host_driver() {
    # check if driver is pre-installed on the host
    if [ -f /host/usr/bin/nvidia-smi ] || [ -L /host/usr/bin/nvidia-smi ]; then
        DRIVER_VERSION=$(chroot /host nvidia-smi --query-gpu=driver_version --format=csv,noheader)
        if [ $? == 0 ] && [ ! -z "${DRIVER_VERSION}" ]; then
            return 0
        fi
    fi

    return 1
}

# Check if mellanox devices are present
_mellanox_devices_present() {
    devices_found=0
    for dev in /sys/bus/pci/devices/*; do
        read vendor < $dev/vendor
        if [ "$vendor" = "0x15b3" ]; then
            echo "Mellanox device found at $(basename $dev)"
            return 0
        fi
    done
    echo "No Mellanox devices were found..."
    return 1
}

_gpu_direct_rdma_enabled() {
    if [ "${GPU_DIRECT_RDMA_ENABLED}" = "true" ]; then
        # check if mellanox cards are present
        if  _mellanox_devices_present; then
            return 0
        fi
    fi
    return 1
}

_wait_for_mofed_driver() {
    # check for mlx5_core module to be loaded
    cmd="lsmod | grep mlx5_core"

    if [ "${USE_HOST_MOFED}" != "true" ]; then
        # when MOFED container is running use readiness flag set by the driver container instaed
        cmd="stat /run/mellanox/drivers/.driver-ready"
    fi

    until bash -c "${cmd}"
    do
        echo "Waiting for MOFED to be installed..."
        sleep 5
    done
}

uninstall_driver() {
    # don't attempt to un-install if driver is pre-installed on the node
    if _host_driver; then
        echo "NVIDIA GPU driver is already pre-installed on the node, disabling the containerized driver on the node"
        kubectl label --overwrite \
          node ${NODE_NAME} \
          nvidia.com/gpu.deploy.driver=pre-installed
        # add wait here as pod termination can take 30s due to default grace-period
        sleep 60
        exit 1
    fi

    # fetch current status of all component labels
    _fetch_current_labels

    # fetch auto upgrade policy annotation
    _fetch_auto_upgrade_annotation

    # always evict all gpu-operator components across a driver restart
    _evict_all_gpu_operator_components || _exit_failed

    # delete any GPU pods running on the node
    if _is_gpu_pod_eviction_enabled; then
        _cordon_k8s_node
        _nvdrain_k8s_node
        if [ "$?" != "0" ]; then
            # nvdrain will only return non-zero code when GPU pods
            # are running on the node and cannot be deleted. Attempt
            # to drain the node in this case.
            echo "Failed to drain node of GPU pods"
            if ! _is_auto_drain_enabled; then
                echo "Cannot proceed until all GPU pods are drained from the node. Exiting."
                _exit_failed
            fi
            echo "Attempting node drain"
            trap '_exit_failed' ERR
            _drain_k8s_node
            _cleanup_driver
        fi
    fi


    # check if driver is already loaded
    if _driver_loaded; then
        _cleanup_driver
        if [ "$?" != "0" ]; then
            if _is_auto_drain_enabled; then
                echo "Unable to cleanup driver modules, attempting again with node drain..."
                trap '_exit_failed' ERR
                _drain_k8s_node
                _cleanup_driver
            else
                echo "Failed to uninstall nvidia driver components"
                _exit_failed
            fi
        fi
        echo "Successfully uninstalled nvidia driver components"
    fi

    # If vfio-pci driver is in use, ensure we unbind it from all devices.
    # If vfio-pci driver is not in use and we have reached this point, all
    # devices will not be bound to any driver, so the below unbind operation
    # will be a no-op.
    vfio-manage unbind --all
    if [ "$?" != "0" ]; then
        echo "Unable to unbind vfio-pci driver from all devices"
        _exit_failed
    fi

    # when GPUDirectRDMA is enabled, wait until MOFED driver has finished installing
    if _gpu_direct_rdma_enabled; then
        echo "GPUDirectRDMA is enabled, validating MOFED driver installation"
        _wait_for_mofed_driver
    fi

    if _is_gpu_pod_eviction_enabled || _is_auto_drain_enabled; then
        # uncordon the node in case if the pod has restarted abruptly after we cordoned the node
        _uncordon_k8s_node
    fi
    # always reschedule operator components in case if the pod has restarted abruptly after evicting pods
    _reschedule_gpu_operator_components

    # check for nouveau driver and unload it before driver nvidia installation
    if _nouveau_loaded; then
        _unload_nouveau
        if [ "$?" != "0" ]; then
            exit 1
        fi
        echo "Successfully unloaded nouveau driver"
    fi
    exit 0
}

preflight_check() {
    # TODO: add checks for driver package availability for current kernel
    # TODO: add checks for driver dependencies
    # TODO: add checks for entitlements(OCP)
    exit 0
}

# Unload the kernel modules if they are currently loaded.
_unload_driver() {
    local rmmod_args=()
    local nvidia_deps=0
    local nvidia_refs=0
    local nvidia_uvm_refs=0
    local nvidia_modeset_refs=0
    local nvidia_peermem_refs=0
    local nvidia_vgpu_vfio_refs=0
    local nvidia_fs_refs=0
    local gdrdrv_refs=0

    echo "Unloading NVIDIA driver kernel modules..."
    if [ -f /sys/module/nvidia_modeset/refcnt ]; then
        nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt)
        rmmod_args+=("nvidia-modeset")
        ((++nvidia_deps))
    fi
    if [ -f /sys/module/nvidia_uvm/refcnt ]; then
        nvidia_uvm_refs=$(< /sys/module/nvidia_uvm/refcnt)
        rmmod_args+=("nvidia-uvm")
        ((++nvidia_deps))
    fi
    if [ -f /sys/module/nvidia_peermem/refcnt ]; then
        nvidia_peermem_refs=$(< /sys/module/nvidia_peermem/refcnt)
        rmmod_args+=("nvidia-peermem")
        ((++nvidia_deps))
    fi
    if [ -f /sys/module/nvidia_fs/refcnt ]; then
        nvidia_fs_refs=$(< /sys/module/nvidia_fs/refcnt)
        rmmod_args+=("nvidia-fs")
        ((++nvidia_deps))
    fi
    if [ -f /sys/module/nvidia_vgpu_vfio/refcnt ]; then
        nvidia_vgpu_vfio_refs=$(< /sys/module/nvidia_vgpu_vfio/refcnt)
        rmmod_args+=("nvidia_vgpu_vfio")
        ((++nvidia_deps))
    fi
    if [ -f /sys/module/gdrdrv/refcnt ]; then
            gdrdrv_refs=$(< /sys/module/gdrdrv/refcnt)
            rmmod_args+=("gdrdrv")
            ((++nvidia_deps))
    fi
    if [ -f /sys/module/nvidia/refcnt ]; then
        nvidia_refs=$(< /sys/module/nvidia/refcnt)
        rmmod_args+=("nvidia")
    fi

    if [ ${nvidia_refs} -gt ${nvidia_deps} ] && [ ${nvidia_refs} -eq 2 ] && [[ " ${rmmod_args} " =~ " nvidia_vgpu_vfio " ]]; then
        # When cleaning up a previous vGPU Manager install, it is possible $nvidia_refs > $nvidia_deps
        # since nvidia_vgpu_vfio holds 2 handles on the driver. Catch this case and proceed to unload driver modules.
        #     $ lsmod | grep nvidia
        #     nvidia_vgpu_vfio       53248  0
        #     nvidia              35315712  2
        #     mdev                   24576  2 vfio_mdev,nvidia_vgpu_vfio
        :
    elif [ ${nvidia_refs} -gt ${nvidia_deps} ] || [ ${nvidia_uvm_refs} -gt 0 ] || [ ${nvidia_modeset_refs} -gt 0 ] || [ ${nvidia_peermem_refs} -gt 0 ] || [ ${nvidia_vgpu_vfio_refs} -gt 0 ] || [ ${nvidia_fs_refs} -gt 0 ] || [ ${gdrdrv_refs} -gt 0 ]; then
        # run lsmod to debug module usage
        lsmod | grep nvidia
        echo "Could not unload NVIDIA driver kernel modules, driver is in use" >&2
        return 1
    fi

    if [ ${#rmmod_args[@]} -gt 0 ]; then
        rmmod ${rmmod_args[@]}
        if [ "$?" != "0" ]; then
            return 1
        fi
    fi
    return 0
}

# Unmount the driver rootfs from the run directory.
_unmount_rootfs() {
    echo "Unmounting NVIDIA driver rootfs..."
    if findmnt -r -o TARGET | grep "${DRIVER_ROOT}" > /dev/null; then
        umount -l -R ${DRIVER_ROOT}
        return 0
    fi
    return 1
}

_cleanup_driver() {
    _unload_driver
    if [ "$?" != "0" ]; then
        return 1
    fi
    _unmount_rootfs
    if [ "$?" != "0" ]; then
        return 1
    fi
    if [ -f ${DRIVER_PID_FILE} ]; then
        rm -f ${DRIVER_PID_FILE}
    fi
    return 0
}

usage() {
    cat >&2 <<EOF
Usage: $0 COMMAND [ARG...]

Commands:
  uninstall_driver
  preflight_check
EOF
    exit 1
}

if [ $# -eq 0 ]; then
    usage
fi
command=$1; shift
case "${command}" in
    uninstall_driver) ;;
    preflight_check) ;;
    *) usage ;;
esac
if [ $? -ne 0 ]; then
    usage
fi

$command