PR openxla#12948: [ROCm] Provide run_xla script to facilitate running XLA unit tests

hsharsha · copybara-github · commit 350641fc9576 · 2024-05-24T06:50:15.000-07:00
Imported from GitHub PR openxla#12948 This is first step in enabling CI runs on AMD hardware. Planning to use this repository to house ROCm related scripts. Copybara import of the project: -- 5465e8b by Harsha HS <harsha.havanurshamsundara@amd.com>: [ROCm] Provide run_xla script to facilitate running XLA unit tests -- d5a1217 by Harsha H S <hsharsha@users.noreply.github.com>: Update run_xla.sh Merging this change closes openxla#12948 COPYBARA_INTEGRATE_REVIEW=openxla#12948 from ROCm:ci_add_run_xla_script_20240522 d5a1217 PiperOrigin-RevId: 636901595
diff --git a/build_tools/rocm/run_xla.sh b/build_tools/rocm/run_xla.sh
@@ -0,0 +1,98 @@
+#!/usr/bin/env bash
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+set -e
+set -x
+
+N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
+# If rocm-smi exists locally (it should) use it to find
+# out how many GPUs we have to test with.
+rocm-smi -i
+STATUS=$?
+if [ $STATUS -ne 0 ]; then TF_GPU_COUNT=1; else
+   TF_GPU_COUNT=$(rocm-smi -i|grep 'Device ID' |grep 'GPU' |wc -l)
+fi
+TF_TESTS_PER_GPU=1
+N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})
+
+echo ""
+echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)."
+echo ""
+
+# First positional argument (if any) specifies the ROCM_INSTALL_DIR
+if [[ -n $1 ]]; then
+    ROCM_INSTALL_DIR=$1
+else
+    if [[ -z "${ROCM_PATH}" ]]; then
+        ROCM_INSTALL_DIR=/opt/rocm-6.1.0
+    else
+        ROCM_INSTALL_DIR=$ROCM_PATH
+    fi
+fi
+
+export PYTHON_BIN_PATH=`which python3`
+PYTHON_VERSION=`python3 -c "import sys;print(f'{sys.version_info.major}.{sys.version_info.minor}')"`
+export TF_PYTHON_VERSION=$PYTHON_VERSION
+export TF_NEED_ROCM=1
+export ROCM_PATH=$ROCM_INSTALL_DIR
+TAGS_FILTER="gpu,requires-gpu-amd,-requires-gpu-nvidia,-no_oss,-oss_excluded,-oss_serial,-no_gpu,-no_rocm"
+UNSUPPORTED_GPU_TAGS="$(echo -requires-gpu-sm{60,70,80,86,89,90}{,-only})"
+TAGS_FILTER="${TAGS_FILTER},${UNSUPPORTED_GPU_TAGS// /,}"
+if [ -f /usertools/rocm.bazelrc ]; then
+        # Use the bazelrc files in /usertools if available
+	if [ ! -d /tf ];then
+           # The bazelrc files in /usertools expect /tf to exist
+           mkdir /tf
+        fi
+ 
+	bazel \
+        --bazelrc=/usertools/rocm.bazelrc \
+        test \
+        --config=sigbuild_local_cache \
+        --config=rocm \
+        --config=xla_cpp \
+        --build_tag_filters=${TAGS_FILTER} \
+        --test_tag_filters=${TAGS_FILTER} \
+        --keep_going \
+        --test_output=errors \
+        --local_test_jobs=${N_TEST_JOBS} \
+        --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
+        --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+        --repo_env=HERMETIC_PYTHON_VERSION=3.11 \
+        --action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
+        --action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
+        --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute \
+        -- //xla/...
+else
+
+    yes "" | $PYTHON_BIN_PATH configure.py
+    bazel \
+        test \
+        -k \
+        --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,gpu,requires-gpu,-no_gpu,-no_rocm --keep_going \
+        --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,gpu,requires-gpu,-no_gpu,-no_rocm \
+        --config=rocm \
+        --test_output=errors \
+        --local_test_jobs=${N_TEST_JOBS} \
+        --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
+        --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+        --repo_env=HERMETIC_PYTHON_VERSION=3.11 \
+        --action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
+        --action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
+        --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute \
+        -- //xla/...
+fi