Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions build_tools/rocm/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -34,21 +34,33 @@ filegroup(
)

genrule(
name = "san_wrapper_script",
srcs = [":sanitizer_ignore_lists"],
outs = ["san_wrapper.sh"],
name = "exclusive_wrapper_script",
outs = ["exclusive_wrapper.sh"],
cmd = """
echo '#!/bin/bash' > $@
echo 'exec "$$@"' >> $@
echo 'exec {lock_fd}>/var/lock/gpulock || exit 1' >> $@
echo 'flock "$$lock_fd"' >> $@
echo '"$$@"' >> $@
echo 'return_code=$$?' >> $@
echo 'flock -u "$$lock_fd"' >> $@
echo 'exit $$return_code' >> $@
chmod +x $@
""",
)

# this wrapper ensures the test target
# take into account any changes in the ignore list files
sh_binary(
name = "exclusive_local_wrapper",
srcs = [":exclusive_wrapper_script"],
visibility = ["//visibility:public"],
)

# this wrapper ensures the test target
# take into account any changes in the ignore list files
sh_binary(
name = "sanitizer_wrapper",
srcs = [":san_wrapper_script"],
srcs = [":exclusive_wrapper_script"],
data = [":sanitizer_ignore_lists"],
visibility = ["//visibility:public"],
)
1 change: 1 addition & 0 deletions build_tools/rocm/lsan_ignore_list.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ leak:libstdc++.so
leak:libamdhip64.so
leak:libhiprtc.so
leak:librccl.so
leak:hwloc_bitmap_alloc
16 changes: 0 additions & 16 deletions build_tools/rocm/platform/BUILD

This file was deleted.

31 changes: 0 additions & 31 deletions build_tools/rocm/platform/linux_x64/BUILD

This file was deleted.

15 changes: 9 additions & 6 deletions build_tools/rocm/rocm_xla.bazelrc
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,22 @@ build:rocm_dev --remote_cache="https://wardite.cluster.engflow.com"

build:rocm_rbe --bes_backend="grpcs://wardite.cluster.engflow.com"
build:rocm_rbe --bes_results_url="https://wardite.cluster.engflow.com/invocation/"
build:rocm_rbe --host_platform="//build_tools/rocm/platform/linux_x64:linux_x64_gpu"
build:rocm_rbe --extra_execution_platforms="//build_tools/rocm/platform/linux_x64:linux_x64_gpu"
build:rocm_rbe --platforms="//build_tools/rocm/platform/linux_x64:linux_x64_gpu"
build:rocm_rbe --host_platform="@local_config_rocm//rocm:linux_x64"
build:rocm_rbe --extra_execution_platforms="@local_config_rocm//rocm:linux_x64"
build:rocm_rbe --platforms="@local_config_rocm//rocm:linux_x64"
build:rocm_rbe --bes_timeout=600s
build:rocm_rbe --tls_client_certificate="/tf/certificates/ci-cert.crt"
build:rocm_rbe --tls_client_key="/tf/certificates/ci-cert.key"
build:rocm_rbe --remote_executor="grpcs://wardite.cluster.engflow.com"
build:rocm_rbe --remote_cache="grpcs://wardite.cluster.engflow.com"
build:rocm_rbe --spawn_strategy=local
build:rocm_rbe --spawn_strategy=remote,local
build:rocm_rbe --jobs=200
build:rocm_rbe --remote_timeout=3600
build:rocm_rbe --remote_download_minimal
build:rocm_rbe --remote_upload_local_results
build:rocm_rbe --grpc_keepalive_time=30s

test:rocm_rbe --strategy=TestRunner=local
test:rocm_rbe --strategy=TestRunner=remote,local

build:asan --strip=never
build:asan --copt -fsanitize=address
Expand Down Expand Up @@ -62,7 +63,9 @@ build:xla_sgpu -- \
-//xla/pjrt/distributed:topology_util_test \
-//xla/pjrt/distributed:client_server_test \
-//xla/service/gpu/tests:dynamic_shared_memory_test_amdgpu_any \
-//xla/service/gpu/tests:gpu_cub_sort_test_amdgpu_any
-//xla/service/gpu/tests:gpu_cub_sort_test_amdgpu_any \
-//xla/tests:iota_test_amdgpu_any \
-//xla/tests:reduce_window_test_amdgpu_any # TODO: return when it is not flaky!

test:xla_mgpu -- \
//xla/tests:collective_ops_e2e_test \
Expand Down
11 changes: 8 additions & 3 deletions build_tools/rocm/run_jax_ut.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,28 @@
set -e

JAX_DIR=$1
XLA_DIR=$2
XLA_DIR="/tf/xla" # TODO: later use argument passed from CI job

pushd $JAX_DIR

python build/build.py build \
python3 build/build.py build \
--wheels=jax-rocm-plugin \
--configure_only \
--local_xla_path=${XLA_DIR} \
--python_version=3.12

# TODO: run the tests when they are green
bazel build \
bazel --bazelrc=${XLA_DIR}/build_tools/rocm/rocm_xla.bazelrc test \
--config=rocm \
--config=rocm_rbe \
--disk_cache=/tf/disk_cache/jaxlib-v0.7.1 \
--build_tag_filters=cpu,gpu,-tpu,-config-cuda-only \
--test_tag_filters=cpu,gpu,-tpu,-config-cuda-only \
--action_env=TF_ROCM_AMDGPU_TARGETS=gfx908,gfx90a,gfx942 \
--test_timeout=920,2400,7200,9600 \
--//jax:build_jaxlib=true \
--run_under=@xla//build_tools/rocm:exclusive_local_wrapper \
--action_env=REMOTE_GPU_TESTING=1 \
"//tests/..."

popd
23 changes: 16 additions & 7 deletions build_tools/rocm/run_xla.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,17 @@ N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
rocm-smi -i
STATUS=$?
if [ $STATUS -ne 0 ]; then TF_GPU_COUNT=1; else
TF_GPU_COUNT=$(rocm-smi -i|grep 'Device ID' |grep 'GPU' |wc -l)
TF_GPU_COUNT=$(rocm-smi -i | grep 'Device ID' | grep 'GPU' | wc -l)
fi
TF_TESTS_PER_GPU=1
N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})
amdgpuname=(`rocminfo | grep gfx | head -n 1`)
amdgpuname=($(rocminfo | grep gfx | head -n 1))
AMD_GPU_GFX_ID=${amdgpuname[1]}
echo ""
echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s) for gpu ${AMD_GPU_GFX_ID}."
echo ""

export PYTHON_BIN_PATH=`which python3`
export PYTHON_BIN_PATH=$(which python3)
export TF_NEED_ROCM=1
export ROCM_PATH="/opt/rocm"

Expand Down Expand Up @@ -99,12 +99,13 @@ BAZEL_DISK_CACHE_SIZE=100G
BAZEL_DISK_CACHE_DIR="/tf/disk_cache/rocm-jaxlib-v0.7.1"
mkdir -p ${BAZEL_DISK_CACHE_DIR}
if [ ! -d /tf/pkg ]; then
mkdir -p /tf/pkg
mkdir -p /tf/pkg
fi

SCRIPT_DIR=$(realpath $(dirname $0))
TAG_FILTERS=$($SCRIPT_DIR/rocm_tag_filters.sh),-multigpu,-multi_gpu_h100,requires-gpu-amd,-skip_rocprofiler_sdk,-no_oss,-oss_excluded,-oss_serial

RBE_OPTIONS=()
SANITIZER_ARGS=()
if [[ $1 == "asan" ]]; then
SANITIZER_ARGS+=("--config=asan")
Expand All @@ -121,6 +122,12 @@ elif [[ $1 == "tsan" ]]; then
HostExecuteStartThunkTest*
HostExecuteDoneThunkTest*
)

# tsan tests appear to be flaky in rbe due to the heavy load
# force them to run locally
RBE_OPTIONS+=(
--strategy=TestRunner=local
)
shift
fi

Expand All @@ -139,16 +146,18 @@ bazel --bazelrc=build_tools/rocm/rocm_xla.bazelrc test \
--flaky_test_attempts=3 \
--keep_going \
--local_test_jobs=${N_TEST_JOBS} \
--repo_env=TF_ROCM_AMDGPU_TARGETS=gfx908,gfx90a,gfx942,gfx1100 \
--test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
--action_env=XLA_FLAGS="--xla_gpu_enable_llvm_module_compilation_parallelism=true --xla_gpu_force_compilation_parallelism=16" \
--run_under=//build_tools/ci:parallel_gpu_execute \
--test_env=MIOPEN_FIND_ENFORCE=5 \
--test_env=MIOPEN_FIND_MODE=1 \
--test_filter=-$(IFS=: ; echo "${EXCLUDED_TESTS[*]}") \
"${SANITIZER_ARGS[@]}" \
"$@"
"$@" \
"${RBE_OPTIONS[@]}"

# clean up bazel disk_cache
bazel shutdown \
--disk_cache=${BAZEL_DISK_CACHE_DIR} \
--experimental_disk_cache_gc_max_size=${BAZEL_DISK_CACHE_SIZE}
--disk_cache=${BAZEL_DISK_CACHE_DIR} \
--experimental_disk_cache_gc_max_size=${BAZEL_DISK_CACHE_SIZE}
33 changes: 16 additions & 17 deletions build_tools/rocm/run_xla_multi_gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,17 @@ if [ ! -d /tf/pkg ]; then
fi

EXCLUDED_TESTS=(
CollectiveOpsTestE2E.MemcpyP2pLargeMessage
RaggedAllToAllTest/RaggedAllToAllTest.RaggedAllToAll_8GPUs_2ReplicasPerGroups/sync_decomposer
RaggedAllToAllTest/RaggedAllToAllTest.RaggedAllToAll_8GPUs_2ReplicasPerGroups/async_decomposer
# //xla/tests:collective_ops_test_amdgpu_any
RaggedAllToAllTest*
AsyncCollectiveOps*
AsyncMemcpyCollectiveOps*
CollectiveOpsTest*
AllReduceTest*
Fp8CollectiveOpsTest*
# //xla/backends/gpu/codegen/triton:fusion_emitter_parametrized_legacy_test_amdgpu_any
ElementwiseTestSuiteF32/BinaryElementwiseTest.ElementwiseFusionExecutesCorrectly/f32_atan2
# //xla/tests:collective_ops_e2e_test_amdgpu_any
CollectiveOpsTestE2E.MemcpyP2pLargeMessage
CollectiveOpsTestE2EPipelinedNonPipelined.CollectivePipelinerBackward
CollectiveOpsTestE2EPipelinedNonPipelined.CollectivePipelinerBackwardStartFromOne
# //xla/tools/multihost_hlo_runner:functional_hlo_runner_test
Expand All @@ -76,6 +81,7 @@ EXCLUDED_TESTS=(
SCRIPT_DIR=$(realpath $(dirname $0))
TAG_FILTERS="$($SCRIPT_DIR/rocm_tag_filters.sh)"

RBE_OPTIONS=()
SANITIZER_ARGS=()
if [[ $1 == "asan" ]]; then
SANITIZER_ARGS+=("--run_under=//build_tools/rocm:sanitizer_wrapper")
Expand All @@ -86,18 +92,10 @@ elif [[ $1 == "tsan" ]]; then
SANITIZER_ARGS+=("--run_under=//build_tools/rocm:sanitizer_wrapper")
SANITIZER_ARGS+=("--config=tsan")
TAG_FILTERS="$TAG_FILTERS,-notsan"
# excluded from tsan
EXCLUDED_TESTS+=(
CollectiveOpsTest*
Fp8CollectiveOpsTest.AllGather_8BitFloat
Fp8CollectiveOpsTest.CollectivePermute_8BitFloat
Fp8CollectiveOpsTest.AllToAll_8BitFloat
AsyncCollectiveOps*
AllReduceTest*
RaggedAllToAllTest*
AsyncCollectiveOps*
AsyncMemcpyCollectiveOps*
RaggedAllToAllTest*
# tsan tests appear to be flaky in rbe due to the heavy load
# force them to run locally
RBE_OPTIONS+=(
--strategy=TestRunner=local
)
shift
fi
Expand All @@ -116,14 +114,15 @@ bazel --bazelrc=build_tools/rocm/rocm_xla.bazelrc test \
--test_output=errors \
--flaky_test_attempts=3 \
--keep_going \
--test_strategy=exclusive \
--run_under=//build_tools/rocm:exclusive_local_wrapper \
--repo_env=TF_ROCM_AMDGPU_TARGETS=gfx908,gfx90a,gfx942,gfx1100 \
--action_env=XLA_FLAGS=--xla_gpu_force_compilation_parallelism=16 \
--action_env=XLA_FLAGS=--xla_gpu_enable_llvm_module_compilation_parallelism=true \
--action_env=NCCL_MAX_NCHANNELS=1 \
--test_filter=-$(IFS=: ; echo "${EXCLUDED_TESTS[*]}") \
"${SANITIZER_ARGS[@]}" \
"$@" \
--strategy=TestRunner=local # execute multigpu tests locally as there is no gpu exclusive protection on rbe
"${RBE_OPTIONS[@]}"

# clean up bazel disk_cache
bazel shutdown \
Expand Down
Loading