From 114eff9f9bbf98924b4a1f1151f3ed6bc04a4ac9 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Thu, 4 Dec 2025 23:19:22 -0800 Subject: [PATCH] [Not Ready]Let CUDA and ROCm read different loss result As title. --- .github/workflows/integration_test_8gpu_features.yaml | 4 ++-- tests/assets/losses/{llama3.txt => cuda_llama3.txt} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename tests/assets/losses/{llama3.txt => cuda_llama3.txt} (100%) diff --git a/.github/workflows/integration_test_8gpu_features.yaml b/.github/workflows/integration_test_8gpu_features.yaml index a20cd22545..0c03b9ea63 100644 --- a/.github/workflows/integration_test_8gpu_features.yaml +++ b/.github/workflows/integration_test_8gpu_features.yaml @@ -90,11 +90,11 @@ jobs: sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded" sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded" - # Verify the accuracy first. + # Verify the accuracy first (only for CUDA). echo "Checking FSDP8 v.s. HSDP (4, 2) accuracy parity" export baseline_options="--parallelism.data_parallel_replicate_degree=1" export test_options="--parallelism.data_parallel_replicate_degree=4" - python3 scripts/loss_compare.py . . --baseline-options="${baseline_options}" --test-options="${test_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --assert-equal --steps=10 --import-result tests/assets/losses/llama3.txt + python3 scripts/loss_compare.py . . --baseline-options="${baseline_options}" --test-options="${test_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --assert-equal --steps=10 --export-result ${RUNNER_TEMP}/artifacts-to-be-uploaded/loss_result.txt rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/* python -m tests.integration_tests.run_tests --gpu_arch_type ${{ matrix.gpu-arch-type }} --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8 diff --git a/tests/assets/losses/llama3.txt b/tests/assets/losses/cuda_llama3.txt similarity index 100% rename from tests/assets/losses/llama3.txt rename to tests/assets/losses/cuda_llama3.txt