Skip to content

Commit 470cd3d

Browse files
committed
[Not Ready] Enable Async TP CI
NVLS seems to be one possible reason. This PR is trying to verify this. ghstack-source-id: 0e31f15 Pull-Request: #2125
1 parent 575674a commit 470cd3d

File tree

3 files changed

+4
-6
lines changed

3 files changed

+4
-6
lines changed

run_train.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ TRAIN_FILE=${TRAIN_FILE:-"torchtitan.train"}
2020
COMM_MODE=${COMM_MODE:-""}
2121

2222
TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"}
23+
export NCCL_NVLS_ENABLE=0
24+
export NVSHMEM_DISABLE_NVLS=0
25+
export TORCH_SHOW_CPP_STACKTRACES=1
26+
export TORCH_CPP_LOG_LEVEL=INFO
2327

2428
if [ -n "$COMM_MODE" ]; then
2529
# Communication mode specified: validate configuration or run in debug mode

tests/integration_tests/features.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,6 @@ def build_features_test_list() -> list[OverrideDefinitions]:
7878
"2D compile",
7979
"2d_compile",
8080
),
81-
# TODO: re-enable this test once the async TP CI issue is fixed
8281
OverrideDefinitions(
8382
[
8483
[
@@ -89,7 +88,6 @@ def build_features_test_list() -> list[OverrideDefinitions]:
8988
],
9089
"2D async TP compile",
9190
"2d_asynctp_compile",
92-
disabled=True,
9391
),
9492
OverrideDefinitions(
9593
[

tests/integration_tests/h100.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
1919
same root config file.
2020
"""
2121
integration_tests_flavors = [
22-
# TODO: re-enable this test once the async TP issue is fixed
2322
OverrideDefinitions(
2423
[
2524
[
@@ -30,7 +29,6 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
3029
],
3130
"2D async TP compile",
3231
"2d_asynctp_compile",
33-
disabled=True,
3432
),
3533
OverrideDefinitions(
3634
[
@@ -43,7 +41,6 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
4341
"Float8 test",
4442
"float8",
4543
),
46-
# TODO: re-enable this test once the async TP issue is fixed
4744
OverrideDefinitions(
4845
[
4946
[
@@ -60,7 +57,6 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
6057
"FSDP+async TP+PP+torch.compile+Float8",
6158
"fsdp+tp+cp+compile+float8",
6259
ngpu=8,
63-
disabled=True,
6460
),
6561
OverrideDefinitions(
6662
[

0 commit comments

Comments
 (0)