Skip to content

Commit 82f0287

Browse files
authored
[CI] Disable Async TP CI (#1756)
Async TP related CI started to fail since Sep 22 2025. However even if we roll back the nightly PyTorch to 0919, the tests still failed. ``` python -m pip install --force-reinstall torch==2.10.0.dev20250917+cu126 --index-url https://download.pytorch.org/whl/nightly/cu126 ``` This is not an async TP issue but symmetric memory. This simple line can cause issues on the CI machine/docker. ``` symm_mem = get_symm_mem_workspace(torch.distributed.group.WORLD.group_name, min_size=1024*1024*64) ```
1 parent 0943771 commit 82f0287

File tree

5 files changed

+44
-33
lines changed

5 files changed

+44
-33
lines changed

tests/integration_tests/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ class OverrideDefinitions:
2222
test_descr: str = "default"
2323
test_name: str = "default"
2424
ngpu: int = 4
25+
disabled: bool = False
2526

2627
def __repr__(self):
2728
return self.test_descr

tests/integration_tests/features.py

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -65,17 +65,18 @@ def build_features_test_list() -> list[OverrideDefinitions]:
6565
"2d_compile",
6666
),
6767
# TODO: re-enable this test once the async TP CI issue is fixed
68-
# OverrideDefinitions(
69-
# [
70-
# [
71-
# "--compile.enable",
72-
# "--parallelism.tensor_parallel_degree 2",
73-
# "--parallelism.enable_async_tensor_parallel",
74-
# ],
75-
# ],
76-
# "2D async TP compile",
77-
# "2d_asynctp_compile",
78-
# ),
68+
OverrideDefinitions(
69+
[
70+
[
71+
"--compile.enable",
72+
"--parallelism.tensor_parallel_degree 2",
73+
"--parallelism.enable_async_tensor_parallel",
74+
],
75+
],
76+
"2D async TP compile",
77+
"2d_asynctp_compile",
78+
disabled=True,
79+
),
7980
OverrideDefinitions(
8081
[
8182
[
@@ -432,16 +433,17 @@ def build_features_test_list() -> list[OverrideDefinitions]:
432433
"cpu_offload+opt_in_bwd+TP+DP+CP",
433434
ngpu=8,
434435
),
435-
# OverrideDefinitions(
436-
# [
437-
# [
438-
# "--memory_estimation.enable",
439-
# ]
440-
# ],
441-
# "FSDP2 Memory Tracking and Estimation",
442-
# "fsdp2_memory_estimation",
443-
# ngpu=2,
444-
# ),
436+
OverrideDefinitions(
437+
[
438+
[
439+
"--memory_estimation.enable",
440+
]
441+
],
442+
"FSDP2 Memory Tracking and Estimation",
443+
"fsdp2_memory_estimation",
444+
ngpu=2,
445+
disabled=True,
446+
),
445447
OverrideDefinitions(
446448
[
447449
[

tests/integration_tests/h100.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
1919
same root config file.
2020
"""
2121
integration_tests_flavors = [
22+
# TODO: re-enable this test once the async TP issue is fixed
2223
OverrideDefinitions(
2324
[
2425
[
@@ -29,6 +30,7 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
2930
],
3031
"2D async TP compile",
3132
"2d_asynctp_compile",
33+
disabled=True,
3234
),
3335
OverrideDefinitions(
3436
[
@@ -41,6 +43,7 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
4143
"Float8 test",
4244
"float8",
4345
),
46+
# TODO: re-enable this test once the async TP issue is fixed
4447
OverrideDefinitions(
4548
[
4649
[
@@ -57,6 +60,7 @@ def build_h100_tests_list() -> list[OverrideDefinitions]:
5760
"FSDP+async TP+PP+torch.compile+Float8",
5861
"fsdp+tp+cp+compile+float8",
5962
ngpu=8,
63+
disabled=True,
6064
),
6165
OverrideDefinitions(
6266
[

tests/integration_tests/run_tests.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,9 @@ def run_tests(args, test_list: list[OverrideDefinitions]):
8484
if args.test_name != "all" and test_flavor.test_name != args.test_name:
8585
continue
8686

87+
if test_flavor.disabled:
88+
continue
89+
8790
# Check if we have enough GPUs
8891
if args.ngpu < test_flavor.ngpu:
8992
logger.info(

torchtitan/experiments/simple_fsdp/tests/integration_tests.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -63,18 +63,19 @@ def build_simple_fsdp_test_list() -> list[OverrideDefinitions]:
6363
"2d",
6464
),
6565
# TODO: re-enable this test once the async TP issue is fixed
66-
# OverrideDefinitions(
67-
# [
68-
# [
69-
# "--model.name simple_fsdp",
70-
# "--compile.enable",
71-
# "--parallelism.tensor_parallel_degree 2",
72-
# "--parallelism.enable_async_tensor_parallel",
73-
# ],
74-
# ],
75-
# "2D async TP",
76-
# "2d_asynctp",
77-
# ),
66+
OverrideDefinitions(
67+
[
68+
[
69+
"--model.name simple_fsdp",
70+
"--compile.enable",
71+
"--parallelism.tensor_parallel_degree 2",
72+
"--parallelism.enable_async_tensor_parallel",
73+
],
74+
],
75+
"2D async TP",
76+
"2d_asynctp",
77+
disabled=True,
78+
),
7879
OverrideDefinitions(
7980
[
8081
[

0 commit comments

Comments
 (0)