Skip to content

Commit ff34fc2

Browse files
committed
add CI to guard compiler optimization passes
1 parent 1b9cfda commit ff34fc2

File tree

4 files changed

+285
-30
lines changed

4 files changed

+285
-30
lines changed

.github/workflows/integration_test_8gpu_simple_fsdp.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,11 @@ jobs:
5050
python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
5151
5252
mkdir artifacts-to-be-uploaded
53-
python -m torchtitan.experiments.simple_fsdp.tests.integration_tests artifacts-to-be-uploaded --ngpu 8
53+
# Run front-end integration tests of SimpleFSDP
54+
python -m torchtitan.experiments.simple_fsdp.tests.frontend_integration_tests artifacts-to-be-uploaded --ngpu 8
55+
56+
# Run backend pass integration tests of SimpleFSDP
57+
python -m torchtitan.experiments.simple_fsdp.tests.compiler_pass_integration_tests artifacts-to-be-uploaded --ngpu 8 --comm_mode local_tensor
5458
5559
# Run the numerics unit tests of SimpleFSDP
5660
torchrun --nproc-per-node=8 -m pytest torchtitan/experiments/simple_fsdp/tests/test_numerics.py -v

tests/integration_tests/run_tests.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,23 @@ def _run_cmd(cmd):
2929
return subprocess.run([cmd], text=True, shell=True)
3030

3131

32-
def run_single_test(test_flavor: OverrideDefinitions, full_path: str, output_dir: str):
32+
def run_single_test(
33+
test_flavor: OverrideDefinitions, full_path: str, output_dir: str, comm_mode: str
34+
):
3335
# run_test supports sequence of tests.
3436
test_name = test_flavor.test_name
3537
dump_folder_arg = f"--job.dump_folder {output_dir}/{test_name}"
3638

3739
all_ranks = ",".join(map(str, range(test_flavor.ngpu)))
3840

3941
for idx, override_arg in enumerate(test_flavor.override_args):
40-
cmd = f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} ./run_train.sh"
42+
if comm_mode == "default":
43+
cmd = f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} ./run_train.sh"
44+
elif comm_mode in ["fake_backend", "local_tensor"]:
45+
cmd = f"NGPU={test_flavor.ngpu} LOCAL_RANK=0 python3 -m torchtitan.train --job.config_file {full_path} --comm.mode={comm_mode} --training.steps=1"
46+
else:
47+
raise ValueError("Unsupported comm mode")
48+
4149
# dump compile trace for debugging purpose
4250
cmd = f'TORCH_TRACE="{output_dir}/{test_name}/compile_trace" ' + cmd
4351

@@ -102,14 +110,22 @@ def run_tests(args, test_list: list[OverrideDefinitions]):
102110
f" because --ngpu arg is {args.ngpu}"
103111
)
104112
else:
105-
run_single_test(test_flavor, args.config_path, args.output_dir)
113+
run_single_test(
114+
test_flavor, args.config_path, args.output_dir, args.comm_mode
115+
)
106116

107117

108118
def main():
109119
parser = argparse.ArgumentParser()
110120
parser.add_argument(
111121
"output_dir", help="Directory to dump results generated by tests"
112122
)
123+
parser.add_argument(
124+
"comm_mode",
125+
default="default",
126+
choices=["default", "fake_backend", "local_tensor"],
127+
help="Communication mode to validate tests",
128+
)
113129
parser.add_argument(
114130
"--gpu_arch_type",
115131
default="cuda",
Lines changed: 261 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,261 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
import argparse
8+
import os
9+
10+
from tests.integration_tests import OverrideDefinitions
11+
from tests.integration_tests.run_tests import run_tests
12+
13+
14+
def build_simple_fsdp_test_list() -> list[OverrideDefinitions]:
15+
"""
16+
key is the config file name and value is a list of OverrideDefinitions
17+
that is used to generate variations of integration tests based on the
18+
same root config file.
19+
"""
20+
integration_tests_flavors = [
21+
OverrideDefinitions(
22+
[
23+
[
24+
"--model.name simple_fsdp.llama3",
25+
"--model.flavor 8B",
26+
"--compile.enable",
27+
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
28+
"--compile.backend aot_eager",
29+
"--compile.graph_passes auto_bucketing",
30+
],
31+
],
32+
"1D+autobucketing",
33+
"1d_autobucketing",
34+
ngpu=8,
35+
),
36+
OverrideDefinitions(
37+
[
38+
[
39+
"--model.name simple_fsdp.llama3",
40+
"--model.flavor 8B",
41+
"--compile.enable",
42+
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
43+
"--compile.backend aot_eager",
44+
"--compile.graph_passes transformer_block_bucketing",
45+
],
46+
],
47+
"1D+transformer_block_bucketing",
48+
"1d_transformer_block_bucketing",
49+
ngpu=8,
50+
),
51+
OverrideDefinitions(
52+
[
53+
[
54+
"--model.name simple_fsdp.llama3",
55+
"--model.flavor 8B",
56+
"--parallelism.tensor_parallel_degree 2",
57+
"--compile.enable",
58+
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
59+
"--compile.backend aot_eager",
60+
"--compile.graph_passes auto_bucketing",
61+
],
62+
],
63+
"2D+autobucketing",
64+
"2d_autobucketing",
65+
ngpu=8,
66+
),
67+
OverrideDefinitions(
68+
[
69+
[
70+
"--model.name simple_fsdp.llama3",
71+
"--model.flavor 8B",
72+
"--parallelism.tensor_parallel_degree 2",
73+
"--compile.enable",
74+
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
75+
"--compile.backend aot_eager",
76+
"--compile.graph_passes transformer_block_bucketing",
77+
],
78+
],
79+
"2D+transformer_block_bucketing",
80+
"2d_transformer_block_bucketing",
81+
ngpu=8,
82+
),
83+
# TODO(ruisizhang123): add back after passes + PP is supported
84+
# OverrideDefinitions(
85+
# [
86+
# [
87+
# "--model.name simple_fsdp.llama3",
88+
# "--model.flavor 8B",
89+
# "--parallelism.tensor_parallel_degree 2",
90+
# "--parallelism.pipeline_parallel_degree 2",
91+
# "--compile.enable",
92+
# "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
93+
# "--compile.backend aot_eager",
94+
# "--compile.graph_passes auto_bucketing",
95+
# ],
96+
# ],
97+
# "3D+autobucketing",
98+
# "3d_autobucketing",
99+
# ngpu=8,
100+
# ),
101+
# OverrideDefinitions(
102+
# [
103+
# [
104+
# "--model.name simple_fsdp.llama3",
105+
# "--model.flavor 8B",
106+
# "--parallelism.tensor_parallel_degree 2",
107+
# "--parallelism.pipeline_parallel_degree 2",
108+
# "--compile.enable",
109+
# "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
110+
# "--compile.backend aot_eager",
111+
# "--compile.graph_passes transformer_block_bucketing",
112+
# ],
113+
# ],
114+
# "3D+transformer_block_bucketing",
115+
# "3d_transformer_block_bucketing",
116+
# ngpu=8,
117+
# ),
118+
# OverrideDefinitions(
119+
# [
120+
# [
121+
# "--model.name simple_fsdp.llama3",
122+
# "--model.flavor 8B",
123+
# "--parallelism.tensor_parallel_degree 2",
124+
# "--parallelism.context_parallel_degree 2",
125+
# "--compile.enable",
126+
# "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
127+
# "--compile.backend aot_eager",
128+
# "--compile.graph_passes auto_bucketing",
129+
# ],
130+
# ],
131+
# "FSDP+TP+CP+autobucketing",
132+
# "fsdp+tp+cp_autobucketing",
133+
# ngpu=8,
134+
# ),
135+
OverrideDefinitions(
136+
[
137+
[
138+
"--model.name simple_fsdp.llama3",
139+
"--model.flavor 8B",
140+
"--parallelism.tensor_parallel_degree 2",
141+
"--parallelism.context_parallel_degree 2",
142+
"--compile.enable",
143+
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
144+
"--compile.backend aot_eager",
145+
"--compile.graph_passes transformer_block_bucketing",
146+
],
147+
],
148+
"FSDP+TP+CP+transformer_block_bucketing",
149+
"fsdp+tp+cp_transformer_block_bucketing",
150+
ngpu=8,
151+
),
152+
OverrideDefinitions(
153+
[
154+
[
155+
"--model.name simple_fsdp.deepseek_v3",
156+
"--model.flavor 16B",
157+
"--parallelism.data_parallel_shard_degree 4",
158+
"--parallelism.expert_parallel_degree 2",
159+
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
160+
"--compile.backend aot_eager",
161+
"--compile.graph_passes auto_bucketing",
162+
],
163+
],
164+
"FSDP+EP+autobucketing",
165+
"fsdp+ep_autobucketing",
166+
ngpu=4,
167+
),
168+
OverrideDefinitions(
169+
[
170+
[
171+
"--model.name simple_fsdp.deepseek_v3",
172+
"--model.flavor 16B",
173+
"--parallelism.data_parallel_shard_degree 4",
174+
"--parallelism.expert_parallel_degree 2",
175+
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
176+
"--compile.backend aot_eager",
177+
"--compile.graph_passes transformer_block_bucketing",
178+
],
179+
],
180+
"FSDP+EP+transformer_block_bucketing",
181+
"fsdp+ep_transformer_block_bucketing",
182+
ngpu=4,
183+
),
184+
OverrideDefinitions(
185+
[
186+
[
187+
"--model.name simple_fsdp.deepseek_v3",
188+
"--model.flavor 16B",
189+
"--parallelism.data_parallel_shard_degree 2",
190+
"--parallelism.tensor_parallel_degree 2",
191+
"--parallelism.expert_parallel_degree 4",
192+
"--parallelism.expert_tensor_parallel_degree 1",
193+
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
194+
"--compile.backend aot_eager",
195+
"--compile.graph_passes auto_bucketing",
196+
],
197+
],
198+
"FSDP+TP+EP+autobucketing",
199+
"fsdp+tp+ep_autobucketing",
200+
ngpu=4,
201+
),
202+
OverrideDefinitions(
203+
[
204+
[
205+
"--model.name simple_fsdp.deepseek_v3",
206+
"--model.flavor 16B",
207+
"--parallelism.data_parallel_shard_degree 2",
208+
"--parallelism.tensor_parallel_degree 2",
209+
"--parallelism.expert_parallel_degree 4",
210+
"--parallelism.expert_tensor_parallel_degree 1",
211+
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
212+
"--compile.backend aot_eager",
213+
"--compile.graph_passes transformer_block_bucketing",
214+
],
215+
],
216+
"FSDP+TP+EP+transformer_block_bucketing",
217+
"fsdp+tp+ep_transformer_block_bucketing",
218+
ngpu=4,
219+
),
220+
]
221+
return integration_tests_flavors
222+
223+
224+
_TEST_SUITES_FUNCTION = {
225+
"simple_fsdp": build_simple_fsdp_test_list,
226+
}
227+
228+
229+
def main():
230+
parser = argparse.ArgumentParser()
231+
parser.add_argument("output_dir")
232+
parser.add_argument(
233+
"--comm_mode",
234+
default="default",
235+
choices=["default", "fake_backend", "local_tensor"],
236+
help="Communication mode to validate tests",
237+
)
238+
parser.add_argument(
239+
"--config_path",
240+
default="./tests/integration_tests/base_config.toml",
241+
help="Base config path for integration tests. This is the config that will be used as a base for all tests.",
242+
)
243+
parser.add_argument(
244+
"--test_name",
245+
default="all",
246+
help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)",
247+
)
248+
parser.add_argument("--ngpu", default=8, type=int)
249+
args = parser.parse_args()
250+
251+
if not os.path.exists(args.output_dir):
252+
os.makedirs(args.output_dir)
253+
if os.listdir(args.output_dir):
254+
raise RuntimeError("Please provide an empty output directory.")
255+
256+
test_list = _TEST_SUITES_FUNCTION["simple_fsdp"]()
257+
run_tests(args, test_list)
258+
259+
260+
if __name__ == "__main__":
261+
main()

torchtitan/experiments/simple_fsdp/tests/integration_tests.py renamed to torchtitan/experiments/simple_fsdp/tests/frontend_integration_tests.py

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -29,32 +29,6 @@ def build_simple_fsdp_test_list() -> list[OverrideDefinitions]:
2929
"1D",
3030
"1d",
3131
),
32-
OverrideDefinitions(
33-
[
34-
[
35-
"--model.name simple_fsdp.llama3",
36-
"--compile.enable",
37-
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
38-
"--compile.backend aot_eager",
39-
"--compile.graph_passes auto_bucketing",
40-
],
41-
],
42-
"1D+autobucketing",
43-
"1d_autobucketing",
44-
),
45-
OverrideDefinitions(
46-
[
47-
[
48-
"--model.name simple_fsdp.llama3",
49-
"--compile.enable",
50-
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
51-
"--compile.backend aot_eager",
52-
"--compile.graph_passes transformer_block_bucketing",
53-
],
54-
],
55-
"1D+transformer_block_bucketing",
56-
"1d_transformer_block_bucketing",
57-
),
5832
OverrideDefinitions(
5933
[
6034
[

0 commit comments

Comments
 (0)