Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/workflows/integration_test_8gpu_simple_fsdp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,11 @@ jobs:
python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126

mkdir artifacts-to-be-uploaded
python -m torchtitan.experiments.simple_fsdp.tests.integration_tests artifacts-to-be-uploaded --ngpu 8
# Run front-end integration tests of SimpleFSDP
python -m torchtitan.experiments.simple_fsdp.tests.frontend_integration_tests artifacts-to-be-uploaded --ngpu 8

# Run backend pass integration tests of SimpleFSDP
python -m torchtitan.experiments.simple_fsdp.tests.compiler_pass_integration_tests artifacts-to-be-uploaded --ngpu 8

# Run the numerics unit tests of SimpleFSDP
torchrun --nproc-per-node=8 -m pytest torchtitan/experiments/simple_fsdp/tests/test_numerics.py -v
Expand Down
15 changes: 13 additions & 2 deletions run_train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,23 @@ NGPU=${NGPU:-"8"}
export LOG_RANK=${LOG_RANK:-0}
CONFIG_FILE=${CONFIG_FILE:-"./torchtitan/models/llama3/train_configs/debug_model.toml"}
TRAIN_FILE=${TRAIN_FILE:-"torchtitan.train"}

set +x
copy_args=("$@")
for ((i=0; i<${#copy_args[@]}; i++)); do
if [[ ${copy_args[i]} == --comm.mode ]]; then
CONFIG_COMM_MODE="${copy_args[i+1]}"
fi
done
set -x

CONFIG_COMM_MODE=${CONFIG_COMM_MODE:-"default"}
# COMM_MODE options: "fake_backend" (dry run), "local_tensor" (debug mode), or empty for normal training
COMM_MODE=${COMM_MODE:-""}
COMM_MODE=${COMM_MODE:-$CONFIG_COMM_MODE}

TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"}

if [ -n "$COMM_MODE" ]; then
if [ "$COMM_MODE" != "default" ]; then
# Communication mode specified: validate configuration or run in debug mode
echo "Running with comm_mode=${COMM_MODE}"
NGPU="${NGPU}" LOCAL_RANK=0 python3 -m "${TRAIN_FILE}" --job.config_file "${CONFIG_FILE}" "$@" --comm.mode=${COMM_MODE} --training.steps=1
Expand Down
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if splitting into two tests will incur overhead.
@wwwjn does this incur any overhead to CI?

Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import argparse
import os

from tests.integration_tests import OverrideDefinitions
from tests.integration_tests.run_tests import run_tests


def build_simple_fsdp_test_list() -> list[OverrideDefinitions]:
"""
key is the config file name and value is a list of OverrideDefinitions
that is used to generate variations of integration tests based on the
same root config file.
"""
integration_tests_flavors = [
OverrideDefinitions(
[
[
"--model.name simple_fsdp.llama3",
"--compile.enable",
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
"--compile.backend aot_eager",
"--compile.graph_passes auto_bucketing",
"--comm.mode fake_backend",
],
],
"1D+autobucketing",
"1d_autobucketing",
ngpu=8,
),
OverrideDefinitions(
[
[
"--model.name simple_fsdp.llama3",
"--compile.enable",
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
"--compile.backend aot_eager",
"--compile.graph_passes transformer_block_bucketing",
"--comm.mode fake_backend",
],
],
"1D+transformer_block_bucketing",
"1d_transformer_block_bucketing",
ngpu=8,
),
OverrideDefinitions(
[
[
"--model.name simple_fsdp.llama3",
"--parallelism.tensor_parallel_degree 2",
"--compile.enable",
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
"--compile.backend aot_eager",
"--compile.graph_passes auto_bucketing",
"--comm.mode fake_backend",
],
],
"2D+autobucketing",
"2d_autobucketing",
ngpu=8,
),
OverrideDefinitions(
[
[
"--model.name simple_fsdp.llama3",
"--parallelism.tensor_parallel_degree 2",
"--compile.enable",
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
"--compile.backend aot_eager",
"--compile.graph_passes transformer_block_bucketing",
"--comm.mode fake_backend",
],
],
"2D+transformer_block_bucketing",
"2d_transformer_block_bucketing",
ngpu=8,
),
# TODO(ruisizhang123): add back after passes + PP is supported
# OverrideDefinitions(
# [
# [
# "--model.name simple_fsdp.llama3",
# "--parallelism.tensor_parallel_degree 2",
# "--parallelism.pipeline_parallel_degree 2",
# "--compile.enable",
# "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
# "--compile.backend aot_eager",
# "--compile.graph_passes auto_bucketing",
# "--comm.mode fake_backend",
# ],
# ],
# "3D+autobucketing",
# "3d_autobucketing",
# ngpu=8,
# ),
# OverrideDefinitions(
# [
# [
# "--model.name simple_fsdp.llama3",
# "--parallelism.tensor_parallel_degree 2",
# "--parallelism.pipeline_parallel_degree 2",
# "--compile.enable",
# "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
# "--compile.backend aot_eager",
# "--compile.graph_passes transformer_block_bucketing",
# "--comm.mode fake_backend",
# ],
# ],
# "3D+transformer_block_bucketing",
# "3d_transformer_block_bucketing",
# ngpu=8,
# ),
# OverrideDefinitions(
# [
# [
# "--model.name simple_fsdp.llama3",
# "--parallelism.tensor_parallel_degree 2",
# "--parallelism.context_parallel_degree 2",
# "--compile.enable",
# "--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
# "--compile.backend aot_eager",
# "--compile.graph_passes auto_bucketing",
# "--comm.mode fake_backend",
# ],
# ],
# "FSDP+TP+CP+autobucketing",
# "fsdp+tp+cp_autobucketing",
# ngpu=8,
# ),
OverrideDefinitions(
[
[
"--model.name simple_fsdp.llama3",
"--parallelism.tensor_parallel_degree 2",
"--parallelism.context_parallel_degree 2",
"--compile.enable",
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
"--compile.backend aot_eager",
"--compile.graph_passes transformer_block_bucketing",
"--comm.mode fake_backend",
],
],
"FSDP+TP+CP+transformer_block_bucketing",
"fsdp+tp+cp_transformer_block_bucketing",
ngpu=8,
),
OverrideDefinitions(
[
[
"--model.name simple_fsdp.deepseek_v3",
"--parallelism.data_parallel_shard_degree 4",
"--parallelism.expert_parallel_degree 2",
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
"--compile.backend aot_eager",
"--compile.graph_passes auto_bucketing",
"--comm.mode fake_backend",
],
],
"FSDP+EP+autobucketing",
"fsdp+ep_autobucketing",
ngpu=4,
),
OverrideDefinitions(
[
[
"--model.name simple_fsdp.deepseek_v3",
"--parallelism.data_parallel_shard_degree 4",
"--parallelism.expert_parallel_degree 2",
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
"--compile.backend aot_eager",
"--compile.graph_passes transformer_block_bucketing",
"--comm.mode fake_backend",
],
],
"FSDP+EP+transformer_block_bucketing",
"fsdp+ep_transformer_block_bucketing",
ngpu=4,
),
OverrideDefinitions(
[
[
"--model.name simple_fsdp.deepseek_v3",
"--parallelism.data_parallel_shard_degree 2",
"--parallelism.tensor_parallel_degree 2",
"--parallelism.expert_parallel_degree 4",
"--parallelism.expert_tensor_parallel_degree 1",
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
"--compile.backend aot_eager",
"--compile.graph_passes auto_bucketing",
"--comm.mode fake_backend",
],
],
"FSDP+TP+EP+autobucketing",
"fsdp+tp+ep_autobucketing",
ngpu=4,
),
OverrideDefinitions(
[
[
"--model.name simple_fsdp.deepseek_v3",
"--parallelism.data_parallel_shard_degree 2",
"--parallelism.tensor_parallel_degree 2",
"--parallelism.expert_parallel_degree 4",
"--parallelism.expert_tensor_parallel_degree 1",
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
"--compile.backend aot_eager",
"--compile.graph_passes transformer_block_bucketing",
"--comm.mode fake_backend",
],
],
"FSDP+TP+EP+transformer_block_bucketing",
"fsdp+tp+ep_transformer_block_bucketing",
ngpu=4,
),
]
return integration_tests_flavors


_TEST_SUITES_FUNCTION = {
"simple_fsdp": build_simple_fsdp_test_list,
}


def main():
parser = argparse.ArgumentParser()
parser.add_argument("output_dir")
parser.add_argument(
"--config_path",
default="./tests/integration_tests/base_config.toml",
help="Base config path for integration tests. This is the config that will be used as a base for all tests.",
)
parser.add_argument(
"--test_name",
default="all",
help="test to run, acceptable values: `test_name` in `build_test_list` (default: all)",
)
parser.add_argument("--ngpu", default=8, type=int)
args = parser.parse_args()

if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
if os.listdir(args.output_dir):
raise RuntimeError("Please provide an empty output directory.")

test_list = _TEST_SUITES_FUNCTION["simple_fsdp"]()
run_tests(args, test_list)


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
Expand Up @@ -29,32 +29,6 @@ def build_simple_fsdp_test_list() -> list[OverrideDefinitions]:
"1D",
"1d",
),
OverrideDefinitions(
[
[
"--model.name simple_fsdp.llama3",
"--compile.enable",
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
"--compile.backend aot_eager",
"--compile.graph_passes auto_bucketing",
],
],
"1D+autobucketing",
"1d_autobucketing",
),
OverrideDefinitions(
[
[
"--model.name simple_fsdp.llama3",
"--compile.enable",
"--job.custom_config_module=torchtitan.experiments.simple_fsdp.job_config",
"--compile.backend aot_eager",
"--compile.graph_passes transformer_block_bucketing",
],
],
"1D+transformer_block_bucketing",
"1d_transformer_block_bucketing",
),
OverrideDefinitions(
[
[
Expand Down
Loading