add joint graph runner deepseek_v3 experiment

yiming0416 · yiming0416 · commit 78da6445128a · 2025-10-16T12:41:20.000-07:00
diff --git a/torchtitan/experiments/__init__.py b/torchtitan/experiments/__init__.py
@@ -5,5 +5,11 @@
 # LICENSE file in the root directory of this source tree.
 
 _supported_experiments = frozenset(
-    ["flux", "simple_fsdp.llama3", "simple_fsdp.deepseek_v3", "vlm"]
+    [
+        "flux",
+        "simple_fsdp.llama3",
+        "simple_fsdp.deepseek_v3",
+        "vlm",
+        "joint_graph_runner.deepseek_v3",
+    ]
 )
diff --git a/torchtitan/experiments/joint_graph_runner/deepseek_v3/__init__.py b/torchtitan/experiments/joint_graph_runner/deepseek_v3/__init__.py
@@ -0,0 +1,36 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved.
+
+from torchtitan.components.loss import build_cross_entropy_loss
+from torchtitan.components.lr_scheduler import build_lr_schedulers
+from torchtitan.components.optimizer import build_optimizers_with_moe_load_balancing
+from torchtitan.components.tokenizer import build_hf_tokenizer
+from torchtitan.datasets.hf_datasets import build_hf_dataloader
+from torchtitan.distributed.pipeline_parallel import pipeline_llm
+
+from torchtitan.experiments.simple_fsdp.deepseek_v3.model import (
+    SimpleFSDPDeepSeekV3Model,
+)
+from torchtitan.models.deepseek_v3 import deepseekv3_args
+from torchtitan.protocols.train_spec import TrainSpec
+
+from .parallelize import parallelize_deepseekv3
+
+
+def get_train_spec() -> TrainSpec:
+    return TrainSpec(
+        model_cls=SimpleFSDPDeepSeekV3Model,
+        model_args=deepseekv3_args,
+        parallelize_fn=parallelize_deepseekv3,
+        pipelining_fn=pipeline_llm,
+        build_optimizers_fn=build_optimizers_with_moe_load_balancing,
+        build_lr_schedulers_fn=build_lr_schedulers,
+        build_dataloader_fn=build_hf_dataloader,
+        build_tokenizer_fn=build_hf_tokenizer,
+        build_loss_fn=build_cross_entropy_loss,
+    )
diff --git a/torchtitan/experiments/joint_graph_runner/deepseek_v3/parallelize.py b/torchtitan/experiments/joint_graph_runner/deepseek_v3/parallelize.py