Skip to content

Commit 88b700b

Browse files
committed
enable DSV3 manual bucketing
1 parent 605a9a1 commit 88b700b

File tree

2 files changed

+29
-2
lines changed

2 files changed

+29
-2
lines changed

torchtitan/experiments/simple_fsdp/deepseek_v3/parallelize.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,11 @@ def convert_modules_to_fqns(modules, module_to_fqn_mapping):
3838
result = []
3939
for m in modules:
4040
if isinstance(m, list):
41-
result.append(convert_modules_to_fqns(m, module_to_fqn_mapping))
41+
if fqn_list := convert_modules_to_fqns(m, module_to_fqn_mapping):
42+
result.append(fqn_list)
4243
else:
43-
result.append(module_to_fqn_mapping.get(m, None))
44+
if fqn := module_to_fqn_mapping.get(m):
45+
result.append(fqn)
4446
return result
4547

4648
module_to_name = {m: n for n, m in model.named_modules()}

torchtitan/models/deepseek_v3/__init__.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,31 @@
7676
attn_mask_type="block_causal",
7777
),
7878
"16B": DeepSeekV3ModelArgs(
79+
vocab_size=102400,
80+
dim=2048,
81+
inter_dim=10944,
82+
moe_inter_dim=1408,
83+
n_layers=27,
84+
n_dense_layers=1,
85+
n_heads=16,
86+
moe_args=MoEArgs(
87+
num_experts=64,
88+
num_shared_experts=2,
89+
top_k=6,
90+
score_func="softmax",
91+
route_norm=False,
92+
score_before_experts=False,
93+
),
94+
q_lora_rank=0,
95+
kv_lora_rank=512,
96+
qk_nope_head_dim=128,
97+
qk_rope_head_dim=64,
98+
v_head_dim=128,
99+
mscale=0.70,
100+
use_flex_attn=False,
101+
attn_mask_type="block_causal",
102+
),
103+
"16B_flex_attn": DeepSeekV3ModelArgs(
79104
vocab_size=102400,
80105
dim=2048,
81106
inter_dim=10944,

0 commit comments

Comments
 (0)