Revert removal of eos_id_args

hyunsooha · hyunsooha · commit f13c23594717 · 2025-12-04T17:21:36.000+09:00
diff --git a/torchtitan/experiments/deterministic_vllm_rl/models/qwen3/model_vllm_compat.py b/torchtitan/experiments/deterministic_vllm_rl/models/qwen3/model_vllm_compat.py
@@ -288,6 +288,7 @@ def __init__(self, model_args: Qwen3ModelArgs):
         self.model_args = model_args
         self.vocab_size = model_args.vocab_size
         self.n_layers = model_args.n_layers
+        self.eos_id = model_args.eos_id
         self.head_dim = model_args.head_dim
 
         self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)
diff --git a/torchtitan/experiments/deterministic_vllm_rl/simple_rl.py b/torchtitan/experiments/deterministic_vllm_rl/simple_rl.py
@@ -332,6 +332,7 @@ def load_model(checkpoint_path: str, model_path: str, use_vllm_compat: bool = Tr
         max_seq_len=getattr(hf_config, "max_position_embeddings", 32768),
         qk_norm=True,
         depth_init=True,
+        eos_id=getattr(hf_config, "eos_token_id", 151645),
     )
 
     # state_dict is in standard TorchTitan format (w1, w2, w3)
diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
@@ -54,6 +54,7 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
             "n_kv_heads": "num_key_value_heads",
             "norm_eps": "rms_norm_eps",
             "max_seq_len": "max_position_embeddings",
+            "eos_id": "eos_token_id",
         }
     }
 
diff --git a/torchtitan/models/llama3/model/args.py b/torchtitan/models/llama3/model/args.py
@@ -45,6 +45,7 @@ class TransformerModelArgs(BaseModelArgs):
 
     use_flex_attn: bool = False
     attn_mask_type: str = "causal"
+    eos_id: int = 0
 
     def update_from_config(self, job_config: JobConfig, **kwargs) -> None:
         seq_len = job_config.training.seq_len
diff --git a/torchtitan/models/qwen3/model/args.py b/torchtitan/models/qwen3/model/args.py
@@ -38,6 +38,7 @@ class Qwen3ModelArgs(BaseModelArgs):
 
     use_flex_attn: bool = False
     attn_mask_type: str = "causal"
+    eos_id: int = 151645
 
     enable_weight_tying: bool = False
 
diff --git a/torchtitan/models/qwen3/model/model.py b/torchtitan/models/qwen3/model/model.py
@@ -384,6 +384,7 @@ def __init__(self, model_args: Qwen3ModelArgs):
         self.model_args = model_args
         self.vocab_size = model_args.vocab_size
         self.n_layers = model_args.n_layers
+        self.eos_id = model_args.eos_id
         self.head_dim = model_args.head_dim
 
         self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)

Original file line number	Diff line number	Diff line change
`@@ -332,6 +332,7 @@ def load_model(checkpoint_path: str, model_path: str, use_vllm_compat: bool = Tr`
`332`	`332`	`max_seq_len=getattr(hf_config, "max_position_embeddings", 32768),`
`333`	`333`	`qk_norm=True,`
`334`	`334`	`depth_init=True,`
	`335`	`+ eos_id=getattr(hf_config, "eos_token_id", 151645),`
`335`	`336`	`)`
`336`	`337`
`337`	`338`	`# state_dict is in standard TorchTitan format (w1, w2, w3)`
Original file line number	Diff line number	Diff line change
`@@ -54,6 +54,7 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):`
`54`	`54`	`"n_kv_heads": "num_key_value_heads",`
`55`	`55`	`"norm_eps": "rms_norm_eps",`
`56`	`56`	`"max_seq_len": "max_position_embeddings",`
	`57`	`+ "eos_id": "eos_token_id",`
`57`	`58`	`}`
`58`	`59`	`}`
`59`	`60`