Remove unused eos_id from model args

CryptoSalamander · CryptoSalamander · commit 50b10b8b0f22 · 2025-11-22T02:48:41.000Z
diff --git a/torchtitan/experiments/deterministic_vllm_rl/models/qwen3/model_vllm_compat.py b/torchtitan/experiments/deterministic_vllm_rl/models/qwen3/model_vllm_compat.py
@@ -288,7 +288,6 @@ def __init__(self, model_args: Qwen3ModelArgs):
         self.model_args = model_args
         self.vocab_size = model_args.vocab_size
         self.n_layers = model_args.n_layers
-        self.eos_id = model_args.eos_id
         self.head_dim = model_args.head_dim
 
         self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)
diff --git a/torchtitan/experiments/deterministic_vllm_rl/simple_rl.py b/torchtitan/experiments/deterministic_vllm_rl/simple_rl.py
@@ -332,7 +332,6 @@ def load_model(checkpoint_path: str, model_path: str, use_vllm_compat: bool = Tr
         max_seq_len=getattr(hf_config, "max_position_embeddings", 32768),
         qk_norm=True,
         depth_init=True,
-        eos_id=getattr(hf_config, "eos_token_id", 151645),
     )
 
     # state_dict is in standard TorchTitan format (w1, w2, w3)
diff --git a/torchtitan/experiments/transformers_backend/model/args.py b/torchtitan/experiments/transformers_backend/model/args.py
@@ -54,7 +54,6 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):
             "n_kv_heads": "num_key_value_heads",
             "norm_eps": "rms_norm_eps",
             "max_seq_len": "max_position_embeddings",
-            "eos_id": "eos_token_id",
         }
     }
 
diff --git a/torchtitan/models/llama3/model/args.py b/torchtitan/models/llama3/model/args.py
@@ -45,7 +45,6 @@ class TransformerModelArgs(BaseModelArgs):
 
     use_flex_attn: bool = False
     attn_mask_type: str = "causal"
-    eos_id: int = 0
 
     def update_from_config(self, job_config: JobConfig, **kwargs) -> None:
         seq_len = job_config.training.seq_len
diff --git a/torchtitan/models/qwen3/model/args.py b/torchtitan/models/qwen3/model/args.py
@@ -38,7 +38,6 @@ class Qwen3ModelArgs(BaseModelArgs):
 
     use_flex_attn: bool = False
     attn_mask_type: str = "causal"
-    eos_id: int = 151645
 
     enable_weight_tying: bool = False
 
diff --git a/torchtitan/models/qwen3/model/model.py b/torchtitan/models/qwen3/model/model.py
@@ -384,7 +384,6 @@ def __init__(self, model_args: Qwen3ModelArgs):
         self.model_args = model_args
         self.vocab_size = model_args.vocab_size
         self.n_layers = model_args.n_layers
-        self.eos_id = model_args.eos_id
         self.head_dim = model_args.head_dim
 
         self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)

Original file line number	Diff line number	Diff line change
`@@ -332,7 +332,6 @@ def load_model(checkpoint_path: str, model_path: str, use_vllm_compat: bool = Tr`
`332`	`332`	`max_seq_len=getattr(hf_config, "max_position_embeddings", 32768),`
`333`	`333`	`qk_norm=True,`
`334`	`334`	`depth_init=True,`
`335`		`- eos_id=getattr(hf_config, "eos_token_id", 151645),`
`336`	`335`	`)`
`337`	`336`
`338`	`337`	`# state_dict is in standard TorchTitan format (w1, w2, w3)`
Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,6 @@ class HFTransformerModelArgs(PretrainedConfig, BaseModelArgs):`
`54`	`54`	`"n_kv_heads": "num_key_value_heads",`
`55`	`55`	`"norm_eps": "rms_norm_eps",`
`56`	`56`	`"max_seq_len": "max_position_embeddings",`
`57`		`- "eos_id": "eos_token_id",`
`58`	`57`	`}`
`59`	`58`	`}`
`60`	`59`