pytorch
diff --git a/‎torchtitan/experiments/deterministic_vllm_rl/__init__.py‎
Lines changed: 9 additions & 0 deletions b/‎torchtitan/experiments/deterministic_vllm_rl/__init__.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎torchtitan/experiments/deterministic_vllm_rl/infer.py‎
Lines changed: 23 additions & 22 deletions b/‎torchtitan/experiments/deterministic_vllm_rl/infer.py‎
Lines changed: 23 additions & 22 deletions
diff --git a/‎torchtitan/experiments/deterministic_vllm_rl/models/__init__.py‎
Lines changed: 70 additions & 1 deletion b/‎torchtitan/experiments/deterministic_vllm_rl/models/__init__.py‎
Lines changed: 70 additions & 1 deletion
@@ -15,6 +15,12 @@
 - Qwen3VLLMCompatModel: vLLM-compatible model with merged projections
 - batch_invariant_backward: Gradient support for vLLM's deterministic operations
 - simple_rl: End-to-end RL training loop
+- TorchTitanVLLMModel: Generic wrapper for TorchTitan models with vLLM
+
+For vLLM inference with TorchTitan models, see:
+- models/base_wrapper.py: Core vLLM wrapper
+- models/__init__.py: Auto-registration with vLLM
+- infer.py: Example inference script
 """
 
 from .batch_invariant_backward import (
@@ -23,12 +29,15 @@
     silu_and_mul_with_gradients,
 )
 from .models import VLLMCompatibleFlashAttention
+from .models.base_wrapper import TorchTitanVLLMModel
 from .models.qwen3 import Qwen3VLLMCompatModel
 
+
 __all__ = [
     "VLLMCompatibleFlashAttention",
     "Qwen3VLLMCompatModel",
     "enable_batch_invariant_backward_mode",
     "rms_norm_with_gradients",
     "silu_and_mul_with_gradients",
+    "TorchTitanVLLMModel",
 ]
@@ -5,23 +5,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""
-Example CLI to run TorchTitan Qwen3 model inference with vLLM:
-
-# Run inference
-python torchtitan/experiments/deterministic_vllm_rl/infer.py
-"""
-
 import argparse
 
 from vllm import LLM, SamplingParams
 
-# Import and register the TorchTitan vLLM plugin
-from torchtitan.experiments.deterministic_vllm_rl.register import register
-
-# Register TorchTitan models with vLLM.
-# NOTE(jianiw): We could use plug-in system instead: https://docs.vllm.ai/en/latest/design/plugin_system/
-register()
+# Import models module - this automatically registers TorchTitan models with vLLM
+from torchtitan.experiments.deterministic_vllm_rl import models  # noqa: F401
 
 
 def parse_args():
@@ -66,39 +55,50 @@ def main():
     args = parse_args()
 
     print("=" * 80)
-    print("INITIALIZING vLLM WITH TORCHTITAN QWEN3 MODEL")
+    print("INITIALIZING vLLM WITH TORCHTITAN QWEN3 MODEL ")
     print("=" * 80)
+    print(f"Model: {args.model}")
+    print(f"Tensor Parallel Size: {args.tensor_parallel_size}")
+    print()
 
     # Build hf_overrides with checkpoint path
     hf_overrides = {
         "checkpoint_dir": args.model,
     }
 
     # Initialize vLLM with custom TorchTitan Qwen3 model
+    # The LLM initialization will internally:
+    # 1. Load TrainSpec for Qwen3 (from register())
+    # 2. Create TorchTitanVLLMModel instance
+    # 3. Process parallelism settings via process_parallelism_settings()
+    # 4. Build device mesh and apply parallelization via build_device_mesh_and_parallelize()
+    # 5. Load model weights and prepare for inference
+    print("Initializing vLLM engine...")
     llm = LLM(
-        model=args.model,  # Use temporary directory with config.json
+        model=args.model,  # Model checkpoint path
         hf_overrides=hf_overrides,
         dtype="bfloat16",
         trust_remote_code=True,
-        enforce_eager=True,  # Use eager mode for debugging
-        # Disable kv cache, required for now
-        enable_prefix_caching=False,
+        enforce_eager=True,  # Use eager mode
+        enable_prefix_caching=False,  # Disable kv cache for now
         tensor_parallel_size=args.tensor_parallel_size,  # Multi-GPU support
     )
 
     print("=" * 80)
-    print("vLLM ENGINE INITIALIZED - STARTING GENERATION")
+    print("vLLM ENGINE INITIALIZED - CONFIGURATION DETAILS")
     print("=" * 80)
+    print(f"Prompt: {args.prompt}")
+    print()
 
-    # Prepare prompt
+    # Prepare prompt and sampling parameters
     prompts = [args.prompt]
     sampling_params = SamplingParams(
         temperature=args.temperature,
         top_p=0.95,
         max_tokens=args.max_tokens,
     )
 
-    # Generate
+    # Generate text
     outputs = llm.generate(
         prompts=prompts,
         sampling_params=sampling_params,
@@ -109,8 +109,9 @@ def main():
         prompt = output.prompt
         generated_text = output.outputs[0].text
 
-        print(f"\nPrompt: {prompt}")
+        print(f"Prompt: {prompt}")
         print(f"Generated text: {generated_text!r}")
+        print()
 
 
 if __name__ == "__main__":
 
@@ -6,8 +6,77 @@
 
 """
 Models for deterministic vLLM RL training.
+
+This module automatically registers TorchTitan models with vLLM when imported.
 """
 
+from vllm.logger import init_logger
+
+from torchtitan.protocols.train_spec import get_train_spec, TrainSpec
 from .attention import VLLMCompatibleFlashAttention, VLLMPagedFlashAttention
+from .base_wrapper import TorchTitanVLLMModel
+
+
+logger = init_logger(__name__)
+
+
+def register_torchtitan_model_from_train_spec(
+    train_spec: TrainSpec,
+    model_name: str,
+) -> None:
+    """
+    Register a TorchTitan model with vLLM using a TrainSpec.
+
+    Args:
+        train_spec: TorchTitan TrainSpec containing model components
+        model_name: Name to register in vLLM (e.g., "Qwen3TorchTitanForCausalLM")
+
+    """
+    from vllm.model_executor.models.registry import ModelRegistry
+
+    # Extract model_args from TrainSpec
+    # TrainSpec has model_args as a Mapping, get the first value
+    if isinstance(train_spec.model_args, dict):
+        model_args_cls = type(next(iter(train_spec.model_args.values())))
+    else:
+        model_args_cls = train_spec.model_args
+
+    # Create dynamic model class directly from TrainSpec components
+    class TorchTitanVLLMModelFromSpec(TorchTitanVLLMModel):
+        """Dynamically created vLLM model from TrainSpec."""
+
+        def __init__(self, *, vllm_config, prefix=""):
+            super().__init__(
+                model_cls=train_spec.model_cls,
+                model_args_cls=model_args_cls,
+                state_dict_adapter=train_spec.state_dict_adapter,
+                parallelize_fn=train_spec.parallelize_fn,
+                vllm_config=vllm_config,
+                prefix=prefix,
+            )
+
+    # Set the class name
+    TorchTitanVLLMModelFromSpec.__name__ = model_name
+    TorchTitanVLLMModelFromSpec.__qualname__ = model_name
+
+    # Register with vLLM
+    ModelRegistry.register_model(model_name, TorchTitanVLLMModelFromSpec)
+
+    logger.info(
+        f"Successfully registered {model_name} with vLLM using TrainSpec "
+        f"(model_cls={train_spec.model_cls.__name__})"
+    )
+
+
+# Auto-register TorchTitan models with vLLM when this module is imported
+register_torchtitan_model_from_train_spec(
+    train_spec=get_train_spec("qwen3"),
+    model_name="Qwen3TorchTitanForCausalLM",
+)
+
 
-__all__ = ["VLLMCompatibleFlashAttention", "VLLMPagedFlashAttention"]
+__all__ = [
+    "VLLMCompatibleFlashAttention",
+    "VLLMPagedFlashAttention",
+    "TorchTitanVLLMModel",
+]