NVIDIA · achartier · Oct 24, 2025
@@ -1,19 +1,30 @@
 # Generate text asynchronously with Ray orchestrator.
+import argparse
 import asyncio
 
 from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import KvCacheConfig
 
 
 def main():
+    parser = argparse.ArgumentParser(
+        description="Generate text asynchronously with Ray orchestrator.")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        help=
+        "HuggingFace model name or path to local HF model (default: TinyLlama/TinyLlama-1.1B-Chat-v1.0)"
+    )
+    args = parser.parse_args()
     # Configure KV cache memory usage fraction.
     kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5,
                                     max_tokens=4096,
                                     enable_block_reuse=True)
 
     # model could accept HF model name or a path to local HF model.
     llm = LLM(
-        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        model=args.model,
         kv_cache_config=kv_cache_config,
         max_seq_len=1024,
         max_batch_size=1,

diff --git a/tests/integration/defs/examples/test_ray.py b/tests/integration/defs/examples/test_ray.py
@@ -14,7 +14,8 @@ def ray_example_root(llm_root):
 
 def test_llm_inference_async_ray(ray_example_root, llm_venv):
     script_path = os.path.join(ray_example_root, "llm_inference_async_ray.py")
-    venv_check_call(llm_venv, [script_path])
+    model_path = f"{llm_models_root()}/llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
+    venv_check_call(llm_venv, [script_path, "--model", model_path])
 
 
 @pytest.mark.skip_less_device(2)