Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion examples/ray_orchestrator/llm_inference_async_ray.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,30 @@
# Generate text asynchronously with Ray orchestrator.
import argparse
import asyncio

from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import KvCacheConfig


def main():
parser = argparse.ArgumentParser(
description="Generate text asynchronously with Ray orchestrator.")
parser.add_argument(
"--model",
type=str,
default="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
help=
"HuggingFace model name or path to local HF model (default: TinyLlama/TinyLlama-1.1B-Chat-v1.0)"
)
args = parser.parse_args()
# Configure KV cache memory usage fraction.
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5,
max_tokens=4096,
enable_block_reuse=True)

# model could accept HF model name or a path to local HF model.
llm = LLM(
model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
model=args.model,
kv_cache_config=kv_cache_config,
max_seq_len=1024,
max_batch_size=1,
Expand Down
3 changes: 2 additions & 1 deletion tests/integration/defs/examples/test_ray.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ def ray_example_root(llm_root):

def test_llm_inference_async_ray(ray_example_root, llm_venv):
script_path = os.path.join(ray_example_root, "llm_inference_async_ray.py")
venv_check_call(llm_venv, [script_path])
model_path = f"{llm_models_root()}/llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
venv_check_call(llm_venv, [script_path, "--model", model_path])


@pytest.mark.skip_less_device(2)
Expand Down