hchings
diff --git a/‎examples/llm-api/llm_inference_async.py‎
Lines changed: 19 additions & 3 deletions b/‎examples/llm-api/llm_inference_async.py‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎examples/ray_orchestrator/llm_inference_async_ray.py‎
Lines changed: 14 additions & 4 deletions b/‎examples/ray_orchestrator/llm_inference_async_ray.py‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎tensorrt_llm/_tmp_utils.py‎
Lines changed: 61 additions & 9 deletions b/‎tensorrt_llm/_tmp_utils.py‎
Lines changed: 61 additions & 9 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/executor_request_queue.py‎
Lines changed: 5 additions & 2 deletions b/‎tensorrt_llm/_torch/pyexecutor/executor_request_queue.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/llm_request.py‎
Lines changed: 11 additions & 10 deletions b/‎tensorrt_llm/_torch/pyexecutor/llm_request.py‎
Lines changed: 11 additions & 10 deletions
@@ -5,22 +5,32 @@
 
 from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm._tmp_utils import (analyze_average_timestamps,
-                                     dump_timestamps_to_json)
+                                     dump_timestamps_to_json,
+                                     print_enqueue_statistics)
+from tensorrt_llm.llmapi import KvCacheConfig
 
 
 def main():
     # model could accept HF model name or a path to local HF model.
+    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8,
+                                    max_tokens=4096,
+                                    enable_block_reuse=True)
+
     llm = LLM(
         #model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
         model="/scratch/llm-models/llama-3.2-models/Llama-3.2-3B-Instruct-FP8",
-        tensor_parallel_size=2)
+        # tensor_parallel_size=2
+        max_seq_len=1024,
+        kv_cache_config=kv_cache_config
+        # max_batch_size=1,
+    )
 
     # Sample prompts.
     prompts = [
         "Hello, my name is",
         "The capital of France is",
         "The future of AI is",
-    ] * 100
+    ] * 1000
 
     # Create a sampling params.
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
@@ -43,6 +53,12 @@ async def main():
     analyze_average_timestamps(all_timestamps)
     dump_timestamps_to_json(all_timestamps, "timestamps_output.json")
 
+    print(
+        f"executor type = {type(llm._executor)}, has enqueue_timings = {hasattr(llm._executor, 'enqueue_timings')}"
+    )
+    if hasattr(llm._executor, 'enqueue_timings'):
+        print_enqueue_statistics(llm._executor.enqueue_timings)
+
     # Got output like follows:
     # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
     # Prompt: 'The capital of France is', Generated text: 'Paris.'
 
@@ -6,13 +6,14 @@
 from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm._tmp_utils import (analyze_average_timestamps,
                                      dump_timestamps_to_json,
+                                     print_enqueue_statistics,
                                      print_fetch_statistics)
 from tensorrt_llm.llmapi import KvCacheConfig
 
 
 def main():
     # Configure KV cache memory usage fraction.
-    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5,
+    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8,
                                     max_tokens=4096,
                                     enable_block_reuse=True)
 
@@ -22,18 +23,20 @@ def main():
         # model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
         kv_cache_config=kv_cache_config,
         max_seq_len=1024,
-        max_batch_size=1,
+        # max_batch_size=1,
         orchestrator_type="ray",  # Enable Ray orchestrator
         # Enable 2-way tensor parallelism
-        tensor_parallel_size=2
+        # tensor_parallel_size=2
     )
 
     # Sample prompts.
     prompts = [
         "Hello, my name is",
         "The capital of France is",
         "The future of AI is",
-    ]
+    ] * 1000
+
+    #* 100
 
     # Create a sampling params.
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
@@ -48,6 +51,10 @@ async def task(prompt: str):
         if output.outputs[0].timestamps:
             all_timestamps.append(output.outputs[0].timestamps)
 
+        # print(
+        #     f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
+        # )
+
     async def main():
         tasks = [task(prompt) for prompt in prompts]
         await asyncio.gather(*tasks)
@@ -57,6 +64,9 @@ async def main():
     analyze_average_timestamps(all_timestamps)
     dump_timestamps_to_json(all_timestamps, "timestamps_output.json")
 
+    if hasattr(llm._executor, 'enqueue_timings'):
+        print_enqueue_statistics(llm._executor.enqueue_timings)
+
     if hasattr(llm._executor, 'workers'):
         for i, worker in enumerate(llm._executor.workers):
             try:
 
@@ -37,23 +37,22 @@ def calculate_latencies(timestamps):
     latencies['post_processing_time'] = timestamps['post_processing_time']
 
     latencies['execution_time'] = (timestamps['response_created'] -
-                                    timestamps['request_fetched']) * 1000
+                                   timestamps['request_fetched']) * 1000
 
     latencies['response_handling'] = (timestamps['response_enqueued'] -
-                                        timestamps['response_created']) * 1000
+                                      timestamps['response_created']) * 1000
 
     latencies['enqueue_response_to_handle'] = (
-        timestamps['handle_response'] -
-        timestamps['response_enqueued']) * 1000
+        timestamps['handle_response'] - timestamps['response_enqueued']) * 1000
 
     latencies['total_e2e'] = (timestamps['handle_response'] -
                               timestamps['executor_submit_request']) * 1000
 
     latencies['communication_overhead'] = (
         (timestamps['worker_enqueue_request'] -
-            timestamps['executor_submit_request']) +
+         timestamps['executor_submit_request']) +
         (timestamps['handle_response'] -
-            timestamps['response_enqueued'])) * 1000
+         timestamps['response_enqueued'])) * 1000
 
     return latencies
 
@@ -79,7 +78,20 @@ def analyze_average_timestamps(all_timestamps):
         return
 
     # Calculate averages
-    print(f"\n=== [{mode}] Latency Breakdown (milliseconds) - Average over {len(all_timestamps)} request ===")
+    print(
+        f"\n=== [{mode}] Latency Breakdown (milliseconds) - Average over {len(all_timestamps)} request ==="
+    )
+
+    # Print first 20 submit_request_to_enqueue values
+    submit_to_enqueue_values = [
+        lat['submit_request_to_enqueue'] for lat in all_latencies
+        if 'submit_request_to_enqueue' in lat
+    ]
+    if submit_to_enqueue_values:
+        first_20 = ', '.join(
+            [f"{x:.2f}" for x in submit_to_enqueue_values[:20]])
+        print(f"  Submit to enqueue (first 20, ms): {first_20}", flush=True)
+        print(flush=True)
 
     metrics = [
         ('submit_request_to_enqueue', 'Submit to enqueue'),
@@ -108,9 +120,11 @@ def analyze_average_timestamps(all_timestamps):
             min_val = min(values)
             max_val = max(values)
             variance = sum((x - avg)**2 for x in values) / len(values)
-            
+
             if metric_key == 'num_iterations':
-                print(f"  {metric_name:48s}: {avg:8.1f} (min: {min_val:8.1f}, max: {max_val:9.1f})")
+                print(
+                    f"  {metric_name:48s}: {avg:8.1f} (min: {min_val:8.1f}, max: {max_val:9.1f})"
+                )
             else:
                 print(
                     f"  {metric_name:48s}: {avg:8.3f} ms (min: {min_val:8.3f}, max: {max_val:9.3f}, var: {variance:10.3f})"
@@ -156,4 +170,42 @@ def print_fetch_statistics(num_fetched_requests, fetch_call_count, rank=None):
         percentage = (count / len(num_fetched_requests)) * 100
         print(f"    {size:3d} requests: {count:5d} times ({percentage:5.1f}%)")
 
+    print(f"\n  Num fetched requests (all iterations): {num_fetched_requests}")
+
+    print("=" * 70)
+
+
+def print_enqueue_statistics(enqueue_timings):
+    if not is_timestamp_debug_enabled():
+        return
+
+    if not enqueue_timings:
+        return
+
+    mode = "[Ray]" if mpi_disabled() else "[MPI]"
+    num_requests = len(enqueue_timings)
+
+    print(
+        f"\n=== {mode} Enqueue Request Timing Statistics ({num_requests} requests) ==="
+    )
+    first_20_enqueue = ', '.join([f"{x:.2f}" for x in enqueue_timings[:20]])
+    print(f"  Direct enqueue (first 20, ms): {first_20_enqueue}", flush=True)
+
+    avg = sum(enqueue_timings) / num_requests
+    min_val = min(enqueue_timings)
+    max_val = max(enqueue_timings)
+
+    # Calculate percentiles
+    sorted_timings = sorted(enqueue_timings)
+    p10 = sorted_timings[int(num_requests *
+                             0.1)] if num_requests > 1 else sorted_timings[0]
+    p50 = sorted_timings[num_requests // 2]
+    p90 = sorted_timings[int(num_requests * 0.9)]
+
+    print(f"  Avg: {avg:.2f} ms")
+    print(f"  Min: {min_val:.2f} ms")
+    print(f"  Max: {max_val:.2f} ms")
+    print(f"  P10: {p10:.2f} ms")
+    print(f"  P50: {p50:.2f} ms")
+    print(f"  P90: {p90:.2f} ms")
     print("=" * 70)
@@ -235,9 +235,12 @@ def _enqueue_impl(
                                      child_req_ids=child_req_ids,
                                      query=query))
 
-                if hasattr(request, 'py_timestamps') and request.py_timestamps is not None:
+                if hasattr(
+                        request,
+                        'py_timestamps') and request.py_timestamps is not None:
                     if 'request_queued' not in request.py_timestamps:
-                        request.py_timestamps['request_queued'] = request_queued_time
+                        request.py_timestamps[
+                            'request_queued'] = request_queued_time
 
                 req_ids.append(req_id)
         return req_ids
 
@@ -593,7 +593,8 @@ def create_response(self,
         result, is_final = super().create_serialized_result(
             use_fast_logits, mpi_world_rank)
 
-        response_timestamps = self.py_timestamps.copy() if self.py_timestamps is not None else None
+        response_timestamps = self.py_timestamps.copy(
+        ) if self.py_timestamps is not None else None
         if response_timestamps is not None:
             response_timestamps['response_created'] = time.time()
 
@@ -775,15 +776,15 @@ def executor_request_to_llm_request(
         arrival_time=getattr(executor_request, "py_arrival_time", None),
         py_multimodal_data=getattr(executor_request, "py_multimodal_data",
                                    None),
-        py_timestamps=getattr(executor_request, "py_timestamps",
-                              {
-                                  'scheduling_wait_time': 0.0,
-                                  'pre_forward_overhead': 0.0,
-                                  'forward_step_time': 0.0,
-                                  'post_processing_time': 0.0,
-                                  'num_iterations': 0,
-                                  'last_iteration_end': None,
-                              } if is_timestamp_debug_enabled() else None))
+        py_timestamps=getattr(
+            executor_request, "py_timestamps", {
+                'scheduling_wait_time': 0.0,
+                'pre_forward_overhead': 0.0,
+                'forward_step_time': 0.0,
+                'post_processing_time': 0.0,
+                'num_iterations': 0,
+                'last_iteration_end': None,
+            } if is_timestamp_debug_enabled() else None))
     if child_req_ids:
         for child_id in child_req_ids:
             llm_request.create_child_request(child_id)