hchings
diff --git a/‎examples/llm-api/llm_inference_async.py‎
Lines changed: 15 additions & 5 deletions b/‎examples/llm-api/llm_inference_async.py‎
Lines changed: 15 additions & 5 deletions
diff --git a/‎examples/ray_orchestrator/llm_inference_async_ray.py‎
Lines changed: 29 additions & 5 deletions b/‎examples/ray_orchestrator/llm_inference_async_ray.py‎
Lines changed: 29 additions & 5 deletions
diff --git a/‎tensorrt_llm/_tmp_utils.py‎
Lines changed: 159 additions & 0 deletions b/‎tensorrt_llm/_tmp_utils.py‎
Lines changed: 159 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/executor_request_queue.py‎
Lines changed: 53 additions & 8 deletions b/‎tensorrt_llm/_torch/pyexecutor/executor_request_queue.py‎
Lines changed: 53 additions & 8 deletions
@@ -4,35 +4,45 @@
 import asyncio
 
 from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm._tmp_utils import (analyze_average_timestamps,
+                                     dump_timestamps_to_json)
 
 
 def main():
     # model could accept HF model name or a path to local HF model.
-    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+    llm = LLM(
+        #model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        model="/scratch/llm-models/llama-3.2-models/Llama-3.2-3B-Instruct-FP8",
+        tensor_parallel_size=2)
 
     # Sample prompts.
     prompts = [
         "Hello, my name is",
         "The capital of France is",
         "The future of AI is",
-    ]
+    ] * 100
 
     # Create a sampling params.
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
+    all_timestamps = []
+
     # Async based on Python coroutines
     async def task(prompt: str):
         output = await llm.generate_async(prompt, sampling_params)
-        print(
-            f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
-        )
+
+        if output.outputs[0].timestamps:
+            all_timestamps.append(output.outputs[0].timestamps)
 
     async def main():
         tasks = [task(prompt) for prompt in prompts]
         await asyncio.gather(*tasks)
 
     asyncio.run(main())
 
+    analyze_average_timestamps(all_timestamps)
+    dump_timestamps_to_json(all_timestamps, "timestamps_output.json")
+
     # Got output like follows:
     # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
     # Prompt: 'The capital of France is', Generated text: 'Paris.'
 
@@ -1,7 +1,12 @@
 # Generate text asynchronously with Ray orchestrator.
 import asyncio
 
+import ray
+
 from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm._tmp_utils import (analyze_average_timestamps,
+                                     dump_timestamps_to_json,
+                                     print_fetch_statistics)
 from tensorrt_llm.llmapi import KvCacheConfig
 
 
@@ -13,13 +18,14 @@ def main():
 
     # model could accept HF model name or a path to local HF model.
     llm = LLM(
-        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        model="/scratch/llm-models/llama-3.2-models/Llama-3.2-3B-Instruct-FP8",
+        # model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
         kv_cache_config=kv_cache_config,
         max_seq_len=1024,
         max_batch_size=1,
         orchestrator_type="ray",  # Enable Ray orchestrator
         # Enable 2-way tensor parallelism
-        # tensor_parallel_size=2
+        tensor_parallel_size=2
     )
 
     # Sample prompts.
@@ -32,19 +38,37 @@ def main():
     # Create a sampling params.
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
+    # Collect all timestamps
+    all_timestamps = []
+
     # Async based on Python coroutines
     async def task(prompt: str):
         output = await llm.generate_async(prompt, sampling_params)
-        print(
-            f"Prompt: {output.prompt!r}, Generated text: {output.outputs[0].text!r}"
-        )
+
+        if output.outputs[0].timestamps:
+            all_timestamps.append(output.outputs[0].timestamps)
 
     async def main():
         tasks = [task(prompt) for prompt in prompts]
         await asyncio.gather(*tasks)
 
     asyncio.run(main())
 
+    analyze_average_timestamps(all_timestamps)
+    dump_timestamps_to_json(all_timestamps, "timestamps_output.json")
+
+    if hasattr(llm._executor, 'workers'):
+        for i, worker in enumerate(llm._executor.workers):
+            try:
+                stats = worker.call_worker_method.remote('get_fetch_statistics')
+                result = ray.get(stats)
+                if result:
+                    print_fetch_statistics(result['num_fetched_requests'],
+                                           result['fetch_call_count'],
+                                           rank=result['rank'])
+            except Exception as e:
+                print(f"Could not get fetch statistics from worker {i}: {e}")
+
     # Got output like follows:
     # Prompt: 'Hello, my name is', Generated text: '\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming'
     # Prompt: 'The capital of France is', Generated text: 'Paris.'
 
@@ -0,0 +1,159 @@
+"""
+Temporary utilities for timestamp analysis and Ray vs MPI latency comparison.
+"""
+import json
+import os
+from collections import Counter
+
+from tensorrt_llm._utils import mpi_disabled
+
+
+def is_timestamp_debug_enabled():
+    return os.environ.get('TIMESTAMP_DEBUG', '0') == '1'
+
+
+def calculate_latencies(timestamps):
+    """
+    Calculate latency metrics from a single set of timestamps.
+    Returns a dict of latencies in milliseconds, or None if timestamps missing.
+    """
+    if not timestamps:
+        return None
+
+    latencies = {}
+
+    latencies['submit_request_to_enqueue'] = (
+        timestamps['worker_enqueue_request'] -
+        timestamps['executor_submit_request']) * 1000
+
+    # only for the fetch
+    latencies['queue_wait_time'] = (timestamps['request_fetched'] -
+                                    timestamps['request_queued']) * 1000
+
+    latencies['num_iterations'] = timestamps['num_iterations']
+    latencies['scheduling_wait_time'] = timestamps['scheduling_wait_time']
+    latencies['pre_forward_overhead'] = timestamps['pre_forward_overhead']
+    latencies['forward_step_time'] = timestamps['forward_step_time']
+    latencies['post_processing_time'] = timestamps['post_processing_time']
+
+    latencies['execution_time'] = (timestamps['response_created'] -
+                                    timestamps['request_fetched']) * 1000
+
+    latencies['response_handling'] = (timestamps['response_enqueued'] -
+                                        timestamps['response_created']) * 1000
+
+    latencies['enqueue_response_to_handle'] = (
+        timestamps['handle_response'] -
+        timestamps['response_enqueued']) * 1000
+
+    latencies['total_e2e'] = (timestamps['handle_response'] -
+                              timestamps['executor_submit_request']) * 1000
+
+    latencies['communication_overhead'] = (
+        (timestamps['worker_enqueue_request'] -
+            timestamps['executor_submit_request']) +
+        (timestamps['handle_response'] -
+            timestamps['response_enqueued'])) * 1000
+
+    return latencies
+
+
+def analyze_average_timestamps(all_timestamps):
+    if not is_timestamp_debug_enabled():
+        return
+
+    if not all_timestamps:
+        print("No timestamps available")
+        return
+
+    mode = "[Ray]" if mpi_disabled() else "[MPI]"
+    # Calculate latencies for each request
+    all_latencies = []
+    for ts in all_timestamps:
+        latencies = calculate_latencies(ts)
+        if latencies:
+            all_latencies.append(latencies)
+
+    if not all_latencies:
+        print("No valid latencies calculated")
+        return
+
+    # Calculate averages
+    print(f"\n=== [{mode}] Latency Breakdown (milliseconds) - Average over {len(all_timestamps)} request ===")
+
+    metrics = [
+        ('submit_request_to_enqueue', 'Submit to enqueue'),
+        ('queue_wait_time', 'Request Queue wait (1st fetch)'),
+        ('execution_time', 'Time in executor loop (sum of all iterations)'),
+        ('scheduling_wait_time', '  ├─ Scheduling wait'),
+        ('pre_forward_overhead', '  ├─ Pre-forward overhead'),
+        ('forward_step_time', '  ├─ Forward step'),
+        ('post_processing_time', '  └─ Post-processing'),
+        ('response_handling', 'Response handling (once)'),
+        ('enqueue_response_to_handle', 'Enqueue to handle (once)'),
+        # ('num_iterations', 'Avg iterations per request'),
+        # ('total_e2e', 'Total E2E latency'),
+        # ('communication_overhead', 'Total communication overhead'),
+    ]
+
+    for metric_key, metric_name in metrics:
+        if metric_key == 'num_iterations':
+            print("")
+        if metric_key == 'total_e2e':
+            print("  " + "-" * 68)
+
+        values = [lat[metric_key] for lat in all_latencies if metric_key in lat]
+        if values:
+            avg = sum(values) / len(values)
+            min_val = min(values)
+            max_val = max(values)
+            variance = sum((x - avg)**2 for x in values) / len(values)
+            
+            if metric_key == 'num_iterations':
+                print(f"  {metric_name:48s}: {avg:8.1f} (min: {min_val:8.1f}, max: {max_val:9.1f})")
+            else:
+                print(
+                    f"  {metric_name:48s}: {avg:8.3f} ms (min: {min_val:8.3f}, max: {max_val:9.3f}, var: {variance:10.3f})"
+                )
+
+    print("=" * 70)
+
+
+def dump_timestamps_to_json(all_timestamps,
+                            output_file="timestamps_output.json"):
+    if not is_timestamp_debug_enabled():
+        return
+
+    if not all_timestamps:
+        print("No timestamps to dump")
+        return
+
+    print(
+        f"\nDumping {len(all_timestamps)} timestamp records to {output_file}..."
+    )
+    with open(output_file, 'w') as f:
+        json.dump(all_timestamps, f, indent=2)
+    print(f"Timestamps saved to {output_file}")
+
+
+def print_fetch_statistics(num_fetched_requests, fetch_call_count, rank=None):
+    if not is_timestamp_debug_enabled():
+        return
+
+    if not num_fetched_requests:
+        return
+
+    rank_str = f"[Rank {rank}]" if rank is not None else ""
+    mode = "[Ray]" if mpi_disabled() else "[MPI]"
+
+    print(f"\n=== {mode}{rank_str} Fetch Request Statistics ===")
+    print(f"  Total fetch calls: {fetch_call_count}")
+
+    size_distribution = Counter(num_fetched_requests)
+    print(f"\n  Fetch Size Distribution:")
+    for size in sorted(size_distribution.keys()):
+        count = size_distribution[size]
+        percentage = (count / len(num_fetched_requests)) * 100
+        print(f"    {size:3d} requests: {count:5d} times ({percentage:5.1f}%)")
+
+    print("=" * 70)
@@ -71,6 +71,10 @@ def __init__(self, dist: Distributed, enable_attention_dp: bool,
 
         self._disable_mpi = mpi_disabled()
 
+        # DIAGNOSTIC: Track iteration count and timing per rank
+        # self.iteration_count = 0
+        # self.last_iteration_time = None
+
     def _get_from_request_queue(
             self,
             timeout: Optional[datetime.timedelta]) -> List[RequestQueueItem]:
@@ -218,6 +222,7 @@ def _enqueue_impl(
         with self.enqueue_lock:
             assert self.active, "PyExecutor has already been shutdown."
             start_time = time.time()
+            request_queued_time = time.time()
             for request, query in requests_and_queries:
                 req_id = self._get_request_id()
                 if self.enable_iter_perf_stats:
@@ -229,6 +234,11 @@ def _enqueue_impl(
                                      request,
                                      child_req_ids=child_req_ids,
                                      query=query))
+
+                if hasattr(request, 'py_timestamps') and request.py_timestamps is not None:
+                    if 'request_queued' not in request.py_timestamps:
+                        request.py_timestamps['request_queued'] = request_queued_time
+
                 req_ids.append(req_id)
         return req_ids
 
@@ -268,24 +278,49 @@ def _fetch_and_process_requests(
         all_ranks_num_active_requests: Optional[List[int]] = None
     ) -> List[RequestQueueItem]:
         """Common logic for fetching and processing requests from the queue."""
+        # # DIAGNOSTIC: Track iteration timing
+        # import time as time_module
+        # fetch_start = time_module.time()
+        # self.iteration_count += 1
+
+        # # Track time between iterations
+        # if self.last_iteration_time is not None:
+        #     iteration_gap_ms = (fetch_start - self.last_iteration_time) * 1000
+        # else:
+        #     iteration_gap_ms = 0
+        # self.last_iteration_time = fetch_start
+
         # Calculate timeout
-        idle = (total_num_active_requests == 0) and len(self.waiting_queue) == 0
-        if idle:
-            # In Ray path (TLLM_DISABLE_MPI=1), use a periodic heartbeat timeout so rank 0
-            # reaches the broadcast path regularly to prevent trtllm-serve timeout when idle.
-            timeout = datetime.timedelta(
-                seconds=1200) if self._disable_mpi else None
-        else:
-            timeout = datetime.timedelta(0)
+
+        # Tentatively revert this to rule this out.
+        timeout = None if (total_num_active_requests == 0) and len(
+            self.waiting_queue) == 0 else datetime.timedelta(0)
+        # idle = (total_num_active_requests == 0) and len(self.waiting_queue) == 0
+        # if idle:
+        #     # In Ray path (TLLM_DISABLE_MPI=1), use a periodic heartbeat timeout so rank 0
+        #     # reaches the broadcast path regularly to prevent trtllm-serve timeout when idle.
+        #     timeout = datetime.timedelta(
+        #         seconds=1200) if self._disable_mpi else None
+        # else:
+        #     timeout = datetime.timedelta(0)
 
         # Fetch requests from rank 0
         new_requests = []
         if self.dist.rank == 0:
             new_requests = self._get_from_request_queue(timeout)
 
         # Broadcast requests and handle Python objects
+        # DIAGNOSTIC: Measure broadcast time
+        # import time as time_module
+        # broadcast_start = time_module.time()
         new_requests, py_request_objects = self._handle_request_broadcasting(
             new_requests)
+        # broadcast_end = time_module.time()
+        # broadcast_duration_ms = (broadcast_end - broadcast_start) * 1000
+        # if broadcast_duration_ms > 100:  # Log if > 100ms from BOTH ranks
+        #     print(
+        #         f"[BROADCAST_DELAY][Rank {self.dist.rank}][Iter {self.iteration_count}] Broadcast took {broadcast_duration_ms:.2f} ms, num_requests={len(new_requests)}",
+        #         flush=True)
 
         # Validate and filter requests
         new_requests = self._validate_and_filter_requests(new_requests)
@@ -307,6 +342,16 @@ def _fetch_and_process_requests(
         if self.enable_iter_perf_stats and self.dist.rank == 0:
             self._update_new_active_requests_queue_latency(new_requests)
 
+        # DIAGNOSTIC: Log total fetch time
+        # fetch_end = time_module.time()
+        # fetch_total_ms = (fetch_end - fetch_start) * 1000
+        # if fetch_total_ms > 100 or self.iteration_count % 10 == 0:  # Log if > 100ms or every 10 iterations from BOTH ranks
+        #     print(
+        #         f"[FETCH_TIMING][Rank {self.dist.rank}][Iter {self.iteration_count}] "
+        #         f"gap_since_last_iter={iteration_gap_ms:.2f}ms, fetch_took={fetch_total_ms:.2f}ms, "
+        #         f"active_reqs={total_num_active_requests}, fetched={len(new_requests)}, queue_size={self.request_queue.qsize()}",
+        #         flush=True)
+
         return new_requests
 
     @nvtx_range("_fetch_new_requests")