NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h‎
Lines changed: 14 additions & 19 deletions b/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h‎
Lines changed: 14 additions & 19 deletions
diff --git a/‎cpp/include/tensorrt_llm/executor/types.h‎
Lines changed: 1 addition & 1 deletion b/‎cpp/include/tensorrt_llm/executor/types.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp‎
Lines changed: 14 additions & 26 deletions b/‎cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp‎
Lines changed: 14 additions & 26 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp‎
Lines changed: 48 additions & 17 deletions b/‎cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp‎
Lines changed: 48 additions & 17 deletions
@@ -1691,22 +1691,22 @@ class GenericLlmRequest
         mDecodingIter = iter;
     }
 
-    void setKvCacheTransferStart(TimePoint const& time)
+    void setKvCacheTransferStart(TimePoint time) const
     {
         mPerfMetrics.timingMetrics.kvCacheTransferStart = maybeToGlobalSteadyClock(time);
     }
 
-    void setKvCacheTransferEnd(TimePoint const& time)
+    void setKvCacheTransferEnd(TimePoint time) const
     {
         mPerfMetrics.timingMetrics.kvCacheTransferEnd = maybeToGlobalSteadyClock(time);
     }
 
-    TimePoint getKvCacheTransferStart()
+    TimePoint getKvCacheTransferStart() const
     {
         return mPerfMetrics.timingMetrics.kvCacheTransferStart;
     }
 
-    TimePoint getKvCacheTransferEnd()
+    TimePoint getKvCacheTransferEnd() const
     {
         return mPerfMetrics.timingMetrics.kvCacheTransferEnd;
     }
@@ -1865,13 +1865,11 @@ class GenericLlmRequest
         return mUseDraftModel;
     }
 
-    // If mGlobalSteadyClockOffset is set, return a global steady clock time point, otherwise return local steady clock
+    // If sGlobalSteadyClockOffset is set, return a global steady clock time point, otherwise return local steady clock
     // time point
-    [[nodiscard]] TimePoint getSteadyClockNow() const
+    [[nodiscard]] static TimePoint getSteadyClockNow()
     {
-        const TimePoint time_point = std::chrono::steady_clock::now();
-
-        return maybeToGlobalSteadyClock(time_point);
+        return maybeToGlobalSteadyClock(std::chrono::steady_clock::now());
     }
 
     RequestIdType mRequestId;
@@ -1894,7 +1892,7 @@ class GenericLlmRequest
     SizeType32 mPtableCurrentPosition{0};
 
     // The offset between local steady clock and global steady clock (at rank 0)
-    inline static std::optional<Duration> mGlobalSteadyClockOffset{std::nullopt};
+    inline static std::optional<Duration> sGlobalSteadyClockOffset{std::nullopt};
 
 protected:
     bool mIsStreaming;
@@ -2028,9 +2026,9 @@ class GenericLlmRequest
 
     std::optional<TensorPtr> mSkipCrossAttnBlocks{std::nullopt};
 
-    // Performance metrics.
+    // Performance metrics. Should be updatable even from a const LlmRequest reference.
     bool mReturnPerfMetrics{false};
-    executor::RequestPerfMetrics mPerfMetrics;
+    mutable executor::RequestPerfMetrics mPerfMetrics;
 
     // Guided decoding params.
     std::optional<executor::GuidedDecodingParams> mGuidedDecodingParams{std::nullopt};
@@ -2183,16 +2181,13 @@ class GenericLlmRequest
         return tensor;
     }
 
-    TimePoint maybeToGlobalSteadyClock(TimePoint const& time_point) const
+    static TimePoint maybeToGlobalSteadyClock(TimePoint const& time_point)
     {
-        if (mGlobalSteadyClockOffset.has_value())
-        {
-            return time_point + *mGlobalSteadyClockOffset;
-        }
-        else
+        if (sGlobalSteadyClockOffset.has_value())
         {
-            return time_point;
+            return time_point + *sGlobalSteadyClockOffset;
         }
+        return time_point;
     }
 };
 
 
@@ -451,7 +451,7 @@ struct RequestPerfMetrics
         /// @brief End time of the KV cache transfer for disaggregated serving
         TimePoint kvCacheTransferEnd;
         /// @brief KV Cache size transfer for disaggregated serving
-        mutable size_t kvCacheSize = 0;
+        size_t kvCacheSize = 0;
     };
 
     struct KvCacheMetrics
 
@@ -227,6 +227,7 @@ std::vector<size_t> CacheFormatter::pickRecvConnections(
 void CacheFormatter::format(tensorrt_llm::batch_manager::TransferSession& session)
 {
     NVTX3_SCOPED_RANGE(CacheFormatter_format);
+    session.setTime(TransferSession::kTimeFormatter);
     auto const& llmRequest = session.getLlmRequest();
     TLLM_LOG_DEBUG(
         mpi::MpiComm::world().getRank(), "Start sending KV cache for request ID: %ld.", llmRequest.mRequestId);
@@ -249,9 +250,6 @@ void CacheFormatter::format(tensorrt_llm::batch_manager::TransferSession& sessio
     auto const numPools = blockManager.getNumPools();
     // TODO(oargov): are we sure the other side has the same number of pools? this might not hold for pp_size>1...
 
-    auto lastTokenTime = llmRequest.getPerfMetrics().timingMetrics.lastTokenTime;
-    bool recordDelay = lastTokenTime != std::chrono::steady_clock::time_point();
-
     bool layerWise = common::getEnvDisaggLayerwise() && numPools == 1;
     if (layerWise)
     {
@@ -420,6 +418,7 @@ void CacheFormatter::format(tensorrt_llm::batch_manager::TransferSession& sessio
             inputKvCacheBlocksPerWindow, outputSplitCaches, destConfig, selfConfig, selfIdx, bufferManager);
 
         bufferManager.getStream().synchronize();
+        session.setTime(TransferSession::kTimePreprocess);
 
         auto preAllocSendBuffer = mCacheTransBufferManager->getSendBuffer(cacheBufferId);
         if (preAllocSendBuffer != nullptr)
@@ -434,7 +433,7 @@ void CacheFormatter::format(tensorrt_llm::batch_manager::TransferSession& sessio
             TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
             TLLM_CHECK(connections.size() > (processIdx / peerDuplicateHeadFactor));
             TLLM_CHECK(outputSplitCaches.size() > (processIdx / peerDuplicateHeadFactor));
-            auto startTime = llmRequest.getSteadyClockNow();
+            auto startTime = LlmRequest::getSteadyClockNow();
 
             size_t ppDomainSize = targetInfo.mDomainPPSize;
             size_t bufferTpRank = (processIdx / ppDomainSize) / peerDuplicateHeadFactor;
@@ -481,15 +480,8 @@ void CacheFormatter::format(tensorrt_llm::batch_manager::TransferSession& sessio
                 }
             }
 
-            auto endTime = llmRequest.getSteadyClockNow();
-            double delay = 0.0;
-            if (recordDelay)
-            {
-                delay = std::chrono::duration<double, std::milli>(startTime - lastTokenTime).count();
-            }
-            double cacheTransferTime
-                = std::max(0.0, std::chrono::duration<double, std::milli>(endTime - startTime).count());
-            session.appendMeasure(delay, cacheTransferTime, size);
+            auto endTime = LlmRequest::getSteadyClockNow();
+            session.appendMeasure(startTime, endTime, size);
         };
 
         if (connections.size() > 1)
@@ -534,8 +526,10 @@ void CacheFormatter::format(tensorrt_llm::batch_manager::TransferSession& sessio
         {
             sendBufferFun(deviceId, 0);
         }
+        session.setTime(TransferSession::kTimeTransmissions);
 
         mCacheTransBufferManager->freeBufferIndexForSend(cacheBufferId);
+        session.setTime(TransferSession::kTimePostprocess);
     }
     TLLM_LOG_DEBUG(
         mpi::MpiComm::world().getRank(), "End the sending of KV cache for the request ID:%ld ", llmRequest.mRequestId);
@@ -544,6 +538,7 @@ void CacheFormatter::format(tensorrt_llm::batch_manager::TransferSession& sessio
 void CacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& session)
 {
     NVTX3_SCOPED_RANGE(CacheFormatter_unformat);
+    session.setTime(TransferSession::kTimeFormatter);
     auto const& llmRequest = session.getLlmRequest();
     auto const ctxReqId = llmRequest.getContextPhaseParams().value().getReqId();
     TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(),
@@ -555,9 +550,6 @@ void CacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& sess
     auto& bufferManager = session.getBufferManager();
     auto blockRange = getBlockRangeForReceiving(mCacheManager, llmRequest, destConfig.getEnableBlockReuse());
 
-    auto arrivalTime = llmRequest.getPerfMetrics().timingMetrics.arrivalTime;
-    bool recordDelay = arrivalTime != std::chrono::steady_clock::time_point();
-
     auto pickUpConnections = pickRecvConnections(connections.size(), selfConfig, selfIdx, destConfig);
 
     TLLM_LOG_DEBUG("pickUpConnections size: %d connections size: %d", pickUpConnections.size(), connections.size());
@@ -779,6 +771,7 @@ void CacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& sess
                 // sync to alloc buffer
                 bufferManager.getStream().synchronize();
             }
+            session.setTime(TransferSession::kTimePreprocess);
 
             runtime::ITensor::SharedPtr preAllocRecvBuffer = nullptr;
             if (cacheBufferId.has_value())
@@ -794,7 +787,7 @@ void CacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& sess
                 TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
                 TLLM_CHECK(pickUpConnections.size() > processIdx);
                 TLLM_CHECK(recvSplitCaches.size() > processIdx);
-                auto startTime = llmRequest.getSteadyClockNow();
+                auto startTime = LlmRequest::getSteadyClockNow();
                 size_t size = 0;
 
                 if (processIdx >= remainNoCoverTargetNum)
@@ -835,15 +828,8 @@ void CacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& sess
                     }
                 }
 
-                auto endTime = llmRequest.getSteadyClockNow();
-                double delay = 0.0;
-                if (recordDelay)
-                {
-                    delay = std::chrono::duration<double, std::milli>(startTime - arrivalTime).count();
-                }
-                double cacheTransferTime
-                    = std::max(0.0, std::chrono::duration<double, std::milli>(endTime - startTime).count());
-                session.appendMeasure(delay, cacheTransferTime, size);
+                auto endTime = LlmRequest::getSteadyClockNow();
+                session.appendMeasure(startTime, endTime, size);
             };
             if (pickUpConnections.size() > 1)
             {
@@ -891,6 +877,7 @@ void CacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& sess
             {
                 recvBufferFun(deviceId, 0);
             }
+            session.setTime(TransferSession::kTimeTransmissions);
 
             {
                 NVTX3_SCOPED_RANGE(formatInputConcatenate);
@@ -904,6 +891,7 @@ void CacheFormatter::unformat(tensorrt_llm::batch_manager::TransferSession& sess
                     mCacheTransBufferManager->freeBufferIndexForRecv(cacheBufferId);
                 }
             }
+            session.setTime(TransferSession::kTimePostprocess);
         }
     }
 
 
@@ -603,7 +603,7 @@ void CacheTransceiver::checkGenTransferStatus(std::optional<int> const& atLeastR
                 it->first->setState(LlmRequestState::kDISAGG_GENERATION_TRANS_COMPLETE);
 
                 // Gather the kv cache transfer time from all workers and update to leader rank
-                if (!common::getEnvKVCacheTransferOutputPath().empty())
+                if (!common::getEnvKVCacheTimeOutputPath().empty())
                 {
                     auto syncComm = mCacheState->getParallelConfig().mEnableAttentionDP ? mGroupDataComm : mGroupComm;
                     updateKVCacheTransferBW(syncComm, it->first);
 
@@ -28,6 +28,7 @@
 #include "tensorrt_llm/executor/cache_transmission/agent_utils/connection.h"
 #include "tensorrt_llm/runtime/common.h"
 #include "tensorrt_llm/runtime/utils/mpiUtils.h"
+#include <chrono>
 #include <future>
 #include <map>
 #include <memory>
@@ -105,39 +106,65 @@ void TransferSession::setLlmRequest(LlmRequest const& llmRequest)
     mRequest = &llmRequest;
 }
 
-void TransferSession::appendMeasure(double delay, double duration, size_t size)
+void TransferSession::setTime(TimeNames name)
 {
-    if (!mRecordMeasure)
+    if (mTimes)
     {
-        return;
+        mTimes->times.at(name) = LlmRequest::getSteadyClockNow();
+    }
+}
+
+void TransferSession::appendMeasure(LlmRequest::TimePoint start, LlmRequest::TimePoint end, size_t size)
+{
+    if (mTimes)
+    {
+        mTimes->measures.emplace_back(Measure{start, end, size});
     }
-    auto bandwidth = size * 8 / (duration / 1000) / 1e9; // byte, ms => Gbps
-    mMeasures.emplace_back(Measure{delay, duration, bandwidth});
 }
 
 void TransferSession::exportMeasure(std::ofstream& outFile, bool isContext) const
 {
-    if (mMeasures.empty())
+    if (!mTimes || mTimes->measures.empty())
     {
         return;
     }
     // write header if not exist
     if (outFile.tellp() == 0)
     {
-        outFile << "RequestID";
-        for (size_t i = 0; i < mMeasures.size(); i++)
+        outFile << "RequestID,RequestInfo,Preparation,Preprocess,Transmissions,Postprocess";
+        for (size_t i = 0; i < mTimes->measures.size(); i++)
         {
-            outFile << ",Delay(ms),Duration(ms),Bandwidth(Gbps)";
+            outFile << ",Delay,Duration,Bandwidth(Gbps)";
         }
         outFile << '\n';
     }
-    // write measures
+    auto transferStart = mRequest->getPerfMetrics().timingMetrics.kvCacheTransferStart;
+    using Milliseconds = std::chrono::duration<double, std::milli>;
+
+    // write measures, time is in milliseconds
     TLLM_CHECK(isContext || mRequest->getContextPhaseParams().has_value());
     auto reqId = isContext ? mRequest->mRequestId : mRequest->getContextPhaseParams().value().getReqId();
     outFile << reqId;
-    for (auto const& measure : mMeasures)
+    auto previousTime = transferStart;
+    for (auto time : mTimes->times)
+    {
+        if (time == LlmRequest::TimePoint())
+        {
+            // timepoint is unset, skip
+            outFile << ",0.0";
+            continue;
+        }
+        double delay = Milliseconds(time - previousTime).count();
+        previousTime = time;
+        outFile << "," << delay;
+    }
+    previousTime = mTimes->times[kTimePreprocess];
+    for (auto const& measure : mTimes->measures)
     {
-        outFile << "," << measure.delay << "," << measure.duration << "," << measure.bandwidth;
+        double delay = Milliseconds(measure.start - previousTime).count();
+        double duration = Milliseconds(measure.end - measure.start).count();
+        double bandwidth = static_cast<double>(measure.size) * 8.0 / duration / 1e6; // byte, ms => Gbps
+        outFile << "," << delay << "," << duration << "," << bandwidth;
     }
     outFile << '\n' << std::flush;
 }
@@ -158,7 +185,7 @@ int32_t tagFromRequestId(LlmRequest::RequestIdType requestId)
 std::filesystem::path getTransferOutputPath(char const* tag)
 {
     namespace fs = std::filesystem;
-    auto outputPath = common::getEnvKVCacheTransferOutputPath();
+    auto outputPath = common::getEnvKVCacheTimeOutputPath();
     if (!outputPath.empty())
     {
         auto rank = mpi::MpiComm::world().getRank();
@@ -273,6 +300,7 @@ class CacheSender::Impl
     {
         std::promise<void> promise;
         auto future = promise.get_future();
+        llmRequest.setKvCacheTransferStart(LlmRequest::getSteadyClockNow());
         {
             {
                 std::scoped_lock lkResp(mSenderMutex);
@@ -309,7 +337,7 @@ class CacheSender::Impl
         std::unique_lock<std::mutex> lk(mMtxForMap);
         auto it = mRequestToSession.find(requestId);
         TLLM_CHECK(it != mRequestToSession.end());
-        if (!common::getEnvKVCacheTransferOutputPath().empty())
+        if (!common::getEnvKVCacheTimeOutputPath().empty())
         {
             if (!mMeasuresFile.is_open())
             {
@@ -363,7 +391,8 @@ class CacheSender::Impl
                 auto session = TransferSession(std::vector<Connection const*>(peerRelativeRanks.size(), nullptr),
                     DataContext{tagFromRequestId(requestId)}, mSelfState, info.getTransState(), mBufferManager,
                     info.getIndexFromEnd(), info.getLastBlockKey(), nullptr,
-                    !common::getEnvKVCacheTransferOutputPath().empty());
+                    !common::getEnvKVCacheTimeOutputPath().empty());
+                session.setTime(TransferSession::kTimeRequestInfo);
                 it = mRequestToSession.emplace(requestId, std::move(session)).first;
             }
             it->second.setConnection(peerIdx, connection);
@@ -382,6 +411,7 @@ class CacheSender::Impl
         }
         session->setLlmRequest(llmRequest);
         mFormatter->format(*session);
+        llmRequest.setKvCacheTransferEnd(LlmRequest::getSteadyClockNow());
     }
 
     bool cancelRequest(LlmRequest const& llmRequest)
@@ -751,7 +781,7 @@ class CacheReceiver::Impl
     void receiveSync(TransferSession& session)
     {
         mFormatter->unformat(session);
-        if (!common::getEnvKVCacheTransferOutputPath().empty())
+        if (!common::getEnvKVCacheTimeOutputPath().empty())
         {
             std::unique_lock<std::mutex> lock(mMeasuresFileMutex);
             if (!mMeasuresFile.is_open())
@@ -846,7 +876,7 @@ class CacheReceiver::Impl
         auto const& resource = getReceiveCacheResource(llmRequest);
         return TransferSession(std::move(counterPartConnections), DataContext{tagFromRequestId(requestId)}, mSelfState,
             contextState, resource->mBufferManager, requestInfo.getIndexFromEnd(), requestInfo.getLastBlockKey(),
-            &llmRequest, !common::getEnvKVCacheTransferOutputPath().empty());
+            &llmRequest, !common::getEnvKVCacheTimeOutputPath().empty());
     }
 
     std::unique_ptr<ReceiveCacheResource> const& getReceiveCacheResource(LlmRequest const& llmRequest)
@@ -957,6 +987,7 @@ class CacheReceiver::Impl
         llmRequest.setKvCacheTransferStart(std::chrono::steady_clock::now());
         TLLM_CUDA_CHECK(cudaSetDevice(mDeviceId));
         auto session = sendRequestInfo(llmRequest);
+        session.setTime(TransferSession::kTimeRequestInfo);
         bool isReady = receiveReadySignal(session);
         if (!isReady)
         {
Original file line number	Diff line number	Diff line change
`@@ -1691,22 +1691,22 @@ class GenericLlmRequest`
`1691`	`1691`	`mDecodingIter = iter;`
`1692`	`1692`	`}`
`1693`	`1693`
`1694`		`- void setKvCacheTransferStart(TimePoint const& time)`
	`1694`	`+ void setKvCacheTransferStart(TimePoint time) const`
`1695`	`1695`	`{`
`1696`	`1696`	`mPerfMetrics.timingMetrics.kvCacheTransferStart = maybeToGlobalSteadyClock(time);`
`1697`	`1697`	`}`
`1698`	`1698`
`1699`		`- void setKvCacheTransferEnd(TimePoint const& time)`
	`1699`	`+ void setKvCacheTransferEnd(TimePoint time) const`
`1700`	`1700`	`{`
`1701`	`1701`	`mPerfMetrics.timingMetrics.kvCacheTransferEnd = maybeToGlobalSteadyClock(time);`
`1702`	`1702`	`}`
`1703`	`1703`
`1704`		`- TimePoint getKvCacheTransferStart()`
	`1704`	`+ TimePoint getKvCacheTransferStart() const`
`1705`	`1705`	`{`
`1706`	`1706`	`return mPerfMetrics.timingMetrics.kvCacheTransferStart;`
`1707`	`1707`	`}`
`1708`	`1708`
`1709`		`- TimePoint getKvCacheTransferEnd()`
	`1709`	`+ TimePoint getKvCacheTransferEnd() const`
`1710`	`1710`	`{`
`1711`	`1711`	`return mPerfMetrics.timingMetrics.kvCacheTransferEnd;`
`1712`	`1712`	`}`
`@@ -1865,13 +1865,11 @@ class GenericLlmRequest`
`1865`	`1865`	`return mUseDraftModel;`
`1866`	`1866`	`}`
`1867`	`1867`
`1868`		`- // If mGlobalSteadyClockOffset is set, return a global steady clock time point, otherwise return local steady clock`
	`1868`	`+ // If sGlobalSteadyClockOffset is set, return a global steady clock time point, otherwise return local steady clock`
`1869`	`1869`	`// time point`
`1870`		`- [[nodiscard]] TimePoint getSteadyClockNow() const`
	`1870`	`+ [[nodiscard]] static TimePoint getSteadyClockNow()`
`1871`	`1871`	`{`
`1872`		`- const TimePoint time_point = std::chrono::steady_clock::now();`
`1873`		`-`
`1874`		`- return maybeToGlobalSteadyClock(time_point);`
	`1872`	`+ return maybeToGlobalSteadyClock(std::chrono::steady_clock::now());`
`1875`	`1873`	`}`
`1876`	`1874`
`1877`	`1875`	`RequestIdType mRequestId;`
`@@ -1894,7 +1892,7 @@ class GenericLlmRequest`
`1894`	`1892`	`SizeType32 mPtableCurrentPosition{0};`
`1895`	`1893`
`1896`	`1894`	`// The offset between local steady clock and global steady clock (at rank 0)`
`1897`		`- inline static std::optional<Duration> mGlobalSteadyClockOffset{std::nullopt};`
	`1895`	`+ inline static std::optional<Duration> sGlobalSteadyClockOffset{std::nullopt};`
`1898`	`1896`
`1899`	`1897`	`protected:`
`1900`	`1898`	`bool mIsStreaming;`
`@@ -2028,9 +2026,9 @@ class GenericLlmRequest`
`2028`	`2026`
`2029`	`2027`	`std::optional<TensorPtr> mSkipCrossAttnBlocks{std::nullopt};`
`2030`	`2028`
`2031`		`- // Performance metrics.`
	`2029`	`+ // Performance metrics. Should be updatable even from a const LlmRequest reference.`
`2032`	`2030`	`bool mReturnPerfMetrics{false};`
`2033`		`- executor::RequestPerfMetrics mPerfMetrics;`
	`2031`	`+ mutable executor::RequestPerfMetrics mPerfMetrics;`
`2034`	`2032`
`2035`	`2033`	`// Guided decoding params.`
`2036`	`2034`	`std::optional<executor::GuidedDecodingParams> mGuidedDecodingParams{std::nullopt};`
`@@ -2183,16 +2181,13 @@ class GenericLlmRequest`
`2183`	`2181`	`return tensor;`
`2184`	`2182`	`}`
`2185`	`2183`
`2186`		`- TimePoint maybeToGlobalSteadyClock(TimePoint const& time_point) const`
	`2184`	`+ static TimePoint maybeToGlobalSteadyClock(TimePoint const& time_point)`
`2187`	`2185`	`{`
`2188`		`- if (mGlobalSteadyClockOffset.has_value())`
`2189`		`- {`
`2190`		`- return time_point + *mGlobalSteadyClockOffset;`
`2191`		`- }`
`2192`		`- else`
	`2186`	`+ if (sGlobalSteadyClockOffset.has_value())`
`2193`	`2187`	`{`
`2194`		`- return time_point;`
	`2188`	`+ return time_point + *sGlobalSteadyClockOffset;`
`2195`	`2189`	`}`
	`2190`	`+ return time_point;`
`2196`	`2191`	`}`
`2197`	`2192`	`};`
`2198`	`2193`
Original file line number	Diff line number	Diff line change
`@@ -603,7 +603,7 @@ void CacheTransceiver::checkGenTransferStatus(std::optional<int> const& atLeastR`
`603`	`603`	`it->first->setState(LlmRequestState::kDISAGG_GENERATION_TRANS_COMPLETE);`
`604`	`604`
`605`	`605`	`// Gather the kv cache transfer time from all workers and update to leader rank`
`606`		`- if (!common::getEnvKVCacheTransferOutputPath().empty())`
	`606`	`+ if (!common::getEnvKVCacheTimeOutputPath().empty())`
`607`	`607`	`{`
`608`	`608`	`auto syncComm = mCacheState->getParallelConfig().mEnableAttentionDP ? mGroupDataComm : mGroupComm;`
`609`	`609`	`updateKVCacheTransferBW(syncComm, it->first);`
Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,7 @@`
`28`	`28`	`#include "tensorrt_llm/executor/cache_transmission/agent_utils/connection.h"`
`29`	`29`	`#include "tensorrt_llm/runtime/common.h"`
`30`	`30`	`#include "tensorrt_llm/runtime/utils/mpiUtils.h"`
	`31`	`+#include <chrono>`
`31`	`32`	`#include <future>`
`32`	`33`	`#include <map>`
`33`	`34`	`#include <memory>`
`@@ -105,39 +106,65 @@ void TransferSession::setLlmRequest(LlmRequest const& llmRequest)`
`105`	`106`	`mRequest = &llmRequest;`
`106`	`107`	`}`
`107`	`108`
`108`		`-void TransferSession::appendMeasure(double delay, double duration, size_t size)`
	`109`	`+void TransferSession::setTime(TimeNames name)`
`109`	`110`	`{`
`110`		`- if (!mRecordMeasure)`
	`111`	`+ if (mTimes)`
`111`	`112`	`{`
`112`		`- return;`
	`113`	`+ mTimes->times.at(name) = LlmRequest::getSteadyClockNow();`
	`114`	`+ }`
	`115`	`+}`
	`116`	`+`
	`117`	`+void TransferSession::appendMeasure(LlmRequest::TimePoint start, LlmRequest::TimePoint end, size_t size)`
	`118`	`+{`
	`119`	`+ if (mTimes)`
	`120`	`+ {`
	`121`	`+ mTimes->measures.emplace_back(Measure{start, end, size});`
`113`	`122`	`}`
`114`		`- auto bandwidth = size * 8 / (duration / 1000) / 1e9; // byte, ms => Gbps`
`115`		`- mMeasures.emplace_back(Measure{delay, duration, bandwidth});`
`116`	`123`	`}`
`117`	`124`
`118`	`125`	`void TransferSession::exportMeasure(std::ofstream& outFile, bool isContext) const`
`119`	`126`	`{`
`120`		`- if (mMeasures.empty())`
	`127`	`+ if (!mTimes \|\| mTimes->measures.empty())`
`121`	`128`	`{`
`122`	`129`	`return;`
`123`	`130`	`}`
`124`	`131`	`// write header if not exist`
`125`	`132`	`if (outFile.tellp() == 0)`
`126`	`133`	`{`
`127`		`- outFile << "RequestID";`
`128`		`- for (size_t i = 0; i < mMeasures.size(); i++)`
	`134`	`+ outFile << "RequestID,RequestInfo,Preparation,Preprocess,Transmissions,Postprocess";`
	`135`	`+ for (size_t i = 0; i < mTimes->measures.size(); i++)`
`129`	`136`	`{`
`130`		`- outFile << ",Delay(ms),Duration(ms),Bandwidth(Gbps)";`
	`137`	`+ outFile << ",Delay,Duration,Bandwidth(Gbps)";`
`131`	`138`	`}`
`132`	`139`	`outFile << '\n';`
`133`	`140`	`}`
`134`		`- // write measures`
	`141`	`+ auto transferStart = mRequest->getPerfMetrics().timingMetrics.kvCacheTransferStart;`
	`142`	`+ using Milliseconds = std::chrono::duration<double, std::milli>;`
	`143`	`+`
	`144`	`+ // write measures, time is in milliseconds`
`135`	`145`	`TLLM_CHECK(isContext \|\| mRequest->getContextPhaseParams().has_value());`
`136`	`146`	`auto reqId = isContext ? mRequest->mRequestId : mRequest->getContextPhaseParams().value().getReqId();`
`137`	`147`	`outFile << reqId;`
`138`		`- for (auto const& measure : mMeasures)`
	`148`	`+ auto previousTime = transferStart;`
	`149`	`+ for (auto time : mTimes->times)`
	`150`	`+ {`
	`151`	`+ if (time == LlmRequest::TimePoint())`
	`152`	`+ {`
	`153`	`+ // timepoint is unset, skip`
	`154`	`+ outFile << ",0.0";`
	`155`	`+ continue;`
	`156`	`+ }`
	`157`	`+ double delay = Milliseconds(time - previousTime).count();`
	`158`	`+ previousTime = time;`
	`159`	`+ outFile << "," << delay;`
	`160`	`+ }`
	`161`	`+ previousTime = mTimes->times[kTimePreprocess];`
	`162`	`+ for (auto const& measure : mTimes->measures)`
`139`	`163`	`{`
`140`		`- outFile << "," << measure.delay << "," << measure.duration << "," << measure.bandwidth;`
	`164`	`+ double delay = Milliseconds(measure.start - previousTime).count();`
	`165`	`+ double duration = Milliseconds(measure.end - measure.start).count();`
	`166`	`+ double bandwidth = static_cast<double>(measure.size) * 8.0 / duration / 1e6; // byte, ms => Gbps`
	`167`	`+ outFile << "," << delay << "," << duration << "," << bandwidth;`
`141`	`168`	`}`
`142`	`169`	`outFile << '\n' << std::flush;`
`143`	`170`	`}`
`@@ -158,7 +185,7 @@ int32_t tagFromRequestId(LlmRequest::RequestIdType requestId)`
`158`	`185`	`std::filesystem::path getTransferOutputPath(char const* tag)`
`159`	`186`	`{`
`160`	`187`	`namespace fs = std::filesystem;`
`161`		`- auto outputPath = common::getEnvKVCacheTransferOutputPath();`
	`188`	`+ auto outputPath = common::getEnvKVCacheTimeOutputPath();`
`162`	`189`	`if (!outputPath.empty())`
`163`	`190`	`{`
`164`	`191`	`auto rank = mpi::MpiComm::world().getRank();`
`@@ -273,6 +300,7 @@ class CacheSender::Impl`
`273`	`300`	`{`
`274`	`301`	`std::promise<void> promise;`
`275`	`302`	`auto future = promise.get_future();`
	`303`	`+ llmRequest.setKvCacheTransferStart(LlmRequest::getSteadyClockNow());`
`276`	`304`	`{`
`277`	`305`	`{`
`278`	`306`	`std::scoped_lock lkResp(mSenderMutex);`
`@@ -309,7 +337,7 @@ class CacheSender::Impl`
`309`	`337`	`std::unique_lock<std::mutex> lk(mMtxForMap);`
`310`	`338`	`auto it = mRequestToSession.find(requestId);`
`311`	`339`	`TLLM_CHECK(it != mRequestToSession.end());`
`312`		`- if (!common::getEnvKVCacheTransferOutputPath().empty())`
	`340`	`+ if (!common::getEnvKVCacheTimeOutputPath().empty())`
`313`	`341`	`{`
`314`	`342`	`if (!mMeasuresFile.is_open())`
`315`	`343`	`{`
`@@ -363,7 +391,8 @@ class CacheSender::Impl`
`363`	`391`	`auto session = TransferSession(std::vector<Connection const*>(peerRelativeRanks.size(), nullptr),`
`364`	`392`	`DataContext{tagFromRequestId(requestId)}, mSelfState, info.getTransState(), mBufferManager,`
`365`	`393`	`info.getIndexFromEnd(), info.getLastBlockKey(), nullptr,`
`366`		`- !common::getEnvKVCacheTransferOutputPath().empty());`
	`394`	`+ !common::getEnvKVCacheTimeOutputPath().empty());`
	`395`	`+ session.setTime(TransferSession::kTimeRequestInfo);`
`367`	`396`	`it = mRequestToSession.emplace(requestId, std::move(session)).first;`
`368`	`397`	`}`
`369`	`398`	`it->second.setConnection(peerIdx, connection);`
`@@ -382,6 +411,7 @@ class CacheSender::Impl`
`382`	`411`	`}`
`383`	`412`	`session->setLlmRequest(llmRequest);`
`384`	`413`	`mFormatter->format(*session);`
	`414`	`+ llmRequest.setKvCacheTransferEnd(LlmRequest::getSteadyClockNow());`
`385`	`415`	`}`
`386`	`416`
`387`	`417`	`bool cancelRequest(LlmRequest const& llmRequest)`
`@@ -751,7 +781,7 @@ class CacheReceiver::Impl`
`751`	`781`	`void receiveSync(TransferSession& session)`
`752`	`782`	`{`
`753`	`783`	`mFormatter->unformat(session);`
`754`		`- if (!common::getEnvKVCacheTransferOutputPath().empty())`
	`784`	`+ if (!common::getEnvKVCacheTimeOutputPath().empty())`
`755`	`785`	`{`
`756`	`786`	`std::unique_lock<std::mutex> lock(mMeasuresFileMutex);`
`757`	`787`	`if (!mMeasuresFile.is_open())`
`@@ -846,7 +876,7 @@ class CacheReceiver::Impl`
`846`	`876`	`auto const& resource = getReceiveCacheResource(llmRequest);`
`847`	`877`	`return TransferSession(std::move(counterPartConnections), DataContext{tagFromRequestId(requestId)}, mSelfState,`
`848`	`878`	`contextState, resource->mBufferManager, requestInfo.getIndexFromEnd(), requestInfo.getLastBlockKey(),`
`849`		`- &llmRequest, !common::getEnvKVCacheTransferOutputPath().empty());`
	`879`	`+ &llmRequest, !common::getEnvKVCacheTimeOutputPath().empty());`
`850`	`880`	`}`
`851`	`881`
`852`	`882`	`std::unique_ptr<ReceiveCacheResource> const& getReceiveCacheResource(LlmRequest const& llmRequest)`
`@@ -957,6 +987,7 @@ class CacheReceiver::Impl`
`957`	`987`	`llmRequest.setKvCacheTransferStart(std::chrono::steady_clock::now());`
`958`	`988`	`TLLM_CUDA_CHECK(cudaSetDevice(mDeviceId));`
`959`	`989`	`auto session = sendRequestInfo(llmRequest);`
	`990`	`+ session.setTime(TransferSession::kTimeRequestInfo);`
`960`	`991`	`bool isReady = receiveReadySignal(session);`
`961`	`992`	`if (!isReady)`
`962`	`993`	`{`