diff --git a/cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h b/cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h index 98296c8a03c..a02f523c7fd 100644 --- a/cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h +++ b/cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h @@ -26,6 +26,8 @@ #include #include #include +#include +#include using SizeType32 = tensorrt_llm::runtime::SizeType32; @@ -43,6 +45,102 @@ class BaseKVCacheManager; class CacheSender; class CacheReceiver; +struct UniqueIdSendMessage +{ +public: + UniqueIdSendMessage(RequestIdType generationRequestId, std::string const& serverUuid) + : mGenerationRequestId(generationRequestId) + , mServerUuid(serverUuid) + { + } + + serializedSize() const + { + return sizeof(RequestIdType) + mServerUuid.size(); + } + + void serialize(std::ostream& os) const + { + os.write(reinterpret_cast(&mGenerationRequestId), sizeof(RequestIdType)); + os.write(mServerUuid.c_str(), mServerUuid.size()); + } + + static UniqueIdSendMessage deserialize(std::istream& is) + { + is.read(reinterpret_cast(&mGenerationRequestId), sizeof(RequestIdType)); + mServerUuid.resize(is.readsome()); + is.read(mServerUuid.data(), mServerUuid.size()); + return UniqueIdSendMessage(mGenerationRequestId, mServerUuid); + } + + RequestIdType mGenerationRequestId; + std::string mServerUuid; +}; + +class UniqueIdGenerator +{ +public: + static int get() + { + std::lock_guard lock(mMutex); + if (!mReleasedIds.empty()) + { + int id = *mReleasedIds.begin(); + mReleasedIds.erase(mReleasedIds.begin()); + return id; + } + return mNextId++; + } + + static void release(int id) + { + std::lock_guard lock(mMutex); + if (id < mNextId) + { + mReleasedIds.insert(id); + } + } + +private: + static std::mutex mMutex; + static int mNextId; + static std::set mReleasedIds; +}; + +class UniqueIdServer +{ +public: + UniqueIdServer() + { + mThread = std::thread( + [this]() + { + int id = UniqueIdGenerator::get(); + while (true) + { + int command; + mpi::MpiComm::session().sendRecv( + &id, &command, 1, mpi::MpiType::kINT32, 0, mpi::MpiTag::kUNIQUE_ID_TAG); + if (command != 0) + { + UniqueIdGenerator::release(command); + } + else + { + id = UniqueIdGenerator::get(); + } + } + }); + } + +private: + std::thread mThread; +}; + +inline std::mutex UniqueIdGenerator::mMutex; +inline int UniqueIdGenerator::mNextId = 1; +inline std::set UniqueIdGenerator::mReleasedIds; + class CacheTransceiverFactory { public: @@ -132,6 +230,8 @@ class CacheTransceiver : public BaseCacheTransceiver // this is used to defer dependency resolution until needed. static std::mutex mDllMutex; void* mWrapperLibHandle{nullptr}; + std::string mUuid; + std::unique_ptr mUniqueIdServer; }; } // namespace tensorrt_llm::batch_manager diff --git a/cpp/include/tensorrt_llm/runtime/utils/mpiUtils.h b/cpp/include/tensorrt_llm/runtime/utils/mpiUtils.h index 5d570f03f16..e4526974164 100644 --- a/cpp/include/tensorrt_llm/runtime/utils/mpiUtils.h +++ b/cpp/include/tensorrt_llm/runtime/utils/mpiUtils.h @@ -360,6 +360,8 @@ class MpiComm void sendRawTag(void const* buffer, std::size_t size, MpiType dtype, int dest, int tag) const; void send(void const* buffer, std::size_t size, MpiType dtype, int dest, MpiTag tag) const; void send(runtime::IBuffer const& buf, int dest, MpiTag tag) const; + void sendRecv( + void const* sendbuf, void* recvbuf, int sendCount, int recvCount, MpiType dtype, int dest, MpiTag tag) const; template void sendValue(T const& value, int dest, MpiTag tag) const diff --git a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp index d832a80b358..0bb23713249 100644 --- a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp +++ b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp @@ -49,6 +49,9 @@ #include "tensorrt_llm/executor/serializeUtils.h" #include "tensorrt_llm/runtime/utils/mpiUtils.h" #include +#include +#include +#include #include #include #include @@ -117,6 +120,21 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa : mMpiGroupComm(std::addressof(tensorrt_llm::mpi::MpiComm::session())) , mCacheTransceiverConfig{cacheTransceiverConfig} { + // Broadcast rank 0 UUID to all other ranks + if (worldConfig.getRank() == 0) + { + boost::uuids::random_generator uuidGen; + mUuid = boost::uuids::to_string(uuidGen()); + std::vector uuidVec(mUuid.begin(), mUuid.end()); + mMpiGroupComm->bcast(uuidVec, 0); + mUniqueIdServer = std::make_unique(); + } + else + { + std::vector uuidVec; + mMpiGroupComm->bcast(uuidVec, 0); + mUuid.assign(uuidVec.begin(), uuidVec.end()); + } using tensorrt_llm::batch_manager::kv_cache_manager::CacheFormatter; if (worldConfig.isTensorParallel()) { @@ -199,9 +217,10 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa auto makeFormatter = [cacheManager, isMLA, this]() { return createCacheFormatter(cacheManager, mCacheTransBufferManager.get(), isMLA); }; - mCacheSender = std::make_unique(mManager.get(), *mCacheState, worldConfig.getRank(), makeFormatter()); + mCacheSender + = std::make_unique(mManager.get(), *mCacheState, worldConfig.getRank(), makeFormatter(), mUuid); mCacheReceiver - = std::make_unique(mManager.get(), *mCacheState, worldConfig.getRank(), makeFormatter()); + = std::make_unique(mManager.get(), *mCacheState, worldConfig.getRank(), makeFormatter(), mUuid); initializeCommState(); } diff --git a/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp index fe30046df98..38e743ec448 100644 --- a/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp +++ b/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp @@ -178,30 +178,42 @@ struct ReceiveCacheResource } }; -RequestInfo::RequestInfo(LlmRequest::RequestIdType requestId, executor::DataTransceiverState transState) - : mRequestId{requestId} +RequestInfo::RequestInfo(LlmRequest::RequestIdType contextRequestId, LlmRequest::RequestIdType generationRequestId, + executor::DataTransceiverState transState, std::string const& serverUuid) + : mContextRequestId{contextRequestId} + , mGenerationRequestId{generationRequestId} , mTransState{std::move(transState)} + , mServerUuid{serverUuid} { } -RequestInfo::RequestInfo(LlmRequest::RequestIdType requestId, executor::DataTransceiverState transState, - int32_t indexFromEnd, BlockKey const& lastBlockKey) - : mRequestId{requestId} +RequestInfo::RequestInfo(LlmRequest::RequestIdType contextRequestId, LlmRequest::RequestIdType generationRequestId, + executor::DataTransceiverState transState, int32_t indexFromEnd, BlockKey const& lastBlockKey, + std::string const& serverUuid) + : mContextRequestId{contextRequestId} + , mGenerationRequestId{generationRequestId} , mIndexFromEnd{indexFromEnd} , mLastBlockKey{lastBlockKey} , mTransState{std::move(transState)} + , mServerUuid{serverUuid} { } bool RequestInfo::operator==(RequestInfo const& rhs) const { - return mRequestId == rhs.mRequestId && mIndexFromEnd == rhs.mIndexFromEnd && mLastBlockKey == rhs.mLastBlockKey - && mTransState == rhs.mTransState; + return mContextRequestId == rhs.mContextRequestId && mGenerationRequestId == rhs.mGenerationRequestId + && mIndexFromEnd == rhs.mIndexFromEnd && mLastBlockKey == rhs.mLastBlockKey && mTransState == rhs.mTransState + && mServerUuid == rhs.mServerUuid; } -LlmRequest::RequestIdType RequestInfo::getRequestId() const noexcept +LlmRequest::RequestIdType RequestInfo::getContextRequestId() const noexcept { - return mRequestId; + return mContextRequestId; +} + +LlmRequest::RequestIdType RequestInfo::getGenerationRequestId() const noexcept +{ + return mGenerationRequestId; } executor::DataTransceiverState const& RequestInfo::getTransState() const noexcept @@ -212,30 +224,37 @@ executor::DataTransceiverState const& RequestInfo::getTransState() const noexcep void RequestInfo::serialize(RequestInfo const& requestInfo, std::ostream& os) { namespace su = executor::serialize_utils; - su::serialize(requestInfo.mRequestId, os); + su::serialize(requestInfo.mContextRequestId, os); + su::serialize(requestInfo.mGenerationRequestId, os); su::serialize(requestInfo.mIndexFromEnd, os); su::serialize(requestInfo.mLastBlockKey, os); su::serialize(requestInfo.mTransState, os); + su::serialize(requestInfo.mServerUuid, os); } RequestInfo RequestInfo::deserialize(std::istream& is) { namespace su = executor::serialize_utils; - auto requestId = su::deserialize(is); + auto contextRequestId = su::deserialize(is); + auto generationRequestId = su::deserialize(is); auto indexFromEnd = su::deserialize(is); auto lastBlockKey = su::deserialize(is); auto transState = su::deserialize(is); - return RequestInfo{requestId, std::move(transState), indexFromEnd, lastBlockKey}; + auto serverUuid = su::deserialize(is); + return RequestInfo{ + contextRequestId, generationRequestId, std::move(transState), indexFromEnd, lastBlockKey, serverUuid}; } std::size_t RequestInfo::serializedSize(RequestInfo const& requestInfo) { namespace su = executor::serialize_utils; std::size_t totalSize = 0; - totalSize += su::serializedSize(requestInfo.mRequestId); + totalSize += su::serializedSize(requestInfo.mContextRequestId); + totalSize += su::serializedSize(requestInfo.mGenerationRequestId); totalSize += su::serializedSize(requestInfo.mIndexFromEnd); totalSize += su::serializedSize(requestInfo.mLastBlockKey); totalSize += su::serializedSize(requestInfo.mTransState); + totalSize += su::serializedSize(requestInfo.mServerUuid); return totalSize; } @@ -245,11 +264,12 @@ class CacheSender::Impl using RequestIdType = LlmRequest::RequestIdType; Impl(executor::kv_cache::ConnectionManager* manager, executor::kv_cache::CacheState selfCacheState, - SizeType32 selfIndex, std::unique_ptr formatter) + SizeType32 selfIndex, std::unique_ptr formatter, std::string const& serverUuid) : mManager{manager} , mSelfState{std::move(selfCacheState), executor::kv_cache::CommState{manager->getCommState()}} , mFormatter{std::move(formatter)} , mBufferManager{std::make_shared()} + , mServerUuid{serverUuid} { TLLM_CHECK(mManager); TLLM_CHECK(mManager->getCommState().getSelfIdx() == selfIndex); @@ -338,7 +358,8 @@ class CacheSender::Impl info = RequestInfo::deserialize(iss); } - auto requestId = info.getRequestId(); + auto requestId = info.getContextRequestId(); + TLLM_CHECK_WITH_INFO(mFormatter->inquireSupport( mSelfState.getCacheState().value(), info.getTransState().getCacheState().value()), "Disagg server does not currently support these cacheState, please check the cacheState of the context and " @@ -352,14 +373,17 @@ class CacheSender::Impl peerRelativeRanks.begin(), peerRelativeRanks.end(), info.getTransState().getCommState()->getSelfIdx())); { std::unique_lock lk(mMtxForMap); - auto it = mRequestToSession.find(requestId); - if (it == mRequestToSession.end()) + auto key = std::make_pair(info.getGenerationRequestId(), info.getServerUuid()); + auto it = mUniqueIdToSession.find(key); + if (it == mUniqueIdToSession.end()) { + // TODO: get the unique ID from the server. + int uniqueId = -1; auto session = TransferSession(std::vector(peerRelativeRanks.size(), nullptr), DataContext{tagFromRequestId(requestId)}, mSelfState, info.getTransState(), mBufferManager, info.getIndexFromEnd(), info.getLastBlockKey(), nullptr, - !common::getEnvKVCacheTransferOutputPath().empty()); - it = mRequestToSession.emplace(requestId, std::move(session)).first; + !common::getEnvKVCacheTransferOutputPath().empty(), uniqueId); + it = mUniqueIdToSession.emplace(key, std::move(session)).first; } it->second.setConnection(peerIdx, connection); } @@ -371,9 +395,10 @@ class CacheSender::Impl TransferSession* session = nullptr; { std::unique_lock lk(mMtxForMap); - auto it = mRequestToSession.find(llmRequest.mRequestId); - TLLM_CHECK(it != mRequestToSession.end()); - session = std::addressof(it->second); + auto it = mRequestIdToUniqueId.find(llmRequest.mRequestId); + TLLM_CHECK(it != mRequestIdToUniqueId.end()); + auto key = std::make_pair(it->second, it->second.second); + session = std::addressof(mUniqueIdToSession.find(key)->second); } session->setLlmRequest(llmRequest); mFormatter->format(*session); @@ -486,16 +511,13 @@ class CacheSender::Impl { break; } - if (!mReadyResponses.empty()) - { - auto const& requestInfo = recvRequestInfo(); - auto reqId = requestInfo.getRequestId(); + auto const& requestInfo = recvRequestInfo(); + auto reqId = requestInfo.getContextRequestId(); + auto key = std::make_pair(requestInfo.getGenerationRequestId(), requestInfo.getServerUuid()); - mCurrentRequest = reqId; - if (mRemainSendCount.find(reqId) == mRemainSendCount.end()) - { - mRemainSendCount[reqId] = getCounterpartsCount(reqId); - } + if (mRemainSendCount.find(key) == mRemainSendCount.end()) + { + mRemainSendCount[key] = getCounterpartsCount(key); } auto it = getCurrentResponse(); if (it != mReadyResponses.end()) @@ -577,13 +599,14 @@ class CacheSender::Impl std::atomic mAnyReady{false}, mTerminate{false}; std::condition_variable mSenderCv, mResponderCv; std::future mResponseFuture; - std::unordered_map mRemainSendCount; + std::unordered_map, int> mRemainSendCount; AsyncSendResource mAsyncSendResource; std::vector> mAsyncSendFutures; int mDeviceId{-1}; executor::kv_cache::ConnectionManager* mManager; - std::map mRequestToSession; + std::map, TransferSession> mUniqueIdToSession; + std::map> mRequestIdToUniqueId; executor::DataTransceiverState mSelfState; std::unique_ptr mFormatter; std::mutex mMtxForMap; @@ -595,11 +618,12 @@ class CacheReceiver::Impl { public: Impl(executor::kv_cache::ConnectionManager* manager, executor::kv_cache::CacheState selfCacheState, - SizeType32 selfIndex, std::unique_ptr formatter) + SizeType32 selfIndex, std::unique_ptr formatter, std::string const& serverUuid) : mManager{manager} , mSelfState{std::move(selfCacheState), executor::kv_cache::CommState{manager->getCommState()}} , mFormatter{std::move(formatter)} , mBufferManager{std::make_shared()} + , mServerUuid{serverUuid} { TLLM_CHECK(mManager); TLLM_CHECK(mManager->getCommState().getSelfIdx() == selfIndex); @@ -672,7 +696,7 @@ class CacheReceiver::Impl TLLM_CHECK_WITH_INFO(mFormatter->inquireSupport(mSelfState.getCacheState().value(), destCacheState), "Disagg server does not currently support these cacheState."); - RequestInfo requestInfo(requestId, mSelfState); + RequestInfo requestInfo(requestId, mSelfState, mServerUuid); if (mFormatter->getCacheManager()->getBlockManager().getNumPools() == 1) { @@ -698,7 +722,7 @@ class CacheReceiver::Impl TLLM_CHECK_WITH_INFO(requestedBlockSize > 0, "requestedBlockSize must be > 0"); int32_t indexFromEnd = requestedBlockSize - 1; - requestInfo = RequestInfo(requestId, mSelfState, indexFromEnd, lastBlockKey); + requestInfo = RequestInfo(requestId, mSelfState, indexFromEnd, lastBlockKey, mServerUuid); } auto* agentConnectionManager = dynamic_cast(mManager); @@ -938,8 +962,9 @@ void CacheReceiver::ImplDeleter::operator()(Impl* ptr) } CacheSender::CacheSender(executor::kv_cache::ConnectionManager* manager, executor::kv_cache::CacheState selfCacheState, - SizeType32 selfIndex, std::unique_ptr formatter) - : mImpl{std::unique_ptr(new Impl(manager, selfCacheState, selfIndex, std::move(formatter)))} + SizeType32 selfIndex, std::unique_ptr formatter, std::string const& serverUuid) + : mImpl{std::unique_ptr( + new Impl(manager, selfCacheState, selfIndex, std::move(formatter), serverUuid))} { } @@ -971,8 +996,10 @@ RequestInfo CacheSender::recvRequestInfo() } CacheReceiver::CacheReceiver(executor::kv_cache::ConnectionManager* manager, - executor::kv_cache::CacheState selfCacheState, SizeType32 selfIndex, std::unique_ptr formatter) - : mImpl{std::unique_ptr(new Impl(manager, selfCacheState, selfIndex, std::move(formatter)))} + executor::kv_cache::CacheState selfCacheState, SizeType32 selfIndex, std::unique_ptr formatter, + std::string const& serverUuid) + : mImpl{std::unique_ptr( + new Impl(manager, selfCacheState, selfIndex, std::move(formatter), serverUuid))} { } diff --git a/cpp/tensorrt_llm/batch_manager/dataTransceiver.h b/cpp/tensorrt_llm/batch_manager/dataTransceiver.h index 47f1a9bc1dd..0cc5189a76a 100644 --- a/cpp/tensorrt_llm/batch_manager/dataTransceiver.h +++ b/cpp/tensorrt_llm/batch_manager/dataTransceiver.h @@ -65,7 +65,7 @@ class TransferSession TransferSession(std::vector connections, DataContext dataContext, executor::DataTransceiverState const& selfState, executor::DataTransceiverState otherState, runtime::BufferManager const& bufferManager, int32_t indexFromEnd, BlockKey const& lastBlockKey, - LlmRequest const* llmRequest = nullptr, bool recordMeasure = false) + LlmRequest const* llmRequest = nullptr, bool recordMeasure = false, int32_t uniqueId = -1) : mConnections(std::move(connections)) , mDataContext(std::move(dataContext)) , mSelfState(&selfState) @@ -76,6 +76,7 @@ class TransferSession , mRecordMeasure(recordMeasure) , mIndexFromEnd(indexFromEnd) , mLastBlockKey(lastBlockKey) + , mUniqueId(uniqueId) { TLLM_CHECK(!mConnections.empty()); } @@ -117,6 +118,11 @@ class TransferSession return mLastBlockKey; } + [[nodiscard]] int32_t getUniqueId() const + { + return mUniqueId; + } + private: std::vector mConnections; DataContext mDataContext; @@ -128,6 +134,7 @@ class TransferSession bool mRecordMeasure{false}; int32_t mIndexFromEnd{0}; BlockKey mLastBlockKey{}; + int32_t mUniqueId{-1}; }; using UniqueToken = tensorrt_llm::runtime::UniqueToken; @@ -144,6 +151,7 @@ struct TransceiverTag static constexpr int32_t kID_TAG{19}; static constexpr int32_t kINFO_SIZE_TAG{22}; static constexpr int32_t kINFO_TAG{32}; + static constexpr int32_t kUNIQUE_ID_TAG{42}; }; // Used to store the information that needs to be sent to the context executor to ensure the generation @@ -152,21 +160,25 @@ class RequestInfo { public: /// @brief Constructor. - /// @param requestId The ID used in the context phase of the current request. + /// @param requestId The ID used in the generation phase of the current request. /// @param transState The state of the data transceiver. - RequestInfo(LlmRequest::RequestIdType requestId, executor::DataTransceiverState transState); + /// @param serverUuid The generation server UUID. + RequestInfo(LlmRequest::RequestIdType contextRequestId, LlmRequest::RequestIdType generationRequestId, + executor::DataTransceiverState transState, std::string const& serverUuid); - RequestInfo(LlmRequest::RequestIdType requestId, executor::DataTransceiverState transState, int32_t indexFromEnd, - BlockKey const& lastBlockKey); + RequestInfo(LlmRequest::RequestIdType contextRequestId, LlmRequest::RequestIdType generationRequestId, + executor::DataTransceiverState transState, int32_t indexFromEnd, BlockKey const& lastBlockKey, + std::string const& serverUuid); RequestInfo() = default; /// @brief Equality comparison operator. /// @param rhs The right operand of the operator. [[nodiscard]] bool operator==(RequestInfo const& rhs) const; - /// @brief Return the ID used in the context phase of the current request. + /// @brief Return the ID used in the generation phase of the current request. /// @return The request ID. - [[nodiscard]] LlmRequest::RequestIdType getRequestId() const noexcept; + [[nodiscard]] LlmRequest::RequestIdType getContextRequestId() const noexcept; + [[nodiscard]] LlmRequest::RequestIdType getGenerationRequestId() const noexcept; [[nodiscard]] int32_t getIndexFromEnd() const noexcept { @@ -197,8 +209,9 @@ class RequestInfo [[nodiscard]] static std::size_t serializedSize(RequestInfo const& requestInfo); private: - // The ID used in the context phase of the current request. - LlmRequest::RequestIdType mRequestId; + // The ID used in the generation phase of the current request. + LlmRequest::RequestIdType mContextRequestId; + LlmRequest::RequestIdType mGenerationRequestId; // Index from end indicating how many trailing blocks to transfer (index+1) int32_t mIndexFromEnd{0}; @@ -207,6 +220,9 @@ class RequestInfo // The state of the data transceiver. executor::DataTransceiverState mTransState; + + // The server UUID. + std::string mServerUuid; }; class CacheSender @@ -214,7 +230,7 @@ class CacheSender public: /// @brief Constructor. CacheSender(executor::kv_cache::ConnectionManager* manager, executor::kv_cache::CacheState selfCacheState, - SizeType32 selfIndex, std::unique_ptr formatter); + SizeType32 selfIndex, std::unique_ptr formatter, std::string const& serverUuid); CacheSender() = default; @@ -259,7 +275,7 @@ class CacheReceiver public: /// @brief Constructor. CacheReceiver(executor::kv_cache::ConnectionManager* manager, executor::kv_cache::CacheState selfCacheState, - SizeType32 selfIndex, std::unique_ptr formatter); + SizeType32 selfIndex, std::unique_ptr formatter, std::string const& serverUuid); CacheReceiver() = default; diff --git a/cpp/tensorrt_llm/runtime/utils/mpiUtils.cpp b/cpp/tensorrt_llm/runtime/utils/mpiUtils.cpp index ad44d885608..82e66c21f15 100644 --- a/cpp/tensorrt_llm/runtime/utils/mpiUtils.cpp +++ b/cpp/tensorrt_llm/runtime/utils/mpiUtils.cpp @@ -325,6 +325,22 @@ void MpiComm::sendRawTag(void const* buffer, size_t size, MpiType dtype, int des TLLM_LOG_DEBUG("end MPI_Send with dest %d, tag %d, size %d", dest, tag, size); } +void MpiComm::sendRecv( + void const* sendbuf, void* recvbuf, int sendCount, int recvCount, MpiType dtype, int dest, MpiTag tag) const +{ + TLLM_LOG_DEBUG( + "start MPI_Sendrecv with dest %d, tag %d, sendCount %d, recvCount %d", dest, tag, sendCount, recvCount); +#if ENABLE_MULTI_DEVICE + MPI_Status status{}; + invokeChunked(MPI_Sendrecv, sendbuf, sendCount, getMpiDtype(dtype), dest, static_cast(tag), recvbuf, recvCount, + getMpiDtype(dtype), dest, static_cast(tag), mComm, &status); +#else + TLLM_THROW("Multi device support is disabled."); +#endif // ENABLE_MULTI_DEVICE + TLLM_LOG_DEBUG( + "end MPI_Sendrecv with dest %d, tag %d, sendCount %d, recvCount %d", dest, tag, sendCount, recvCount); +} + void MpiComm::send(void const* buffer, size_t size, MpiType dtype, int dest, MpiTag tag) const { sendRawTag(buffer, size, dtype, dest, static_cast(tag));