From 3565bfdf6d37e769f334ca0e2290f5927b3a9725 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Fri, 24 Jan 2025 14:25:31 -0800 Subject: [PATCH] Renaming channels (#436) Renamed `ProxyChannel` to `PortChannel` and `SmChannel` to `MemoryChannel` --- README.md | 12 +- apps/nccl/src/allgather.hpp | 62 ++-- apps/nccl/src/allreduce.hpp | 50 +-- apps/nccl/src/broadcast.hpp | 76 ++-- apps/nccl/src/nccl.cu | 136 +++---- docs/design/design.md | 38 +- docs/design/mscclpp-dsl.md | 8 +- docs/getting-started/tutorials/index.rst | 4 +- .../tutorials/initialization.md | 16 +- .../tutorials/memory-channel.md | 3 + .../getting-started/tutorials/port-channel.md | 3 + .../tutorials/proxy-channel.md | 3 - docs/getting-started/tutorials/python-api.md | 8 +- docs/getting-started/tutorials/sm-channel.md | 3 - include/mscclpp/memory_channel.hpp | 50 +++ ...l_device.hpp => memory_channel_device.hpp} | 15 +- .../{proxy_channel.hpp => port_channel.hpp} | 70 ++-- ...nel_device.hpp => port_channel_device.hpp} | 40 +- include/mscclpp/semaphore.hpp | 16 +- include/mscclpp/semaphore_device.hpp | 8 +- include/mscclpp/sm_channel.hpp | 47 --- python/examples/allgather_barrier.py | 6 +- python/examples/send_recv_packet.py | 6 +- python/examples/send_recv_proxy.py | 10 +- python/mscclpp/__init__.py | 89 ++++- python/mscclpp/comm.py | 44 ++- python/mscclpp/core_py.cpp | 8 +- python/mscclpp/language/collectives.py | 3 - .../mscclpp/language/dag/instruction_dag.py | 2 +- python/mscclpp/language/dag/optimizer.py | 22 +- python/mscclpp/language/ir.py | 2 +- python/mscclpp/language/program.py | 26 +- python/mscclpp/language/types.py | 8 +- python/mscclpp/memory_channel_py.cpp | 35 ++ ...oxy_channel_py.cpp => port_channel_py.cpp} | 40 +- python/mscclpp/semaphore_py.cpp | 18 +- python/mscclpp/sm_channel_py.cpp | 35 -- python/mscclpp_benchmark/allreduce.cu | 186 +++++----- python/mscclpp_benchmark/mscclpp_op.py | 68 ++-- python/test/d2d_semaphore_test.cu | 2 +- ...channel_test.cu => memory_channel_test.cu} | 5 +- python/test/nvls_test.cu | 2 +- ...y_channel_test.cu => port_channel_test.cu} | 6 +- python/test/test_mscclpp.py | 34 +- src/executor/execution_plan.cc | 40 +- src/executor/executor.cc | 50 +-- src/include/execution_common.hpp | 16 +- src/include/execution_kernel.hpp | 132 +++---- src/include/execution_plan.hpp | 4 +- src/{sm_channel.cc => memory_channel.cc} | 10 +- src/{proxy_channel.cc => port_channel.cc} | 31 +- src/semaphore.cc | 8 +- test/allgather_test_cpp.cu | 68 ++-- test/mp_unit/CMakeLists.txt | 4 +- ...annel_tests.cu => memory_channel_tests.cu} | 144 ++++---- test/mp_unit/mp_unit_tests.hpp | 17 +- ...channel_tests.cu => port_channel_tests.cu} | 140 +++---- test/mscclpp-test/allgather_test.cu | 200 +++++----- test/mscclpp-test/allreduce_test.cu | 342 +++++++++--------- test/mscclpp-test/alltoall_test.cu | 30 +- test/mscclpp-test/common.cc | 41 ++- test/mscclpp-test/common.hpp | 12 +- test/mscclpp-test/sendrecv_test.cu | 32 +- 63 files changed, 1373 insertions(+), 1273 deletions(-) create mode 100644 docs/getting-started/tutorials/memory-channel.md create mode 100644 docs/getting-started/tutorials/port-channel.md delete mode 100644 docs/getting-started/tutorials/proxy-channel.md delete mode 100644 docs/getting-started/tutorials/sm-channel.md create mode 100644 include/mscclpp/memory_channel.hpp rename include/mscclpp/{sm_channel_device.hpp => memory_channel_device.hpp} (97%) rename include/mscclpp/{proxy_channel.hpp => port_channel.hpp} (59%) rename include/mscclpp/{proxy_channel_device.hpp => port_channel_device.hpp} (87%) delete mode 100644 include/mscclpp/sm_channel.hpp create mode 100644 python/mscclpp/memory_channel_py.cpp rename python/mscclpp/{proxy_channel_py.cpp => port_channel_py.cpp} (54%) delete mode 100644 python/mscclpp/sm_channel_py.cpp rename python/test/{sm_channel_test.cu => memory_channel_test.cu} (83%) rename python/test/{proxy_channel_test.cu => port_channel_test.cu} (85%) rename src/{sm_channel.cc => memory_channel.cc} (54%) rename src/{proxy_channel.cc => port_channel.cc} (68%) rename test/mp_unit/{sm_channel_tests.cu => memory_channel_tests.cu} (64%) rename test/mp_unit/{proxy_channel_tests.cu => port_channel_tests.cu} (74%) diff --git a/README.md b/README.md index 4127f8b8e..03d894fca 100644 --- a/README.md +++ b/README.md @@ -50,8 +50,8 @@ The following highlights key concepts of MSCCL++. MSCCL++ provides peer-to-peer communication methods between GPUs. A peer-to-peer connection between two GPUs is called a *Channel*. Channels are constructed by MSCCL++ host-side interfaces and copied to GPUs during initialization. Channels provide *GPU-side interfaces*, which means that all communication methods are defined as a device function to be called from a GPU kernel code. For example, the `put()` method in the following example copies 1KB data from the local GPU to a remote GPU. ```cpp -// `ProxyChannel` will be explained in the following section. -__device__ mscclpp::DeviceHandle channel; +// `PortChannel` will be explained in the following section. +__device__ mscclpp::DeviceHandle channel; __global__ void gpuKernel() { ... // Only one thread is needed for this method. @@ -79,15 +79,15 @@ __device__ void barrier() { MSCCL++ provides consistent interfaces, i.e., the above interfaces are used regardless of the location of the remote GPU (either on the local node or on a remote node) or the underlying link (either NVLink/xGMI or InfiniBand). -### ProxyChannel and SmChannel +### PortChannel and MemoryChannel -MSCCL++ delivers two types of channels, **ProxyChannel** and **SmChannel**. `ProxyChannel` provides (R)DMA-based data copy and synchronization methods. When called, these methods send/receive a signal to/from a host-side proxy (hence the name `ProxyChannel`), which will trigger (R)DMA (such as `cudaMemcpy*` or `ibv_post_send`) or issue synchronization methods (such as `cudaStreamSynchronize` or `ibv_poll_cq`). Since the key functionalities are run by the proxy, `ProxyChannel` requires only a single GPU thread to call its methods. See all `ProxyChannel` methods from [here](./include/mscclpp/proxy_channel_device.hpp). +MSCCL++ delivers two types of channels, **PortChannel** and **MemoryChannel**. `PortChannel` provides port-mapping-based data copy and synchronization methods. When called, these methods send/receive a signal to/from a host-side proxy, which will trigger (R)DMA (such as `cudaMemcpy*` or `ibv_post_send`) or issue synchronization methods (such as `cudaStreamSynchronize` or `ibv_poll_cq`). Since the key functionalities are run by the proxy, `PortChannel` requires only a single GPU thread to call its methods. See all `PortChannel` methods from [here](./include/mscclpp/port_channel_device.hpp). -On the other hand, `SmChannel` provides memory-mapping-based copy and synchronization methods. When called, these methods will directly use GPU threads to read/write from/to the remote GPU's memory space. Comparing against `ProxyChannel`, `SmChannel` is especially performant for low-latency scenarios, while it may need many GPU threads to call copying methods at the same time to achieve high copying bandwidth. See all `SmChannel` methods from [here](./include/mscclpp/sm_channel_device.hpp). +On the other hand, `MemoryChannel` provides memory-mapping-based copy and synchronization methods. When called, these methods will directly use GPU threads to read/write from/to the remote GPU's memory space. Comparing against `PortChannel`, `MemoryChannel` is especially performant for low-latency scenarios, while it may need many GPU threads to call copying methods at the same time to achieve high copying bandwidth. See all `MemoryChannel` methods from [here](./include/mscclpp/memory_channel_device.hpp). ### Host-Side Communication Proxy -MSCCL++ provides a default implementation of a host-side proxy for ProxyChannels, which is a background host thread that busy polls triggers from GPUs and conducts functionalities accordingly. For example, the following is a typical host-side code for MSCCL++. +MSCCL++ provides a default implementation of a host-side proxy for PortChannels, which is a background host thread that busy polls triggers from GPUs and conducts functionalities accordingly. For example, the following is a typical host-side code for MSCCL++. ```cpp // Bootstrap: initialize control-plane connections between all ranks diff --git a/apps/nccl/src/allgather.hpp b/apps/nccl/src/allgather.hpp index 59aedbb49..af4a6808a 100644 --- a/apps/nccl/src/allgather.hpp +++ b/apps/nccl/src/allgather.hpp @@ -7,14 +7,14 @@ #include #include #include -#include -#include +#include +#include #include "common.hpp" template __global__ void __launch_bounds__(1024, 1) - allgather6(void* sendbuff, mscclpp::DeviceHandle* smChannels, size_t channelOutOffset, + allgather6(void* sendbuff, mscclpp::DeviceHandle* memoryChannels, size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) { const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; const size_t lid = tid % WARP_SIZE; @@ -24,11 +24,11 @@ __global__ void __launch_bounds__(1024, 1) const size_t nWarp = nThread / WARP_SIZE; const size_t nPeer = nRanksPerNode - 1; const size_t chanOffset = nPeer * blockIdx.x; - auto smChans = smChannels + chanOffset; + auto memChans = memoryChannels + chanOffset; if (threadIdx.x < nPeer) { - smChans[threadIdx.x].relaxedSignal(); - smChans[threadIdx.x].wait(); + memChans[threadIdx.x].relaxedSignal(); + memChans[threadIdx.x].wait(); } __syncthreads(); @@ -49,16 +49,16 @@ __global__ void __launch_bounds__(1024, 1) const size_t peerIdx = wid % nPeer; const size_t offset = bytesPerGPU * rank + (wid / nPeer) * unitBytesPerWarp; if constexpr (IsOutOfPlace) { - char* dst = reinterpret_cast(smChans[peerIdx].dst_); - char* src = reinterpret_cast(smChans[peerIdx].src_); + char* dst = reinterpret_cast(memChans[peerIdx].dst_); + char* src = reinterpret_cast(memChans[peerIdx].src_); char* buff = reinterpret_cast(sendbuff); const size_t offsetWithinRank = (wid / nPeer) * unitBytesPerWarp; - smChans[peerIdx].copy<16, false>(src + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid, - WARP_SIZE); - smChans[peerIdx].copy<16, false>(dst + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid, - WARP_SIZE); + memChans[peerIdx].copy<16, false>(src + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid, + WARP_SIZE); + memChans[peerIdx].copy<16, false>(dst + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid, + WARP_SIZE); } else { - smChans[peerIdx].put<16, false>(offset + channelOutOffset, unitBytesPerWarp, lid, WARP_SIZE); + memChans[peerIdx].put<16, false>(offset + channelOutOffset, unitBytesPerWarp, lid, WARP_SIZE); } } @@ -67,16 +67,16 @@ __global__ void __launch_bounds__(1024, 1) const size_t peerIdx = gWid % nPeer; const size_t offset = bytesPerGPU * rank + (gWid / nPeer) * unitBytesPerWarp; if constexpr (IsOutOfPlace) { - char* dst = reinterpret_cast(smChans[peerIdx].dst_); - char* src = reinterpret_cast(smChans[peerIdx].src_); + char* dst = reinterpret_cast(memChans[peerIdx].dst_); + char* src = reinterpret_cast(memChans[peerIdx].src_); char* buff = reinterpret_cast(sendbuff); const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp; - smChans[peerIdx].copy<16, false>(src + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid, - WARP_SIZE); - smChans[peerIdx].copy<16, false>(dst + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid, - WARP_SIZE); + memChans[peerIdx].copy<16, false>(src + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid, + WARP_SIZE); + memChans[peerIdx].copy<16, false>(dst + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid, + WARP_SIZE); } else { - smChans[peerIdx].put<16, false>(offset + channelOutOffset, unitBytesPerWarp, lid, WARP_SIZE); + memChans[peerIdx].put<16, false>(offset + channelOutOffset, unitBytesPerWarp, lid, WARP_SIZE); } } @@ -90,15 +90,15 @@ __global__ void __launch_bounds__(1024, 1) : unitBytesPerWarp; if (remainBytes > 0) { if constexpr (IsOutOfPlace) { - char* dst = reinterpret_cast(smChans[peerIdx].dst_); - char* src = reinterpret_cast(smChans[peerIdx].src_); + char* dst = reinterpret_cast(memChans[peerIdx].dst_); + char* src = reinterpret_cast(memChans[peerIdx].src_); char* buff = reinterpret_cast(sendbuff); - smChans[peerIdx].copy<16, true>(src + offset + channelOutOffset, buff + offsetWithinRank, remainBytes, lid, - WARP_SIZE); - smChans[peerIdx].copy<16, true>(dst + offset + channelOutOffset, buff + offsetWithinRank, remainBytes, lid, - WARP_SIZE); + memChans[peerIdx].copy<16, true>(src + offset + channelOutOffset, buff + offsetWithinRank, remainBytes, lid, + WARP_SIZE); + memChans[peerIdx].copy<16, true>(dst + offset + channelOutOffset, buff + offsetWithinRank, remainBytes, lid, + WARP_SIZE); } else { - smChans[peerIdx].put<16, true>(offset + channelOutOffset, remainBytes, lid, WARP_SIZE); + memChans[peerIdx].put<16, true>(offset + channelOutOffset, remainBytes, lid, WARP_SIZE); } } } @@ -106,14 +106,14 @@ __global__ void __launch_bounds__(1024, 1) deviceSyncer.sync(gridDim.x); if (threadIdx.x < nPeer) { - smChans[threadIdx.x].relaxedSignal(); - smChans[threadIdx.x].wait(); + memChans[threadIdx.x].relaxedSignal(); + memChans[threadIdx.x].wait(); } } template cudaError_t allgather(T* buff, [[maybe_unused]] T* scratch, [[maybe_unused]] T* resultBuff, - mscclpp::DeviceHandle* smChannels, size_t channelOutOffset, int rank, + mscclpp::DeviceHandle* memoryChannels, size_t channelOutOffset, int rank, int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream) { int nBlocks = 28; if (nelems <= 4096) { @@ -123,7 +123,7 @@ cudaError_t allgather(T* buff, [[maybe_unused]] T* scratch, [[maybe_unused]] T* } else if (nelems >= 2097152) { nBlocks = 35; } - allgather6<<>>((void*)buff, smChannels, channelOutOffset, rank, worldSize, + allgather6<<>>((void*)buff, memoryChannels, channelOutOffset, rank, worldSize, nRanksPerNode, nelems * sizeof(T) / sizeof(int)); return cudaGetLastError(); } diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index 41342413b..e53cd1f11 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -8,9 +8,9 @@ #include #include #include +#include +#include #include -#include -#include #if defined(ENABLE_NPKIT) #include @@ -196,7 +196,7 @@ __forceinline__ __device__ void vectorSum(T* dst, T* src, size_t nElem) { template __global__ void __launch_bounds__(32, 1) - allreduceAllToAll(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, + allreduceAllToAll(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* memoryChannels, size_t channelDataOffset, size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize, size_t nelems, uint32_t flag) { // This version of allreduce only works for single nodes @@ -213,10 +213,10 @@ __global__ void __launch_bounds__(32, 1) uint32_t* src = (uint32_t*)((char*)buff); uint32_t* dst = (uint32_t*)((char*)resultBuff); - __shared__ mscclpp::DeviceHandle channels[NRANKS_PER_NODE - 1]; + __shared__ mscclpp::DeviceHandle channels[NRANKS_PER_NODE - 1]; const int lid = tid % WARP_SIZE; if (lid < nPeers) { - channels[lid] = smChannels[lid]; + channels[lid] = memoryChannels[lid]; } __syncwarp(); @@ -240,7 +240,7 @@ __global__ void __launch_bounds__(32, 1) template __global__ void __launch_bounds__(1024, 1) - allreduce7(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, + allreduce7(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* memoryChannels, size_t channelDataOffset, size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize, size_t nelems, uint32_t flag #if defined(ENABLE_NPKIT) @@ -304,10 +304,10 @@ __global__ void __launch_bounds__(1024, 1) uint2* dst = (uint2*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); // Put channels into shared memory, read channel info from global memory is unexpectable slow. - __shared__ mscclpp::DeviceHandle channels[NRANKS_PER_NODE - 1]; + __shared__ mscclpp::DeviceHandle channels[NRANKS_PER_NODE - 1]; const int lid = tid % WARP_SIZE; if (lid < nPeers) { - channels[lid] = smChannels[lid]; + channels[lid] = memoryChannels[lid]; } __syncwarp(); @@ -361,16 +361,16 @@ __global__ void __launch_bounds__(1024, 1) template __global__ void __launch_bounds__(512, 1) - allreduce8(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, - mscclpp::DeviceHandle* smOutChannels, size_t channelOutDataOffset, + allreduce8(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* memoryChannels, + mscclpp::DeviceHandle* memoryOutChannels, size_t channelOutDataOffset, size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize, size_t nelems) { const int nPeer = nRanksPerNode - 1; const size_t chanOffset = nPeer * blockIdx.x; // assume (nelems * sizeof(T)) is divisible by (16 * worldSize) const size_t nInt4 = nelems * sizeof(T) / sizeof(int4); const size_t nInt4PerRank = nInt4 / worldSize; - auto smChans = smChannels + chanOffset; - auto smOutChans = smOutChannels + chanOffset; + auto memoryChans = memoryChannels + chanOffset; + auto memoryOutChans = memoryOutChannels + chanOffset; int4* buff4 = reinterpret_cast(buff); int4* scratch4 = reinterpret_cast((char*)scratch + channelScratchOffset); @@ -396,12 +396,12 @@ __global__ void __launch_bounds__(512, 1) const size_t scratchChunkRankOffset = chunkSizePerRank * rank; const size_t scratchBaseOffsetInt4 = channelScratchOffset / sizeof(int4); - __shared__ mscclpp::DeviceHandle channels[NRANKS_PER_NODE - 1]; - __shared__ mscclpp::DeviceHandle outChannels[NRANKS_PER_NODE - 1]; + __shared__ mscclpp::DeviceHandle channels[NRANKS_PER_NODE - 1]; + __shared__ mscclpp::DeviceHandle outChannels[NRANKS_PER_NODE - 1]; const int lid = threadIdx.x % WARP_SIZE; if (lid < nPeer) { - channels[lid] = smChans[lid]; - outChannels[lid] = smOutChans[lid]; + channels[lid] = memoryChans[lid]; + outChannels[lid] = memoryOutChans[lid]; } __syncwarp(); @@ -496,8 +496,8 @@ __global__ void __launch_bounds__(512, 1) } template -cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, - mscclpp::DeviceHandle* smOutChannels, size_t channelInOffset, +cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* memoryChannels, + mscclpp::DeviceHandle* memoryOutChannels, size_t channelInOffset, size_t channelOutOffset, size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream) { static uint32_t flag = 1; @@ -505,9 +505,9 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< if (sizeof(T) * nelems < worldSize * sizeof(int)) { int nBlocks = 7; int nThreadsPerBlock = 32; - allreduceAllToAll<<>>(buff, scratch, resultBuff, smChannels, channelInOffset, - channelScratchOffset, rank, nRanksPerNode, worldSize, - nelems, flag++); + allreduceAllToAll<<>>(buff, scratch, resultBuff, memoryChannels, + channelInOffset, channelScratchOffset, rank, + nRanksPerNode, worldSize, nelems, flag++); } else if (sizeof(T) * nelems <= (1 << 20)) { int nBlocks = 28; int nThreadsPerBlock = 1024; @@ -518,17 +518,17 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< #if defined(ENABLE_NPKIT) size_t NpkitSharedMemSize = NPKIT_SHM_NUM_EVENTS * sizeof(NpKitEvent); allreduce7<<>>( - buff, scratch, resultBuff, smChannels, channelInOffset, channelScratchOffset, rank, nRanksPerNode, worldSize, - nelems, flag++, NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp()); + buff, scratch, resultBuff, memoryChannels, channelInOffset, channelScratchOffset, rank, nRanksPerNode, + worldSize, nelems, flag++, NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp()); #else - allreduce7<<>>(buff, scratch, resultBuff, smChannels, channelInOffset, + allreduce7<<>>(buff, scratch, resultBuff, memoryChannels, channelInOffset, channelScratchOffset, rank, nRanksPerNode, worldSize, nelems, flag++); #endif } else { int nBlocks = 35; int nThreadsPerBlock = 512; - allreduce8<<>>(buff, scratch, resultBuff, smChannels, smOutChannels, + allreduce8<<>>(buff, scratch, resultBuff, memoryChannels, memoryOutChannels, channelOutOffset, channelScratchOffset, rank, nRanksPerNode, worldSize, nelems); } diff --git a/apps/nccl/src/broadcast.hpp b/apps/nccl/src/broadcast.hpp index e9a9111f6..6d52c963d 100644 --- a/apps/nccl/src/broadcast.hpp +++ b/apps/nccl/src/broadcast.hpp @@ -7,25 +7,25 @@ #include #include #include -#include -#include +#include +#include #include "common.hpp" template __global__ void __launch_bounds__(1024, 1) - broadcast6(void* sendbuff, void* scratchbuff, void* recvbuff, mscclpp::DeviceHandle* smChannels, - [[maybe_unused]] size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, size_t root, - size_t nRanksPerNode, size_t nelemsPerGPU) { + broadcast6(void* sendbuff, void* scratchbuff, void* recvbuff, + mscclpp::DeviceHandle* memoryChannels, [[maybe_unused]] size_t channelOutOffset, + size_t rank, [[maybe_unused]] size_t worldSize, size_t root, size_t nRanksPerNode, size_t nelemsPerGPU) { const size_t nThread = blockDim.x * gridDim.x; const size_t nPeer = nRanksPerNode - 1; const size_t chanOffset = nPeer * blockIdx.x; - __shared__ mscclpp::DeviceHandle smChans[NRANKS_PER_NODE - 1]; + __shared__ mscclpp::DeviceHandle memChans[NRANKS_PER_NODE - 1]; if (threadIdx.x < nPeer) { - smChans[threadIdx.x] = smChannels[chanOffset + threadIdx.x]; - smChans[threadIdx.x].relaxedSignal(); - smChans[threadIdx.x].wait(); + memChans[threadIdx.x] = memoryChannels[chanOffset + threadIdx.x]; + memChans[threadIdx.x].relaxedSignal(); + memChans[threadIdx.x].wait(); } __syncthreads(); @@ -55,23 +55,23 @@ __global__ void __launch_bounds__(1024, 1) if (rank == root) { char* send_ = reinterpret_cast(sendbuff); for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) { - char* dst = reinterpret_cast(smChans[peerIdx].dst_); // Peer's scratchbuff. - smChans[peerIdx].copy<16, false>(dst + offset, send_ + offset, unitBytesPerBlock, threadIdx.x, blockDim.x); + char* dst = reinterpret_cast(memChans[peerIdx].dst_); // Peer's scratchbuff. + memChans[peerIdx].copy<16, false>(dst + offset, send_ + offset, unitBytesPerBlock, threadIdx.x, blockDim.x); __syncthreads(); - if (threadIdx.x == peerIdx) smChans[peerIdx].signal(); + if (threadIdx.x == peerIdx) memChans[peerIdx].signal(); } if constexpr (IsOutOfPlace) { char* recv_ = reinterpret_cast(recvbuff); - smChans[0].copy<16, false>(recv_ + offset, send_ + offset, unitBytesPerBlock, threadIdx.x, blockDim.x); + memChans[0].copy<16, false>(recv_ + offset, send_ + offset, unitBytesPerBlock, threadIdx.x, blockDim.x); } } else { // rank != root. - if (threadIdx.x == peerRootIdx) smChans[peerRootIdx].wait(); + if (threadIdx.x == peerRootIdx) memChans[peerRootIdx].wait(); __syncthreads(); char* recv_ = reinterpret_cast(recvbuff); char* scratch_ = reinterpret_cast(scratchbuff); // My scratchbuff. - smChans[peerRootIdx].copy<16, false>(recv_ + offset, scratch_ + offset, unitBytesPerBlock, threadIdx.x, - blockDim.x); + memChans[peerRootIdx].copy<16, false>(recv_ + offset, scratch_ + offset, unitBytesPerBlock, threadIdx.x, + blockDim.x); } } @@ -81,30 +81,30 @@ __global__ void __launch_bounds__(1024, 1) scratchSub = -i * unitBytes; deviceSyncer.sync(gridDim.x); if (threadIdx.x < nPeer) { - smChans[threadIdx.x].relaxedSignal(); - smChans[threadIdx.x].wait(); + memChans[threadIdx.x].relaxedSignal(); + memChans[threadIdx.x].wait(); } } if (rank == root) { char* send_ = reinterpret_cast(sendbuff); for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) { - char* dst = reinterpret_cast(smChans[peerIdx].dst_); // Peer's scratchbuff. - smChans[peerIdx].copy<16, false>(dst + offset + scratchSub, send_ + offset, unitBytesPerBlock, threadIdx.x, - blockDim.x); + char* dst = reinterpret_cast(memChans[peerIdx].dst_); // Peer's scratchbuff. + memChans[peerIdx].copy<16, false>(dst + offset + scratchSub, send_ + offset, unitBytesPerBlock, threadIdx.x, + blockDim.x); __syncthreads(); - if (threadIdx.x == peerIdx) smChans[peerIdx].signal(); + if (threadIdx.x == peerIdx) memChans[peerIdx].signal(); } if constexpr (IsOutOfPlace) { char* recv_ = reinterpret_cast(recvbuff); - smChans[0].copy<16, false>(recv_ + offset, send_ + offset, unitBytesPerBlock, threadIdx.x, blockDim.x); + memChans[0].copy<16, false>(recv_ + offset, send_ + offset, unitBytesPerBlock, threadIdx.x, blockDim.x); } } else { // rank != root. - if (threadIdx.x == peerRootIdx) smChans[peerRootIdx].wait(); + if (threadIdx.x == peerRootIdx) memChans[peerRootIdx].wait(); __syncthreads(); char* recv_ = reinterpret_cast(recvbuff); char* scratch_ = reinterpret_cast(scratchbuff); // My scratchbuff. - smChans[peerRootIdx].copy<16, false>(recv_ + offset, scratch_ + offset + scratchSub, unitBytesPerBlock, - threadIdx.x, blockDim.x); + memChans[peerRootIdx].copy<16, false>(recv_ + offset, scratch_ + offset + scratchSub, unitBytesPerBlock, + threadIdx.x, blockDim.x); } } @@ -116,23 +116,23 @@ __global__ void __launch_bounds__(1024, 1) if (rank == root) { char* send_ = reinterpret_cast(sendbuff); for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) { - char* dst = reinterpret_cast(smChans[peerIdx].dst_); // Peer's scratchbuff. - smChans[peerIdx].copy<16, true>(dst + offset + scratchSub, send_ + offset, remainBytes, threadIdx.x, - blockDim.x); + char* dst = reinterpret_cast(memChans[peerIdx].dst_); // Peer's scratchbuff. + memChans[peerIdx].copy<16, true>(dst + offset + scratchSub, send_ + offset, remainBytes, threadIdx.x, + blockDim.x); __syncthreads(); - if (threadIdx.x == peerIdx) smChans[peerIdx].signal(); + if (threadIdx.x == peerIdx) memChans[peerIdx].signal(); } if constexpr (IsOutOfPlace) { char* recv_ = reinterpret_cast(recvbuff); - smChans[0].copy<16, true>(recv_ + offset, send_ + offset, remainBytes, threadIdx.x, blockDim.x); + memChans[0].copy<16, true>(recv_ + offset, send_ + offset, remainBytes, threadIdx.x, blockDim.x); } } else { // rank != root. - if (threadIdx.x == peerRootIdx) smChans[peerRootIdx].wait(); + if (threadIdx.x == peerRootIdx) memChans[peerRootIdx].wait(); __syncthreads(); char* recv_ = reinterpret_cast(recvbuff); char* scratch_ = reinterpret_cast(scratchbuff); // My scratchbuff. - smChans[peerRootIdx].copy<16, true>(recv_ + offset, scratch_ + offset + scratchSub, remainBytes, threadIdx.x, - blockDim.x); + memChans[peerRootIdx].copy<16, true>(recv_ + offset, scratch_ + offset + scratchSub, remainBytes, threadIdx.x, + blockDim.x); } } // remainBytes > 0. } @@ -140,13 +140,13 @@ __global__ void __launch_bounds__(1024, 1) deviceSyncer.sync(gridDim.x); if (threadIdx.x < nPeer) { - smChans[threadIdx.x].relaxedSignal(); - smChans[threadIdx.x].wait(); + memChans[threadIdx.x].relaxedSignal(); + memChans[threadIdx.x].wait(); } } template -cudaError_t broadcast(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* smChannels, +cudaError_t broadcast(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* memoryChannels, size_t channelOutOffset, int rank, int nRanksPerNode, int root, int worldSize, size_t nelems, cudaStream_t stream) { int nBlocks = 7; @@ -157,7 +157,7 @@ cudaError_t broadcast(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle< // } else if (nelems >= 2097152) { // nBlocks = 35; // } - broadcast6<<>>((void*)buff, (void*)scratch, (void*)resultBuff, smChannels, + broadcast6<<>>((void*)buff, (void*)scratch, (void*)resultBuff, memoryChannels, channelOutOffset, rank, worldSize, root, nRanksPerNode, nelems * sizeof(T) / sizeof(int)); return cudaGetLastError(); diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu index f91d15e69..3daadf8a3 100644 --- a/apps/nccl/src/nccl.cu +++ b/apps/nccl/src/nccl.cu @@ -7,8 +7,8 @@ #include #include #include -#include -#include +#include +#include #include #include #include @@ -69,14 +69,14 @@ struct hash { } // namespace std struct ChannelInfo { - std::vector smChannels; - std::shared_ptr> smChannelDeviceHandles; + std::vector memoryChannels; + std::shared_ptr> memoryChannelDeviceHandles; }; struct ncclComm { std::shared_ptr comm; std::vector> connections; - std::vector> smSemaphores; + std::vector> memorySemaphores; std::shared_ptr executor; std::unordered_map> executionPlans; @@ -148,16 +148,15 @@ static std::vector setupRemoteMemories(std::shared_pt return remoteMemories; } -static std::vector setupSmChannels(ncclComm_t comm, - const std::vector& remoteMemories, - void* src) { - std::vector channels; - std::vector>& smSemaphores = comm->smSemaphores; +static std::vector setupMemoryChannels( + ncclComm_t comm, const std::vector& remoteMemories, void* src) { + std::vector channels; + std::vector>& memorySemaphores = comm->memorySemaphores; size_t nConnections = comm->connections.size(); for (size_t idx = 0; idx < NUM_CHANNELS_PER_CONNECTION; ++idx) { for (size_t cid = 0; cid < nConnections; ++cid) { if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) { - channels.emplace_back(smSemaphores[idx * nConnections + cid], remoteMemories[cid], src, nullptr); + channels.emplace_back(memorySemaphores[idx * nConnections + cid], remoteMemories[cid], src, nullptr); } } } @@ -171,15 +170,16 @@ static std::pair loadExecutionPlan(const std return std::make_pair(collective, executionPlanInstance{key, plan}); } -static std::shared_ptr> setupSmChannelDeviceHandles( - const std::vector& smChannels) { - std::vector> smChannelDeviceHandles; - std::transform(smChannels.begin(), smChannels.end(), std::back_inserter(smChannelDeviceHandles), - [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); - std::shared_ptr> ptr = - mscclpp::detail::gpuCallocShared>(smChannelDeviceHandles.size()); - mscclpp::gpuMemcpy>(ptr.get(), smChannelDeviceHandles.data(), - smChannelDeviceHandles.size(), cudaMemcpyHostToDevice); +static std::shared_ptr> setupMemoryChannelDeviceHandles( + const std::vector& memoryChannels) { + std::vector> memoryChannelDeviceHandles; + std::transform(memoryChannels.begin(), memoryChannels.end(), std::back_inserter(memoryChannelDeviceHandles), + [](const mscclpp::MemoryChannel& memoryChannel) { return mscclpp::deviceHandle(memoryChannel); }); + std::shared_ptr> ptr = + mscclpp::detail::gpuCallocShared>( + memoryChannelDeviceHandles.size()); + mscclpp::gpuMemcpy>( + ptr.get(), memoryChannelDeviceHandles.data(), memoryChannelDeviceHandles.size(), cudaMemcpyHostToDevice); return ptr; } @@ -211,28 +211,28 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff, int rank = comm->comm->bootstrap()->getRank(); channelKey sendKey{(void*)sendBasePtr, sendBytes}; channelKey recvKey{(void*)recvBasePtr, recvBytes}; - mscclpp::DeviceHandle* smChannels = nullptr; - mscclpp::DeviceHandle* smOutChannels = nullptr; + mscclpp::DeviceHandle* memoryChannels = nullptr; + mscclpp::DeviceHandle* memoryOutChannels = nullptr; // Creating the channels if (count * ncclTypeSize(datatype) <= (1 << 20)) { auto sendIt = comm->channelScratchInfos.find(sendKey); if (sendIt == comm->channelScratchInfos.end()) { - std::vector channels = - setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast((void*)sendBasePtr)); - ChannelInfo channelInfo{channels, setupSmChannelDeviceHandles(channels)}; + std::vector channels = + setupMemoryChannels(comm, comm->remoteScratchRegMemories, const_cast((void*)sendBasePtr)); + ChannelInfo channelInfo{channels, setupMemoryChannelDeviceHandles(channels)}; sendIt = comm->channelScratchInfos.emplace(sendKey, channelInfo).first; } - smChannels = sendIt->second.smChannelDeviceHandles.get(); + memoryChannels = sendIt->second.memoryChannelDeviceHandles.get(); } else { std::vector remoteMemories; auto sendIt = comm->channelInInfos.find(sendKey); if (sendIt == comm->channelInInfos.end()) { - std::vector channels = - setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast((void*)sendBasePtr)); - ChannelInfo channelInfo{channels, setupSmChannelDeviceHandles(channels)}; + std::vector channels = + setupMemoryChannels(comm, comm->remoteScratchRegMemories, const_cast((void*)sendBasePtr)); + ChannelInfo channelInfo{channels, setupMemoryChannelDeviceHandles(channels)}; sendIt = comm->channelInInfos.emplace(sendKey, channelInfo).first; } @@ -240,37 +240,37 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff, if (recvIt == comm->channelOutInfos.end()) { remoteMemories = setupRemoteMemories(comm->comm, rank, (void*)recvBasePtr, recvBytes, mscclpp::Transport::CudaIpc); - std::vector outChannels = - setupSmChannels(comm, remoteMemories, const_cast((void*)recvBasePtr)); - ChannelInfo channelInfo{outChannels, setupSmChannelDeviceHandles(outChannels)}; + std::vector outChannels = + setupMemoryChannels(comm, remoteMemories, const_cast((void*)recvBasePtr)); + ChannelInfo channelInfo{outChannels, setupMemoryChannelDeviceHandles(outChannels)}; recvIt = comm->channelOutInfos.emplace(recvKey, channelInfo).first; } - smChannels = sendIt->second.smChannelDeviceHandles.get(); - smOutChannels = recvIt->second.smChannelDeviceHandles.get(); + memoryChannels = sendIt->second.memoryChannelDeviceHandles.get(); + memoryOutChannels = recvIt->second.memoryChannelDeviceHandles.get(); } switch (datatype) { case ncclFloat16: - CUDACHECK(allreduce((half*)sendbuff, (half*)comm->scratchBuff.get(), (half*)recvbuff, smChannels, smOutChannels, - offsetIn, offsetOut, offsetScratch, rank, NRANKS_PER_NODE, + CUDACHECK(allreduce((half*)sendbuff, (half*)comm->scratchBuff.get(), (half*)recvbuff, memoryChannels, + memoryOutChannels, offsetIn, offsetOut, offsetScratch, rank, NRANKS_PER_NODE, comm->comm->bootstrap()->getNranks(), count, stream)); break; case ncclFloat32: - CUDACHECK(allreduce((float*)sendbuff, (float*)comm->scratchBuff.get(), (float*)recvbuff, smChannels, - smOutChannels, offsetIn, offsetOut, offsetScratch, comm->comm->bootstrap()->getRank(), + CUDACHECK(allreduce((float*)sendbuff, (float*)comm->scratchBuff.get(), (float*)recvbuff, memoryChannels, + memoryOutChannels, offsetIn, offsetOut, offsetScratch, comm->comm->bootstrap()->getRank(), NRANKS_PER_NODE, comm->comm->bootstrap()->getNranks(), count, stream)); break; case ncclBfloat16: CUDACHECK(allreduce((__bfloat16*)sendbuff, (__bfloat16*)comm->scratchBuff.get(), (__bfloat16*)recvbuff, - smChannels, smOutChannels, offsetIn, offsetOut, offsetScratch, rank, NRANKS_PER_NODE, + memoryChannels, memoryOutChannels, offsetIn, offsetOut, offsetScratch, rank, NRANKS_PER_NODE, comm->comm->bootstrap()->getNranks(), count, stream)); break; case ncclInt32: case ncclUint32: - CUDACHECK(allreduce((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, smChannels, smOutChannels, - offsetIn, offsetOut, offsetScratch, comm->comm->bootstrap()->getRank(), NRANKS_PER_NODE, - comm->comm->bootstrap()->getNranks(), count, stream)); + CUDACHECK(allreduce((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, memoryChannels, + memoryOutChannels, offsetIn, offsetOut, offsetScratch, comm->comm->bootstrap()->getRank(), + NRANKS_PER_NODE, comm->comm->bootstrap()->getNranks(), count, stream)); break; default: WARN("datatype is invalid"); @@ -304,27 +304,27 @@ static ncclResult_t ncclAllGatherFallback(const void* sendbuff, void* recvbuff, channelKey recvKey{(void*)recvBasePtr, recvBytes}; int rank = comm->comm->bootstrap()->getRank(); int nRank = comm->comm->bootstrap()->getNranks(); - mscclpp::DeviceHandle* smChannels = nullptr; + mscclpp::DeviceHandle* memoryChannels = nullptr; auto it = comm->channelOutInfos.find(recvKey); if (it == comm->channelOutInfos.end()) { std::vector remoteMemories = setupRemoteMemories( comm->comm, rank, const_cast((void*)recvBasePtr), recvBytes, mscclpp::Transport::CudaIpc); - std::vector channels = - setupSmChannels(comm, remoteMemories, const_cast((void*)recvBasePtr)); - std::vector> smChannelDeviceHandles; - std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles), - [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); - ChannelInfo channelInfo{channels, setupSmChannelDeviceHandles(channels)}; + std::vector channels = + setupMemoryChannels(comm, remoteMemories, const_cast((void*)recvBasePtr)); + std::vector> memoryChannelDeviceHandles; + std::transform(channels.begin(), channels.end(), std::back_inserter(memoryChannelDeviceHandles), + [](const mscclpp::MemoryChannel& memoryChannel) { return mscclpp::deviceHandle(memoryChannel); }); + ChannelInfo channelInfo{channels, setupMemoryChannelDeviceHandles(channels)}; it = comm->channelOutInfos.emplace(recvKey, channelInfo).first; } - smChannels = it->second.smChannelDeviceHandles.get(); + memoryChannels = it->second.memoryChannelDeviceHandles.get(); if ((char*)sendbuff == (char*)recvbuff + rank * sendcount) { - CUDACHECK(allgather((int*)sendbuff, (int*)nullptr, (int*)recvbuff, smChannels, offsetOut, rank, + CUDACHECK(allgather((int*)sendbuff, (int*)nullptr, (int*)recvbuff, memoryChannels, offsetOut, rank, NRANKS_PER_NODE, nRank, bytes / sizeof(int), stream)); } else { - CUDACHECK(allgather((int*)sendbuff, (int*)nullptr, (int*)recvbuff, smChannels, offsetOut, rank, + CUDACHECK(allgather((int*)sendbuff, (int*)nullptr, (int*)recvbuff, memoryChannels, offsetOut, rank, NRANKS_PER_NODE, nRank, bytes / sizeof(int), stream)); } @@ -346,19 +346,19 @@ static void ncclCommInitRankFallbackSingleNode(ncclComm* commPtr, std::shared_pt std::transform(connectionFutures.begin(), connectionFutures.end(), std::back_inserter(connections), [](const auto& future) { return future.get(); }); - std::vector> smSemaphores; + std::vector> memorySemaphores; for (size_t idx = 0; idx < NUM_CHANNELS_PER_CONNECTION; ++idx) { for (size_t cid = 0; cid < connections.size(); ++cid) { if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) { - smSemaphores.emplace_back( - std::make_shared(*(mscclppComm), connections[cid])); + memorySemaphores.emplace_back( + std::make_shared(*(mscclppComm), connections[cid])); } } } mscclppComm->setup(); commPtr->connections = std::move(connections); - commPtr->smSemaphores = std::move(smSemaphores); + commPtr->memorySemaphores = std::move(memorySemaphores); commPtr->buffFlag = 0; commPtr->numScratchBuff = 2; commPtr->scratchBuff = mscclpp::GpuBuffer(SCRATCH_SIZE).memory(); @@ -584,29 +584,29 @@ NCCL_API ncclResult_t ncclBroadcastFallback(const void* sendbuff, void* recvbuff channelKey recvKey{(void*)0x0, 0}; // Just create the channel once. int rank = comm->comm->bootstrap()->getRank(); int nRank = comm->comm->bootstrap()->getNranks(); - mscclpp::DeviceHandle* smChannels = nullptr; + mscclpp::DeviceHandle* memoryChannels = nullptr; auto it = comm->channelOutInfos.find(recvKey); if (it == comm->channelOutInfos.end()) { // std::vector remoteMemories = setupRemoteMemories( // comm->comm, rank, const_cast((void*)recvBasePtr), recvBytes, mscclpp::Transport::CudaIpc); - // std::vector channels = - // setupSmChannels(comm, remoteMemories, const_cast((void*)recvBasePtr)); - std::vector channels = - setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast((void*)recvBasePtr)); - std::vector> smChannelDeviceHandles; - std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles), - [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); - ChannelInfo channelInfo{channels, setupSmChannelDeviceHandles(channels)}; + // std::vector channels = + // setupMemoryChannels(comm, remoteMemories, const_cast((void*)recvBasePtr)); + std::vector channels = + setupMemoryChannels(comm, comm->remoteScratchRegMemories, const_cast((void*)recvBasePtr)); + std::vector> memoryChannelDeviceHandles; + std::transform(channels.begin(), channels.end(), std::back_inserter(memoryChannelDeviceHandles), + [](const mscclpp::MemoryChannel& memoryChannel) { return mscclpp::deviceHandle(memoryChannel); }); + ChannelInfo channelInfo{channels, setupMemoryChannelDeviceHandles(channels)}; it = comm->channelOutInfos.emplace(recvKey, channelInfo).first; } - smChannels = it->second.smChannelDeviceHandles.get(); + memoryChannels = it->second.memoryChannelDeviceHandles.get(); if ((char*)sendbuff == (char*)recvbuff) { - CUDACHECK(broadcast((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, smChannels, offsetOut, + CUDACHECK(broadcast((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, memoryChannels, offsetOut, rank, NRANKS_PER_NODE, root, nRank, bytes / sizeof(int), stream)); } else { - CUDACHECK(broadcast((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, smChannels, offsetOut, + CUDACHECK(broadcast((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, memoryChannels, offsetOut, rank, NRANKS_PER_NODE, root, nRank, bytes / sizeof(int), stream)); } diff --git a/docs/design/design.md b/docs/design/design.md index c67e4d62a..eb0b59327 100644 --- a/docs/design/design.md +++ b/docs/design/design.md @@ -33,17 +33,17 @@ __global__ void gpuKernel() { ``` MSCCL++ also provides efficient synchronization methods, `signal()`, `flush()`, and `wait()`. We will discuss these methods in the following sections. -#### SmChannel & ProxyChannel -MSCCL++ delivers two types of channels, **ProxyChannel** and **SmChannel**. `ProxyChannel` provides (R)DMA-based data copy and synchronization methods. When called, these methods send/receive a signal to/from a host-side proxy (hence the name `ProxyChannel`), which will trigger (R)DMA (such as `cudaMemcpy*` or `ibv_post_send`) or issue synchronization methods (such as `cudaStreamSynchronize` or `ibv_poll_cq`). Since the key functionalities are run by the proxy, ProxyChannel requires only a single GPU thread to call its methods. See all `ProxyChannel` methods from [here](https://github.com/microsoft/mscclpp/blob/main/include/mscclpp/proxy_channel_device.hpp). +#### MemoryChannel & PortChannel +MSCCL++ delivers two types of channels, **PortChannel** and **MemoryChannel**. `PortChannel` provides port-mapping-based data copy and synchronization methods. When called, these methods send/receive a signal to/from a host-side proxy, which will trigger (R)DMA (such as `cudaMemcpy*` or `ibv_post_send`) or issue synchronization methods (such as `cudaStreamSynchronize` or `ibv_poll_cq`). Since the key functionalities are run by the proxy, PortChannel requires only a single GPU thread to call its methods. See all `PortChannel` methods from [here](https://github.com/microsoft/mscclpp/blob/main/include/mscclpp/port_channel_device.hpp). -On the other hand, `SmChannel` provides memory-mapping-based copy and synchronization methods. When called, these methods will directly use GPU threads to read/write from/to the remote GPU's memory space. Comparing against ProxyChannel, SmChannel is especially performant for low-latency scenarios, while it may need many GPU threads to call copying methods at the same time to achieve high copying bandwidth. See all SmChannel methods from [here](https://github.com/microsoft/mscclpp/blob/main/include/mscclpp/sm_channel_device.hpp). +On the other hand, `MemoryChannel` provides memory-mapping-based copy and synchronization methods. When called, these methods will directly use GPU threads to read/write from/to the remote GPU's memory space. Comparing against PortChannel, MemoryChannel is especially performant for low-latency scenarios, while it may need many GPU threads to call copying methods at the same time to achieve high copying bandwidth. See all MemoryChannel methods from [here](https://github.com/microsoft/mscclpp/blob/main/include/mscclpp/memory_channel_device.hpp). ### Fifo & Trigger One of the key features of MSCCL++ is to offload the communication logic from the GPU to the CPU. To offload the communication logic from the GPU to the CPU, MSCCL++ introduces the concept of `Fifo` and `Trigger`. A Fifo is a circular buffer that shared between the GPU and the CPU. It is used to store `Trigger`. A `Trigger` is a signal that is sent from the GPU to the CPU to notify the CPU that there are commands in the Fifo that need to be processed. The CPU will then process the commands in the Fifo and send a signal back to the GPU to notify the GPU that the commands have been processed. The implementation details of Fifo and Trigger can be found in following sections. ### ProxyService -Proxy service is a persistent service that resides in the CPU side. It functions as a polling service that receives the message `Trigger` from the GPU side and then transfers data according to the command. When we use `ProxyChannel` for communication, a `Trigger` is sent from the GPU side to the `ProxyService`. Then `ProxyService` will invoke `cudaMemcpy*` or `IB verbs` to transfer data to the targe device. +Proxy service is a persistent service that resides in the CPU side. It functions as a polling service that receives the message `Trigger` from the GPU side and then transfers data according to the command. When we use `PortChannel` for communication, a `Trigger` is sent from the GPU side to the `ProxyService`. Then `ProxyService` will invoke `cudaMemcpy*` or `IB verbs` to transfer data to the targe device. ## Implementation @@ -60,18 +60,18 @@ MSCCL++ offers one-sided communication methods directly callable from a GPU kern This operation is executed within a kernel launched with a single block. ```cpp // Running on rank 0 -__device__ void gpuKernel(mscclpp::SmChannelDeviceHandle* smChannel) { - smChannel[0].put(/*dstOffset=*/ 0, /*srcOffset=*/ 0, /*size=*/ 1024, /*threadId*/ threadIdx.x, /*numThreads*/ blockDim.x); +__device__ void gpuKernel(mscclpp::MemoryChannelDeviceHandle* memoryChannel) { + memoryChannel[0].put(/*dstOffset=*/ 0, /*srcOffset=*/ 0, /*size=*/ 1024, /*threadId*/ threadIdx.x, /*numThreads*/ blockDim.x); __syncthreads(); if (threadIdx.x == 0) { - smChannel[0].signal(); + memoryChannel[0].signal(); } } // Running on rank 1 -__device__ void gpuKernel(mscclpp::SmChannelDeviceHandle* smChannel) { +__device__ void gpuKernel(mscclpp::MemoryChannelDeviceHandle* memoryChannel) { if (threadIdx.x == 0) { - smChannel[0].wait(); + memoryChannel[0].wait(); } __syncthreads(); // Data is ready to use @@ -81,14 +81,14 @@ __device__ void gpuKernel(mscclpp::SmChannelDeviceHandle* smChannel) { Similar to the LL protocol offered by NCCL, MSCCL++ introduces a `Packet` structure designed to facilitate the transfer of both data and flags within a single instruction, proving particularly beneficial for applications where latency is a critical concern. The following code shows the basic usage of the `Packet` structure. The flag should be same for sender and receiver side. ```cpp // Running on rank 0 -__device__ void gpuKernel(mscclpp::SmChannelDeviceHandle* smChans, int flag) { - smChans[0].putPackets(/*dstOffset=*/ 0, /*srcOffset=*/ 0, /*size=*/ 1024, /*threadId*/ threadIdx.x, /*numThreads*/ blockDim.x, +__device__ void gpuKernel(mscclpp::MemoryChannelDeviceHandle* memChans, int flag) { + memChans[0].putPackets(/*dstOffset=*/ 0, /*srcOffset=*/ 0, /*size=*/ 1024, /*threadId*/ threadIdx.x, /*numThreads*/ blockDim.x, /*flag=*/ flag); } // Running on rank 1 -__device__ void gpuKernel(mscclpp::SmChannelDeviceHandle* smChans, int flag) { - smChans[0].getPackets(/*dstOffset=*/ 0, /*srcOffset=*/ 0, /*size=*/ 1024, /*threadId*/ threadIdx.x, /*numThreads*/ blockDim.x, +__device__ void gpuKernel(mscclpp::MemoryChannelDeviceHandle* memChans, int flag) { + memChans[0].getPackets(/*dstOffset=*/ 0, /*srcOffset=*/ 0, /*size=*/ 1024, /*threadId*/ threadIdx.x, /*numThreads*/ blockDim.x, /*flag=*/ flag); // Data is ready to use } @@ -117,11 +117,11 @@ In this section, we will discuss several use cases that demonstrate the capabili MSCCL++ enables the offloading of communication logic from the GPU to the CPU, facilitating the overlapping of communication and computation processes. The code snippet provided illustrates this overlapping technique. In the depicted scenario, the GPU emits a signal to the CPU indicating readiness for data transfer. Subsequently, while the GPU continues to execute computation tasks, the CPU initiates the data transfer to the designated target device. ```cpp -__device__ void gpuKernel(mscclpp::ProxyChannelDeviceHandle* proxyChannel) { +__device__ void gpuKernel(mscclpp::PortChannelDeviceHandle* portChannel) { int tid = threadIdx.x + blockIdx.x * blockDim.x; // Send a trigger to the CPU if (tid == 0) { - proxyChannel[0].putWithSignal(/*dstOffset*/ 0, /*srcOffset*/ 0, /*size*/ 1024); + portChannel[0].putWithSignal(/*dstOffset*/ 0, /*srcOffset*/ 0, /*size*/ 1024); } // Continue computation matrixMul() @@ -138,18 +138,18 @@ Traditional communication libraries enforce a separation between communication a MCSCL++ offers a low-level communication API, allowing users to design customized collective communication algorithms. The following code demonstrates how to implement a customized All2All algorithm using MSCCL++. ```cpp using DeviceHandle = mscclpp::DeviceHandle; -__device__ void localAlltoall(DeviceHandle* proxyChans, int rank, +__device__ void localAlltoall(DeviceHandle* portChans, int rank, int nRanksPerNode, size_t nElements) { int remoteRank = ((int)blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1; for (int i = 1; i < nRanksPerNode; i++) { - DeviceHandle proxyChan = proxyChans[blockIdx.x]; + DeviceHandle portChan = portChans[blockIdx.x]; if (threadIdx.x == 0 && remoteRank % nRanksPerNode == (rank + i) % nRanksPerNode) { - proxyChan.putWithSignalAndFlush(rank * nElements * sizeof(int), remoteRank * nElements * sizeof(int), + portChan.putWithSignalAndFlush(rank * nElements * sizeof(int), remoteRank * nElements * sizeof(int), nElements * sizeof(int)); } // wait for the data from GPU (rank-i) % nranksPerNode to arrive if (threadIdx.x == 0 && remoteRank % nRanksPerNode == (rank - i + nRanksPerNode) % nRanksPerNode) { - proxyChan.wait(); + portChan.wait(); } deviceSyncer.sync(nRanksPerNode - 1); } diff --git a/docs/design/mscclpp-dsl.md b/docs/design/mscclpp-dsl.md index 9b34b29f0..9b6955e81 100644 --- a/docs/design/mscclpp-dsl.md +++ b/docs/design/mscclpp-dsl.md @@ -72,11 +72,11 @@ The operation can only be applied to the chunks. We provide a set of communicati ***Please notice***: MSCCLPPLang only provides one-sided communication operations. The user needs to make sure that the data is ready to be sent or received before calling the communication operations. Also we provides `wait/signal` operations to synchronize the communication across GPUs. #### Channel -A channel is a communication channel between two GPUs. It is used to send and receive data between GPUs. We supports three types of channel: `ChannelType.sm`, `ChannelType.proxy` and `ChannelType.nvls`. +A channel is a communication channel between two GPUs. It is used to send and receive data between GPUs. We supports three types of channel: `ChannelType.memory`, `ChannelType.port` and `ChannelType.nvls`. -`ChannelType.sm` is used for communication between GPUs on the same node. This channel uses GPU processors to transfer data. +`ChannelType.memory` is used for communication between GPUs on the same node. This channel uses GPU processors to transfer data. -`ChannelType.proxy` is used for communication between GPUs, whether they are on different nodes or the same node. This channel will offload the data transfer to CPU processors, which can provide better throughput compared to `ChannelType.sm`. However, this comes at the cost of higher latency compared to `ChannelType.sm`. +`ChannelType.port` is used for communication between GPUs, whether they are on different nodes or the same node. This channel will offload the data transfer to CPU processors, which can provide better throughput compared to `ChannelType.memory`. However, this comes at the cost of higher latency compared to `ChannelType.memory`. `ChannelType.nvls` is used for communication between GPUs on the same node. This feature offloads the data processing task to the switch, requiring specific hardware support. Refer [nvdia documentation](https://www.nvidia.com/en-us/data-center/nvlink/) for more details. @@ -85,7 +85,7 @@ We can assign operations to a thread block. The thread block is a group of threa #### Instance An instance is a parallel execution of the program. For example, if a collective algorithm is designed to run on `n` chunks with `m` thread blocks, setting the instance to 2 will run the algorithm on `2n` chunks with `2m` thread blocks. Serveral replication policies are supported, including `duplicated` and `interleaved`. -- `duplicated`: Each chunk is split into smaller parts based on the number of instances, duplicating the same instructions for all parts. For example, ChunkA is split into ChunkA0 and ChunkA1, while ChunkB is split into ChunkB0 and ChunkB1. Both ChunkA0 and ChunkA1 belong to Instance 0, and both ChunkB0 and ChunkB1 belong to Instance 1. +- `duplicated`: Each chunk is split into smaller parts based on the number of instances, duplicating the same instructions for all parts. For example, ChunkA is split into ChunkA0 and ChunkA1, while ChunkB is split into ChunkB0 and ChunkB1. Both ChunkA0 and ChunkA1 belong to Instance 0, and both ChunkB0 and ChunkB1 belong to Instance 1. - `interleaved`: Assign chunks to instances in an interleaved manner. For example, ChunkA and ChunkB are split into to ChunkA0, ChunkA1, ChunkB0, and ChunkB1. ChunkA0 and ChunkB0 belong to Instance 0, while ChunkA1 and ChunkB1 belong to Instance 1. #### Instruction Fusion diff --git a/docs/getting-started/tutorials/index.rst b/docs/getting-started/tutorials/index.rst index 7ee91b194..c43207edd 100644 --- a/docs/getting-started/tutorials/index.rst +++ b/docs/getting-started/tutorials/index.rst @@ -9,8 +9,8 @@ This tutorial section provides a step-by-step guide to help you get started with :hidden: initialization - proxy-channel - sm-channel + port-channel + memory-channel packet-api customized-proxy-service python-api diff --git a/docs/getting-started/tutorials/initialization.md b/docs/getting-started/tutorials/initialization.md index b1d4c1d26..4b9327fa6 100644 --- a/docs/getting-started/tutorials/initialization.md +++ b/docs/getting-started/tutorials/initialization.md @@ -13,7 +13,7 @@ We will setup a mesh topology with eight GPUs. Each GPU will be connected to its ```cpp #include #include -#include +#include #include #include @@ -21,7 +21,7 @@ We will setup a mesh topology with eight GPUs. Each GPU will be connected to its template using DeviceHandle = mscclpp::DeviceHandle; -__constant__ DeviceHandle constProxyChans[8]; +__constant__ DeviceHandle constPortChans[8]; void setupMeshTopology(int rank, int worldsize, void* data, size_t dataSize) { std::string ip_port = "10.0.0.4:50000"; @@ -55,17 +55,17 @@ void setupMeshTopology(int rank, int worldsize, void* data, size_t dataSize) { comm.setup(); - std::vector> proxyChannels; + std::vector> portChannels; for (size_t i = 0; i < semaphoreIds.size(); ++i) { - proxyChannels.push_back(mscclpp::deviceHandle(mscclpp::ProxyChannel( - proxyService.proxyChannel(semaphoreIds[i]), proxyService.addMemory(remoteMemories[i].get()), + portChannels.push_back(mscclpp::deviceHandle(mscclpp::PortChannel( + proxyService.portChannel(semaphoreIds[i]), proxyService.addMemory(remoteMemories[i].get()), proxyService.addMemory(localMemories[i])))); } - if (proxyChannels.size() > sizeof(constProxyChans) / sizeof(DeviceHandle)) { + if (portChannels.size() > sizeof(constPortChans) / sizeof(DeviceHandle)) { std::runtime_error("unexpected error"); } - CUDACHECK(cudaMemcpyToSymbol(constProxyChans, proxyChannels.data(), - sizeof(DeviceHandle) * proxyChannels.size())); + CUDACHECK(cudaMemcpyToSymbol(constPortChans, portChannels.data(), + sizeof(DeviceHandle) * portChannels.size())); } ``` diff --git a/docs/getting-started/tutorials/memory-channel.md b/docs/getting-started/tutorials/memory-channel.md new file mode 100644 index 000000000..d6f78e32e --- /dev/null +++ b/docs/getting-started/tutorials/memory-channel.md @@ -0,0 +1,3 @@ +# Using MemoryChannel for Intra-Node Communication + +TBU diff --git a/docs/getting-started/tutorials/port-channel.md b/docs/getting-started/tutorials/port-channel.md new file mode 100644 index 000000000..a4db69854 --- /dev/null +++ b/docs/getting-started/tutorials/port-channel.md @@ -0,0 +1,3 @@ +# Offload commnunication to CPU with PortChannel + +TBU diff --git a/docs/getting-started/tutorials/proxy-channel.md b/docs/getting-started/tutorials/proxy-channel.md deleted file mode 100644 index fec5c4cc0..000000000 --- a/docs/getting-started/tutorials/proxy-channel.md +++ /dev/null @@ -1,3 +0,0 @@ -# Offload commnunication to CPU with ProxyChannel - -TBU diff --git a/docs/getting-started/tutorials/python-api.md b/docs/getting-started/tutorials/python-api.md index c2f26c23f..cac195c93 100644 --- a/docs/getting-started/tutorials/python-api.md +++ b/docs/getting-started/tutorials/python-api.md @@ -35,7 +35,7 @@ if __name__ == "__main__": nelems = 1024 memory = GpuBuffer(nelem, dtype=cp.int32) proxy_service = ProxyService() - simple_channels = group.make_proxy_channels(proxy_service, memory, connections) + simple_channels = group.make_port_channels(proxy_service, memory, connections) proxy_service.start_proxy() mscclpp_group.barrier() launch_kernel(mscclpp_group.my_rank, mscclpp_group.nranks, simple_channels, memory) @@ -48,7 +48,7 @@ We provide some Python utils to help you launch kernel via python. Here is a exa ```python from mscclpp.utils import KernelBuilder, pack -def launch_kernel(my_rank: int, nranks: int, simple_channels: List[ProxyChannel], memory: cp.ndarray): +def launch_kernel(my_rank: int, nranks: int, simple_channels: List[PortChannel], memory: cp.ndarray): file_dir = os.path.dirname(os.path.abspath(__file__)) kernel = KernelBuilder(file="test.cu", kernel_name="test", file_dir=file_dir).get_compiled_kernel() params = b"" @@ -74,11 +74,11 @@ def launch_kernel(my_rank: int, nranks: int, simple_channels: List[ProxyChannel] The test kernel is defined in `test.cu` as follows: ```cuda #include -#include +#include // be careful about using channels[my_rank] as it is inavlie and it is there just for simplicity of indexing extern "C" __global__ void __launch_bounds__(1024, 1) - proxy_channel(mscclpp::ProxyChannelDeviceHandle* channels, int my_rank, int nranks, + port_channel(mscclpp::PortChannelDeviceHandle* channels, int my_rank, int nranks, int num_elements) { int tid = threadIdx.x; int nthreads = blockDim.x; diff --git a/docs/getting-started/tutorials/sm-channel.md b/docs/getting-started/tutorials/sm-channel.md deleted file mode 100644 index 191e47b36..000000000 --- a/docs/getting-started/tutorials/sm-channel.md +++ /dev/null @@ -1,3 +0,0 @@ -# Using SmChannel for Intra-Node Communication - -TBU diff --git a/include/mscclpp/memory_channel.hpp b/include/mscclpp/memory_channel.hpp new file mode 100644 index 000000000..533907b9c --- /dev/null +++ b/include/mscclpp/memory_channel.hpp @@ -0,0 +1,50 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef MSCCLPP_MEMORY_CHANNEL_HPP_ +#define MSCCLPP_MEMORY_CHANNEL_HPP_ + +#include + +#include "core.hpp" +#include "memory_channel_device.hpp" +#include "semaphore.hpp" + +namespace mscclpp { + +/// Channel for accessing peer memory directly from GPU threads. +struct MemoryChannel { + private: + std::shared_ptr semaphore_; + RegisteredMemory dst_; + void* src_; + void* getPacketBuffer_; + + public: + /// Constructor. + MemoryChannel() = default; + + /// Constructor. + /// @param semaphore The semaphore used to synchronize the communication. + /// @param dst Registered memory of the destination. + /// @param src The source memory address. + /// @param getPacketBuffer The optional buffer used for @ref getPackets(). + MemoryChannel(std::shared_ptr semaphore, RegisteredMemory dst, void* src, + void* getPacketBuffer = nullptr); + + /// Device-side handle for @ref MemoryChannel. + using DeviceHandle = MemoryChannelDeviceHandle; + + /// Returns the device-side handle. + /// + /// User should make sure the MemoryChannel is not released when using the returned handle. + /// + DeviceHandle deviceHandle() const; +}; + +/// @deprecated Use @ref MemoryChannel instead. +[[deprecated("Use MemoryChannel instead.")]] typedef MemoryChannel SmChannel; + +} // namespace mscclpp + +#endif // MSCCLPP_MEMORY_CHANNEL_HPP_ diff --git a/include/mscclpp/sm_channel_device.hpp b/include/mscclpp/memory_channel_device.hpp similarity index 97% rename from include/mscclpp/sm_channel_device.hpp rename to include/mscclpp/memory_channel_device.hpp index e49a431b7..d49eb4def 100644 --- a/include/mscclpp/sm_channel_device.hpp +++ b/include/mscclpp/memory_channel_device.hpp @@ -1,8 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#ifndef MSCCLPP_SM_CHANNEL_DEVICE_HPP_ -#define MSCCLPP_SM_CHANNEL_DEVICE_HPP_ +#ifndef MSCCLPP_MEMORY_CHANNEL_DEVICE_HPP_ +#define MSCCLPP_MEMORY_CHANNEL_DEVICE_HPP_ #include "semaphore_device.hpp" #if defined(MSCCLPP_DEVICE_COMPILE) @@ -42,9 +42,9 @@ MSCCLPP_DEVICE_INLINE void copy(T* dst, T* src, uint64_t numElems, uint32_t thre #endif // defined(MSCCLPP_DEVICE_COMPILE) -/// Channel for accessing peer memory directly from SM. -struct SmChannelDeviceHandle { - SmDevice2DeviceSemaphoreDeviceHandle semaphore_; +/// Device-side handle of a MemoryChannel. +struct MemoryChannelDeviceHandle { + MemoryDevice2DeviceSemaphoreDeviceHandle semaphore_; void* src_; void* dst_; void* getPacketBuffer_; @@ -276,6 +276,9 @@ struct SmChannelDeviceHandle { #endif // defined(MSCCLPP_DEVICE_COMPILE) }; +/// @deprecated Use @ref MemoryChannelDeviceHandle instead. +[[deprecated("Use MemoryChannelDeviceHandle instead.")]] typedef MemoryChannelDeviceHandle SmChannelDeviceHandle; + } // namespace mscclpp -#endif // MSCCLPP_SM_CHANNEL_DEVICE_HPP_ +#endif // MSCCLPP_MEMORY_CHANNEL_DEVICE_HPP_ diff --git a/include/mscclpp/proxy_channel.hpp b/include/mscclpp/port_channel.hpp similarity index 59% rename from include/mscclpp/proxy_channel.hpp rename to include/mscclpp/port_channel.hpp index 4f2978f75..3d5a62843 100644 --- a/include/mscclpp/proxy_channel.hpp +++ b/include/mscclpp/port_channel.hpp @@ -1,18 +1,18 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#ifndef MSCCLPP_PROXY_CHANNEL_HPP_ -#define MSCCLPP_PROXY_CHANNEL_HPP_ +#ifndef MSCCLPP_PORT_CHANNEL_HPP_ +#define MSCCLPP_PORT_CHANNEL_HPP_ #include "core.hpp" +#include "port_channel_device.hpp" #include "proxy.hpp" -#include "proxy_channel_device.hpp" #include "semaphore.hpp" namespace mscclpp { -struct BaseProxyChannel; -struct ProxyChannel; +struct BasePortChannel; +struct PortChannel; /// Base class for proxy services. Proxy services are used to proxy data between devices. class BaseProxyService { @@ -49,17 +49,17 @@ class ProxyService : public BaseProxyService { /// @return The semaphore. std::shared_ptr semaphore(SemaphoreId id) const; - /// Get a base proxy channel by semaphore ID. + /// Get a base port channel by semaphore ID. /// @param id The ID of the semaphore. - /// @return The base proxy channel. - BaseProxyChannel baseProxyChannel(SemaphoreId id); + /// @return The base port channel. + BasePortChannel basePortChannel(SemaphoreId id); - /// Get a proxy channel by semaphore ID and memory regions. + /// Get a port channel by semaphore ID and memory regions. /// @param id The ID of the semaphore. /// @param dst The destination memory region. /// @param src The source memory region. - /// @return The proxy channel. - ProxyChannel proxyChannel(SemaphoreId id, MemoryId dst, MemoryId src); + /// @return The port channel. + PortChannel portChannel(SemaphoreId id, MemoryId dst, MemoryId src); /// Start the proxy service. void startProxy(); @@ -79,8 +79,8 @@ class ProxyService : public BaseProxyService { ProxyHandlerResult handleTrigger(ProxyTrigger triggerRaw); }; -/// Proxy channel. -struct BaseProxyChannel { +/// Port channel without specifying source/destination memory regions. +struct BasePortChannel { protected: SemaphoreId semaphoreId_; @@ -89,34 +89,34 @@ struct BaseProxyChannel { std::shared_ptr proxy_; public: - BaseProxyChannel() = default; + BasePortChannel() = default; - BaseProxyChannel(SemaphoreId semaphoreId, std::shared_ptr semaphore, - std::shared_ptr proxy); + BasePortChannel(SemaphoreId semaphoreId, std::shared_ptr semaphore, + std::shared_ptr proxy); - BaseProxyChannel(const BaseProxyChannel& other) = default; + BasePortChannel(const BasePortChannel& other) = default; - BaseProxyChannel& operator=(BaseProxyChannel& other) = default; + BasePortChannel& operator=(BasePortChannel& other) = default; - /// Device-side handle for @ref BaseProxyChannel. - using DeviceHandle = BaseProxyChannelDeviceHandle; + /// Device-side handle for @ref BasePortChannel. + using DeviceHandle = BasePortChannelDeviceHandle; /// Returns the device-side handle. /// - /// User should make sure the BaseProxyChannel is not released when using the returned handle. + /// User should make sure the BasePortChannel is not released when using the returned handle. /// DeviceHandle deviceHandle() const; }; -/// A common form of proxy channel with a single destination and source memory region. -struct ProxyChannel : public BaseProxyChannel { +/// Port channel. +struct PortChannel : public BasePortChannel { private: MemoryId dst_; MemoryId src_; public: /// Default constructor. - ProxyChannel() = default; + PortChannel() = default; /// Constructor. /// @param semaphoreId The ID of the semaphore. @@ -124,25 +124,31 @@ struct ProxyChannel : public BaseProxyChannel { /// @param proxy The proxy. /// @param dst The destination memory region. /// @param src The source memory region. - ProxyChannel(SemaphoreId semaphoreId, std::shared_ptr semaphore, std::shared_ptr proxy, - MemoryId dst, MemoryId src); + PortChannel(SemaphoreId semaphoreId, std::shared_ptr semaphore, std::shared_ptr proxy, + MemoryId dst, MemoryId src); /// Copy constructor. - ProxyChannel(const ProxyChannel& other) = default; + PortChannel(const PortChannel& other) = default; /// Assignment operator. - ProxyChannel& operator=(ProxyChannel& other) = default; + PortChannel& operator=(PortChannel& other) = default; - /// Device-side handle for @ref ProxyChannel. - using DeviceHandle = ProxyChannelDeviceHandle; + /// Device-side handle for @ref PortChannel. + using DeviceHandle = PortChannelDeviceHandle; /// Returns the device-side handle. /// - /// User should make sure the ProxyChannel is not released when using the returned handle. + /// User should make sure the PortChannel is not released when using the returned handle. /// DeviceHandle deviceHandle() const; }; +/// @deprecated Use @ref BasePortChannel instead. +[[deprecated("Use BasePortChannel instead.")]] typedef BasePortChannel BaseProxyChannel; + +/// @deprecated Use @ref PortChannel instead. +[[deprecated("Use PortChannel instead.")]] typedef PortChannel ProxyChannel; + } // namespace mscclpp -#endif // MSCCLPP_PROXY_CHANNEL_HPP_ +#endif // MSCCLPP_PORT_CHANNEL_HPP_ diff --git a/include/mscclpp/proxy_channel_device.hpp b/include/mscclpp/port_channel_device.hpp similarity index 87% rename from include/mscclpp/proxy_channel_device.hpp rename to include/mscclpp/port_channel_device.hpp index 38237978a..bd9cefe78 100644 --- a/include/mscclpp/proxy_channel_device.hpp +++ b/include/mscclpp/port_channel_device.hpp @@ -1,8 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#ifndef MSCCLPP_PROXY_CHANNEL_DEVICE_HPP_ -#define MSCCLPP_PROXY_CHANNEL_DEVICE_HPP_ +#ifndef MSCCLPP_PORT_CHANNEL_DEVICE_HPP_ +#define MSCCLPP_PORT_CHANNEL_DEVICE_HPP_ #include "fifo_device.hpp" #include "semaphore_device.hpp" @@ -83,7 +83,7 @@ union ChannelTrigger { #endif // defined(MSCCLPP_DEVICE_COMPILE) }; -struct BaseProxyChannelDeviceHandle { +struct BasePortChannelDeviceHandle { SemaphoreId semaphoreId_; Host2DeviceSemaphoreDeviceHandle semaphore_; @@ -92,11 +92,11 @@ struct BaseProxyChannelDeviceHandle { // can produce for and the sole proxy thread consumes it. FifoDeviceHandle fifo_; - MSCCLPP_HOST_DEVICE_INLINE BaseProxyChannelDeviceHandle() {} + MSCCLPP_HOST_DEVICE_INLINE BasePortChannelDeviceHandle() {} - MSCCLPP_HOST_DEVICE_INLINE BaseProxyChannelDeviceHandle(SemaphoreId semaphoreId, - Host2DeviceSemaphoreDeviceHandle semaphore, - FifoDeviceHandle fifo) + MSCCLPP_HOST_DEVICE_INLINE BasePortChannelDeviceHandle(SemaphoreId semaphoreId, + Host2DeviceSemaphoreDeviceHandle semaphore, + FifoDeviceHandle fifo) : semaphoreId_(semaphoreId), semaphore_(semaphore), fifo_(fifo) {} #if defined(MSCCLPP_DEVICE_COMPILE) @@ -171,27 +171,27 @@ struct BaseProxyChannelDeviceHandle { fifo_.sync(curFifoHead); } - /// Check if the proxy channel has been signaled. - /// @return true if the proxy channel has been signaled. + /// Check if the port channel has been signaled. + /// @return true if the port channel has been signaled. MSCCLPP_DEVICE_INLINE bool poll() { return semaphore_.poll(); } - /// Wait for the proxy channel to be signaled. + /// Wait for the port channel to be signaled. /// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative. MSCCLPP_DEVICE_INLINE void wait(int64_t maxSpinCount = 10000000) { semaphore_.wait(maxSpinCount); } #endif // defined(MSCCLPP_DEVICE_COMPILE) }; -struct ProxyChannelDeviceHandle : public BaseProxyChannelDeviceHandle { +struct PortChannelDeviceHandle : public BasePortChannelDeviceHandle { MemoryId dst_; MemoryId src_; - MSCCLPP_HOST_DEVICE_INLINE ProxyChannelDeviceHandle(){}; + MSCCLPP_HOST_DEVICE_INLINE PortChannelDeviceHandle(){}; - MSCCLPP_HOST_DEVICE_INLINE ProxyChannelDeviceHandle(SemaphoreId semaphoreId, - Host2DeviceSemaphoreDeviceHandle semaphore, FifoDeviceHandle fifo, - MemoryId dst, MemoryId src) - : BaseProxyChannelDeviceHandle(semaphoreId, semaphore, fifo), dst_(dst), src_(src) {} + MSCCLPP_HOST_DEVICE_INLINE PortChannelDeviceHandle(SemaphoreId semaphoreId, + Host2DeviceSemaphoreDeviceHandle semaphore, FifoDeviceHandle fifo, + MemoryId dst, MemoryId src) + : BasePortChannelDeviceHandle(semaphoreId, semaphore, fifo), dst_(dst), src_(src) {} #if defined(MSCCLPP_DEVICE_COMPILE) /// Push a @ref TriggerData to the FIFO. @@ -199,7 +199,7 @@ struct ProxyChannelDeviceHandle : public BaseProxyChannelDeviceHandle { /// @param srcOffset The offset into the source memory region. /// @param size The size of the transfer. MSCCLPP_DEVICE_INLINE void put(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) { - BaseProxyChannelDeviceHandle::put(dst_, dstOffset, src_, srcOffset, size); + BasePortChannelDeviceHandle::put(dst_, dstOffset, src_, srcOffset, size); } /// Push a @ref TriggerData to the FIFO. @@ -212,7 +212,7 @@ struct ProxyChannelDeviceHandle : public BaseProxyChannelDeviceHandle { /// @param srcOffset The offset into the source memory region. /// @param size The size of the transfer. MSCCLPP_DEVICE_INLINE void putWithSignal(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) { - BaseProxyChannelDeviceHandle::putWithSignal(dst_, dstOffset, src_, srcOffset, size); + BasePortChannelDeviceHandle::putWithSignal(dst_, dstOffset, src_, srcOffset, size); } /// Push a @ref TriggerData and a @ref TriggerFlag at the same time to the FIFO. @@ -225,7 +225,7 @@ struct ProxyChannelDeviceHandle : public BaseProxyChannelDeviceHandle { /// @param srcOffset The offset into the source memory region. /// @param size The size of the transfer. MSCCLPP_DEVICE_INLINE void putWithSignalAndFlush(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) { - BaseProxyChannelDeviceHandle::putWithSignalAndFlush(dst_, dstOffset, src_, srcOffset, size); + BasePortChannelDeviceHandle::putWithSignalAndFlush(dst_, dstOffset, src_, srcOffset, size); } /// Push a @ref TriggerData, a @ref TriggerFlag, and a @ref TriggerSync at the same time to the FIFO. @@ -239,4 +239,4 @@ struct ProxyChannelDeviceHandle : public BaseProxyChannelDeviceHandle { } // namespace mscclpp -#endif // MSCCLPP_PROXY_CHANNEL_DEVICE_HPP_ +#endif // MSCCLPP_PORT_CHANNEL_DEVICE_HPP_ diff --git a/include/mscclpp/semaphore.hpp b/include/mscclpp/semaphore.hpp index b28373bdc..55dbbe740 100644 --- a/include/mscclpp/semaphore.hpp +++ b/include/mscclpp/semaphore.hpp @@ -116,19 +116,19 @@ class Host2HostSemaphore : public BaseSemaphore connection_; }; -/// A semaphore for sending signals from the local device to a peer device via SM. -class SmDevice2DeviceSemaphore : public BaseSemaphore { +/// A semaphore for sending signals from the local device to a peer device via a GPU thread. +class MemoryDevice2DeviceSemaphore : public BaseSemaphore { public: /// Constructor. /// @param communicator The communicator. /// @param connection The connection associated with this semaphore. - SmDevice2DeviceSemaphore(Communicator& communicator, std::shared_ptr connection); + MemoryDevice2DeviceSemaphore(Communicator& communicator, std::shared_ptr connection); /// Constructor. - SmDevice2DeviceSemaphore() = delete; + MemoryDevice2DeviceSemaphore() = delete; - /// Device-side handle for @ref SmDevice2DeviceSemaphore. - using DeviceHandle = SmDevice2DeviceSemaphoreDeviceHandle; + /// Device-side handle for @ref MemoryDevice2DeviceSemaphore. + using DeviceHandle = MemoryDevice2DeviceSemaphoreDeviceHandle; /// Returns the device-side handle. DeviceHandle deviceHandle() const; @@ -136,6 +136,10 @@ class SmDevice2DeviceSemaphore : public BaseSemaphore - -#include "core.hpp" -#include "semaphore.hpp" -#include "sm_channel_device.hpp" - -namespace mscclpp { - -/// Channel for accessing peer memory directly from SM. -struct SmChannel { - private: - std::shared_ptr semaphore_; - RegisteredMemory dst_; - void* src_; - void* getPacketBuffer_; - - public: - /// Constructor. - SmChannel() = default; - - /// Constructor. - /// @param semaphore The semaphore used to synchronize the communication. - /// @param dst Registered memory of the destination. - /// @param src The source memory address. - /// @param getPacketBuffer The optional buffer used for @ref getPackets(). - SmChannel(std::shared_ptr semaphore, RegisteredMemory dst, void* src, - void* getPacketBuffer = nullptr); - - /// Device-side handle for @ref SmChannel. - using DeviceHandle = SmChannelDeviceHandle; - - /// Returns the device-side handle. - /// - /// User should make sure the SmChannel is not released when using the returned handle. - /// - DeviceHandle deviceHandle() const; -}; - -} // namespace mscclpp - -#endif // MSCCLPP_SM_CHANNEL_HPP_ diff --git a/python/examples/allgather_barrier.py b/python/examples/allgather_barrier.py index acc0c2a2f..d6f358045 100644 --- a/python/examples/allgather_barrier.py +++ b/python/examples/allgather_barrier.py @@ -28,7 +28,7 @@ def allgather_test(gpus, instances): c = chunk(n, Buffer.input, 0, 1) for peer in range(gpus): if n != peer: - c.put(peer, Buffer.output, n, sendtb=peer, chan_type=ChannelType.sm) + c.put(peer, Buffer.output, n, sendtb=peer, chan_type=ChannelType.memory) else: c.copy(n, Buffer.output, n, sendtb=peer) # explicit barrier @@ -36,13 +36,13 @@ def allgather_test(gpus, instances): r.barrier(tb_list=list(range(gpus))) for peer in range(gpus): if n != peer: - c.signal(peer, Buffer.output, n, sendtb=peer, chan_type=ChannelType.sm) + c.signal(peer, Buffer.output, n, sendtb=peer, chan_type=ChannelType.memory) for n in range(gpus): for peer in range(gpus): c = chunk(n, Buffer.output, peer, 1) if n != peer: - c.wait(peer, Buffer.input, peer, recvtb=peer, chan_type=ChannelType.sm) + c.wait(peer, Buffer.input, peer, recvtb=peer, chan_type=ChannelType.memory) Json() Check() diff --git a/python/examples/send_recv_packet.py b/python/examples/send_recv_packet.py index f0272344e..4ecb58ddb 100644 --- a/python/examples/send_recv_packet.py +++ b/python/examples/send_recv_packet.py @@ -10,9 +10,9 @@ def send_recv(instances): """ - Send and receive data between two ranks using proxy channels, with LL protocol and double scratch buffer. + Send and receive data between two ranks using port channels, with LL protocol and double scratch buffer. Steps: - 1. Each rank sends a chunk to every other rank's scratch buffer with packet format via proxy channel. + 1. Each rank sends a chunk to every other rank's scratch buffer with packet format via port channel. 2. Wait for the data to be received, then copy it to the output buffer. """ size = 2 @@ -36,7 +36,7 @@ def send_recv(instances): "scratch", 1, sendtb=0, - chan_type=ChannelType.proxy, + chan_type=ChannelType.port, temp_buffer="scratch", temp_buffer_index=0, ) diff --git a/python/examples/send_recv_proxy.py b/python/examples/send_recv_proxy.py index ec6baee99..f9ed2f309 100644 --- a/python/examples/send_recv_proxy.py +++ b/python/examples/send_recv_proxy.py @@ -10,7 +10,7 @@ def send_recv(instances): """ - Send and receive data between two ranks using proxy channels. + Send and receive data between two ranks using port channels. steps: 1. Each rank sends a chunk to the other rank's scratch buffer and signals the other rank that the data has been sent. 2. Wait for the data to be received then copy it to the output buffer. @@ -34,14 +34,14 @@ def send_recv(instances): "scratch", 1, sendtb=0, - chan_type=ChannelType.proxy, + chan_type=ChannelType.port, ) - c.signal(nghr, "scratch", 1, sendtb=0, chan_type=ChannelType.proxy) - c.flush(nghr, "scratch", 1, sendtb=0, chan_type=ChannelType.proxy) + c.signal(nghr, "scratch", 1, sendtb=0, chan_type=ChannelType.port) + c.flush(nghr, "scratch", 1, sendtb=0, chan_type=ChannelType.port) for r in range(size): c = chunk(r, "scratch", 1) - c.wait(1 - r, Buffer.input, 0, recvtb=0, chan_type=ChannelType.proxy) + c.wait(1 - r, Buffer.input, 0, recvtb=0, chan_type=ChannelType.port) c.copy(r, Buffer.output, 0, sendtb=0) Json() diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py index 678379ac2..839b921e7 100644 --- a/python/mscclpp/__init__.py +++ b/python/mscclpp/__init__.py @@ -1,7 +1,9 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -import os as _os +import os +import warnings +from functools import wraps from ._mscclpp import ( Env, @@ -22,9 +24,9 @@ numa, ProxyService, RegisteredMemory, - ProxyChannel, - SmChannel, - SmDevice2DeviceSemaphore, + PortChannel, + MemoryChannel, + MemoryDevice2DeviceSemaphore, TcpBootstrap, Transport, TransportFlags, @@ -39,17 +41,82 @@ npkit, ) -__version__ = version() -if _os.environ.get("MSCCLPP_HOME", None) is None: - _os.environ["MSCCLPP_HOME"] = _os.path.abspath(_os.path.dirname(__file__)) +__all__ = [ + "Communicator", + "Connection", + "connect_nvls_collective", + "EndpointConfig", + "Fifo", + "Host2DeviceSemaphore", + "Host2HostSemaphore", + "numa", + "ProxyService", + "RegisteredMemory", + "PortChannel", + "MemoryChannel", + "MemoryDevice2DeviceSemaphore", + "TcpBootstrap", + "Transport", + "TransportFlags", + "DataType", + "Executor", + "ExecutionPlan", + "PacketType", + "version", + "is_nvls_supported", + "alloc_shared_physical_cuda", + "npkit", + "__version__", + "get_include", + "get_lib", + ### Deprecated ### + "ProxyChannel", + "SmChannel", + "SmDevice2DeviceSemaphore", +] +__version__: str = str(version()) -def get_include(): +if os.environ.get("MSCCLPP_HOME", None) is None: + os.environ["MSCCLPP_HOME"] = os.path.abspath(os.path.dirname(__file__)) + + +def get_include() -> str: """Return the directory that contains the MSCCL++ headers.""" - return _os.path.join(_os.path.dirname(__file__), "include") + return os.path.join(os.path.dirname(__file__), "include") -def get_lib(): +def get_lib() -> str: """Return the directory that contains the MSCCL++ headers.""" - return _os.path.join(_os.path.dirname(__file__), "lib") + return os.path.join(os.path.dirname(__file__), "lib") + + +def deprecated(new_cls): + def decorator(old_cls): + @wraps(old_cls) + def wrapper(*args, **kwargs): + warnings.warn( + f"{old_cls.__name__} is deprecated, use {new_cls.__name__} instead.", + DeprecationWarning, + ) + return new_cls(*args, **kwargs) + + return wrapper + + return decorator + + +@deprecated(PortChannel) +class ProxyChannel(PortChannel): + pass + + +@deprecated(MemoryChannel) +class SmChannel(MemoryChannel): + pass + + +@deprecated(MemoryDevice2DeviceSemaphore) +class SmDevice2DeviceSemaphore(MemoryDevice2DeviceSemaphore): + pass diff --git a/python/mscclpp/comm.py b/python/mscclpp/comm.py index c2726826f..8d2e0f481 100644 --- a/python/mscclpp/comm.py +++ b/python/mscclpp/comm.py @@ -14,9 +14,9 @@ Host2HostSemaphore, ProxyService, RegisteredMemory, - ProxyChannel, - SmChannel, - SmDevice2DeviceSemaphore, + PortChannel, + MemoryChannel, + MemoryDevice2DeviceSemaphore, TcpBootstrap, Transport, TransportFlags, @@ -135,7 +135,7 @@ def register_tensor_with_connections( def make_semaphore( self, connections: dict[int, Connection], - semaphore_type: Type[Host2HostSemaphore] or Type[Host2DeviceSemaphore] or Type[SmDevice2DeviceSemaphore], + semaphore_type: Type[Host2HostSemaphore] or Type[Host2DeviceSemaphore] or Type[MemoryDevice2DeviceSemaphore], ) -> dict[int, Host2HostSemaphore]: semaphores = {} for rank in connections: @@ -143,33 +143,35 @@ def make_semaphore( self.communicator.setup() return semaphores - def make_sm_channels(self, tensor: cp.ndarray, connections: dict[int, Connection]) -> dict[int, SmChannel]: - semaphores = self.make_semaphore(connections, SmDevice2DeviceSemaphore) + def make_memory_channels(self, tensor: cp.ndarray, connections: dict[int, Connection]) -> dict[int, MemoryChannel]: + semaphores = self.make_semaphore(connections, MemoryDevice2DeviceSemaphore) registered_memories = self.register_tensor_with_connections(tensor, connections) channels = {} tensor_data_ptr = tensor.data_ptr() if is_torch_tensor(tensor) else tensor.data.ptr for rank in connections: - channels[rank] = SmChannel(semaphores[rank], registered_memories[rank], tensor_data_ptr) + channels[rank] = MemoryChannel(semaphores[rank], registered_memories[rank], tensor_data_ptr) return channels - def make_sm_channels_with_scratch( + def make_memory_channels_with_scratch( self, tensor: cp.ndarray, scratchTensor: cp.ndarray, connections: dict[int, Connection], - ) -> dict[int, SmChannel]: - semaphores = self.make_semaphore(connections, SmDevice2DeviceSemaphore) + ) -> dict[int, MemoryChannel]: + semaphores = self.make_semaphore(connections, MemoryDevice2DeviceSemaphore) registered_memories = self.register_tensor_with_connections(scratchTensor, connections) channels = {} tensor_data_ptr = tensor.data_ptr() if is_torch_tensor(tensor) else tensor.data.ptr scratch_data_ptr = scratchTensor.data_ptr() if is_torch_tensor(scratchTensor) else scratchTensor.data.ptr for rank in connections: - channels[rank] = SmChannel(semaphores[rank], registered_memories[rank], tensor_data_ptr, scratch_data_ptr) + channels[rank] = MemoryChannel( + semaphores[rank], registered_memories[rank], tensor_data_ptr, scratch_data_ptr + ) return channels - def make_proxy_channels( + def make_port_channels( self, proxy_service: ProxyService, tensor: cp.ndarray, connections: dict[int, Connection] - ) -> dict[int, SmChannel]: + ) -> dict[int, MemoryChannel]: semaphores = self.make_semaphore(connections, Host2DeviceSemaphore) registered_memories = self.register_tensor_with_connections(tensor, connections) memory_ids = {} @@ -180,18 +182,16 @@ def make_proxy_channels( semaphore_ids[rank] = proxy_service.add_semaphore(semaphores[rank]) channels = {} for rank in semaphores: - channels[rank] = proxy_service.proxy_channel( - semaphore_ids[rank], memory_ids[rank], memory_ids[self.my_rank] - ) + channels[rank] = proxy_service.port_channel(semaphore_ids[rank], memory_ids[rank], memory_ids[self.my_rank]) return channels - def make_proxy_channels_with_scratch( + def make_port_channels_with_scratch( self, proxy_service: ProxyService, tensor: cp.ndarray, scratchTensor: cp.ndarray, connections: dict[int, Connection], - ) -> dict[int, SmChannel]: + ) -> dict[int, MemoryChannel]: transport_flags = TransportFlags() for rank in connections: transport_flags |= connections[rank].transport() @@ -218,21 +218,19 @@ def make_proxy_channels_with_scratch( semaphore_ids[rank] = proxy_service.add_semaphore(semaphores[rank]) channels = {} for rank in semaphores: - channels[rank] = proxy_service.proxy_channel( - semaphore_ids[rank], memory_ids[rank], memory_ids[self.my_rank] - ) + channels[rank] = proxy_service.port_channel(semaphore_ids[rank], memory_ids[rank], memory_ids[self.my_rank]) return channels def register_semaphore_with_proxy( self, proxy_service: ProxyService, connections: dict[int, Connection] - ) -> dict[int, SmChannel]: + ) -> dict[int, MemoryChannel]: semaphores = self.make_semaphore(connections, Host2DeviceSemaphore) semaphore_ids = {} for rank in semaphores: semaphore_ids[rank] = proxy_service.add_semaphore(semaphores[rank]) channels = {} for rank in semaphores: - channels[rank] = proxy_service.base_proxy_channel(semaphore_ids[rank]) + channels[rank] = proxy_service.base_port_channel(semaphore_ids[rank]) return channels def register_memory_with_proxy( diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp index 90ee22860..48bd57ab1 100644 --- a/python/mscclpp/core_py.cpp +++ b/python/mscclpp/core_py.cpp @@ -15,8 +15,8 @@ using namespace mscclpp; extern void register_env(nb::module_& m); extern void register_error(nb::module_& m); -extern void register_proxy_channel(nb::module_& m); -extern void register_sm_channel(nb::module_& m); +extern void register_port_channel(nb::module_& m); +extern void register_memory_channel(nb::module_& m); extern void register_fifo(nb::module_& m); extern void register_semaphore(nb::module_& m); extern void register_utils(nb::module_& m); @@ -187,8 +187,8 @@ void register_core(nb::module_& m) { NB_MODULE(_mscclpp, m) { register_env(m); register_error(m); - register_proxy_channel(m); - register_sm_channel(m); + register_port_channel(m); + register_memory_channel(m); register_fifo(m); register_semaphore(m); register_utils(m); diff --git a/python/mscclpp/language/collectives.py b/python/mscclpp/language/collectives.py index 67b735ba9..55fe51880 100644 --- a/python/mscclpp/language/collectives.py +++ b/python/mscclpp/language/collectives.py @@ -6,7 +6,6 @@ class Collective: - def __init__(self, num_ranks, chunk_factor, inplace, num_ranks_per_node=-1, **kwargs): self.num_ranks = num_ranks self.chunk_factor = chunk_factor @@ -36,7 +35,6 @@ def get_buffer_index(self, rank, buffer, index): class AllToAll(Collective): - def __init__(self, num_ranks, chunk_factor, inplace): Collective.__init__(self, num_ranks, chunk_factor, inplace) self.name = "alltoall" @@ -137,7 +135,6 @@ def get_buffer_index(self, rank, buffer, index): class AllReduce(Collective): - def __init__(self, num_ranks, chunk_factor, inplace, num_ranks_per_node=-1, **kwargs): num_chunk_groups = kwargs.get("num_chunk_groups", num_ranks) Collective.__init__( diff --git a/python/mscclpp/language/dag/instruction_dag.py b/python/mscclpp/language/dag/instruction_dag.py index dcc1189ca..6f137a90a 100644 --- a/python/mscclpp/language/dag/instruction_dag.py +++ b/python/mscclpp/language/dag/instruction_dag.py @@ -221,7 +221,7 @@ def add_flush(self, rank, send_ref, recv_ref, tb): next=set(), prev=set(), tb=tb, - channel_type=ChannelType.proxy, + channel_type=ChannelType.port, step=tb_step, ) buffer = send_ref.buffer diff --git a/python/mscclpp/language/dag/optimizer.py b/python/mscclpp/language/dag/optimizer.py index 62fc0f5e8..4cfa638db 100644 --- a/python/mscclpp/language/dag/optimizer.py +++ b/python/mscclpp/language/dag/optimizer.py @@ -19,7 +19,6 @@ class _InstructionOptimizer: - def try_merge_same_instructions( self, op: Op, @@ -128,8 +127,8 @@ def try_fuse_with_put(self, op: Op, next_op: Op, tb: Threadblock, queue: list) - and same_tb(op, next_op) and same_count(op, next_op) and buf_dst_src_match(op, next_op) - and next_op.channel_type == ChannelType.sm - and (op.channel_type == ChannelType.none or op.channel_type == ChannelType.sm) + and next_op.channel_type == ChannelType.memory + and (op.channel_type == ChannelType.none or op.channel_type == ChannelType.memory) and not circular_dep_after_merge(op, next_op) and all_prevs_visited_after_merge(op, next_op) ): @@ -140,10 +139,10 @@ def try_fuse_with_put(self, op: Op, next_op: Op, tb: Threadblock, queue: list) - op.inst = Instruction.read_reduce_copy_send elif op.inst == Instruction.reduce: op.inst = Instruction.reduce_send - op.channel_type = ChannelType.sm + op.channel_type = ChannelType.memory elif op.inst == Instruction.reduce_packet: op.inst = Instruction.reduce_send_packet - op.channel_type = ChannelType.sm + op.channel_type = ChannelType.memory # Append the destination chunk from next_op op.dsts.append( ( @@ -158,11 +157,11 @@ def try_fuse_with_put(self, op: Op, next_op: Op, tb: Threadblock, queue: list) - return True return False - def try_fuse_instructions_using_proxy_channel( + def try_fuse_instructions_using_port_channel( self, op: Op, next_op: Op, tb: Threadblock, queue: list, expected_next_inst: Instruction ) -> bool: """ - Attempts to fuse operations which using proxy channel. + Attempts to fuse operations which using port channel. :param op: The current operation. :param next_op: The next operation to potentially merge with. :param tb: The thread block containing the operations. @@ -177,7 +176,7 @@ def try_fuse_instructions_using_proxy_channel( and same_buf_dst(op, next_op) and same_buf_src(op, next_op) and same_chan_type(op, next_op) - and op.channel_type == ChannelType.proxy + and op.channel_type == ChannelType.port and not circular_dep_after_merge(op, next_op) and all_prevs_visited_after_merge(op, next_op) ): @@ -229,7 +228,6 @@ def try_remove_op(self, pending_remove_op: Op, condition: bool) -> bool: class DagOptimizer: - def __init__(self, instruction_dag: InstructionDAG): self.optimizer = _InstructionOptimizer() self.dag = instruction_dag @@ -257,7 +255,7 @@ def remove_redundant_signal_wait(self): queue = queue[1:] def fuse_instructions(self): - self._fuse_instructions_using_proxy_channel() + self._fuse_instructions_using_port_channel() self._fuse_same_instructions() self._optimize_rrcs_rs() self._optimize_group_ops() @@ -267,7 +265,7 @@ def fuse_instructions(self): # -> putWithSignal(src, sbuf, si, dst, dbuf, di) # put(src, sbuf, si, dst, dbuf, di) signal(src, sbuf, si, dst, dbuf, di) flush(src, sbuf, si, dst, dbuf, di) # -> putWithSignalAndFlush(src, sbuf, si, dst, dbuf, di) - def _fuse_instructions_using_proxy_channel(self): + def _fuse_instructions_using_port_channel(self): inst_followup_map = { Instruction.put: Instruction.signal, Instruction.put_with_signal: Instruction.flush, @@ -280,7 +278,7 @@ def _fuse_instructions_using_proxy_channel(self): fused = False if op.inst in inst_followup_map: for next_op in op.next: - fused = self.optimizer.try_fuse_instructions_using_proxy_channel( + fused = self.optimizer.try_fuse_instructions_using_port_channel( op, next_op, tb, queue, inst_followup_map[op.inst] ) if fused: diff --git a/python/mscclpp/language/ir.py b/python/mscclpp/language/ir.py index 3b84b5298..4cb12e6da 100644 --- a/python/mscclpp/language/ir.py +++ b/python/mscclpp/language/ir.py @@ -286,7 +286,7 @@ def to_json(self, op: Op, tb_channel_dict: dict) -> _JsonInstruction: class _ReduceSendConverter(_OpConverter): def to_json(self, op: Op, tb_channel_dict: dict) -> _JsonInstruction: dst_channel_ids = self.get_channel_ids( - op.dsts, tb_channel_dict, op.dst.buffer, op.dsts[0].buffer, ChannelType.sm + op.dsts, tb_channel_dict, op.dst.buffer, op.dsts[0].buffer, ChannelType.memory ) o_buff = {"src": op.dst.buffer.value, "dst": op.dsts[0].buffer.value} srcs = list(map(lambda x: {"buff": x.buffer.value, "off": x.index}, op.srcs)) diff --git a/python/mscclpp/language/program.py b/python/mscclpp/language/program.py index 6cf0d15b1..0657f1fcf 100644 --- a/python/mscclpp/language/program.py +++ b/python/mscclpp/language/program.py @@ -222,7 +222,7 @@ def _get_buffer_index(self, remote_rank, buffer, index): return buffer, self.prog.buffers[remote_rank][buffer].instance_size() return buffer, index - def _put(self, dst, buffer=None, index=-1, sendtb=-1, chan_type=ChannelType.sm, use_packet=False): + def _put(self, dst, buffer=None, index=-1, sendtb=-1, chan_type=ChannelType.memory, use_packet=False): self.prog.check_buffer_exists(dst, buffer) assert self.rank != dst, "Cannot put to the same rank" buffer, index = self._get_buffer_index(dst, buffer, index) @@ -237,7 +237,7 @@ def _put(self, dst, buffer=None, index=-1, sendtb=-1, chan_type=ChannelType.sm, self.prog.instr_dag.add_put(self.rank, self, dst_chunkref, sendtb, chan_type) return dst_chunkref - def put(self, dst, buffer=None, index=-1, sendtb=-1, chan_type=ChannelType.sm): + def put(self, dst, buffer=None, index=-1, sendtb=-1, chan_type=ChannelType.memory): return self._put(dst, buffer, index, sendtb, chan_type) def put_packet( @@ -246,19 +246,19 @@ def put_packet( buffer=None, index=-1, sendtb=-1, - chan_type=ChannelType.sm, + chan_type=ChannelType.memory, temp_buffer=None, temp_buffer_index=-1, ): chunk_ref = self - if chan_type == ChannelType.proxy: - assert temp_buffer is not None, "Need to specify a temporary buffer for proxy channels" + if chan_type == ChannelType.port: + assert temp_buffer is not None, "Need to specify a temporary buffer for port channels" chunk_ref = self._copy( self.rank, temp_buffer, temp_buffer_index, sendtb, trans_from_packet=False, trans_to_packet=True ) return chunk_ref._put(dst, buffer, index, sendtb, chan_type, True) - def get(self, src, buffer=None, index=-1, recvtb=-1, chan_type=ChannelType.sm): + def get(self, src, buffer=None, index=-1, recvtb=-1, chan_type=ChannelType.memory): self.prog.check_buffer_exists(src, buffer) sender = src receiver = self.rank @@ -273,7 +273,7 @@ def get(self, src, buffer=None, index=-1, recvtb=-1, chan_type=ChannelType.sm): # for signal and wait, currently we assuem the pair will use the same tb index. In future we need # to infer the tb index from the instruction DAG Add a channel is define as (send_tb, src_buffer, recv_tb, dst_buffer, type). # Then we can use DAG info to reduce the number of channels. - def signal(self, dst, buffer=None, index=-1, sendtb=-1, chan_type=ChannelType.sm): + def signal(self, dst, buffer=None, index=-1, sendtb=-1, chan_type=ChannelType.memory): sender = self.rank receiver = dst assert sender != receiver, "Cannot signal to the same rank" @@ -282,9 +282,9 @@ def signal(self, dst, buffer=None, index=-1, sendtb=-1, chan_type=ChannelType.sm dst_chunkref = self.prog.get_ref(dst, buffer, index, self.size) self.prog.instr_dag.add_signal(sender, self, dst_chunkref, sendtb, chan_type) - # only proxy channel need to use this function - def flush(self, dst, buffer=None, index=-1, sendtb=-1, chan_type=ChannelType.proxy): - assert chan_type == ChannelType.proxy, "Only proxy channel can use flush" + # only port channel need to use this function + def flush(self, dst, buffer=None, index=-1, sendtb=-1, chan_type=ChannelType.port): + assert chan_type == ChannelType.port, "Only port channel can use flush" sender = self.rank receiver = dst assert sender != receiver, "Cannot flush to the same rank" @@ -293,7 +293,7 @@ def flush(self, dst, buffer=None, index=-1, sendtb=-1, chan_type=ChannelType.pro dst_chunkref = self.prog.get_ref(dst, buffer, index, self.size) self.prog.instr_dag.add_flush(sender, self, dst_chunkref, sendtb) - def wait(self, src, buffer=None, index=-1, recvtb=-1, chan_type=ChannelType.sm): + def wait(self, src, buffer=None, index=-1, recvtb=-1, chan_type=ChannelType.memory): sender = src receiver = self.rank assert sender != receiver, "Cannot wait on the same rank" @@ -324,7 +324,7 @@ def copy(self, dst, buffer=None, index=-1, sendtb=-1): def copy_packet(self, dst, buffer=None, index=-1, sendtb=-1): return self._copy(dst, buffer, index, sendtb, trans_from_packet=True, trans_to_packet=False) - def _reduce(self, other_chunkref, recvtb=-1, channel_type=ChannelType.sm, use_packet=False): + def _reduce(self, other_chunkref, recvtb=-1, channel_type=ChannelType.memory, use_packet=False): dst = self.rank src = other_chunkref.rank @@ -342,7 +342,7 @@ def _reduce(self, other_chunkref, recvtb=-1, channel_type=ChannelType.sm, use_pa return self # Reduces the chunk(s) referenced by other_chunkref into the chunk(s) referenced by this chunkref - def reduce(self, other_chunkref, recvtb=-1, channel_type=ChannelType.sm): + def reduce(self, other_chunkref, recvtb=-1, channel_type=ChannelType.memory): return self._reduce(other_chunkref, recvtb, channel_type) # Reduces the chunk(s) referenced by other_chunkref into the chunk(s) referenced by this chunkref diff --git a/python/mscclpp/language/types.py b/python/mscclpp/language/types.py index f6202ccfe..a819bc034 100644 --- a/python/mscclpp/language/types.py +++ b/python/mscclpp/language/types.py @@ -114,11 +114,15 @@ def __hash__(self): class ChannelType(Enum): - proxy = "proxy" - sm = "sm" + port = "port" + memory = "memory" none = "none" nvls = "nvls" + # Deprecated + proxy = "port" + sm = "memory" + def __str__(self): return self.value diff --git a/python/mscclpp/memory_channel_py.cpp b/python/mscclpp/memory_channel_py.cpp new file mode 100644 index 000000000..d4e1c5442 --- /dev/null +++ b/python/mscclpp/memory_channel_py.cpp @@ -0,0 +1,35 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include +#include +#include + +#include + +namespace nb = nanobind; +using namespace mscclpp; + +void register_memory_channel(nb::module_& m) { + nb::class_ memoryChannel(m, "MemoryChannel"); + memoryChannel + .def("__init__", + [](MemoryChannel* memoryChannel, std::shared_ptr semaphore, + RegisteredMemory dst, uintptr_t src) { new (memoryChannel) MemoryChannel(semaphore, dst, (void*)src); }) + .def("__init__", + [](MemoryChannel* memoryChannel, std::shared_ptr semaphore, + RegisteredMemory dst, uintptr_t src, uintptr_t get_packet_buffer) { + new (memoryChannel) MemoryChannel(semaphore, dst, (void*)src, (void*)get_packet_buffer); + }) + .def("device_handle", &MemoryChannel::deviceHandle); + + nb::class_(m, "MemoryChannelDeviceHandle") + .def(nb::init<>()) + .def_rw("semaphore_", &MemoryChannel::DeviceHandle::semaphore_) + .def_rw("src_", &MemoryChannel::DeviceHandle::src_) + .def_rw("dst_", &MemoryChannel::DeviceHandle::dst_) + .def_rw("getPacketBuffer_", &MemoryChannel::DeviceHandle::getPacketBuffer_) + .def_prop_ro("raw", [](const MemoryChannel::DeviceHandle& self) -> nb::bytes { + return nb::bytes(reinterpret_cast(&self), sizeof(self)); + }); +}; diff --git a/python/mscclpp/proxy_channel_py.cpp b/python/mscclpp/port_channel_py.cpp similarity index 54% rename from python/mscclpp/proxy_channel_py.cpp rename to python/mscclpp/port_channel_py.cpp index dfe882228..dd33724e0 100644 --- a/python/mscclpp/proxy_channel_py.cpp +++ b/python/mscclpp/port_channel_py.cpp @@ -5,12 +5,12 @@ #include #include -#include +#include namespace nb = nanobind; using namespace mscclpp; -void register_proxy_channel(nb::module_& m) { +void register_port_channel(nb::module_& m) { nb::class_(m, "BaseProxyService") .def("start_proxy", &BaseProxyService::startProxy) .def("stop_proxy", &BaseProxyService::stopProxy); @@ -23,36 +23,36 @@ void register_proxy_channel(nb::module_& m) { .def("add_semaphore", &ProxyService::addSemaphore, nb::arg("semaphore")) .def("add_memory", &ProxyService::addMemory, nb::arg("memory")) .def("semaphore", &ProxyService::semaphore, nb::arg("id")) - .def("base_proxy_channel", &ProxyService::baseProxyChannel, nb::arg("id")) - .def("proxy_channel", &ProxyService::proxyChannel, nb::arg("id"), nb::arg("dst"), nb::arg("src")); + .def("base_port_channel", &ProxyService::basePortChannel, nb::arg("id")) + .def("port_channel", &ProxyService::portChannel, nb::arg("id"), nb::arg("dst"), nb::arg("src")); - nb::class_(m, "BaseProxyChannel") + nb::class_(m, "BasePortChannel") .def(nb::init, std::shared_ptr>(), nb::arg("semaphoreId"), nb::arg("semaphore"), nb::arg("proxy")) - .def("device_handle", &BaseProxyChannel::deviceHandle); + .def("device_handle", &BasePortChannel::deviceHandle); - nb::class_(m, "BaseProxyChannelDeviceHandle") + nb::class_(m, "BasePortChannelDeviceHandle") .def(nb::init<>()) - .def_rw("semaphoreId_", &BaseProxyChannel::DeviceHandle::semaphoreId_) - .def_rw("semaphore_", &BaseProxyChannel::DeviceHandle::semaphore_) - .def_rw("fifo_", &BaseProxyChannel::DeviceHandle::fifo_) - .def_prop_ro("raw", [](const BaseProxyChannel::DeviceHandle& self) -> nb::bytes { + .def_rw("semaphoreId_", &BasePortChannel::DeviceHandle::semaphoreId_) + .def_rw("semaphore_", &BasePortChannel::DeviceHandle::semaphore_) + .def_rw("fifo_", &BasePortChannel::DeviceHandle::fifo_) + .def_prop_ro("raw", [](const BasePortChannel::DeviceHandle& self) -> nb::bytes { return nb::bytes(reinterpret_cast(&self), sizeof(self)); }); - nb::class_(m, "ProxyChannel") + nb::class_(m, "PortChannel") .def(nb::init, std::shared_ptr, MemoryId, MemoryId>(), nb::arg("semaphoreId"), nb::arg("semaphore"), nb::arg("proxy"), nb::arg("dst"), nb::arg("src")) - .def("device_handle", &ProxyChannel::deviceHandle); + .def("device_handle", &PortChannel::deviceHandle); - nb::class_(m, "ProxyChannelDeviceHandle") + nb::class_(m, "PortChannelDeviceHandle") .def(nb::init<>()) - .def_rw("semaphoreId_", &ProxyChannel::DeviceHandle::semaphoreId_) - .def_rw("semaphore_", &ProxyChannel::DeviceHandle::semaphore_) - .def_rw("fifo_", &ProxyChannel::DeviceHandle::fifo_) - .def_rw("src_", &ProxyChannel::DeviceHandle::src_) - .def_rw("dst_", &ProxyChannel::DeviceHandle::dst_) - .def_prop_ro("raw", [](const ProxyChannel::DeviceHandle& self) -> nb::bytes { + .def_rw("semaphoreId_", &PortChannel::DeviceHandle::semaphoreId_) + .def_rw("semaphore_", &PortChannel::DeviceHandle::semaphore_) + .def_rw("fifo_", &PortChannel::DeviceHandle::fifo_) + .def_rw("src_", &PortChannel::DeviceHandle::src_) + .def_rw("dst_", &PortChannel::DeviceHandle::dst_) + .def_prop_ro("raw", [](const PortChannel::DeviceHandle& self) -> nb::bytes { return nb::bytes(reinterpret_cast(&self), sizeof(self)); }); }; diff --git a/python/mscclpp/semaphore_py.cpp b/python/mscclpp/semaphore_py.cpp index a616a89da..daadeb03b 100644 --- a/python/mscclpp/semaphore_py.cpp +++ b/python/mscclpp/semaphore_py.cpp @@ -33,18 +33,18 @@ void register_semaphore(nb::module_& m) { .def("wait", &Host2HostSemaphore::wait, nb::call_guard(), nb::arg("max_spin_count") = 10000000); - nb::class_ smDevice2DeviceSemaphore(m, "SmDevice2DeviceSemaphore"); - smDevice2DeviceSemaphore + nb::class_ memoryDevice2DeviceSemaphore(m, "MemoryDevice2DeviceSemaphore"); + memoryDevice2DeviceSemaphore .def(nb::init>(), nb::arg("communicator"), nb::arg("connection")) - .def("device_handle", &SmDevice2DeviceSemaphore::deviceHandle); + .def("device_handle", &MemoryDevice2DeviceSemaphore::deviceHandle); - nb::class_(smDevice2DeviceSemaphore, "DeviceHandle") + nb::class_(memoryDevice2DeviceSemaphore, "DeviceHandle") .def(nb::init<>()) - .def_rw("inboundSemaphoreId", &SmDevice2DeviceSemaphore::DeviceHandle::inboundSemaphoreId) - .def_rw("outboundSemaphoreId", &SmDevice2DeviceSemaphore::DeviceHandle::outboundSemaphoreId) - .def_rw("remoteInboundSemaphoreId", &SmDevice2DeviceSemaphore::DeviceHandle::remoteInboundSemaphoreId) - .def_rw("expectedInboundSemaphoreId", &SmDevice2DeviceSemaphore::DeviceHandle::expectedInboundSemaphoreId) - .def_prop_ro("raw", [](const SmDevice2DeviceSemaphore::DeviceHandle& self) -> nb::bytes { + .def_rw("inboundSemaphoreId", &MemoryDevice2DeviceSemaphore::DeviceHandle::inboundSemaphoreId) + .def_rw("outboundSemaphoreId", &MemoryDevice2DeviceSemaphore::DeviceHandle::outboundSemaphoreId) + .def_rw("remoteInboundSemaphoreId", &MemoryDevice2DeviceSemaphore::DeviceHandle::remoteInboundSemaphoreId) + .def_rw("expectedInboundSemaphoreId", &MemoryDevice2DeviceSemaphore::DeviceHandle::expectedInboundSemaphoreId) + .def_prop_ro("raw", [](const MemoryDevice2DeviceSemaphore::DeviceHandle& self) -> nb::bytes { return nb::bytes(reinterpret_cast(&self), sizeof(self)); }); } diff --git a/python/mscclpp/sm_channel_py.cpp b/python/mscclpp/sm_channel_py.cpp deleted file mode 100644 index 04a51eb8b..000000000 --- a/python/mscclpp/sm_channel_py.cpp +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include -#include -#include - -#include - -namespace nb = nanobind; -using namespace mscclpp; - -void register_sm_channel(nb::module_& m) { - nb::class_ smChannel(m, "SmChannel"); - smChannel - .def("__init__", - [](SmChannel* smChannel, std::shared_ptr semaphore, RegisteredMemory dst, - uintptr_t src) { new (smChannel) SmChannel(semaphore, dst, (void*)src); }) - .def("__init__", - [](SmChannel* smChannel, std::shared_ptr semaphore, RegisteredMemory dst, - uintptr_t src, uintptr_t get_packet_buffer) { - new (smChannel) SmChannel(semaphore, dst, (void*)src, (void*)get_packet_buffer); - }) - .def("device_handle", &SmChannel::deviceHandle); - - nb::class_(m, "SmChannelDeviceHandle") - .def(nb::init<>()) - .def_rw("semaphore_", &SmChannel::DeviceHandle::semaphore_) - .def_rw("src_", &SmChannel::DeviceHandle::src_) - .def_rw("dst_", &SmChannel::DeviceHandle::dst_) - .def_rw("getPacketBuffer_", &SmChannel::DeviceHandle::getPacketBuffer_) - .def_prop_ro("raw", [](const SmChannel::DeviceHandle& self) -> nb::bytes { - return nb::bytes(reinterpret_cast(&self), sizeof(self)); - }); -}; diff --git a/python/mscclpp_benchmark/allreduce.cu b/python/mscclpp_benchmark/allreduce.cu index 4c9851b9a..dbe376a3a 100644 --- a/python/mscclpp_benchmark/allreduce.cu +++ b/python/mscclpp_benchmark/allreduce.cu @@ -8,9 +8,9 @@ #endif #include +#include #include -#include -#include +#include __device__ mscclpp::DeviceSyncer deviceSyncer; __device__ mscclpp::DeviceSyncer allGatherDeviceSyncer; @@ -124,7 +124,7 @@ __forceinline__ __device__ void vectorSum(TYPE* dst, TYPE* src, size_t nElem) { // ------------------------------------------- template -__device__ void allreduce1_helper(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff, int rank, int nranks, +__device__ void allreduce1_helper(mscclpp::MemoryChannelDeviceHandle* memChans, TYPE* buff, int rank, int nranks, size_t nelems) { const size_t chunkSize = nelems / nranks; if (nranks == 1) return; @@ -140,10 +140,10 @@ __device__ void allreduce1_helper(mscclpp::SmChannelDeviceHandle* smChans, TYPE* } __syncthreads(); if (tid < nPeer) { - smChans[tid].relaxedSignal(); + memChans[tid].relaxedSignal(); } if (tid >= nPeer && tid < nPeer * 2) { - smChans[tid - nPeer].wait(); + memChans[tid - nPeer].wait(); } deviceSyncer.sync(gridDim.x); @@ -155,14 +155,14 @@ __device__ void allreduce1_helper(mscclpp::SmChannelDeviceHandle* smChans, TYPE* int4 val; int peerIdx = (index + rank); if (peerIdx >= nPeer) peerIdx -= nPeer; - val = smChans[peerIdx].read(indexOffset4 + idx); + val = memChans[peerIdx].read(indexOffset4 + idx); tmp = add_vectors(tmp, val); } if (READ_ONLY == 0) { for (int index = 0; index < nPeer; ++index) { int peerIdx = (index + rank); if (peerIdx >= nPeer) peerIdx -= nPeer; - smChans[peerIdx].write(indexOffset4 + idx, tmp); + memChans[peerIdx].write(indexOffset4 + idx, tmp); } } buff4[indexOffset4 + idx] = tmp; @@ -178,14 +178,14 @@ __device__ void allreduce1_helper(mscclpp::SmChannelDeviceHandle* smChans, TYPE* for (int index = 0; index < nPeer; ++index) { int peerIdx = (index + rank); if (peerIdx >= nPeer) peerIdx -= nPeer; - TYPE val = smChans[peerIdx].read(idx); + TYPE val = memChans[peerIdx].read(idx); tmp += val; } if (READ_ONLY == 0) { for (int index = 0; index < nPeer; ++index) { int peerIdx = (index + rank); if (peerIdx >= nPeer) peerIdx -= nPeer; - smChans[peerIdx].write(idx, tmp); + memChans[peerIdx].write(idx, tmp); } } buff[idx] = tmp; @@ -198,10 +198,10 @@ __device__ void allreduce1_helper(mscclpp::SmChannelDeviceHandle* smChans, TYPE* } __syncthreads(); if (tid < nPeer) { - smChans[tid].relaxedSignal(); + memChans[tid].relaxedSignal(); } if (tid >= nPeer && tid < nPeer * 2) { - smChans[tid - nPeer].wait(); + memChans[tid - nPeer].wait(); } if (READ_ONLY) { @@ -211,17 +211,18 @@ __device__ void allreduce1_helper(mscclpp::SmChannelDeviceHandle* smChans, TYPE* if (peerIdx >= nPeer) peerIdx -= nPeer; const int remoteRank = (peerIdx < rank ? peerIdx : peerIdx + 1); size_t offset = chunkSize * remoteRank * sizeof(TYPE); - smChans[peerIdx].get(offset, chunkSize * sizeof(TYPE), tid, blockDim.x * gridDim.x); + memChans[peerIdx].get(offset, chunkSize * sizeof(TYPE), tid, blockDim.x * gridDim.x); } } } -extern "C" __global__ void __launch_bounds__(1024, 1) allreduce1(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff, - int rank, int nranks, size_t nelems, int read_only) { +extern "C" __global__ void __launch_bounds__(1024, 1) + allreduce1(mscclpp::MemoryChannelDeviceHandle* memChans, TYPE* buff, int rank, int nranks, size_t nelems, + int read_only) { if (read_only) - allreduce1_helper<1>(smChans, buff, rank, nranks, nelems); + allreduce1_helper<1>(memChans, buff, rank, nranks, nelems); else - allreduce1_helper<0>(smChans, buff, rank, nranks, nelems); + allreduce1_helper<0>(memChans, buff, rank, nranks, nelems); } // ------------------------------------------- @@ -231,7 +232,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1) allreduce1(mscclpp::SmChan __device__ uint64_t globalFlag = 1; extern "C" __global__ void __launch_bounds__(1024, 1) - allreduce2(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff, TYPE* scratch, void* resultBuff, int rank, + allreduce2(mscclpp::MemoryChannelDeviceHandle* memChans, TYPE* buff, TYPE* scratch, void* resultBuff, int rank, int worldSize, size_t nelems) { nelems = nelems / (sizeof(int) / sizeof(TYPE)); // This version of allreduce only works for single nodes @@ -246,7 +247,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1) const int localBlockIdx = blockIdx.x % nBlocksPerPeer; const int peerIdx = blockIdx.x / nBlocksPerPeer; const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1; - mscclpp::SmChannelDeviceHandle smChan = smChans[peerIdx]; + mscclpp::MemoryChannelDeviceHandle memChan = memChans[peerIdx]; const int tid = threadIdx.x + localBlockIdx * blockDim.x; // double buffering size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LLPacket); @@ -259,7 +260,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1) uint2* dst = (uint2*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); // step 1: write to scratch buffer - smChan.putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer, flag); + memChan.putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer, flag); // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { uint2 data = make_uint2(0, 0); @@ -279,7 +280,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1) packet.flag2 = flag; size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket) + (idx + rank * nPktsPerRank); for (int index = 0; index < nPeers; index++) { - smChans[index].write(offset, packet); + memChans[index].write(offset, packet); } } // step 3: get data result from scratch buffer @@ -301,7 +302,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1) // ------------------------------------------- extern "C" __global__ void __launch_bounds__(1024, 1) - allreduce3(mscclpp::ProxyChannelDeviceHandle* fstRoundChans, mscclpp::ProxyChannelDeviceHandle* sndRoundChans, + allreduce3(mscclpp::PortChannelDeviceHandle* fstRoundChans, mscclpp::PortChannelDeviceHandle* sndRoundChans, TYPE* buff, TYPE* scratch, int rank, int worldSize, size_t nelems) { nelems = nelems / (sizeof(int) / sizeof(TYPE)); @@ -311,10 +312,10 @@ extern "C" __global__ void __launch_bounds__(1024, 1) int peerSendId = (remoteSendRank < rank) ? remoteSendRank : remoteSendRank - 1; int peerRecvId = (remoteRecvRank < rank) ? remoteRecvRank : remoteRecvRank - 1; - mscclpp::ProxyChannelDeviceHandle& devFstSendChan = fstRoundChans[peerSendId]; - mscclpp::ProxyChannelDeviceHandle& devFstRecvChan = fstRoundChans[peerRecvId]; - mscclpp::ProxyChannelDeviceHandle& devSndSendChan = sndRoundChans[peerSendId]; - mscclpp::ProxyChannelDeviceHandle& devSndRecvChan = sndRoundChans[peerRecvId]; + mscclpp::PortChannelDeviceHandle& devFstSendChan = fstRoundChans[peerSendId]; + mscclpp::PortChannelDeviceHandle& devFstRecvChan = fstRoundChans[peerRecvId]; + mscclpp::PortChannelDeviceHandle& devSndSendChan = sndRoundChans[peerSendId]; + mscclpp::PortChannelDeviceHandle& devSndRecvChan = sndRoundChans[peerRecvId]; // Step 1 size_t chunkIndex = (rank + worldSize - 1) % worldSize; @@ -419,9 +420,9 @@ extern "C" __global__ void __launch_bounds__(1024, 1) // AllReduce4 // 2-node // ------------------------------------------- -__device__ void localReduceScatterSm(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff, int rank, int nRanksPerNode, - int startChunkIndex, size_t offsetInChunk, size_t chunkSize, size_t nelems, - int nBlocks) { +__device__ void localReduceScatterMem(mscclpp::MemoryChannelDeviceHandle* memChans, TYPE* buff, int rank, + int nRanksPerNode, int startChunkIndex, size_t offsetInChunk, size_t chunkSize, + size_t nelems, int nBlocks) { if (nRanksPerNode == 1) return; if (blockIdx.x >= nBlocks) return; const int nPeer = nRanksPerNode - 1; @@ -433,10 +434,10 @@ __device__ void localReduceScatterSm(mscclpp::SmChannelDeviceHandle* smChans, TY int4* buff4 = (int4*)buff; for (int peerIdx = threadIdx.x + blockIdx.x * blockDim.x; peerIdx < nPeer; peerIdx += blockDim.x * nBlocks) { - smChans[peerIdx].relaxedSignal(); + memChans[peerIdx].relaxedSignal(); } for (int peerIdx = threadIdx.x + blockIdx.x * blockDim.x; peerIdx < nPeer; peerIdx += blockDim.x * nBlocks) { - smChans[peerIdx].wait(); + memChans[peerIdx].wait(); } reduceScatterDeviceSyncer.sync(nBlocks); @@ -447,7 +448,7 @@ __device__ void localReduceScatterSm(mscclpp::SmChannelDeviceHandle* smChans, TY int4 val; int peerIdx = index + localRankIndexInNode; if (peerIdx >= nPeer) peerIdx -= nPeer; - val = smChans[peerIdx].read(indexOffset4 + idx); + val = memChans[peerIdx].read(indexOffset4 + idx); tmp = add_vectors(tmp, val); } buff4[indexOffset4 + idx] = tmp; @@ -457,9 +458,9 @@ __device__ void localReduceScatterSm(mscclpp::SmChannelDeviceHandle* smChans, TY } // This kernel is the most performant when the number of blocks is a multiple of (nRanksPerNode - 1). -__device__ void localAllGatherSm(mscclpp::SmChannelDeviceHandle* smChans, int rank, int nRanksPerNode, - int startRankChunkIndex, uint64_t offsetInRankChunk, uint64_t rankChunkSize, - uint64_t size, size_t nBlocks) { +__device__ void localAllGatherMem(mscclpp::MemoryChannelDeviceHandle* memChans, int rank, int nRanksPerNode, + int startRankChunkIndex, uint64_t offsetInRankChunk, uint64_t rankChunkSize, + uint64_t size, size_t nBlocks) { if (nRanksPerNode == 1) return; if (blockIdx.x >= nBlocks) return; const size_t nPeer = nRanksPerNode - 1; @@ -495,16 +496,16 @@ __device__ void localAllGatherSm(mscclpp::SmChannelDeviceHandle* smChans, int ra sizeForThisBlock += lastChunkSize; } if (threadIdx.x == 0 && peerLocalBlockIdx == 0) { - smChans[peerIdx].relaxedSignal(); - smChans[peerIdx].wait(); + memChans[peerIdx].relaxedSignal(); + memChans[peerIdx].wait(); } allGatherDeviceSyncer.sync(nBlocks); size_t offset = rankChunkSize * (startRankChunkIndex + remoteRankLocalIndex) + offsetInRankChunk; - smChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x); + memChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x); } -__device__ void localAllGatherAllPairsSm(mscclpp::SmChannelDeviceHandle* smChans, int rank, int nRanksPerNode, - uint64_t size, size_t nBlocks) { +__device__ void localAllGatherAllPairsMem(mscclpp::MemoryChannelDeviceHandle* memChans, int rank, int nRanksPerNode, + uint64_t size, size_t nBlocks) { if (nRanksPerNode == 1) return; if (blockIdx.x >= nBlocks) return; @@ -512,24 +513,24 @@ __device__ void localAllGatherAllPairsSm(mscclpp::SmChannelDeviceHandle* smChans const int nPeer = nRanksPerNode - 1; if (tid < nPeer) { - smChans[tid].signal(); + memChans[tid].signal(); } int waitStart = nBlocks * blockDim.x - nPeer; if (tid >= waitStart && tid < nBlocks * blockDim.x) { - smChans[tid - waitStart].wait(); + memChans[tid - waitStart].wait(); } allGatherDeviceSyncer.sync(nBlocks); for (int i = 0; i < nPeer; ++i) { int peerIdx = (i + rank) % nPeer; const int remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); size_t offset = size * remoteRankLocalIndex; - smChans[peerIdx].get(offset, size, tid, blockDim.x * nBlocks); + memChans[peerIdx].get(offset, size, tid, blockDim.x * nBlocks); } } // This is an allgather4 equivalent -__device__ void allGatherSm(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::ProxyChannelDeviceHandle* proxyChans, - int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU, int pipelineDepth) { +__device__ void allGatherMem(mscclpp::MemoryChannelDeviceHandle* memChans, mscclpp::PortChannelDeviceHandle* portChans, + int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU, int pipelineDepth) { // this allgather is a pipelined and hierarchical one and only works for two nodes // it is implemented as follows: // Step 1: each node does a local allgather and concurrently, @@ -544,14 +545,14 @@ __device__ void allGatherSm(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::Pr int peerRank = (rank + nRanksPerNode) % worldSize; int peerNodeId = peerRank / nRanksPerNode; int peer = (peerRank < rank) ? peerRank : peerRank - 1; - mscclpp::ProxyChannelDeviceHandle proxyChan = proxyChans[peer]; + mscclpp::PortChannelDeviceHandle portChan = portChans[peer]; const size_t nBlocksForLocalAllGather = gridDim.x / (nRanksPerNode - 1) * (nRanksPerNode - 1); const size_t rankChunkSize = nelemsPerGPU * sizeof(int); const int startRankIndexInLocalNode = (rank / nRanksPerNode) * nRanksPerNode; const int startRankIndexInPeerNode = (peerRank / nRanksPerNode) * nRanksPerNode; if (peerNodeId == rank / nRanksPerNode) { - localAllGatherSm(smChans, rank, nRanksPerNode, 0, 0, rankChunkSize, rankChunkSize, gridDim.x); + localAllGatherMem(memChans, rank, nRanksPerNode, 0, 0, rankChunkSize, rankChunkSize, gridDim.x); return; } @@ -562,36 +563,37 @@ __device__ void allGatherSm(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::Pr // Step 1 if (threadIdx.x == 0 && blockIdx.x == 0 && step1Bytes > 0) { - proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), step1Bytes); + portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), step1Bytes); } - localAllGatherSm(smChans, rank, nRanksPerNode, startRankIndexInLocalNode, 0, rankChunkSize, rankChunkSize, - nBlocksForLocalAllGather); + localAllGatherMem(memChans, rank, nRanksPerNode, startRankIndexInLocalNode, 0, rankChunkSize, rankChunkSize, + nBlocksForLocalAllGather); if (threadIdx.x == 0 && blockIdx.x == 0 && step1Bytes > 0) { - proxyChan.wait(); - proxyChan.flush(); + portChan.wait(); + portChan.flush(); } deviceSyncer.sync(gridDim.x); // Step 2 if (threadIdx.x == 0 && blockIdx.x == 0) { - proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int) + step1Bytes, step2Bytes); + portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int) + step1Bytes, step2Bytes); } if (step1Bytes > 0) - localAllGatherSm(smChans, rank, nRanksPerNode, startRankIndexInPeerNode, 0, rankChunkSize, step1Bytes, - nBlocksForLocalAllGather); + localAllGatherMem(memChans, rank, nRanksPerNode, startRankIndexInPeerNode, 0, rankChunkSize, step1Bytes, + nBlocksForLocalAllGather); if (threadIdx.x == 0 && blockIdx.x == 0) { - proxyChan.wait(); - proxyChan.flush(); + portChan.wait(); + portChan.flush(); } deviceSyncer.sync(gridDim.x); // Step 3 - localAllGatherSm(smChans, rank, nRanksPerNode, startRankIndexInPeerNode, step1Bytes, rankChunkSize, step2Bytes, - nBlocksForLocalAllGather); + localAllGatherMem(memChans, rank, nRanksPerNode, startRankIndexInPeerNode, step1Bytes, rankChunkSize, step2Bytes, + nBlocksForLocalAllGather); } -__device__ void reduceScatterSm(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::ProxyChannelDeviceHandle* proxyChans, - TYPE* buff, TYPE* scratch, int rank, int nRanksPerNode, int worldSize, - size_t nelems, // must be divisible by 3 - int pipelineDepth) { +__device__ void reduceScatterMem(mscclpp::MemoryChannelDeviceHandle* memChans, + mscclpp::PortChannelDeviceHandle* portChans, TYPE* buff, TYPE* scratch, int rank, + int nRanksPerNode, int worldSize, + size_t nelems, // must be divisible by 3 + int pipelineDepth) { // this reduce-scatter algorithm works as follows: // Step 1: each node does a local reduce-scatter on peer node data chunks with 1/pipeline portion of chunk data. For // example, 2 nodes and each node has 2 ranks. rank 0 and rank 1 perform reduce-scatter on chunk 2 and chunk 3, with @@ -612,29 +614,29 @@ __device__ void reduceScatterSm(mscclpp::SmChannelDeviceHandle* smChans, mscclpp int isComm = (threadIdx.x == 0) && (blockIdx.x == nBlocksForReduceScatter); int peer = (peerRank < rank) ? peerRank : peerRank - 1; int nBlocksRemain = gridDim.x - nBlocksForReduceScatter; - mscclpp::ProxyChannelDeviceHandle proxyChan = proxyChans[peer]; + mscclpp::PortChannelDeviceHandle portChan = portChans[peer]; if (peerNodeId == rank / nRanksPerNode) { - localReduceScatterSm(smChans, buff, rank, nRanksPerNode, 0, 0, chunkSize, chunkSize, gridDim.x); + localReduceScatterMem(memChans, buff, rank, nRanksPerNode, 0, 0, chunkSize, chunkSize, gridDim.x); return; } // step 1: local reduce int startChunkIndex = peerNodeId * nRanksPerNode; - localReduceScatterSm(smChans, buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize / pipelineSize, - nBlocksForReduceScatter); + localReduceScatterMem(memChans, buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize / pipelineSize, + nBlocksForReduceScatter); deviceSyncer.sync(gridDim.x); // step 2: local reduce and exchange data with neighbor if (isComm) { size_t offset = (peerRank * chunkSize) * sizeof(int); // opposite side - proxyChan.putWithSignal(offset, (chunkSize / pipelineSize * sizeof(int))); + portChan.putWithSignal(offset, (chunkSize / pipelineSize * sizeof(int))); } if (pipelineSize > 1) - localReduceScatterSm(smChans, buff, rank, nRanksPerNode, startChunkIndex, chunkSize / pipelineSize, chunkSize, - (pipelineSize - 1) * chunkSize / pipelineSize, nBlocksForReduceScatter); + localReduceScatterMem(memChans, buff, rank, nRanksPerNode, startChunkIndex, chunkSize / pipelineSize, chunkSize, + (pipelineSize - 1) * chunkSize / pipelineSize, nBlocksForReduceScatter); if (isComm) { - proxyChan.wait(); + portChan.wait(); } if (blockIdx.x >= nBlocksForReduceScatter) { ibDeviceSyncer.sync(nBlocksRemain); @@ -645,7 +647,7 @@ __device__ void reduceScatterSm(mscclpp::SmChannelDeviceHandle* smChans, mscclpp vectorSum((TYPE*)dst, (TYPE*)src, chunkSize / pipelineSize, blockIdx.x - nBlocksForReduceScatter, nBlocksRemain); } if (isComm) { - proxyChan.flush(); + portChan.flush(); } deviceSyncer.sync(gridDim.x); @@ -653,12 +655,12 @@ __device__ void reduceScatterSm(mscclpp::SmChannelDeviceHandle* smChans, mscclpp startChunkIndex = (rank / nRanksPerNode) * nRanksPerNode; if (isComm && pipelineSize > 1) { size_t offset = (peerRank * chunkSize + chunkSize / pipelineSize) * sizeof(int); - proxyChan.putWithSignal(offset, (pipelineSize - 1) * chunkSize / pipelineSize * sizeof(int)); + portChan.putWithSignal(offset, (pipelineSize - 1) * chunkSize / pipelineSize * sizeof(int)); } - localReduceScatterSm(smChans, buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize, - nBlocksForReduceScatter); + localReduceScatterMem(memChans, buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize, + nBlocksForReduceScatter); if (isComm && pipelineSize > 1) { - proxyChan.wait(); + portChan.wait(); } deviceSyncer.sync(gridDim.x); // reduce to related rank, can not overlap since localReduceScatter also calculate the sum @@ -667,24 +669,24 @@ __device__ void reduceScatterSm(mscclpp::SmChannelDeviceHandle* smChans, mscclpp int* src = (int*)((char*)scratch + offset); if (pipelineSize > 1) vectorSum((TYPE*)dst, (TYPE*)src, (pipelineSize - 1) * chunkSize / pipelineSize); if (isComm) { - proxyChan.flush(); + portChan.flush(); } } extern "C" __global__ void __launch_bounds__(1024, 1) __global__ - allreduce4(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::ProxyChannelDeviceHandle* reduceScatterProxyChans, - mscclpp::ProxyChannelDeviceHandle* allGatherProxyChans, TYPE* buff, TYPE* scratch, int rank, + allreduce4(mscclpp::MemoryChannelDeviceHandle* memChans, mscclpp::PortChannelDeviceHandle* reduceScatterPortChans, + mscclpp::PortChannelDeviceHandle* allGatherPortChans, TYPE* buff, TYPE* scratch, int rank, int nRanksPerNode, int worldSize, size_t nelems, int pipelineDepth) { nelems = nelems / (sizeof(int) / sizeof(TYPE)); - reduceScatterSm(smChans, reduceScatterProxyChans, buff, scratch, rank, nRanksPerNode, worldSize, nelems, - pipelineDepth); + reduceScatterMem(memChans, reduceScatterPortChans, buff, scratch, rank, nRanksPerNode, worldSize, nelems, + pipelineDepth); deviceSyncer.sync(gridDim.x); - allGatherSm(smChans, allGatherProxyChans, rank, worldSize, nRanksPerNode, nelems / worldSize, pipelineDepth); + allGatherMem(memChans, allGatherPortChans, rank, worldSize, nRanksPerNode, nelems / worldSize, pipelineDepth); } // allreduce 5 for 2-nodes extern "C" __global__ void __launch_bounds__(1024, 1) - allreduce5(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::ProxyChannelDeviceHandle* proxyChans, TYPE* buff, + allreduce5(mscclpp::MemoryChannelDeviceHandle* memChans, mscclpp::PortChannelDeviceHandle* portChans, TYPE* buff, TYPE* scratch, TYPE* putBuff, TYPE* resultBuff, int rank, int nRanksPerNode, int worldSize, size_t nelems) { nelems = nelems / (sizeof(int) / sizeof(TYPE)); @@ -701,8 +703,8 @@ extern "C" __global__ void __launch_bounds__(1024, 1) const int localBlockIdx = blockIdx.x % nBlocksPerPeer; const int peerIdx = blockIdx.x / nBlocksPerPeer; const int remoteRankIdx = peerIdx < localRankId ? peerIdx : peerIdx + 1; - mscclpp::SmChannelDeviceHandle smChan = smChans[peerIdx]; - mscclpp::ProxyChannelDeviceHandle proxyChan = proxyChans[localRankId]; + mscclpp::MemoryChannelDeviceHandle memChan = memChans[peerIdx]; + mscclpp::PortChannelDeviceHandle portChan = portChans[localRankId]; const int tid = threadIdx.x + localBlockIdx * blockDim.x; // double buffering size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LLPacket); @@ -717,8 +719,8 @@ extern "C" __global__ void __launch_bounds__(1024, 1) // step 1: write to scratch buffer if (nRanksPerNode > 1) { - smChan.putPackets(scratchOffset, srcOffset, nelemsPerLocalRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer, - flag); + memChan.putPackets(scratchOffset, srcOffset, nelemsPerLocalRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer, + flag); } // step 2: get data from scratch buffer, do local reduce-scatter in each node. mscclpp::LLPacket* putPkt = (mscclpp::LLPacket*)((char*)putBuff + putBaseOffset); @@ -737,9 +739,9 @@ extern "C" __global__ void __launch_bounds__(1024, 1) deviceSyncer.sync(gridDim.x); // step 3. send local reduced data to remote node. if (threadIdx.x == 0 && blockIdx.x == 0) { - proxyChan.put(scratchOffset, putBaseOffset, nPktsPerLocalRank * sizeof(mscclpp::LLPacket)); + portChan.put(scratchOffset, putBaseOffset, nPktsPerLocalRank * sizeof(mscclpp::LLPacket)); if ((flag & 63) == 0) { - proxyChan.flush(); + portChan.flush(); } } // step 4. try to read the data from scratch buffer and write to local peers @@ -756,7 +758,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1) packet.flag2 = flag; size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket) + (idx + localRankId * nPktsPerLocalRank); for (int index = 0; index < nPeersInNode; index++) { - smChans[index].write(offset, packet); + memChans[index].write(offset, packet); } dst[idx] = res; } @@ -787,7 +789,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1) // Barrier among all devices // Should be called by all threads on all devices // Assumes \p num_threads_per_block >= \p num_ranks -__forceinline__ __device__ void barrier(mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores, int thread_id, +__forceinline__ __device__ void barrier(mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle* semaphores, int thread_id, int block_id, int num_blocks, int num_ranks) { // wait for every device if (block_id == 0) { @@ -804,7 +806,7 @@ __forceinline__ __device__ void barrier(mscclpp::SmDevice2DeviceSemaphoreDeviceH // Assumes \p kVecSize is 1, 2, 4, or 8 (default 8) template -MSCCLPP_DEVICE_INLINE void allreduce6_helper(mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores, +MSCCLPP_DEVICE_INLINE void allreduce6_helper(mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle* semaphores, mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs, int my_rank, int num_ranks, size_t num_elements) { DataType* mc_ptr = (DataType*)nvlsPtrs.mcPtr; @@ -863,7 +865,7 @@ MSCCLPP_DEVICE_INLINE void allreduce6_helper(mscclpp::SmDevice2DeviceSemaphoreDe } extern "C" __global__ void __launch_bounds__(1024, 1) - allreduce6(mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores, + allreduce6(mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle* semaphores, mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs, int my_rank, int num_ranks, size_t num_elements, size_t vector_size) { if (vector_size == 8) { diff --git a/python/mscclpp_benchmark/mscclpp_op.py b/python/mscclpp_benchmark/mscclpp_op.py index c2af7a4fc..a04fcc8c1 100644 --- a/python/mscclpp_benchmark/mscclpp_op.py +++ b/python/mscclpp_benchmark/mscclpp_op.py @@ -1,7 +1,7 @@ import os import cupy as cp import ctypes -from mscclpp import Transport, ProxyService, SmDevice2DeviceSemaphore +from mscclpp import Transport, ProxyService, MemoryDevice2DeviceSemaphore import mscclpp.comm as mscclpp_comm from mscclpp.utils import KernelBuilder, GpuBuffer, pack @@ -48,8 +48,8 @@ def __init__( self.connections = self.group.make_connection(remote_nghrs, Transport.CudaIpc) type_str = type_to_str(memory.dtype) - # create a sm_channel for each remote neighbor - self.sm_channels = self.group.make_sm_channels(self.memory, self.connections) + # create a memory_channel for each remote neighbor + self.memory_channels = self.group.make_memory_channels(self.memory, self.connections) file_dir = os.path.dirname(os.path.abspath(__file__)) self.kernel = KernelBuilder( file="allreduce.cu", @@ -60,7 +60,7 @@ def __init__( self.device_handles = [] for rank in range(self.group.nranks): if rank != self.group.my_rank: - self.device_handles.append(self.sm_channels[rank].device_handle().raw) + self.device_handles.append(self.memory_channels[rank].device_handle().raw) self.device_handles_cp = cp.asarray(memoryview(b"".join(self.device_handles)), dtype=cp.uint8) @@ -116,8 +116,8 @@ def __init__( type_str = type_to_str(memory.dtype) self.scratch = GpuBuffer(self.memory.size * 8, dtype=self.memory.dtype) - # create a sm_channel for each remote neighbor - self.sm_channels = self.group.make_sm_channels_with_scratch(self.memory, self.scratch, self.connections) + # create a memory_channel for each remote neighbor + self.memory_channels = self.group.make_memory_channels_with_scratch(self.memory, self.scratch, self.connections) file_dir = os.path.dirname(os.path.abspath(__file__)) self.kernel = KernelBuilder( file="allreduce.cu", kernel_name="allreduce2", file_dir=file_dir, macro_dict={"TYPE": type_str} @@ -125,7 +125,7 @@ def __init__( self.device_handles = [] for rank in range(self.group.nranks): if rank != self.group.my_rank: - self.device_handles.append(self.sm_channels[rank].device_handle().raw) + self.device_handles.append(self.memory_channels[rank].device_handle().raw) self.device_handles_cp = cp.asarray(memoryview(b"".join(self.device_handles)), dtype=cp.uint8) @@ -181,11 +181,11 @@ def __init__( self.proxy_service = proxy_service self.scratch = GpuBuffer(self.memory.size, dtype=self.memory.dtype) - # create a sm_channel for each remote neighbor - self.fst_round_proxy_chans = self.group.make_proxy_channels_with_scratch( + # create a memory_channel for each remote neighbor + self.fst_round_port_chans = self.group.make_port_channels_with_scratch( self.proxy_service, self.memory, self.scratch, self.connections ) - self.snd_round_proxy_chans = self.group.make_proxy_channels(self.proxy_service, self.memory, self.connections) + self.snd_round_port_chans = self.group.make_port_channels(self.proxy_service, self.memory, self.connections) file_dir = os.path.dirname(os.path.abspath(__file__)) self.kernel = KernelBuilder( file="allreduce.cu", kernel_name="allreduce3", file_dir=file_dir, macro_dict={"TYPE": type_str} @@ -194,8 +194,8 @@ def __init__( self.snd_device_handles = [] for rank in range(self.group.nranks): if rank != self.group.my_rank: - self.fst_device_handles.append(self.fst_round_proxy_chans[rank].device_handle().raw) - self.snd_device_handles.append(self.snd_round_proxy_chans[rank].device_handle().raw) + self.fst_device_handles.append(self.fst_round_port_chans[rank].device_handle().raw) + self.snd_device_handles.append(self.snd_round_port_chans[rank].device_handle().raw) self.fst_device_handles_cp = cp.asarray(memoryview(b"".join(self.fst_device_handles)), dtype=cp.uint8) self.snd_device_handles_cp = cp.asarray(memoryview(b"".join(self.snd_device_handles)), dtype=cp.uint8) @@ -261,31 +261,29 @@ def __init__( self.proxy_service = proxy_service self.scratch = GpuBuffer(self.memory.size, dtype=self.memory.dtype) same_node_connections = {rank: conn for rank, conn in self.connections.items() if in_same_node(rank)} - # create a sm_channel for each remote neighbor - self.sm_channels = self.group.make_sm_channels(self.memory, same_node_connections) - self.reduce_scatter_proxy_channels = self.group.make_proxy_channels_with_scratch( + # create a memory_channel for each remote neighbor + self.memory_channels = self.group.make_memory_channels(self.memory, same_node_connections) + self.reduce_scatter_port_channels = self.group.make_port_channels_with_scratch( self.proxy_service, self.memory, self.scratch, self.connections ) - self.all_gather_proxy_channels = self.group.make_proxy_channels( - self.proxy_service, self.memory, self.connections - ) + self.all_gather_port_channels = self.group.make_port_channels(self.proxy_service, self.memory, self.connections) file_dir = os.path.dirname(os.path.abspath(__file__)) self.kernel = KernelBuilder( file="allreduce.cu", kernel_name="allreduce4", file_dir=file_dir, macro_dict={"TYPE": type_str} ).get_compiled_kernel() - self.sm_device_handles = [] + self.mem_device_handles = [] self.reduce_sactter_proxy_device_handles = [] self.all_gather_proxy_device_handles = [] for rank in range(self.group.nranks): if rank != self.group.my_rank and in_same_node(rank): - self.sm_device_handles.append(self.sm_channels[rank].device_handle().raw) + self.mem_device_handles.append(self.memory_channels[rank].device_handle().raw) if rank != self.group.my_rank: self.reduce_sactter_proxy_device_handles.append( - self.reduce_scatter_proxy_channels[rank].device_handle().raw + self.reduce_scatter_port_channels[rank].device_handle().raw ) - self.all_gather_proxy_device_handles.append(self.all_gather_proxy_channels[rank].device_handle().raw) + self.all_gather_proxy_device_handles.append(self.all_gather_port_channels[rank].device_handle().raw) - self.sm_device_handles_cp = cp.asarray(memoryview(b"".join(self.sm_device_handles)), dtype=cp.uint8) + self.mem_device_handles_cp = cp.asarray(memoryview(b"".join(self.mem_device_handles)), dtype=cp.uint8) self.reduce_sactter_proxy_device_handles_cp = cp.asarray( memoryview(b"".join(self.reduce_sactter_proxy_device_handles)), dtype=cp.uint8 ) @@ -306,7 +304,7 @@ def set_params(self, nblocks, block_size, pipeline_depth): self.params = b"" self.params += pack( - self.sm_device_handles_cp, + self.mem_device_handles_cp, self.reduce_sactter_proxy_device_handles_cp, self.all_gather_proxy_device_handles_cp, self.memory, @@ -366,24 +364,26 @@ def __init__( self.put_buff = GpuBuffer(self.memory.size * 8 // nranks_per_node, dtype=self.memory.dtype) same_node_connections = {rank: conn for rank, conn in self.connections.items() if in_same_node(rank)} across_node_connections = {rank: conn for rank, conn in self.connections.items() if not in_same_node(rank)} - # create a sm_channel for each remote neighbor - self.sm_channels = self.group.make_sm_channels_with_scratch(self.memory, self.scratch, same_node_connections) - self.proxy_channels = self.group.make_proxy_channels_with_scratch( + # create a memory_channel for each remote neighbor + self.memory_channels = self.group.make_memory_channels_with_scratch( + self.memory, self.scratch, same_node_connections + ) + self.port_channels = self.group.make_port_channels_with_scratch( self.proxy_service, self.put_buff, self.scratch, across_node_connections ) file_dir = os.path.dirname(os.path.abspath(__file__)) self.kernel = KernelBuilder( file="allreduce.cu", kernel_name="allreduce5", file_dir=file_dir, macro_dict={"TYPE": type_str} ).get_compiled_kernel() - self.sm_device_handles = [] + self.mem_device_handles = [] self.proxy_device_handles = [] for rank in range(self.group.nranks): if rank != self.group.my_rank and in_same_node(rank): - self.sm_device_handles.append(self.sm_channels[rank].device_handle().raw) + self.mem_device_handles.append(self.memory_channels[rank].device_handle().raw) if rank != self.group.my_rank and not in_same_node(rank): - self.proxy_device_handles.append(self.proxy_channels[rank].device_handle().raw) + self.proxy_device_handles.append(self.port_channels[rank].device_handle().raw) - self.sm_device_handles_cp = cp.asarray(memoryview(b"".join(self.sm_device_handles)), dtype=cp.uint8) + self.mem_device_handles_cp = cp.asarray(memoryview(b"".join(self.mem_device_handles)), dtype=cp.uint8) self.proxy_device_handles_cp = cp.asarray(memoryview(b"".join(self.proxy_device_handles)), dtype=cp.uint8) self.set_params(nblocks, block_size) @@ -398,7 +398,7 @@ def set_params(self, nblocks, block_size): self.params = b"" self.params += pack( - self.sm_device_handles_cp, + self.mem_device_handles_cp, self.proxy_device_handles_cp, self.memory, self.scratch, @@ -446,8 +446,8 @@ def __init__( self.memory.data.ptr, self.memory.data.mem.size ) - # create a sm_channel for each remote neighbor - self.semaphores = group.make_semaphore(self.nvlink_connections, SmDevice2DeviceSemaphore) + # create a memory_channel for each remote neighbor + self.semaphores = group.make_semaphore(self.nvlink_connections, MemoryDevice2DeviceSemaphore) file_dir = os.path.dirname(os.path.abspath(__file__)) self.kernel = KernelBuilder( file="allreduce.cu", diff --git a/python/test/d2d_semaphore_test.cu b/python/test/d2d_semaphore_test.cu index 04b945e3d..d6bc3ec5e 100644 --- a/python/test/d2d_semaphore_test.cu +++ b/python/test/d2d_semaphore_test.cu @@ -6,7 +6,7 @@ // be careful about using semaphore[my_rank] as it is an invalid semaphore and it is there just for simplicity of // indexing extern "C" __global__ void __launch_bounds__(1024, 1) - d2d_semaphore(mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks) { + d2d_semaphore(mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks) { int tid = threadIdx.x; if (tid < nranks && tid != my_rank) { semaphores[tid].signal(); diff --git a/python/test/sm_channel_test.cu b/python/test/memory_channel_test.cu similarity index 83% rename from python/test/sm_channel_test.cu rename to python/test/memory_channel_test.cu index 93b5c99aa..48a831493 100644 --- a/python/test/sm_channel_test.cu +++ b/python/test/memory_channel_test.cu @@ -1,11 +1,12 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#include +#include // be careful about using channels[my_rank] as it is inavlie and it is there just for simplicity of indexing extern "C" __global__ void __launch_bounds__(1024, 1) - sm_channel(mscclpp::SmChannelDeviceHandle* channels, int my_rank, int nranks, int num_elements, int use_packet) { + memory_channel(mscclpp::MemoryChannelDeviceHandle* channels, int my_rank, int nranks, int num_elements, + int use_packet) { int tid = threadIdx.x; int bid = blockIdx.x; uint64_t size_per_rank = (num_elements * sizeof(int)) / nranks; diff --git a/python/test/nvls_test.cu b/python/test/nvls_test.cu index 3bc2f52d2..8d391ff59 100644 --- a/python/test/nvls_test.cu +++ b/python/test/nvls_test.cu @@ -10,7 +10,7 @@ __device__ mscclpp::DeviceSyncer deviceSyncer; extern "C" __global__ void __launch_bounds__(1024, 1) nvls_test(mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs, - mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks, int nbytes) { + mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks, int nbytes) { int nelem = nbytes / sizeof(float); float* dev_ptr = (float*)nvlsPtrs.devicePtr; float* mc_ptr = (float*)nvlsPtrs.mcPtr; diff --git a/python/test/proxy_channel_test.cu b/python/test/port_channel_test.cu similarity index 85% rename from python/test/proxy_channel_test.cu rename to python/test/port_channel_test.cu index d79a97bf6..05b99d1ab 100644 --- a/python/test/proxy_channel_test.cu +++ b/python/test/port_channel_test.cu @@ -2,12 +2,12 @@ // Licensed under the MIT license. #include -#include +#include // be careful about using channels[my_rank] as it is inavlie and it is there just for simplicity of indexing extern "C" __global__ void __launch_bounds__(1024, 1) - proxy_channel(mscclpp::ProxyChannelDeviceHandle* channels, int my_rank, int nranks, int* data, int* scratch, - int num_elements, int use_packet) { + port_channel(mscclpp::PortChannelDeviceHandle* channels, int my_rank, int nranks, int* data, int* scratch, + int num_elements, int use_packet) { int tid = threadIdx.x; int nthreads = blockDim.x; uint64_t size_per_rank = (num_elements * sizeof(int)) / nranks; diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index 1a5f99c42..f0e63daf4 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -22,7 +22,7 @@ Host2DeviceSemaphore, Host2HostSemaphore, ProxyService, - SmDevice2DeviceSemaphore, + MemoryDevice2DeviceSemaphore, TcpBootstrap, Transport, is_nvls_supported, @@ -363,9 +363,9 @@ def __init__( ).get_compiled_kernel() self.nblocks = 1 self.nthreads = nranks - elif test_name == "sm_channel": + elif test_name == "memory_channel": self._kernel = KernelBuilder( - file="sm_channel_test.cu", kernel_name="sm_channel", file_dir=file_dir + file="memory_channel_test.cu", kernel_name="memory_channel", file_dir=file_dir ).get_compiled_kernel() self.nblocks = nranks self.nthreads = 1024 @@ -381,9 +381,9 @@ def __init__( ).get_compiled_kernel() self.nblocks = 1 self.nthreads = nranks - elif test_name == "proxy_channel": + elif test_name == "port_channel": self._kernel = KernelBuilder( - file="proxy_channel_test.cu", kernel_name="proxy_channel", file_dir=file_dir + file="port_channel_test.cu", kernel_name="port_channel", file_dir=file_dir ).get_compiled_kernel() self.nblocks = 1 self.nthreads = 1024 @@ -411,11 +411,11 @@ def __init__( # keep a reference to the device handles so that they don't get garbage collected self._d_semaphore_or_channels = cp.asarray(memoryview(b"".join(device_handles)), dtype=cp.uint8) - if test_name in ["h2d_semaphore", "d2d_semaphore", "sm_channel", "proxy_channel"]: + if test_name in ["h2d_semaphore", "d2d_semaphore", "memory_channel", "port_channel"]: self.params += pack(self._d_semaphore_or_channels, my_rank, nranks) - if test_name == "sm_channel": + if test_name == "memory_channel": self.params += pack(tensor.size, use_packet) - if test_name == "proxy_channel": + if test_name == "port_channel": self.params += pack(tensor, scratch, tensor.size, use_packet) elif test_name == "fifo": self.params = fifo.device_handle().raw @@ -457,7 +457,7 @@ def signal(semaphores): def test_d2d_semaphores(mpi_group: MpiGroup): group, connections = create_group_and_connection(mpi_group, "NVLink") - semaphores = group.make_semaphore(connections, SmDevice2DeviceSemaphore) + semaphores = group.make_semaphore(connections, MemoryDevice2DeviceSemaphore) group.barrier() kernel = MscclppKernel("d2d_semaphore", group.my_rank, group.nranks, semaphores) kernel() @@ -468,7 +468,7 @@ def test_d2d_semaphores(mpi_group: MpiGroup): @parametrize_mpi_groups(2, 4, 8, 16) @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]]) @pytest.mark.parametrize("use_packet", [False, True]) -def test_sm_channels(mpi_group: MpiGroup, nelem: int, use_packet: bool): +def test_memory_channels(mpi_group: MpiGroup, nelem: int, use_packet: bool): group, connections = create_group_and_connection(mpi_group, "NVLink") memory = GpuBuffer(nelem, dtype=cp.int32) @@ -483,10 +483,10 @@ def test_sm_channels(mpi_group: MpiGroup, nelem: int, use_packet: bool): memory_expected[(nelemPerRank * rank) : (nelemPerRank * (rank + 1))] = rank + 1 if use_packet: - channels = group.make_sm_channels_with_scratch(memory, scratch, connections) + channels = group.make_memory_channels_with_scratch(memory, scratch, connections) else: - channels = group.make_sm_channels(memory, connections) - kernel = MscclppKernel("sm_channel", group.my_rank, group.nranks, channels, memory, use_packet, scratch) + channels = group.make_memory_channels(memory, connections) + kernel = MscclppKernel("memory_channel", group.my_rank, group.nranks, channels, memory, use_packet, scratch) group.barrier() kernel() @@ -565,7 +565,7 @@ def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str): @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]]) @pytest.mark.parametrize("transport", ["NVLink", "IB"]) @pytest.mark.parametrize("use_packet", [False, True]) -def test_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool): +def test_port_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool): group, connections = create_group_and_connection(mpi_group, transport) memory = GpuBuffer(nelem, dtype=cp.int32) @@ -586,10 +586,10 @@ def test_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_pack memory_to_register = scratch else: memory_to_register = memory - channels = group.make_proxy_channels(proxy_service, memory_to_register, connections) + channels = group.make_port_channels(proxy_service, memory_to_register, connections) kernel = MscclppKernel( - "proxy_channel", + "port_channel", my_rank=group.my_rank, nranks=group.nranks, semaphore_or_channels=channels, @@ -614,7 +614,7 @@ def test_nvls(mpi_group: MpiGroup): mem_handle = nvls_connection.allocate_bind_memory(nbytes) nvlinks_connections = create_connection(group, "NVLink") - semaphores = group.make_semaphore(nvlinks_connections, SmDevice2DeviceSemaphore) + semaphores = group.make_semaphore(nvlinks_connections, MemoryDevice2DeviceSemaphore) kernel = MscclppKernel( "nvls", diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc index 56c881bd0..ed5509052 100644 --- a/src/executor/execution_plan.cc +++ b/src/executor/execution_plan.cc @@ -74,10 +74,10 @@ auto convertToBufferType = [](const std::string& str) { }; auto convertToChannelType = [](const std::string& str) { - if (str == "sm") { - return mscclpp::ChannelType::SM; - } else if (str == "proxy") { - return mscclpp::ChannelType::PROXY; + if (str == "memory" || str == "sm") { + return mscclpp::ChannelType::MEMORY; + } else if (str == "port" || str == "proxy") { + return mscclpp::ChannelType::PORT; } else if (str == "none") { return mscclpp::ChannelType::NONE; } else if (str == "nvls") { @@ -304,7 +304,7 @@ void ExecutionPlan::Impl::parseChannels( } } -// Construct the channel info. Step 1. Flatten SM and PROXY channels into separate vectors. +// Construct the channel info. Step 1. Flatten MEMORY and PORT channels into separate vectors. // Step 2. For each threadblock, construct a vector of channel indexes and keys. void ExecutionPlan::Impl::setupChannels(const json& gpus) { using mapKey = std::tuple; @@ -331,7 +331,7 @@ void ExecutionPlan::Impl::setupChannels(const json& gpus) { // setup threadblockChannelMap for (const auto& gpu : gpus) { int rank = gpu["id"]; - auto channelTypes = {ChannelType::SM, ChannelType::PROXY, ChannelType::NVLS}; + auto channelTypes = {ChannelType::MEMORY, ChannelType::PORT, ChannelType::NVLS}; std::unordered_map> channelMap; for (auto channelType : channelTypes) { const std::vector channelInfos = this->getChannelInfos(rank, channelType); @@ -352,18 +352,18 @@ void ExecutionPlan::Impl::setupChannels(const json& gpus) { } } int nthreadblocks = gpu["threadblocks"].size(); - this->threadblockSMChannelMap[rank].resize(nthreadblocks); - this->threadblockProxyChannelMap[rank].resize(nthreadblocks); + this->threadblockMemoryChannelMap[rank].resize(nthreadblocks); + this->threadblockPortChannelMap[rank].resize(nthreadblocks); this->threadblockNvlsChannelMap[rank].resize(nthreadblocks); for (const auto& threadblock : gpu["threadblocks"]) { for (const auto& channel : threadblock["channels"]) { ChannelType channelType = convertToChannelType(channel["ctype"]); ChannelKey key = {convertToBufferType(channel["src"]), convertToBufferType(channel["dst"]), channelType}; for (int id : channel["cids"]) { - if (channelType == ChannelType::SM) { - this->threadblockSMChannelMap[rank][threadblock["id"]].emplace_back(channelMap[key][id], key); - } else if (channelType == ChannelType::PROXY) { - this->threadblockProxyChannelMap[rank][threadblock["id"]].emplace_back(channelMap[key][id], key); + if (channelType == ChannelType::MEMORY) { + this->threadblockMemoryChannelMap[rank][threadblock["id"]].emplace_back(channelMap[key][id], key); + } else if (channelType == ChannelType::PORT) { + this->threadblockPortChannelMap[rank][threadblock["id"]].emplace_back(channelMap[key][id], key); } else if (channelType == ChannelType::NVLS) { this->threadblockNvlsChannelMap[rank][threadblock["id"]].emplace_back(channelMap[key][id], key); } @@ -394,15 +394,15 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse std::unordered_map> channelIndexes; std::vector ops; int threadblockId = threadblock["id"]; - const auto& smChannels = this->threadblockSMChannelMap[rank][threadblockId]; - const auto& proxyChannels = this->threadblockProxyChannelMap[rank][threadblockId]; + const auto& memoryChannels = this->threadblockMemoryChannelMap[rank][threadblockId]; + const auto& portChannels = this->threadblockPortChannelMap[rank][threadblockId]; const auto& nvlsChannels = this->threadblockNvlsChannelMap[rank][threadblockId]; - for (size_t i = 0; i < smChannels.size(); i++) { - const auto& [_, key] = smChannels[i]; + for (size_t i = 0; i < memoryChannels.size(); i++) { + const auto& [_, key] = memoryChannels[i]; channelIndexes[key].push_back(i); } - for (size_t i = 0; i < proxyChannels.size(); i++) { - const auto& [_, key] = proxyChannels[i]; + for (size_t i = 0; i < portChannels.size(); i++) { + const auto& [_, key] = portChannels[i]; channelIndexes[key].push_back(i); } for (size_t i = 0; i < nvlsChannels.size(); i++) { @@ -586,8 +586,8 @@ void ExecutionPlan::Impl::reset() { this->operations.clear(); this->channelInfos.clear(); this->nvlsInfos.clear(); - this->threadblockSMChannelMap.clear(); - this->threadblockProxyChannelMap.clear(); + this->threadblockMemoryChannelMap.clear(); + this->threadblockPortChannelMap.clear(); this->threadblockNvlsChannelMap.clear(); this->inputChunks.clear(); this->outputChunks.clear(); diff --git a/src/executor/executor.cc b/src/executor/executor.cc index 944ddb254..25e55bb56 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -2,9 +2,9 @@ // Licensed under the MIT license. #include +#include #include -#include -#include +#include #include #include "execution_kernel.hpp" @@ -113,10 +113,10 @@ struct ExecutionContext { std::unordered_map> connections; std::vector> nvlsConnections; std::unordered_map, mscclpp::RegisteredMemory> registeredMemories; - std::vector> smSemaphores; + std::vector> memorySemaphores; std::vector proxySemaphores; - std::vector smChannels; - std::vector proxyChannels; + std::vector memoryChannels; + std::vector portChannels; std::vector nvlsChannels; std::unordered_map> deviceExecutionPlans; std::unordered_map> deviceExecutionPlansBuffers; @@ -194,9 +194,9 @@ struct Executor::Impl { TransportFlags getTransportFlags(std::vector& infos, int rank) { TransportFlags flags; for (ChannelInfo& info : infos) { - if (info.channelType == ChannelType::SM) { + if (info.channelType == ChannelType::MEMORY) { flags |= Transport::CudaIpc; - } else if (info.channelType == ChannelType::PROXY) { + } else if (info.channelType == ChannelType::PORT) { for (int peer : info.connectedPeers) { if (!inSameNode(rank, peer, this->nranksPerNode)) { flags |= IBs[rank % this->nranksPerNode]; @@ -279,16 +279,16 @@ struct Executor::Impl { void setupChannels(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize, size_t recvBufferSize, int rank, const ExecutionPlan& plan) { - const auto channelTypes = {ChannelType::SM, ChannelType::PROXY}; - std::vector> smSemaphores; + const auto channelTypes = {ChannelType::MEMORY, ChannelType::PORT}; + std::vector> memorySemaphores; std::vector proxySemaphores; auto processChannelInfos = [&](std::vector& channelInfos) { for (ChannelInfo& info : channelInfos) { for (int peer : info.connectedPeers) { - if (info.channelType == ChannelType::SM) { - smSemaphores.push_back( - std::make_shared(*this->comm, context.connections.at(peer))); - } else if (info.channelType == ChannelType::PROXY) { + if (info.channelType == ChannelType::MEMORY) { + memorySemaphores.push_back( + std::make_shared(*this->comm, context.connections.at(peer))); + } else if (info.channelType == ChannelType::PORT) { proxySemaphores.push_back( context.proxyService->buildAndAddSemaphore(*this->comm, context.connections.at(peer))); } @@ -307,7 +307,7 @@ struct Executor::Impl { processChannelInfos(channelInfos); } this->comm->setup(); - context.smSemaphores = std::move(smSemaphores); + context.memorySemaphores = std::move(memorySemaphores); context.proxySemaphores = std::move(proxySemaphores); auto getBufferSize = [&](BufferType type) { @@ -332,11 +332,11 @@ struct Executor::Impl { TransportFlags transport = getTransportFlags(channelInfos, rank); RegisteredMemory localMemory = this->comm->registerMemory(src, bufferSize, transport); for (int peer : info.connectedPeers) { - if (channelType == ChannelType::SM) { - context.smChannels.emplace_back(context.smSemaphores[index++], - context.registeredMemories[{info.dstBufferType, peer}], src, nullptr); - } else if (channelType == ChannelType::PROXY) { - context.proxyChannels.emplace_back(context.proxyService->proxyChannel( + if (channelType == ChannelType::MEMORY) { + context.memoryChannels.emplace_back(context.memorySemaphores[index++], + context.registeredMemories[{info.dstBufferType, peer}], src, nullptr); + } else if (channelType == ChannelType::PORT) { + context.portChannels.emplace_back(context.proxyService->portChannel( context.proxySemaphores[index++], context.proxyService->addMemory(context.registeredMemories[{info.dstBufferType, peer}]), context.proxyService->addMemory(localMemory))); @@ -366,15 +366,15 @@ struct Executor::Impl { DeviceExecutionPlan deviceExecutionPlan = {}; std::vector ops = plan.impl_->getOperations(rank, threadblock); deviceExecutionPlan.nOperations = ops.size(); - deviceExecutionPlan.nSmChannels = plan.impl_->threadblockSMChannelMap.at(rank).at(threadblock).size(); - deviceExecutionPlan.nProxyChannels = plan.impl_->threadblockProxyChannelMap.at(rank).at(threadblock).size(); + deviceExecutionPlan.nMemoryChannels = plan.impl_->threadblockMemoryChannelMap.at(rank).at(threadblock).size(); + deviceExecutionPlan.nPortChannels = plan.impl_->threadblockPortChannelMap.at(rank).at(threadblock).size(); int chanIndex = 0; - for (const auto& [index, _] : plan.impl_->threadblockSMChannelMap.at(rank).at(threadblock)) { - deviceExecutionPlan.channels.smChannels[chanIndex++] = mscclpp::deviceHandle(context.smChannels[index]); + for (const auto& [index, _] : plan.impl_->threadblockMemoryChannelMap.at(rank).at(threadblock)) { + deviceExecutionPlan.channels.memoryChannels[chanIndex++] = mscclpp::deviceHandle(context.memoryChannels[index]); } chanIndex = 0; - for (const auto& [index, _] : plan.impl_->threadblockProxyChannelMap.at(rank).at(threadblock)) { - deviceExecutionPlan.channels.proxyChannels[chanIndex++] = mscclpp::deviceHandle(context.proxyChannels[index]); + for (const auto& [index, _] : plan.impl_->threadblockPortChannelMap.at(rank).at(threadblock)) { + deviceExecutionPlan.channels.portChannels[chanIndex++] = mscclpp::deviceHandle(context.portChannels[index]); } chanIndex = 0; for (const auto& [index, _] : plan.impl_->threadblockNvlsChannelMap.at(rank).at(threadblock)) { diff --git a/src/include/execution_common.hpp b/src/include/execution_common.hpp index f6ed215e1..87b1a69e1 100644 --- a/src/include/execution_common.hpp +++ b/src/include/execution_common.hpp @@ -4,9 +4,9 @@ #ifndef MSCCLPP_EXECUTION_COMMON_HPP_ #define MSCCLPP_EXECUTION_COMMON_HPP_ +#include #include -#include -#include +#include namespace mscclpp { @@ -23,8 +23,8 @@ enum class BufferType : uint8_t { enum class ChannelType : uint8_t { NONE, - SM, - PROXY, + MEMORY, + PORT, NVLS, }; @@ -53,8 +53,8 @@ enum class OperationType : uint8_t { }; struct Channels { - mscclpp::DeviceHandle smChannels[MAX_CHANNEL]; - mscclpp::DeviceHandle proxyChannels[MAX_CHANNEL]; + mscclpp::DeviceHandle memoryChannels[MAX_CHANNEL]; + mscclpp::DeviceHandle portChannels[MAX_CHANNEL]; mscclpp::DeviceHandle nvlsChannels[MAX_CHANNEL]; }; @@ -97,8 +97,8 @@ struct Operation { // total size = 2304 + 6400 + 4 + 12(padding) = 8720 bytes struct __attribute__((aligned(16))) DeviceExecutionPlan { - uint8_t nSmChannels; // 1 bytes - uint8_t nProxyChannels; // 1 bytes + uint8_t nMemoryChannels; // 1 bytes + uint8_t nPortChannels; // 1 bytes uint16_t nOperations; // 2 bytes Channels channels; // 2304 bytes Operation operations[MAX_OPERATION]; // 64 * 100 = 6400 bytes diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index 98bed37eb..cea5fbf3b 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -9,9 +9,9 @@ #include #endif #include +#include #include -#include -#include +#include #include "execution_common.hpp" @@ -192,68 +192,71 @@ MSCCLPP_DEVICE_INLINE T* getBuffer(T* input, T* output, T* scratch, BufferType b return nullptr; } -MSCCLPP_DEVICE_INLINE void handleSignal(DeviceHandle* smChannels, DeviceHandle* proxyChannels, - uint8_t* channelIndex, int nChannels, ChannelType chType) { +MSCCLPP_DEVICE_INLINE void handleSignal(DeviceHandle* memoryChannels, + DeviceHandle* portChannels, uint8_t* channelIndex, int nChannels, + ChannelType chType) { int tid = threadIdx.x; - if (tid < nChannels && chType == ChannelType::SM) { - smChannels[channelIndex[tid]].signal(); + if (tid < nChannels && chType == ChannelType::MEMORY) { + memoryChannels[channelIndex[tid]].signal(); return; } - if (tid < nChannels && chType == ChannelType::PROXY) { - proxyChannels[channelIndex[threadIdx.x]].signal(); + if (tid < nChannels && chType == ChannelType::PORT) { + portChannels[channelIndex[threadIdx.x]].signal(); } } -MSCCLPP_DEVICE_INLINE void handleWait(DeviceHandle* smChannels, DeviceHandle* proxyChannels, - uint8_t* channelIndexes, int nChannels, ChannelType chType) { +MSCCLPP_DEVICE_INLINE void handleWait(DeviceHandle* memoryChannels, + DeviceHandle* portChannels, uint8_t* channelIndexes, int nChannels, + ChannelType chType) { int tid = threadIdx.x; - if (tid < nChannels && chType == ChannelType::SM) { - smChannels[channelIndexes[tid]].wait(); + if (tid < nChannels && chType == ChannelType::MEMORY) { + memoryChannels[channelIndexes[tid]].wait(); return; } - if (tid < nChannels && chType == ChannelType::PROXY) { - proxyChannels[channelIndexes[tid]].wait(); + if (tid < nChannels && chType == ChannelType::PORT) { + portChannels[channelIndexes[tid]].wait(); } } -MSCCLPP_DEVICE_INLINE void handleFlush(DeviceHandle* proxyChannels, uint8_t* channelIndexes, +MSCCLPP_DEVICE_INLINE void handleFlush(DeviceHandle* portChannels, uint8_t* channelIndexes, int nChannels) { int tid = threadIdx.x; if (tid < nChannels) { - proxyChannels[channelIndexes[tid]].flush(); + portChannels[channelIndexes[tid]].flush(); } } -MSCCLPP_DEVICE_INLINE void handleGet(DeviceHandle* smChannel, uint8_t* srcChannelIndexes, +MSCCLPP_DEVICE_INLINE void handleGet(DeviceHandle* memoryChannel, uint8_t* srcChannelIndexes, uint32_t* dstOffsets, uint32_t* srcOffsets, int count, uint32_t size) { for (int i = 0; i < count; i++) { uint32_t dstOffset = dstOffsets[i]; uint32_t srcOffset = srcOffsets[i]; - smChannel[srcChannelIndexes[i]].get(dstOffset, srcOffset, size, threadIdx.x, blockDim.x); + memoryChannel[srcChannelIndexes[i]].get(dstOffset, srcOffset, size, threadIdx.x, blockDim.x); } } template -MSCCLPP_DEVICE_INLINE void handlePut(DeviceHandle* smChannel, DeviceHandle* proxyChannels, - uint8_t* dstChannelIndexes, uint32_t* dstOffsets, uint32_t* srcOffsets, int count, - uint32_t size, ChannelType chType) { - if (chType == ChannelType::SM) { +MSCCLPP_DEVICE_INLINE void handlePut(DeviceHandle* memoryChannel, + DeviceHandle* portChannels, uint8_t* dstChannelIndexes, + uint32_t* dstOffsets, uint32_t* srcOffsets, int count, uint32_t size, + ChannelType chType) { + if (chType == ChannelType::MEMORY) { for (int i = 0; i < count; i++) { uint32_t dstOffset = dstOffsets[i]; uint32_t srcOffset = srcOffsets[i]; - smChannel[dstChannelIndexes[i]].put(dstOffset, srcOffset, size, threadIdx.x, blockDim.x); + memoryChannel[dstChannelIndexes[i]].put(dstOffset, srcOffset, size, threadIdx.x, blockDim.x); } return; } - if (chType == ChannelType::PROXY) { + if (chType == ChannelType::PORT) { int tid = threadIdx.x; if (tid < count) { if constexpr (PutWithSignal) { - proxyChannels[dstChannelIndexes[tid]].putWithSignal(dstOffsets[tid], srcOffsets[tid], size); + portChannels[dstChannelIndexes[tid]].putWithSignal(dstOffsets[tid], srcOffsets[tid], size); } else if constexpr (PutWithSignalAndFlush) { - proxyChannels[dstChannelIndexes[tid]].putWithSignalAndFlush(dstOffsets[tid], srcOffsets[tid], size); + portChannels[dstChannelIndexes[tid]].putWithSignalAndFlush(dstOffsets[tid], srcOffsets[tid], size); } else { - proxyChannels[dstChannelIndexes[tid]].put(dstOffsets[tid], srcOffsets[tid], size); + portChannels[dstChannelIndexes[tid]].put(dstOffsets[tid], srcOffsets[tid], size); } } } @@ -261,7 +264,8 @@ MSCCLPP_DEVICE_INLINE void handlePut(DeviceHandle* smChannel, DeviceH template MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* output, uint32_t outputOffsetByBytes, T* input, - uint32_t inputOffsetByBytes, DeviceHandle* smChannels, + uint32_t inputOffsetByBytes, + DeviceHandle* memoryChannels, uint8_t* dstChannelIndexes, uint8_t* srcChannelIndexes, uint32_t* dstOffsets, uint32_t* srcOffsets, int nDstChannels, int nSrcChannels, uint32_t size, bool sendToRemote = true) { @@ -275,14 +279,14 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* output, uint32_t outputOf for (int index = 0; index < nSrcChannels; ++index) { int4 val; size_t srcOffset = srcOffsets[index] / sizeof(int4); - val = smChannels[srcChannelIndexes[index]].read(srcOffset + idx); + val = memoryChannels[srcChannelIndexes[index]].read(srcOffset + idx); tmp = add_vectors(tmp, val); } output4[outputOffset4 + idx] = tmp; if (sendToRemote) { for (int index = 0; index < nDstChannels; ++index) { size_t dstOffset = dstOffsets[index] / sizeof(int4); - smChannels[dstChannelIndexes[index]].write(dstOffset + idx, tmp); + memoryChannels[dstChannelIndexes[index]].write(dstOffset + idx, tmp); } } } @@ -294,48 +298,48 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* output, uint32_t outputOf T tmp = input[idx]; for (int index = 0; index < nSrcChannels; ++index) { size_t srcOffset = srcOffsets[index] / sizeof(T); - tmp = add_elements(tmp, smChannels[srcChannelIndexes[index]].read(srcOffset + idx)); + tmp = add_elements(tmp, memoryChannels[srcChannelIndexes[index]].read(srcOffset + idx)); } output[idx] = tmp; if (sendToRemote) { for (int index = 0; index < nDstChannels; ++index) { size_t dstOffset = dstOffsets[index] / sizeof(T); - smChannels[dstChannelIndexes[index]].write(dstOffset + idx, tmp); + memoryChannels[dstChannelIndexes[index]].write(dstOffset + idx, tmp); } } } } template -MSCCLPP_DEVICE_INLINE void handlePutPacket(size_t scratchSize, DeviceHandle* smChannels, - DeviceHandle* proxyChannels, uint8_t* dstChannelIndexes, +MSCCLPP_DEVICE_INLINE void handlePutPacket(size_t scratchSize, DeviceHandle* memoryChannels, + DeviceHandle* portChannels, uint8_t* dstChannelIndexes, uint32_t* dstOffsets, uint32_t* srcOffsets, int nDstChannels, uint32_t size, ChannelType chType, uint32_t flag) { const size_t scratchBaseOffset = flag & 0x1 ? 0 : scratchSize >> 1; - if (chType == ChannelType::SM) { + if (chType == ChannelType::MEMORY) { for (int index = 0; index < nDstChannels; ++index) { - smChannels[dstChannelIndexes[index]].putPackets( + memoryChannels[dstChannelIndexes[index]].putPackets( scratchBaseOffset + dstOffsets[index] * 2, srcOffsets[index], size, threadIdx.x, blockDim.x, flag); } } - if (chType == ChannelType::PROXY) { + if (chType == ChannelType::PORT) { int tid = threadIdx.x; if (tid >= nDstChannels) { return; } - // For proxy channel, we assume src and dst are in packet format + // For port channel, we assume src and dst are in packet format uint32_t dstOffset = (dstOffsets[tid] << 1) + scratchBaseOffset; uint32_t srcOffset = (srcOffsets[tid] << 1) + scratchBaseOffset; - proxyChannels[dstChannelIndexes[tid]].put(dstOffset, srcOffset, size << 1); + portChannels[dstChannelIndexes[tid]].put(dstOffset, srcOffset, size << 1); } } template MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* dst, uint32_t dstOffsetByBytes, T* src, uint32_t srcOffsetByBytes, T* inputBuff, size_t inputBuffSize, uint32_t* inputOffsets, int nSrcs, - DeviceHandle* smChannels, uint8_t* outputChannelIndexes, - uint32_t* outputOffsets, int nDstChannels, size_t size, - uint32_t flag) { + DeviceHandle* memoryChannels, + uint8_t* outputChannelIndexes, uint32_t* outputOffsets, + int nDstChannels, size_t size, uint32_t flag) { size_t nPackets = size * 2 / sizeof(PacketType); const size_t intputBaseOffset = flag & 0x1 ? 0 : inputBuffSize >> 1; const uint32_t srcOffset = srcOffsetByBytes / sizeof(PacketPayload); @@ -356,7 +360,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* dst, uint32_t dstOffsetByBy PacketType pkt(data, flag); for (int index = 0; index < nDstChannels; ++index) { size_t offset = (intputBaseOffset + outputOffsets[index] * 2) / sizeof(PacketType); - smChannels[outputChannelIndexes[index]].write(offset + idx, pkt); + memoryChannels[outputChannelIndexes[index]].write(offset + idx, pkt); } } } @@ -385,9 +389,9 @@ MSCCLPP_DEVICE_INLINE void handleTransformToPacket(void* dst, void* src, size_t template MSCCLPP_DEVICE_INLINE void handleReduceSend(T* dst, uint32_t dstOffsetByBytes, T* src, uint32_t srcOffsetByBytes, - T* input, uint32_t* inputOffsets, DeviceHandle* smChannels, - uint8_t* outputChannelIndexes, uint32_t* outputOffsets, int nOutChannels, - uint32_t size) { + T* input, uint32_t* inputOffsets, + DeviceHandle* memoryChannels, uint8_t* outputChannelIndexes, + uint32_t* outputOffsets, int nOutChannels, uint32_t size) { const size_t nInt4 = size / sizeof(int4); const size_t srcOffset4 = srcOffsetByBytes / sizeof(int4); const size_t dstOffset4 = dstOffsetByBytes / sizeof(int4); @@ -404,7 +408,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSend(T* dst, uint32_t dstOffsetByBytes, T dst4[dstOffset4 + idx] = tmp; for (int index = 0; index < nOutChannels; ++index) { size_t offset = outputOffsets[index] / sizeof(int4); - smChannels[outputChannelIndexes[index]].write(offset + idx, tmp); + memoryChannels[outputChannelIndexes[index]].write(offset + idx, tmp); } } // handle rest of data @@ -420,7 +424,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSend(T* dst, uint32_t dstOffsetByBytes, T dst[idx] = tmp; for (int index = 0; index < nOutChannels; ++index) { size_t offset = outputOffsets[index] / sizeof(T); - smChannels[outputChannelIndexes[index]].write(offset + idx, tmp); + memoryChannels[outputChannelIndexes[index]].write(offset + idx, tmp); } } } @@ -492,8 +496,8 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu localPlan = (DeviceExecutionPlan*)sharedMem; int nOperations = localPlan->nOperations; Operation* operations = localPlan->operations; - DeviceHandle* smChannels = localPlan->channels.smChannels; - DeviceHandle* proxyChannels = localPlan->channels.proxyChannels; + DeviceHandle* memoryChannels = localPlan->channels.memoryChannels; + DeviceHandle* portChannels = localPlan->channels.portChannels; [[maybe_unused]] DeviceHandle* nvlsChannels = localPlan->channels.nvlsChannels; @@ -534,22 +538,22 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu int syncStateIndex = op.deviceSyncerIndex; deviceSyncers[syncStateIndex].sync(nThreadBlocks); } else if (op.type == OperationType::SIGNAL) { - handleSignal(smChannels, proxyChannels, op.outputChannelIndexes, op.nOutputs, op.channelType); + handleSignal(memoryChannels, portChannels, op.outputChannelIndexes, op.nOutputs, op.channelType); } else if (op.type == OperationType::WAIT) { - handleWait(smChannels, proxyChannels, op.inputChannelIndexes, op.nInputs, op.channelType); + handleWait(memoryChannels, portChannels, op.inputChannelIndexes, op.nInputs, op.channelType); } else if (op.type == OperationType::FLUSH) { - handleFlush(proxyChannels, op.outputChannelIndexes, op.nOutputs); + handleFlush(portChannels, op.outputChannelIndexes, op.nOutputs); } else if (op.type == OperationType::PUT) { - handlePut(smChannels, proxyChannels, op.outputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs, + handlePut(memoryChannels, portChannels, op.outputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs, op.size, op.channelType); } else if (op.type == OperationType::PUT_WITH_SIGNAL) { - handlePut(smChannels, proxyChannels, op.outputChannelIndexes, op.outputOffsets, op.inputOffsets, + handlePut(memoryChannels, portChannels, op.outputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs, op.size, op.channelType); } else if (op.type == OperationType::PUT_WITH_SIGNAL_AND_FLUSH) { - handlePut(smChannels, proxyChannels, op.outputChannelIndexes, op.outputOffsets, op.inputOffsets, + handlePut(memoryChannels, portChannels, op.outputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs, op.size, op.channelType); } else if (op.type == OperationType::GET) { - handleGet(smChannels, op.inputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nInputs, op.size); + handleGet(memoryChannels, op.inputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nInputs, op.size); } else if (op.type == OperationType::COPY) { T* dst = getBuffer(input, output, scratch, op.dstBufferType); T* src = getBuffer(input, output, scratch, op.srcBufferType); @@ -557,30 +561,30 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu } else if (op.type == OperationType::READ_REDUCE_COPY_SEND) { T* dst = getBuffer(input, output, scratch, op.dstBufferType); T* src = getBuffer(input, output, scratch, op.srcBufferType); - handleReadReduceCopySend(dst, op.dstOffset, src, op.srcOffset, smChannels, op.outputChannelIndexes, + handleReadReduceCopySend(dst, op.dstOffset, src, op.srcOffset, memoryChannels, op.outputChannelIndexes, op.inputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs, op.nInputs, op.size); } else if (op.type == OperationType::READ_REDUCE_COPY) { T* dst = getBuffer(input, output, scratch, op.dstBufferType); T* src = getBuffer(input, output, scratch, op.srcBufferType); - handleReadReduceCopySend(dst, op.dstOffset, src, op.srcOffset, smChannels, op.outputChannelIndexes, + handleReadReduceCopySend(dst, op.dstOffset, src, op.srcOffset, memoryChannels, op.outputChannelIndexes, op.inputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs, op.nInputs, op.size, false); } else if (op.type == OperationType::PUT_PACKET) { - handlePutPacket(scratchSize, smChannels, proxyChannels, op.outputChannelIndexes, op.outputOffsets, + handlePutPacket(scratchSize, memoryChannels, portChannels, op.outputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs, op.size, op.channelType, flag); } else if (op.type == OperationType::REDUCE_SEND_PACKET) { T* dst = getBuffer(input, output, scratch, op.dstBufferType); T* src = getBuffer(input, output, scratch, op.srcBufferType); handleReduceSendPacket(dst, op.dstOffset, src, op.srcOffset, scratch, scratchSize, op.inputOffsets, - op.nInputs, smChannels, op.outputChannelIndexes, op.outputOffsets, + op.nInputs, memoryChannels, op.outputChannelIndexes, op.outputOffsets, op.nOutputs, op.size, flag); } else if (op.type == OperationType::REDUCE_PACKET) { T* dst = getBuffer(input, output, scratch, op.dstBufferType); T* src = getBuffer(input, output, scratch, op.srcBufferType); handleReduceSendPacket(dst, op.dstOffset, src, op.srcOffset, scratch, scratchSize, - op.inputOffsets, op.nInputs, smChannels, op.outputChannelIndexes, + op.inputOffsets, op.nInputs, memoryChannels, op.outputChannelIndexes, op.outputOffsets, op.nOutputs, op.size, flag); } else if (op.type == OperationType::COPY_PACKET) { T* dst = getBuffer(input, output, scratch, op.dstBufferType); @@ -594,8 +598,8 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu T* dst = getBuffer(input, output, scratch, op.dstBufferType); T* src = getBuffer(input, output, scratch, op.srcBufferType); T* tmp = getBuffer(input, output, scratch, op.inputBufferType); - handleReduceSend(dst, op.dstOffset, src, op.srcOffset, tmp, op.inputOffsets, smChannels, op.outputChannelIndexes, - op.outputOffsets, op.nOutputs, op.size); + handleReduceSend(dst, op.dstOffset, src, op.srcOffset, tmp, op.inputOffsets, memoryChannels, + op.outputChannelIndexes, op.outputOffsets, op.nOutputs, op.size); } #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 else if (op.type == OperationType::MULTI_LOAD_REDUCE_STORE) { diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp index 080a76883..66ed464d4 100644 --- a/src/include/execution_plan.hpp +++ b/src/include/execution_plan.hpp @@ -98,8 +98,8 @@ struct ExecutionPlan::Impl { // for nvls channels std::unordered_map> nvlsInfos; // threadblockChannelMap[rank][threadblock] = [channelIndex, channelKey] - std::unordered_map>>> threadblockSMChannelMap; - std::unordered_map>>> threadblockProxyChannelMap; + std::unordered_map>>> threadblockMemoryChannelMap; + std::unordered_map>>> threadblockPortChannelMap; std::unordered_map>>> threadblockNvlsChannelMap; std::unordered_map inputChunks; std::unordered_map outputChunks; diff --git a/src/sm_channel.cc b/src/memory_channel.cc similarity index 54% rename from src/sm_channel.cc rename to src/memory_channel.cc index a148595bf..3adce71fe 100644 --- a/src/sm_channel.cc +++ b/src/memory_channel.cc @@ -1,22 +1,22 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#include +#include #include "api.h" #include "debug.h" namespace mscclpp { -MSCCLPP_API_CPP SmChannel::SmChannel(std::shared_ptr semaphore, RegisteredMemory dst, - void* src, void* getPacketBuffer) +MSCCLPP_API_CPP MemoryChannel::MemoryChannel(std::shared_ptr semaphore, + RegisteredMemory dst, void* src, void* getPacketBuffer) : semaphore_(semaphore), dst_(dst), src_(src), getPacketBuffer_(getPacketBuffer) { if (!dst.transports().has(Transport::CudaIpc)) { - throw Error("SmChannel: dst must be registered with CudaIpc", ErrorCode::InvalidUsage); + throw Error("MemoryChannel: dst must be registered with CudaIpc", ErrorCode::InvalidUsage); } } -MSCCLPP_API_CPP SmChannel::DeviceHandle SmChannel::deviceHandle() const { +MSCCLPP_API_CPP MemoryChannel::DeviceHandle MemoryChannel::deviceHandle() const { return DeviceHandle{.semaphore_ = semaphore_->deviceHandle(), .src_ = src_, .dst_ = dst_.data(), diff --git a/src/proxy_channel.cc b/src/port_channel.cc similarity index 68% rename from src/proxy_channel.cc rename to src/port_channel.cc index f2ca00674..e574af9fc 100644 --- a/src/proxy_channel.cc +++ b/src/port_channel.cc @@ -2,21 +2,21 @@ // Licensed under the MIT license. #include -#include +#include #include "api.h" #include "debug.h" namespace mscclpp { -MSCCLPP_API_CPP BaseProxyChannel::BaseProxyChannel(SemaphoreId semaphoreId, - std::shared_ptr semaphore, - std::shared_ptr proxy) +MSCCLPP_API_CPP BasePortChannel::BasePortChannel(SemaphoreId semaphoreId, + std::shared_ptr semaphore, + std::shared_ptr proxy) : semaphoreId_(semaphoreId), semaphore_(semaphore), proxy_(proxy) {} -MSCCLPP_API_CPP ProxyChannel::ProxyChannel(SemaphoreId semaphoreId, std::shared_ptr semaphore, - std::shared_ptr proxy, MemoryId dst, MemoryId src) - : BaseProxyChannel(semaphoreId, semaphore, proxy), dst_(dst), src_(src) {} +MSCCLPP_API_CPP PortChannel::PortChannel(SemaphoreId semaphoreId, std::shared_ptr semaphore, + std::shared_ptr proxy, MemoryId dst, MemoryId src) + : BasePortChannel(semaphoreId, semaphore, proxy), dst_(dst), src_(src) {} MSCCLPP_API_CPP ProxyService::ProxyService(size_t fifoSize) : proxy_(std::make_shared([&](ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }, @@ -46,12 +46,12 @@ MSCCLPP_API_CPP std::shared_ptr ProxyService::semaphore(Se return semaphores_[id]; } -MSCCLPP_API_CPP BaseProxyChannel ProxyService::baseProxyChannel(SemaphoreId id) { - return BaseProxyChannel(id, semaphores_[id], proxy_); +MSCCLPP_API_CPP BasePortChannel ProxyService::basePortChannel(SemaphoreId id) { + return BasePortChannel(id, semaphores_[id], proxy_); } -MSCCLPP_API_CPP ProxyChannel ProxyService::proxyChannel(SemaphoreId id, MemoryId dst, MemoryId src) { - return ProxyChannel(id, semaphores_[id], proxy_, dst, src); +MSCCLPP_API_CPP PortChannel ProxyService::portChannel(SemaphoreId id, MemoryId dst, MemoryId src) { + return PortChannel(id, semaphores_[id], proxy_, dst, src); } MSCCLPP_API_CPP void ProxyService::startProxy() { proxy_->start(); } @@ -95,13 +95,12 @@ ProxyHandlerResult ProxyService::handleTrigger(ProxyTrigger triggerRaw) { return result; } -MSCCLPP_API_CPP BaseProxyChannel::DeviceHandle BaseProxyChannel::deviceHandle() const { - return BaseProxyChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo().deviceHandle()); +MSCCLPP_API_CPP BasePortChannel::DeviceHandle BasePortChannel::deviceHandle() const { + return BasePortChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo().deviceHandle()); } -MSCCLPP_API_CPP ProxyChannel::DeviceHandle ProxyChannel::deviceHandle() const { - return ProxyChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo().deviceHandle(), dst_, - src_); +MSCCLPP_API_CPP PortChannel::DeviceHandle PortChannel::deviceHandle() const { + return PortChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo().deviceHandle(), dst_, src_); } } // namespace mscclpp diff --git a/src/semaphore.cc b/src/semaphore.cc index c6238b532..b03ff6736 100644 --- a/src/semaphore.cc +++ b/src/semaphore.cc @@ -91,8 +91,8 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) { } } -MSCCLPP_API_CPP SmDevice2DeviceSemaphore::SmDevice2DeviceSemaphore(Communicator& communicator, - std::shared_ptr connection) +MSCCLPP_API_CPP MemoryDevice2DeviceSemaphore::MemoryDevice2DeviceSemaphore(Communicator& communicator, + std::shared_ptr connection) : BaseSemaphore(createGpuSemaphoreId(), createGpuSemaphoreId(), createGpuSemaphoreId()) { INFO(MSCCLPP_INIT, "Creating a Device2Device semaphore for %s transport from %d to %d", connection->getTransportName().c_str(), communicator.bootstrap()->getRank(), @@ -107,8 +107,8 @@ MSCCLPP_API_CPP SmDevice2DeviceSemaphore::SmDevice2DeviceSemaphore(Communicator& } } -MSCCLPP_API_CPP SmDevice2DeviceSemaphore::DeviceHandle SmDevice2DeviceSemaphore::deviceHandle() const { - SmDevice2DeviceSemaphore::DeviceHandle device; +MSCCLPP_API_CPP MemoryDevice2DeviceSemaphore::DeviceHandle MemoryDevice2DeviceSemaphore::deviceHandle() const { + MemoryDevice2DeviceSemaphore::DeviceHandle device; device.remoteInboundSemaphoreId = isRemoteInboundSemaphoreIdSet_ ? reinterpret_cast(remoteInboundSemaphoreIdsRegMem_.get().data()) : nullptr; diff --git a/test/allgather_test_cpp.cu b/test/allgather_test_cpp.cu index 0f5d37759..836ee3e64 100644 --- a/test/allgather_test_cpp.cu +++ b/test/allgather_test_cpp.cu @@ -2,7 +2,7 @@ // Licensed under the MIT license. #include -#include +#include #ifdef MSCCLPP_USE_MPI_FOR_TESTS #include "mpi.h" @@ -40,25 +40,25 @@ static double getTime(void) { template using DeviceHandle = mscclpp::DeviceHandle; -__constant__ DeviceHandle constProxyChans[16]; +__constant__ DeviceHandle constPortChans[16]; -__device__ void allgather0(DeviceHandle proxyChan, int rank, size_t nelemsPerGPU) { +__device__ void allgather0(DeviceHandle portChan, int rank, size_t nelemsPerGPU) { // this allgather is really simple and implemented as an alltoall // this thread's role is a sender role // put your data asynchronously - if ((threadIdx.x % 32) == 0) proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int)); + if ((threadIdx.x % 32) == 0) portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int)); // make sure everyone is put their data before some thread randomly blocks everyone else in signal __syncthreads(); // push with flag and sync to make sure the data is received - if ((threadIdx.x % 32) == 0) proxyChan.flush(); + if ((threadIdx.x % 32) == 0) portChan.flush(); // this thread's role is a receiver role. wait on the semaphore to make sure the data is ready - if ((threadIdx.x % 32) == 0) proxyChan.wait(); + if ((threadIdx.x % 32) == 0) portChan.wait(); } -__device__ void localAllGather(DeviceHandle proxyChan, int rank, int nranksPerNode, - int remoteRank, uint64_t offset, uint64_t size) { +__device__ void localAllGather(DeviceHandle portChan, int rank, int nranksPerNode, int remoteRank, + uint64_t offset, uint64_t size) { // this allgather algorithm works as follows: // Step 1: GPU rank i sends data to GPU rank (i+1) % nranksPerNode // and waits for data from GPU rank (i-1) % nranksPerNode @@ -68,11 +68,11 @@ __device__ void localAllGather(DeviceHandle proxyChan, in for (int i = 1; i < nranksPerNode; i++) { if ((remoteRank % nranksPerNode) == ((rank + i) % nranksPerNode)) { // put your data to GPU (rank+i) % nranksPerNode and signal in one call - if ((threadIdx.x % 32) == 0) proxyChan.putWithSignal(offset, size); + if ((threadIdx.x % 32) == 0) portChan.putWithSignal(offset, size); } // wait for the data from GPU (rank-i) % nranksPerNode to arrive if ((remoteRank % nranksPerNode) == ((rank - i + nranksPerNode) % nranksPerNode)) { - if ((threadIdx.x % 32) == 0) proxyChan.wait(); + if ((threadIdx.x % 32) == 0) portChan.wait(); } #if defined(__HIP_PLATFORM_AMD__) // NOTE: we actually need a group barrier here for better performance, but __syncthreads() is still correct. @@ -83,15 +83,15 @@ __device__ void localAllGather(DeviceHandle proxyChan, in } } -__device__ void allgather1(DeviceHandle proxyChan, int rank, int nranksPerNode, int remoteRank, +__device__ void allgather1(DeviceHandle portChan, int rank, int nranksPerNode, int remoteRank, size_t nelemsPerGPU) { - localAllGather(proxyChan, rank, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), + localAllGather(portChan, rank, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int)); if (remoteRank / nranksPerNode == rank / nranksPerNode) - if ((threadIdx.x % 32) == 0) proxyChan.flush(); + if ((threadIdx.x % 32) == 0) portChan.flush(); } -__device__ void allgather2(DeviceHandle proxyChan, int rank, int world_size, int nranksPerNode, +__device__ void allgather2(DeviceHandle portChan, int rank, int world_size, int nranksPerNode, int remoteRank, size_t nelemsPerGPU) { // this allgather is a pipelined and hierarchical one and only works for two nodes // it is implemented as follows: @@ -108,16 +108,16 @@ __device__ void allgather2(DeviceHandle proxyChan, int ra // Step 1 // local allgather if (remoteRank / nranksPerNode == rank / nranksPerNode) { - localAllGather(proxyChan, rank, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), + localAllGather(portChan, rank, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int)); } // cross-node exchange if (remoteRank % nranksPerNode == rank % nranksPerNode) { // opposite side if ((threadIdx.x % 32) == 0) - proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), - (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int)); - if ((threadIdx.x % 32) == 0) proxyChan.wait(); + portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int)); + if ((threadIdx.x % 32) == 0) portChan.wait(); } __syncthreads(); @@ -126,7 +126,7 @@ __device__ void allgather2(DeviceHandle proxyChan, int ra // local allgather int otherNghr = (rank + nranksPerNode) % world_size; if (remoteRank / nranksPerNode == rank / nranksPerNode) { - localAllGather(proxyChan, rank, nranksPerNode, remoteRank, otherNghr * nelemsPerGPU * sizeof(int), + localAllGather(portChan, rank, nranksPerNode, remoteRank, otherNghr * nelemsPerGPU * sizeof(int), (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int)); } @@ -134,9 +134,9 @@ __device__ void allgather2(DeviceHandle proxyChan, int ra if (remoteRank % nranksPerNode == rank % nranksPerNode) { // opposite side if ((threadIdx.x % 32) == 0) - proxyChan.putWithSignal((rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int), - nelemsPerGPU / pipelineSize * sizeof(int)); - if ((threadIdx.x % 32) == 0) proxyChan.wait(); + portChan.putWithSignal((rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int), + nelemsPerGPU / pipelineSize * sizeof(int)); + if ((threadIdx.x % 32) == 0) portChan.wait(); } __syncthreads(); @@ -144,29 +144,29 @@ __device__ void allgather2(DeviceHandle proxyChan, int ra // Step 3 // local allgather if (remoteRank / nranksPerNode == rank / nranksPerNode) { - localAllGather(proxyChan, rank, nranksPerNode, remoteRank, + localAllGather(portChan, rank, nranksPerNode, remoteRank, (otherNghr * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int), nelemsPerGPU / pipelineSize * sizeof(int)); } if (remoteRank / nranksPerNode == rank / nranksPerNode || remoteRank % nranksPerNode == rank % nranksPerNode) { - if ((threadIdx.x % 32) == 0) proxyChan.flush(); + if ((threadIdx.x % 32) == 0) portChan.flush(); } } __global__ void kernel(int rank, int world_size, int nranksPerNode, size_t nelemsPerGPU, int kernel) { - // find the mapping between remoteRank and proxyChans + // find the mapping between remoteRank and portChans int warpId = threadIdx.x / 32; int remoteRank = (warpId < rank) ? warpId : warpId + 1; // Each warp is responsible for one of the remote ranks - DeviceHandle proxyChan = constProxyChans[warpId]; + DeviceHandle portChan = constPortChans[warpId]; if (kernel == 0) - allgather0(proxyChan, rank, nelemsPerGPU); + allgather0(portChan, rank, nelemsPerGPU); else if (kernel == 1) - allgather1(proxyChan, rank, nranksPerNode, remoteRank, nelemsPerGPU); + allgather1(portChan, rank, nranksPerNode, remoteRank, nelemsPerGPU); else if (kernel == 2) - allgather2(proxyChan, rank, world_size, nranksPerNode, remoteRank, nelemsPerGPU); + allgather2(portChan, rank, world_size, nranksPerNode, remoteRank, nelemsPerGPU); } int rankToLocalRank(int rank) { return rank % nranksPerNode; } @@ -234,17 +234,17 @@ void setupMscclppConnections(int rank, int world_size, mscclpp::Communicator& co comm.setup(); - std::vector> proxyChannels; + std::vector> portChannels; for (size_t i = 0; i < semaphoreIds.size(); ++i) { - proxyChannels.push_back(mscclpp::deviceHandle(proxyService.proxyChannel( + portChannels.push_back(mscclpp::deviceHandle(proxyService.portChannel( semaphoreIds[i], proxyService.addMemory(remoteMemories[i].get()), proxyService.addMemory(localMemories[i])))); } - if (proxyChannels.size() > sizeof(constProxyChans) / sizeof(DeviceHandle)) { + if (portChannels.size() > sizeof(constPortChans) / sizeof(DeviceHandle)) { std::runtime_error("unexpected error"); } - CUDACHECK(cudaMemcpyToSymbol(constProxyChans, proxyChannels.data(), - sizeof(DeviceHandle) * proxyChannels.size())); + CUDACHECK(cudaMemcpyToSymbol(constPortChans, portChannels.data(), + sizeof(DeviceHandle) * portChannels.size())); } void printUsage(const char* prog, bool isMpi) { diff --git a/test/mp_unit/CMakeLists.txt b/test/mp_unit/CMakeLists.txt index 8e37d2405..007e3e6dd 100644 --- a/test/mp_unit/CMakeLists.txt +++ b/test/mp_unit/CMakeLists.txt @@ -6,7 +6,7 @@ target_sources(mp_unit_tests PRIVATE bootstrap_tests.cc ib_tests.cu communicator_tests.cu - proxy_channel_tests.cu - sm_channel_tests.cu + port_channel_tests.cu + memory_channel_tests.cu executor_tests.cc ) diff --git a/test/mp_unit/sm_channel_tests.cu b/test/mp_unit/memory_channel_tests.cu similarity index 64% rename from test/mp_unit/sm_channel_tests.cu rename to test/mp_unit/memory_channel_tests.cu index af4aa2985..daa0423d6 100644 --- a/test/mp_unit/sm_channel_tests.cu +++ b/test/mp_unit/memory_channel_tests.cu @@ -5,7 +5,7 @@ #include "mp_unit_tests.hpp" -void SmChannelOneToOneTest::SetUp() { +void MemoryChannelOneToOneTest::SetUp() { // Need at least two ranks within a node if (gEnv->nRanksPerNode < 2) { GTEST_SKIP(); @@ -15,10 +15,11 @@ void SmChannelOneToOneTest::SetUp() { CommunicatorTestBase::SetUp(); } -void SmChannelOneToOneTest::TearDown() { CommunicatorTestBase::TearDown(); } +void MemoryChannelOneToOneTest::TearDown() { CommunicatorTestBase::TearDown(); } -void SmChannelOneToOneTest::setupMeshConnections(std::vector& smChannels, void* inputBuff, - size_t inputBuffBytes, void* outputBuff, size_t outputBuffBytes) { +void MemoryChannelOneToOneTest::setupMeshConnections(std::vector& memoryChannels, + void* inputBuff, size_t inputBuffBytes, void* outputBuff, + size_t outputBuffBytes) { const int rank = communicator->bootstrap()->getRank(); const int worldSize = communicator->bootstrap()->getNranks(); const bool isInPlace = (outputBuff == nullptr); @@ -59,34 +60,35 @@ void SmChannelOneToOneTest::setupMeshConnections(std::vector } connections[r] = connectionFutures[r].get(); - smSemaphores[r] = std::make_shared(*communicator, connections[r]); + memorySemaphores[r] = std::make_shared(*communicator, connections[r]); - smChannels.emplace_back(smSemaphores[r], remoteMemFutures[r].get(), inputBufRegMem.data(), - (isInPlace ? nullptr : outputBufRegMem.data())); + memoryChannels.emplace_back(memorySemaphores[r], remoteMemFutures[r].get(), inputBufRegMem.data(), + (isInPlace ? nullptr : outputBufRegMem.data())); } communicator->setup(); } -__constant__ DeviceHandle gChannelOneToOneTestConstSmChans; +__constant__ DeviceHandle gChannelOneToOneTestConstMemChans; -void SmChannelOneToOneTest::packetPingPongTest(const std::string testName, PacketPingPongKernelWrapper kernelWrapper) { +void MemoryChannelOneToOneTest::packetPingPongTest(const std::string testName, + PacketPingPongKernelWrapper kernelWrapper) { if (gEnv->rank >= numRanksToUse) return; const int nElem = 4 * 1024 * 1024; const int defaultNTries = 1000; - std::vector smChannels; + std::vector memoryChannels; std::shared_ptr buff = mscclpp::GpuBuffer(nElem).memory(); std::shared_ptr intermBuff = mscclpp::GpuBuffer(nElem * 2).memory(); - setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int), intermBuff.get(), nElem * 2 * sizeof(int)); - std::vector> deviceHandles(smChannels.size()); - std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(), - [](const mscclpp::SmChannel& smChan) { return mscclpp::deviceHandle(smChan); }); + setupMeshConnections(memoryChannels, buff.get(), nElem * sizeof(int), intermBuff.get(), nElem * 2 * sizeof(int)); + std::vector> deviceHandles(memoryChannels.size()); + std::transform(memoryChannels.begin(), memoryChannels.end(), deviceHandles.begin(), + [](const mscclpp::MemoryChannel& memChan) { return mscclpp::deviceHandle(memChan); }); - ASSERT_EQ(smChannels.size(), 1); - MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstSmChans, deviceHandles.data(), - sizeof(DeviceHandle))); + ASSERT_EQ(memoryChannels.size(), 1); + MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstMemChans, deviceHandles.data(), + sizeof(DeviceHandle))); std::shared_ptr ret = mscclpp::detail::gpuCallocHostShared(); @@ -125,15 +127,15 @@ void SmChannelOneToOneTest::packetPingPongTest(const std::string testName, Packe } } -__global__ void kernelSmPutPingPong(int* buff, int rank, int nElem, int* ret) { - DeviceHandle& smChan = gChannelOneToOneTestConstSmChans; +__global__ void kernelMemPutPingPong(int* buff, int rank, int nElem, int* ret) { + DeviceHandle& memChan = gChannelOneToOneTestConstMemChans; volatile int* sendBuff = (volatile int*)buff; int nTries = 1000; int rank1Offset = 10000000; for (int i = 0; i < nTries; i++) { if (rank == 0) { if (i > 0) { - if (threadIdx.x == 0) smChan.wait(); + if (threadIdx.x == 0) memChan.wait(); __syncthreads(); for (int j = threadIdx.x; j < nElem; j += blockDim.x) { if (sendBuff[j] != rank1Offset + i - 1 + j) { @@ -147,11 +149,11 @@ __global__ void kernelSmPutPingPong(int* buff, int rank, int nElem, int* ret) { sendBuff[j] = i + j; } __syncthreads(); - smChan.put(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x); - if (threadIdx.x == 0) smChan.signal(); + memChan.put(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x); + if (threadIdx.x == 0) memChan.signal(); } if (rank == 1) { - if (threadIdx.x == 0) smChan.wait(); + if (threadIdx.x == 0) memChan.wait(); __syncthreads(); for (int j = threadIdx.x; j < nElem; j += blockDim.x) { if (sendBuff[j] != i + j) { @@ -165,59 +167,59 @@ __global__ void kernelSmPutPingPong(int* buff, int rank, int nElem, int* ret) { sendBuff[j] = rank1Offset + i + j; } __syncthreads(); - smChan.put(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x); - if (threadIdx.x == 0) smChan.signal(); + memChan.put(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x); + if (threadIdx.x == 0) memChan.signal(); } } } } -TEST_F(SmChannelOneToOneTest, PutPingPong) { +TEST_F(MemoryChannelOneToOneTest, PutPingPong) { if (gEnv->rank >= numRanksToUse) return; const int nElem = 4 * 1024 * 1024; - std::vector smChannels; + std::vector memoryChannels; std::shared_ptr buff = mscclpp::GpuBuffer(nElem).memory(); - setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int)); - std::vector> deviceHandles(smChannels.size()); - std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(), - [](const mscclpp::SmChannel& smChan) { return mscclpp::deviceHandle(smChan); }); + setupMeshConnections(memoryChannels, buff.get(), nElem * sizeof(int)); + std::vector> deviceHandles(memoryChannels.size()); + std::transform(memoryChannels.begin(), memoryChannels.end(), deviceHandles.begin(), + [](const mscclpp::MemoryChannel& memChan) { return mscclpp::deviceHandle(memChan); }); - ASSERT_EQ(smChannels.size(), 1); - MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstSmChans, deviceHandles.data(), - sizeof(DeviceHandle))); + ASSERT_EQ(memoryChannels.size(), 1); + MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstMemChans, deviceHandles.data(), + sizeof(DeviceHandle))); std::shared_ptr ret = mscclpp::detail::gpuCallocHostShared(); - kernelSmPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get()); + kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get()); MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); EXPECT_EQ(*ret, 0); *ret = 0; - kernelSmPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get()); + kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get()); MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); EXPECT_EQ(*ret, 0); *ret = 0; - kernelSmPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get()); + kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get()); MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); EXPECT_EQ(*ret, 0); *ret = 0; - kernelSmPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get()); + kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get()); MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); EXPECT_EQ(*ret, 0); } -__global__ void kernelSmGetPingPong(int* buff, int rank, int nElem, int* ret) { +__global__ void kernelMemGetPingPong(int* buff, int rank, int nElem, int* ret) { if (rank > 1) return; - DeviceHandle& smChan = gChannelOneToOneTestConstSmChans; + DeviceHandle& memChan = gChannelOneToOneTestConstMemChans; volatile int* buffPtr = (volatile int*)buff; int offset0 = (rank == 0) ? 0 : 10000000; int offset1 = (rank == 0) ? 10000000 : 0; @@ -231,14 +233,14 @@ __global__ void kernelSmGetPingPong(int* buff, int rank, int nElem, int* ret) { buffPtr[j] = offset0 + i + j; } if (threadIdx.x == 0) { - smChan.signal(); + memChan.signal(); } } else { if (threadIdx.x == 0) { - smChan.wait(); + memChan.wait(); } __syncthreads(); - smChan.get(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x); + memChan.get(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x); __syncthreads(); for (int j = threadIdx.x; j < nElem; j += blockDim.x) { if (buffPtr[j] != offset1 + i + j) { @@ -251,52 +253,52 @@ __global__ void kernelSmGetPingPong(int* buff, int rank, int nElem, int* ret) { } } -TEST_F(SmChannelOneToOneTest, GetPingPong) { +TEST_F(MemoryChannelOneToOneTest, GetPingPong) { if (gEnv->rank >= numRanksToUse) return; const int nElem = 4 * 1024 * 1024; - std::vector smChannels; + std::vector memoryChannels; std::shared_ptr buff = mscclpp::GpuBuffer(nElem).memory(); - setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int)); - std::vector> deviceHandles(smChannels.size()); - std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(), - [](const mscclpp::SmChannel& smChan) { return mscclpp::deviceHandle(smChan); }); + setupMeshConnections(memoryChannels, buff.get(), nElem * sizeof(int)); + std::vector> deviceHandles(memoryChannels.size()); + std::transform(memoryChannels.begin(), memoryChannels.end(), deviceHandles.begin(), + [](const mscclpp::MemoryChannel& memChan) { return mscclpp::deviceHandle(memChan); }); ASSERT_EQ(deviceHandles.size(), 1); - MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstSmChans, deviceHandles.data(), - sizeof(DeviceHandle))); + MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstMemChans, deviceHandles.data(), + sizeof(DeviceHandle))); std::shared_ptr ret = mscclpp::detail::gpuCallocHostShared(); - kernelSmGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get()); + kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get()); MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); EXPECT_EQ(*ret, 0); *ret = 0; - kernelSmGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get()); + kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get()); MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); EXPECT_EQ(*ret, 0); *ret = 0; - kernelSmGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get()); + kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get()); MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); EXPECT_EQ(*ret, 0); *ret = 0; - kernelSmGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get()); + kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get()); MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); EXPECT_EQ(*ret, 0); } -__global__ void kernelSmLL8PacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries) { +__global__ void kernelMemLL8PacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries) { if (rank > 1) return; - DeviceHandle& smChan = gChannelOneToOneTestConstSmChans; + DeviceHandle& memChan = gChannelOneToOneTestConstMemChans; volatile int* sendBuff = (volatile int*)buff; int putOffset = (rank == 0) ? 0 : 10000000; int getOffset = (rank == 0) ? 10000000 : 0; @@ -312,9 +314,9 @@ __global__ void kernelSmLL8PacketPingPong(int* buff, int rank, int nElem, int* r // sendBuff[2 * j + 1] = putOffset + i + 2 * j + 1; } // __syncthreads(); - smChan.putPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); + memChan.putPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); } else { - smChan.getPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); + memChan.getPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); // If each thread reads 8 bytes at once, we don't need a barrier after getPackets(). // __syncthreads(); for (int j = threadIdx.x; j < nElem; j += blockDim.x) { @@ -331,10 +333,10 @@ __global__ void kernelSmLL8PacketPingPong(int* buff, int rank, int nElem, int* r } } -__global__ void kernelSmLL16PacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries) { +__global__ void kernelMemLL16PacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries) { if (rank > 1) return; - DeviceHandle& smChan = gChannelOneToOneTestConstSmChans; + DeviceHandle& memChan = gChannelOneToOneTestConstMemChans; volatile int* sendBuff = (volatile int*)buff; int putOffset = (rank == 0) ? 0 : 10000000; int getOffset = (rank == 0) ? 10000000 : 0; @@ -349,9 +351,9 @@ __global__ void kernelSmLL16PacketPingPong(int* buff, int rank, int nElem, int* sendBuff[2 * j + 1] = putOffset + i + 2 * j + 1; } // __syncthreads(); - smChan.putPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); + memChan.putPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); } else { - smChan.getPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); + memChan.getPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); // If each thread reads 8 bytes at once, we don't need a barrier after getPackets(). // __syncthreads(); for (int j = threadIdx.x; j < nElem / 2; j += blockDim.x) { @@ -374,16 +376,16 @@ __global__ void kernelSmLL16PacketPingPong(int* buff, int rank, int nElem, int* } } -TEST_F(SmChannelOneToOneTest, LL8PacketPingPong) { - auto kernelSmLL8PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) { - kernelSmLL8PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries); +TEST_F(MemoryChannelOneToOneTest, LL8PacketPingPong) { + auto kernelMemLL8PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) { + kernelMemLL8PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries); }; - packetPingPongTest("smLL8PacketPingPong", kernelSmLL8PacketPingPongWrapper); + packetPingPongTest("memoryLL8PacketPingPong", kernelMemLL8PacketPingPongWrapper); } -TEST_F(SmChannelOneToOneTest, LL16PacketPingPong) { - auto kernelSmLL16PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) { - kernelSmLL16PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries); +TEST_F(MemoryChannelOneToOneTest, LL16PacketPingPong) { + auto kernelMemLL16PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) { + kernelMemLL16PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries); }; - packetPingPongTest("smLL16PacketPingPong", kernelSmLL16PacketPingPongWrapper); + packetPingPongTest("memoryLL16PacketPingPong", kernelMemLL16PacketPingPongWrapper); } diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp index c00ecb6b6..a2d8ac74f 100644 --- a/test/mp_unit/mp_unit_tests.hpp +++ b/test/mp_unit/mp_unit_tests.hpp @@ -8,9 +8,9 @@ #include #include +#include #include -#include -#include +#include #include #include "ib.hpp" @@ -128,7 +128,7 @@ class CommunicatorTest : public CommunicatorTestBase { template using DeviceHandle = mscclpp::DeviceHandle; -class ProxyChannelOneToOneTest : public CommunicatorTestBase { +class PortChannelOneToOneTest : public CommunicatorTestBase { protected: struct PingPongTestParams { bool useIPC; @@ -140,9 +140,8 @@ class ProxyChannelOneToOneTest : public CommunicatorTestBase { void SetUp() override; void TearDown() override; - void setupMeshConnections(std::vector& proxyChannels, bool useIPC, bool useIb, - bool useEthernet, void* sendBuff, size_t sendBuffBytes, void* recvBuff = nullptr, - size_t recvBuffBytes = 0); + void setupMeshConnections(std::vector& portChannels, bool useIPC, bool useIb, bool useEthernet, + void* sendBuff, size_t sendBuffBytes, void* recvBuff = nullptr, size_t recvBuffBytes = 0); void testPingPong(PingPongTestParams params); void testPingPongPerf(PingPongTestParams params); void testPacketPingPong(bool useIbOnly); @@ -151,17 +150,17 @@ class ProxyChannelOneToOneTest : public CommunicatorTestBase { std::shared_ptr proxyService; }; -class SmChannelOneToOneTest : public CommunicatorTestBase { +class MemoryChannelOneToOneTest : public CommunicatorTestBase { protected: void SetUp() override; void TearDown() override; - void setupMeshConnections(std::vector& smChannels, void* inputBuff, size_t inputBuffBytes, + void setupMeshConnections(std::vector& memoryChannels, void* inputBuff, size_t inputBuffBytes, void* outputBuff = nullptr, size_t outputBuffBytes = 0); using PacketPingPongKernelWrapper = std::function; void packetPingPongTest(const std::string testName, PacketPingPongKernelWrapper kernelWrapper); - std::unordered_map> smSemaphores; + std::unordered_map> memorySemaphores; }; class ExecutorTest : public MultiProcessTest { diff --git a/test/mp_unit/proxy_channel_tests.cu b/test/mp_unit/port_channel_tests.cu similarity index 74% rename from test/mp_unit/proxy_channel_tests.cu rename to test/mp_unit/port_channel_tests.cu index 192985b47..f49c23306 100644 --- a/test/mp_unit/proxy_channel_tests.cu +++ b/test/mp_unit/port_channel_tests.cu @@ -6,18 +6,18 @@ #include "mp_unit_tests.hpp" -void ProxyChannelOneToOneTest::SetUp() { +void PortChannelOneToOneTest::SetUp() { // Use only two ranks setNumRanksToUse(2); CommunicatorTestBase::SetUp(); proxyService = std::make_shared(); } -void ProxyChannelOneToOneTest::TearDown() { CommunicatorTestBase::TearDown(); } +void PortChannelOneToOneTest::TearDown() { CommunicatorTestBase::TearDown(); } -void ProxyChannelOneToOneTest::setupMeshConnections(std::vector& proxyChannels, bool useIPC, - bool useIb, bool useEthernet, void* sendBuff, size_t sendBuffBytes, - void* recvBuff, size_t recvBuffBytes) { +void PortChannelOneToOneTest::setupMeshConnections(std::vector& portChannels, bool useIPC, + bool useIb, bool useEthernet, void* sendBuff, size_t sendBuffBytes, + void* recvBuff, size_t recvBuffBytes) { const int rank = communicator->bootstrap()->getRank(); const int worldSize = communicator->bootstrap()->getNranks(); const bool isInPlace = (recvBuff == nullptr); @@ -64,17 +64,17 @@ void ProxyChannelOneToOneTest::setupMeshConnections(std::vectorbuildAndAddSemaphore(*communicator, connectionFutures[r].get()); - proxyChannels.emplace_back(proxyService->proxyChannel(cid, proxyService->addMemory(remoteMemFutures[r].get()), - proxyService->addMemory(sendBufRegMem))); + portChannels.emplace_back(proxyService->portChannel(cid, proxyService->addMemory(remoteMemFutures[r].get()), + proxyService->addMemory(sendBufRegMem))); } communicator->setup(); } -__constant__ DeviceHandle gChannelOneToOneTestConstProxyChans; +__constant__ DeviceHandle gChannelOneToOneTestConstPortChans; __global__ void kernelProxyPingPong(int* buff, int rank, int nElem, bool waitWithPoll, int nTries, int* ret) { - DeviceHandle& proxyChan = gChannelOneToOneTestConstProxyChans; + DeviceHandle& portChan = gChannelOneToOneTestConstPortChans; volatile int* sendBuff = (volatile int*)buff; int flusher = 0; int rank1Offset = 10000000; @@ -84,7 +84,7 @@ __global__ void kernelProxyPingPong(int* buff, int rank, int nElem, bool waitWit if (threadIdx.x == 0) { if (waitWithPoll) { int spin = 1000000; - while (!proxyChan.poll() && spin > 0) { + while (!portChan.poll() && spin > 0) { spin--; } if (spin == 0) { @@ -92,7 +92,7 @@ __global__ void kernelProxyPingPong(int* buff, int rank, int nElem, bool waitWit *ret = 1; } } else { - proxyChan.wait(); + portChan.wait(); } } __syncthreads(); @@ -109,13 +109,13 @@ __global__ void kernelProxyPingPong(int* buff, int rank, int nElem, bool waitWit } __syncthreads(); // __threadfence_system(); // not necessary if we make sendBuff volatile - if (threadIdx.x == 0) proxyChan.putWithSignal(0, nElem * sizeof(int)); + if (threadIdx.x == 0) portChan.putWithSignal(0, nElem * sizeof(int)); } if (rank == 1) { if (threadIdx.x == 0) { if (waitWithPoll) { int spin = 1000000; - while (!proxyChan.poll() && spin > 0) { + while (!portChan.poll() && spin > 0) { spin--; } if (spin == 0) { @@ -123,7 +123,7 @@ __global__ void kernelProxyPingPong(int* buff, int rank, int nElem, bool waitWit *ret = 1; } } else { - proxyChan.wait(); + portChan.wait(); } } __syncthreads(); @@ -140,32 +140,32 @@ __global__ void kernelProxyPingPong(int* buff, int rank, int nElem, bool waitWit } __syncthreads(); // __threadfence_system(); // not necessary if we make sendBuff volatile - if (threadIdx.x == 0) proxyChan.putWithSignal(0, nElem * sizeof(int)); + if (threadIdx.x == 0) portChan.putWithSignal(0, nElem * sizeof(int)); } } flusher++; if (flusher == 1) { - if (threadIdx.x == 0) proxyChan.flush(); + if (threadIdx.x == 0) portChan.flush(); flusher = 0; } } } -void ProxyChannelOneToOneTest::testPingPong(PingPongTestParams params) { +void PortChannelOneToOneTest::testPingPong(PingPongTestParams params) { if (gEnv->rank >= numRanksToUse) return; const int nElem = 4 * 1024 * 1024; - std::vector proxyChannels; + std::vector portChannels; std::shared_ptr buff = mscclpp::GpuBuffer(nElem).memory(); - setupMeshConnections(proxyChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int)); + setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int)); - std::vector> proxyChannelHandles; - for (auto& ch : proxyChannels) proxyChannelHandles.push_back(ch.deviceHandle()); + std::vector> portChannelHandles; + for (auto& ch : portChannels) portChannelHandles.push_back(ch.deviceHandle()); - ASSERT_EQ(proxyChannels.size(), 1); - MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstProxyChans, proxyChannelHandles.data(), - sizeof(DeviceHandle))); + ASSERT_EQ(portChannels.size(), 1); + MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstPortChans, portChannelHandles.data(), + sizeof(DeviceHandle))); proxyService->startProxy(); @@ -196,21 +196,21 @@ void ProxyChannelOneToOneTest::testPingPong(PingPongTestParams params) { proxyService->stopProxy(); } -void ProxyChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) { +void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) { if (gEnv->rank >= numRanksToUse) return; const int nElem = 4 * 1024 * 1024; - std::vector proxyChannels; + std::vector portChannels; std::shared_ptr buff = mscclpp::GpuBuffer(nElem).memory(); - setupMeshConnections(proxyChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int)); + setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int)); - std::vector> proxyChannelHandles; - for (auto& ch : proxyChannels) proxyChannelHandles.push_back(ch.deviceHandle()); + std::vector> portChannelHandles; + for (auto& ch : portChannels) portChannelHandles.push_back(ch.deviceHandle()); - ASSERT_EQ(proxyChannels.size(), 1); - MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstProxyChans, proxyChannelHandles.data(), - sizeof(DeviceHandle))); + ASSERT_EQ(portChannels.size(), 1); + MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstPortChans, portChannelHandles.data(), + sizeof(DeviceHandle))); proxyService->startProxy(); @@ -240,46 +240,46 @@ void ProxyChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) { proxyService->stopProxy(); } -TEST_F(ProxyChannelOneToOneTest, PingPong) { +TEST_F(PortChannelOneToOneTest, PingPong) { testPingPong(PingPongTestParams{.useIPC = true, .useIB = true, .useEthernet = false, .waitWithPoll = false}); } -TEST_F(ProxyChannelOneToOneTest, PingPongIb) { +TEST_F(PortChannelOneToOneTest, PingPongIb) { testPingPong(PingPongTestParams{.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false}); } -TEST_F(ProxyChannelOneToOneTest, PingPongEthernet) { +TEST_F(PortChannelOneToOneTest, PingPongEthernet) { testPingPong(PingPongTestParams{.useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false}); } -TEST_F(ProxyChannelOneToOneTest, PingPongWithPoll) { +TEST_F(PortChannelOneToOneTest, PingPongWithPoll) { testPingPong(PingPongTestParams{.useIPC = true, .useIB = true, .useEthernet = false, .waitWithPoll = true}); } -TEST_F(ProxyChannelOneToOneTest, PingPongIbWithPoll) { +TEST_F(PortChannelOneToOneTest, PingPongIbWithPoll) { testPingPong(PingPongTestParams{.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = true}); } -TEST_F(ProxyChannelOneToOneTest, PingPongPerf) { +TEST_F(PortChannelOneToOneTest, PingPongPerf) { testPingPongPerf(PingPongTestParams{.useIPC = true, .useIB = true, .useEthernet = false, .waitWithPoll = false}); } -TEST_F(ProxyChannelOneToOneTest, PingPongPerfIb) { +TEST_F(PortChannelOneToOneTest, PingPongPerfIb) { testPingPongPerf(PingPongTestParams{.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false}); } -TEST_F(ProxyChannelOneToOneTest, PingPongPerfEthernet) { +TEST_F(PortChannelOneToOneTest, PingPongPerfEthernet) { testPingPongPerf(PingPongTestParams{.useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false}); } -__device__ mscclpp::DeviceSyncer gChannelOneToOneTestProxyChansSyncer; +__device__ mscclpp::DeviceSyncer gChannelOneToOneTestPortChansSyncer; template __global__ void kernelProxyLLPingPong(int* buff, mscclpp::LLPacket* putPktBuf, mscclpp::LLPacket* getPktBuf, int rank, int nElem, int nTries, int* ret) { if (rank > 1) return; - DeviceHandle& proxyChan = gChannelOneToOneTestConstProxyChans; + DeviceHandle& portChan = gChannelOneToOneTestConstPortChans; volatile int* buffPtr = (volatile int*)buff; int putOffset = (rank == 0) ? 0 : 10000000; int getOffset = (rank == 0) ? 10000000 : 0; @@ -302,14 +302,14 @@ __global__ void kernelProxyLLPingPong(int* buff, mscclpp::LLPacket* putPktBuf, m // __syncthreads(); } mscclpp::putPackets(putPktBuf, 0, buff, 0, nElem * sizeof(int), threadId, numThreads, flag); - gChannelOneToOneTestProxyChansSyncer.sync(gridDim.x); + gChannelOneToOneTestPortChansSyncer.sync(gridDim.x); if (threadId == 0) { // Send data from the local putPacketBuffer to the remote getPacketBuffer - proxyChan.put(0, nPkt * sizeof(mscclpp::LLPacket)); + portChan.put(0, nPkt * sizeof(mscclpp::LLPacket)); } flusher++; if (flusher == 64) { - if (threadId == 0) proxyChan.flush(); + if (threadId == 0) portChan.flush(); flusher = 0; } } else { @@ -333,38 +333,38 @@ __global__ void kernelProxyLLPingPong(int* buff, mscclpp::LLPacket* putPktBuf, m } } // Make sure all threads are done in this iteration - gChannelOneToOneTestProxyChansSyncer.sync(gridDim.x); + gChannelOneToOneTestPortChansSyncer.sync(gridDim.x); } } } -void ProxyChannelOneToOneTest::testPacketPingPong(bool useIbOnly) { +void PortChannelOneToOneTest::testPacketPingPong(bool useIbOnly) { if (gEnv->rank >= numRanksToUse) return; const int nElem = 4 * 1024 * 1024; - std::vector proxyChannels; + std::vector portChannels; std::shared_ptr buff = mscclpp::GpuBuffer(nElem).memory(); const size_t nPacket = (nElem * sizeof(int) + sizeof(uint64_t) - 1) / sizeof(uint64_t); auto putPacketBuffer = mscclpp::GpuBuffer(nPacket).memory(); auto getPacketBuffer = mscclpp::GpuBuffer(nPacket).memory(); - setupMeshConnections(proxyChannels, !useIbOnly, true, false, putPacketBuffer.get(), + setupMeshConnections(portChannels, !useIbOnly, true, false, putPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket), getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket)); - ASSERT_EQ(proxyChannels.size(), 1); + ASSERT_EQ(portChannels.size(), 1); - std::vector> proxyChannelHandles; - for (auto& proxyChannel : proxyChannels) { - proxyChannelHandles.push_back(proxyChannel.deviceHandle()); + std::vector> portChannelHandles; + for (auto& portChannel : portChannels) { + portChannelHandles.push_back(portChannel.deviceHandle()); } - MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstProxyChans, proxyChannelHandles.data(), - sizeof(DeviceHandle))); + MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstPortChans, portChannelHandles.data(), + sizeof(DeviceHandle))); mscclpp::DeviceSyncer syncer = {}; - MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestProxyChansSyncer, &syncer, sizeof(mscclpp::DeviceSyncer))); + MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestPortChansSyncer, &syncer, sizeof(mscclpp::DeviceSyncer))); proxyService->startProxy(); @@ -405,33 +405,33 @@ void ProxyChannelOneToOneTest::testPacketPingPong(bool useIbOnly) { proxyService->stopProxy(); } -void ProxyChannelOneToOneTest::testPacketPingPongPerf(bool useIbOnly) { +void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIbOnly) { if (gEnv->rank >= numRanksToUse) return; const int nElem = 4 * 1024 * 1024; - std::vector proxyChannels; + std::vector portChannels; std::shared_ptr buff = mscclpp::GpuBuffer(nElem).memory(); const size_t nPacket = (nElem * sizeof(int) + sizeof(uint64_t) - 1) / sizeof(uint64_t); auto putPacketBuffer = mscclpp::GpuBuffer(nPacket).memory(); auto getPacketBuffer = mscclpp::GpuBuffer(nPacket).memory(); - setupMeshConnections(proxyChannels, !useIbOnly, true, false, putPacketBuffer.get(), + setupMeshConnections(portChannels, !useIbOnly, true, false, putPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket), getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket)); - ASSERT_EQ(proxyChannels.size(), 1); + ASSERT_EQ(portChannels.size(), 1); - std::vector> proxyChannelHandles; - for (auto& proxyChannel : proxyChannels) { - proxyChannelHandles.push_back(proxyChannel.deviceHandle()); + std::vector> portChannelHandles; + for (auto& portChannel : portChannels) { + portChannelHandles.push_back(portChannel.deviceHandle()); } - MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstProxyChans, proxyChannelHandles.data(), - sizeof(DeviceHandle))); + MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstPortChans, portChannelHandles.data(), + sizeof(DeviceHandle))); mscclpp::DeviceSyncer syncer = {}; - MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestProxyChansSyncer, &syncer, sizeof(mscclpp::DeviceSyncer))); + MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestPortChansSyncer, &syncer, sizeof(mscclpp::DeviceSyncer))); proxyService->startProxy(); @@ -461,10 +461,10 @@ void ProxyChannelOneToOneTest::testPacketPingPongPerf(bool useIbOnly) { proxyService->stopProxy(); } -TEST_F(ProxyChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false); } +TEST_F(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false); } -TEST_F(ProxyChannelOneToOneTest, PacketPingPongIb) { testPacketPingPong(true); } +TEST_F(PortChannelOneToOneTest, PacketPingPongIb) { testPacketPingPong(true); } -TEST_F(ProxyChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false); } +TEST_F(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false); } -TEST_F(ProxyChannelOneToOneTest, PacketPingPongPerfIb) { testPacketPingPongPerf(true); } +TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIb) { testPacketPingPongPerf(true); } diff --git a/test/mscclpp-test/allgather_test.cu b/test/mscclpp-test/allgather_test.cu index 27506f340..17319bc9e 100644 --- a/test/mscclpp-test/allgather_test.cu +++ b/test/mscclpp-test/allgather_test.cu @@ -21,37 +21,37 @@ constexpr uint64_t MAGIC = 0xdeadbeef; template using DeviceHandle = mscclpp::DeviceHandle; -__constant__ DeviceHandle constProxyChans[16]; -__constant__ DeviceHandle constRawProxyChan[16]; +__constant__ DeviceHandle constPortChans[16]; +__constant__ DeviceHandle constRawPortChan[16]; -__constant__ DeviceHandle constSmChans[512]; -__constant__ DeviceHandle constSmOutOfPlaceChans[16]; +__constant__ DeviceHandle constMemChans[512]; +__constant__ DeviceHandle constMemOutOfPlaceChans[16]; __device__ uint64_t globalFlag; __global__ void __launch_bounds__(1024) allgather0(int rank, size_t nelemsPerGPU) { int warpId = threadIdx.x / WARP_SIZE; // Each warp is responsible for one of the remote ranks - DeviceHandle proxyChan = constProxyChans[warpId]; + DeviceHandle portChan = constPortChans[warpId]; // this allgather is really simple and implemented as an alltoall // this thread's role is a sender role // put your data asynchronously if (threadIdx.x % WARP_SIZE == 0) { - proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int)); + portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int)); } // make sure everyone is put their data before some thread randomly blocks everyone else in signal __syncthreads(); // push with flag and sync to make sure the data is received - if (threadIdx.x % WARP_SIZE == 0) proxyChan.flush(); + if (threadIdx.x % WARP_SIZE == 0) portChan.flush(); // this thread's role is a receiver role. wait on the semaphore to make sure the data is ready - if (threadIdx.x % WARP_SIZE == 0) proxyChan.wait(); + if (threadIdx.x % WARP_SIZE == 0) portChan.wait(); } -__device__ void localAllGather(DeviceHandle proxyChan, int rank, int nRanksPerNode, - int remoteRank, uint64_t offset, uint64_t size, bool flushAfterSignal = true) { +__device__ void localAllGather(DeviceHandle portChan, int rank, int nRanksPerNode, int remoteRank, + uint64_t offset, uint64_t size, bool flushAfterSignal = true) { // this allgather algorithm works as follows: // Step 1: GPU rank i sends data to GPU rank (i+1) % nRanksPerNode // and waits for data from GPU rank (i-1) % nRanksPerNode @@ -61,12 +61,12 @@ __device__ void localAllGather(DeviceHandle proxyChan, in for (int i = 1; i < nRanksPerNode; i++) { if ((remoteRank % nRanksPerNode) == ((rank + i) % nRanksPerNode)) { // put your data to GPU (rank+i) % nRanksPerNode and signal in one call - if (flushAfterSignal && (threadIdx.x % WARP_SIZE) == 0) proxyChan.putWithSignalAndFlush(offset, size); - if (!flushAfterSignal && (threadIdx.x % WARP_SIZE) == 0) proxyChan.putWithSignal(offset, size); + if (flushAfterSignal && (threadIdx.x % WARP_SIZE) == 0) portChan.putWithSignalAndFlush(offset, size); + if (!flushAfterSignal && (threadIdx.x % WARP_SIZE) == 0) portChan.putWithSignal(offset, size); } // wait for the data from GPU (rank-i) % nRanksPerNode to arrive if ((remoteRank % nRanksPerNode) == ((rank - i + nRanksPerNode) % nRanksPerNode)) { - if ((threadIdx.x % WARP_SIZE) == 0) proxyChan.wait(); + if ((threadIdx.x % WARP_SIZE) == 0) portChan.wait(); } #if defined(__HIP_PLATFORM_AMD__) // NOTE: we actually need a group barrier here for better performance, but __syncthreads() is still correct. @@ -80,8 +80,8 @@ __device__ void localAllGather(DeviceHandle proxyChan, in __device__ mscclpp::DeviceSyncer deviceSyncer; // This kernel is the most performant when the number of blocks is a multiple of (nRanksPerNode - 1). -__device__ void localAllGatherSm(int rank, int nRanksPerNode, int startRankChunkIndex, uint64_t offsetInRankChunk, - uint64_t rankChunkSize, uint64_t size, size_t nBlocks) { +__device__ void localAllGatherMem(int rank, int nRanksPerNode, int startRankChunkIndex, uint64_t offsetInRankChunk, + uint64_t rankChunkSize, uint64_t size, size_t nBlocks) { if (nRanksPerNode == 1) return; if (blockIdx.x >= nBlocks) return; const size_t nPeer = nRanksPerNode - 1; @@ -117,12 +117,12 @@ __device__ void localAllGatherSm(int rank, int nRanksPerNode, int startRankChunk sizeForThisBlock += lastChunkSize; } if (threadIdx.x == 0 && peerLocalBlockIdx == 0) { - constSmChans[peerIdx].signal(); - constSmChans[peerIdx].wait(); + constMemChans[peerIdx].signal(); + constMemChans[peerIdx].wait(); } deviceSyncer.sync(nBlocks); size_t offset = rankChunkSize * (startRankChunkIndex + remoteRankLocalIndex) + offsetInRankChunk; - constSmChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x); + constMemChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x); } __global__ void __launch_bounds__(1024) allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) { @@ -130,9 +130,9 @@ __global__ void __launch_bounds__(1024) allgather1(int rank, int nRanksPerNode, int remoteRank = (warpId < rank) ? warpId : warpId + 1; // Each warp is responsible for one of the remote ranks - DeviceHandle proxyChan = constProxyChans[warpId]; + DeviceHandle portChan = constPortChans[warpId]; - localAllGather(proxyChan, rank, nRanksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), + localAllGather(portChan, rank, nRanksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int)); } @@ -141,7 +141,7 @@ __global__ void __launch_bounds__(1024) allgather2(int rank, int worldSize, int int remoteRank = (warpId < rank) ? warpId : warpId + 1; // Each warp is responsible for one of the remote ranks - DeviceHandle proxyChan = constProxyChans[warpId]; + DeviceHandle portChan = constPortChans[warpId]; // this allgather is a pipelined and hierarchical one and only works for two nodes // it is implemented as follows: @@ -158,16 +158,16 @@ __global__ void __launch_bounds__(1024) allgather2(int rank, int worldSize, int // Step 1 // local allgather if (remoteRank / nRanksPerNode == rank / nRanksPerNode) { - localAllGather(proxyChan, rank, nRanksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), + localAllGather(portChan, rank, nRanksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int), false); } // cross-node exchange if (remoteRank % nRanksPerNode == rank % nRanksPerNode) { // opposite side if ((threadIdx.x % WARP_SIZE) == 0) - proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), - (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int)); - if ((threadIdx.x % WARP_SIZE) == 0) proxyChan.wait(); + portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int)); + if ((threadIdx.x % WARP_SIZE) == 0) portChan.wait(); } // sync here to make sure IB flush dose not block the CUDA IPC traffic @@ -175,7 +175,7 @@ __global__ void __launch_bounds__(1024) allgather2(int rank, int worldSize, int // need to flush ib channel here to avoid cq overflow. since we won't change send suffer after send, we don't need // to flush for IPC channel. if (remoteRank % nRanksPerNode == rank % nRanksPerNode) { - if ((threadIdx.x % WARP_SIZE) == 0) proxyChan.flush(); + if ((threadIdx.x % WARP_SIZE) == 0) portChan.flush(); } __syncthreads(); @@ -183,7 +183,7 @@ __global__ void __launch_bounds__(1024) allgather2(int rank, int worldSize, int // local allgather int otherNghr = (rank + nRanksPerNode) % worldSize; if (remoteRank / nRanksPerNode == rank / nRanksPerNode) { - localAllGather(proxyChan, rank, nRanksPerNode, remoteRank, otherNghr * nelemsPerGPU * sizeof(int), + localAllGather(portChan, rank, nRanksPerNode, remoteRank, otherNghr * nelemsPerGPU * sizeof(int), (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int), false); } @@ -191,21 +191,21 @@ __global__ void __launch_bounds__(1024) allgather2(int rank, int worldSize, int if (remoteRank % nRanksPerNode == rank % nRanksPerNode) { // opposite side if ((threadIdx.x % WARP_SIZE) == 0) - proxyChan.putWithSignal((rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int), - nelemsPerGPU / pipelineSize * sizeof(int)); - if ((threadIdx.x % WARP_SIZE) == 0) proxyChan.wait(); + portChan.putWithSignal((rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int), + nelemsPerGPU / pipelineSize * sizeof(int)); + if ((threadIdx.x % WARP_SIZE) == 0) portChan.wait(); } __syncthreads(); if (remoteRank % nRanksPerNode == rank % nRanksPerNode) { - if ((threadIdx.x % WARP_SIZE) == 0) proxyChan.flush(); + if ((threadIdx.x % WARP_SIZE) == 0) portChan.flush(); } __syncthreads(); // Step 3 // local allgather if (remoteRank / nRanksPerNode == rank / nRanksPerNode) { - localAllGather(proxyChan, rank, nRanksPerNode, remoteRank, + localAllGather(portChan, rank, nRanksPerNode, remoteRank, (otherNghr * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int), nelemsPerGPU / pipelineSize * sizeof(int)); } @@ -215,7 +215,7 @@ __global__ void __launch_bounds__(1024) allgather3() { int warpId = threadIdx.x / WARP_SIZE; // Each warp is responsible for one of the remote ranks - DeviceHandle proxyChan = constRawProxyChan[warpId]; + DeviceHandle portChan = constRawPortChan[warpId]; int tid = threadIdx.x; __syncthreads(); @@ -224,12 +224,12 @@ __global__ void __launch_bounds__(1024) allgather3() { trigger.fst = MAGIC; trigger.snd = 0; // offload all the work to the proxy - uint64_t currentFifoHead = proxyChan.fifo_.push(trigger); + uint64_t currentFifoHead = portChan.fifo_.push(trigger); // wait for the work to be done in cpu side - proxyChan.fifo_.sync(currentFifoHead); + portChan.fifo_.sync(currentFifoHead); } if (tid % WARP_SIZE == 0) { - proxyChan.wait(); + portChan.wait(); } } @@ -248,14 +248,14 @@ __global__ void __launch_bounds__(1024) allgather4(int rank, int worldSize, int int peerRank = (rank + nRanksPerNode) % worldSize; int peerNodeId = peerRank / nRanksPerNode; int peer = (peerRank < rank) ? peerRank : peerRank - 1; - DeviceHandle& proxyChan = constProxyChans[peer]; + DeviceHandle& portChan = constPortChans[peer]; const size_t nBlocksForLocalAllGather = gridDim.x; const size_t rankChunkSize = nelemsPerGPU * sizeof(int); const int startRankIndexInLocalNode = (rank / nRanksPerNode) * nRanksPerNode; const int startRankIndexInPeerNode = (peerRank / nRanksPerNode) * nRanksPerNode; if (peerNodeId == rank / nRanksPerNode) { - localAllGatherSm(rank, nRanksPerNode, 0, 0, rankChunkSize, rankChunkSize, nBlocksForLocalAllGather); + localAllGatherMem(rank, nRanksPerNode, 0, 0, rankChunkSize, rankChunkSize, nBlocksForLocalAllGather); return; } @@ -266,29 +266,29 @@ __global__ void __launch_bounds__(1024) allgather4(int rank, int worldSize, int // Step 1 if (threadIdx.x == 0 && blockIdx.x == 0) { - proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), step1Bytes); + portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), step1Bytes); } - localAllGatherSm(rank, nRanksPerNode, startRankIndexInLocalNode, 0, rankChunkSize, rankChunkSize, - nBlocksForLocalAllGather); + localAllGatherMem(rank, nRanksPerNode, startRankIndexInLocalNode, 0, rankChunkSize, rankChunkSize, + nBlocksForLocalAllGather); if (threadIdx.x == 0 && blockIdx.x == 0) { - proxyChan.wait(); - proxyChan.flush(); + portChan.wait(); + portChan.flush(); } deviceSyncer.sync(nBlocksForLocalAllGather); // Step 2 if (threadIdx.x == 0 && blockIdx.x == 0) { - proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int) + step1Bytes, step2Bytes); + portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int) + step1Bytes, step2Bytes); } - localAllGatherSm(rank, nRanksPerNode, startRankIndexInPeerNode, 0, rankChunkSize, step1Bytes, - nBlocksForLocalAllGather); + localAllGatherMem(rank, nRanksPerNode, startRankIndexInPeerNode, 0, rankChunkSize, step1Bytes, + nBlocksForLocalAllGather); if (threadIdx.x == 0 && blockIdx.x == 0) { - proxyChan.wait(); - proxyChan.flush(); + portChan.wait(); + portChan.flush(); } deviceSyncer.sync(nBlocksForLocalAllGather); // Step 3 - localAllGatherSm(rank, nRanksPerNode, startRankIndexInPeerNode, step1Bytes, rankChunkSize, step2Bytes, - nBlocksForLocalAllGather); + localAllGatherMem(rank, nRanksPerNode, startRankIndexInPeerNode, step1Bytes, rankChunkSize, step2Bytes, + nBlocksForLocalAllGather); } __global__ void __launch_bounds__(1024, 1) @@ -304,11 +304,11 @@ __global__ void __launch_bounds__(1024, 1) const size_t nWarp = nThread / WARP_SIZE; const size_t nPeer = nRanksPerNode - 1; const size_t chanOffset = nPeer * blockIdx.x; - auto smChans = constSmChans + chanOffset; + auto memChans = constMemChans + chanOffset; if (wid < nPeer && lid == 0) { - smChans[wid].relaxedSignal(); - smChans[wid].wait(); + memChans[wid].relaxedSignal(); + memChans[wid].wait(); } __syncthreads(); const size_t bytesPerGPU = nelemsPerGPU * sizeof(int); @@ -328,7 +328,7 @@ __global__ void __launch_bounds__(1024, 1) const size_t peerIdx = wid % nPeer; const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); const size_t offset = bytesPerGPU * remoteRankLocalIndex + (wid / nPeer) * unitBytesPerWarp; - smChans[peerIdx].get<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); + memChans[peerIdx].get<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); } for (size_t i = 1; i < nLoop; ++i) { @@ -336,7 +336,7 @@ __global__ void __launch_bounds__(1024, 1) const size_t peerIdx = gWid % nPeer; const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); const size_t offset = bytesPerGPU * remoteRankLocalIndex + (gWid / nPeer) * unitBytesPerWarp; - smChans[peerIdx].get<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); + memChans[peerIdx].get<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); } if (bytes % unitBytes > 0) { @@ -349,7 +349,7 @@ __global__ void __launch_bounds__(1024, 1) ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0) : unitBytesPerWarp; if (remainBytes > 0) { - smChans[peerIdx].get<16, true>(offset, remainBytes, lid, WARP_SIZE); + memChans[peerIdx].get<16, true>(offset, remainBytes, lid, WARP_SIZE); } } } @@ -367,11 +367,11 @@ __global__ void __launch_bounds__(1024, 1) const size_t nWarp = nThread / WARP_SIZE; const size_t nPeer = nRanksPerNode - 1; const size_t chanOffset = nPeer * blockIdx.x; - auto smChans = constSmChans + chanOffset; + auto memChans = constMemChans + chanOffset; if (wid < nPeer && lid == 0) { - smChans[wid].relaxedSignal(); - smChans[wid].wait(); + memChans[wid].relaxedSignal(); + memChans[wid].wait(); } __syncthreads(); const size_t bytesPerGPU = nelemsPerGPU * sizeof(int); @@ -390,14 +390,14 @@ __global__ void __launch_bounds__(1024, 1) // First loop unrolling const size_t peerIdx = wid % nPeer; const size_t offset = bytesPerGPU * rank + (wid / nPeer) * unitBytesPerWarp; - smChans[peerIdx].put<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); + memChans[peerIdx].put<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); } for (size_t i = 1; i < nLoop; ++i) { const size_t gWid = wid + i * nWarp; const size_t peerIdx = gWid % nPeer; const size_t offset = bytesPerGPU * rank + (gWid / nPeer) * unitBytesPerWarp; - smChans[peerIdx].put<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); + memChans[peerIdx].put<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE); } if (bytes % unitBytes > 0) { @@ -409,7 +409,7 @@ __global__ void __launch_bounds__(1024, 1) ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0) : unitBytesPerWarp; if (remainBytes > 0) { - smChans[peerIdx].put<16, true>(offset, remainBytes, lid, WARP_SIZE); + memChans[peerIdx].put<16, true>(offset, remainBytes, lid, WARP_SIZE); } } } @@ -426,7 +426,7 @@ __global__ void __launch_bounds__(1024, 1) const size_t nThread = blockDim.x * nBlock; const size_t nWarp = nThread / WARP_SIZE; const size_t nPeer = nRanksPerNode - 1; - auto smChans = constSmOutOfPlaceChans; + auto memChans = constMemOutOfPlaceChans; const uint32_t flag = (uint32_t)globalFlag; const size_t bytesPerGPU = nelemsPerGPU * sizeof(int); @@ -443,7 +443,7 @@ __global__ void __launch_bounds__(1024, 1) // First loop unrolling const size_t peerIdx = wid % nPeer; const size_t offset = bytesPerGPU * rank + (wid / nPeer) * unitBytesPerWarp; - smChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); + memChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); } if (nLoop > 0) { @@ -451,14 +451,14 @@ __global__ void __launch_bounds__(1024, 1) const size_t peerIdx = wid % nPeer; const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); const size_t offset = bytesPerGPU * remoteRankLocalIndex + (wid / nPeer) * unitBytesPerWarp; - smChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); + memChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); } for (size_t i = 1; i < nLoop; ++i) { const size_t gWid = wid + i * nWarp; const size_t peerIdx = gWid % nPeer; const size_t offset = bytesPerGPU * rank + (gWid / nPeer) * unitBytesPerWarp; - smChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); + memChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); } for (size_t i = 1; i < nLoop; ++i) { @@ -466,7 +466,7 @@ __global__ void __launch_bounds__(1024, 1) const size_t peerIdx = gWid % nPeer; const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); const size_t offset = bytesPerGPU * remoteRankLocalIndex + (gWid / nPeer) * unitBytesPerWarp; - smChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); + memChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag); } if (bytes % unitBytes > 0) { @@ -478,7 +478,7 @@ __global__ void __launch_bounds__(1024, 1) ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0) : unitBytesPerWarp; if (remainBytes > 0) { - smChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, remainBytes, lid, WARP_SIZE, flag); + memChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, remainBytes, lid, WARP_SIZE, flag); } } if (bytes % unitBytes > 0) { @@ -491,7 +491,7 @@ __global__ void __launch_bounds__(1024, 1) ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0) : unitBytesPerWarp; if (remainBytes > 0) { - smChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, remainBytes, lid, WARP_SIZE, flag); + memChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, remainBytes, lid, WARP_SIZE, flag); } } @@ -513,10 +513,10 @@ class AllGatherProxyService : public mscclpp::BaseProxyService { semaphores_.push_back(std::make_shared(communicator, connection)); return semaphores_.size() - 1; } - std::vector> proxyChannels() { - std::vector> result; + std::vector> portChannels() { + std::vector> result; for (auto& semaphore : semaphores_) { - result.push_back(mscclpp::deviceHandle(mscclpp::BaseProxyChannel(0, semaphore, proxy_))); + result.push_back(mscclpp::deviceHandle(mscclpp::BasePortChannel(0, semaphore, proxy_))); } return result; } @@ -705,8 +705,8 @@ class AllGatherTestEngine : public BaseTestEngine { std::shared_ptr sendBuff_; std::shared_ptr expectedBuff_; std::shared_ptr scratchPacketBuff_; - std::vector smChannels_; - std::vector smOutOfPlaceChannels_; + std::vector memoryChannels_; + std::vector memoryOutOfPlaceChannels_; }; AllGatherTestEngine::AllGatherTestEngine(const TestArgs& args) : BaseTestEngine(args, "allgather") {} @@ -723,42 +723,46 @@ void AllGatherTestEngine::allocateBuffer() { } void AllGatherTestEngine::setupConnections() { - std::vector> devProxyChannels; + std::vector> devPortChannels; if (!isUsingHostOffload(args_.kernelNum)) { - setupMeshConnections(devProxyChannels, sendBuff_.get(), args_.maxBytes); - if (devProxyChannels.size() > sizeof(constProxyChans) / sizeof(DeviceHandle)) { + setupMeshConnections(devPortChannels, sendBuff_.get(), args_.maxBytes); + if (devPortChannels.size() > sizeof(constPortChans) / sizeof(DeviceHandle)) { std::runtime_error("unexpected error"); } - CUDATHROW(cudaMemcpyToSymbol(constProxyChans, devProxyChannels.data(), - sizeof(DeviceHandle) * devProxyChannels.size())); + CUDATHROW(cudaMemcpyToSymbol(constPortChans, devPortChannels.data(), + sizeof(DeviceHandle) * devPortChannels.size())); - setupMeshConnections(smChannels_, sendBuff_.get(), args_.maxBytes, nullptr, 0, ChannelSemantic::PUT, 64); - std::vector> smChannelHandles(smChannels_.size()); - if (smChannels_.size() > sizeof(constSmChans) / sizeof(DeviceHandle)) { + setupMeshConnections(memoryChannels_, sendBuff_.get(), args_.maxBytes, nullptr, 0, ChannelSemantic::PUT, 64); + std::vector> memoryChannelHandles(memoryChannels_.size()); + if (memoryChannels_.size() > sizeof(constMemChans) / sizeof(DeviceHandle)) { std::runtime_error("unexpected error"); } - std::transform(smChannels_.begin(), smChannels_.end(), smChannelHandles.begin(), - [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); - CUDATHROW(cudaMemcpyToSymbol(constSmChans, smChannelHandles.data(), - sizeof(DeviceHandle) * smChannelHandles.size())); + std::transform(memoryChannels_.begin(), memoryChannels_.end(), memoryChannelHandles.begin(), + [](const mscclpp::MemoryChannel& memoryChannel) { return mscclpp::deviceHandle(memoryChannel); }); + CUDATHROW(cudaMemcpyToSymbol(constMemChans, memoryChannelHandles.data(), + sizeof(DeviceHandle) * memoryChannelHandles.size())); if (args_.kernelNum == 7) { const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t); const size_t scratchPacketBuffBytes = nPacket * 2 * 2 * sizeof(mscclpp::LLPacket); - setupMeshConnections(smOutOfPlaceChannels_, sendBuff_.get(), args_.maxBytes, scratchPacketBuff_.get(), + setupMeshConnections(memoryOutOfPlaceChannels_, sendBuff_.get(), args_.maxBytes, scratchPacketBuff_.get(), scratchPacketBuffBytes); - std::vector> smOutOfPlaceChannelHandles(smOutOfPlaceChannels_.size()); - if (smOutOfPlaceChannels_.size() > sizeof(constSmOutOfPlaceChans) / sizeof(DeviceHandle)) { + std::vector> memoryOutOfPlaceChannelHandles( + memoryOutOfPlaceChannels_.size()); + if (memoryOutOfPlaceChannels_.size() > + sizeof(constMemOutOfPlaceChans) / sizeof(DeviceHandle)) { std::runtime_error("unexpected error"); } - std::transform(smOutOfPlaceChannels_.begin(), smOutOfPlaceChannels_.end(), smOutOfPlaceChannelHandles.begin(), - [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); - CUDATHROW(cudaMemcpyToSymbol(constSmOutOfPlaceChans, smOutOfPlaceChannelHandles.data(), - sizeof(DeviceHandle) * smOutOfPlaceChannelHandles.size())); + std::transform(memoryOutOfPlaceChannels_.begin(), memoryOutOfPlaceChannels_.end(), + memoryOutOfPlaceChannelHandles.begin(), + [](const mscclpp::MemoryChannel& memoryChannel) { return mscclpp::deviceHandle(memoryChannel); }); + CUDATHROW( + cudaMemcpyToSymbol(constMemOutOfPlaceChans, memoryOutOfPlaceChannelHandles.data(), + sizeof(DeviceHandle) * memoryOutOfPlaceChannelHandles.size())); } } else { auto service = std::dynamic_pointer_cast(chanService_); - setupMeshConnections(devProxyChannels, sendBuff_.get(), args_.maxBytes, nullptr, 0, + setupMeshConnections(devPortChannels, sendBuff_.get(), args_.maxBytes, nullptr, 0, [&](std::vector> conns, std::vector>& remoteMemories, const mscclpp::RegisteredMemory& localMemory) { @@ -770,12 +774,12 @@ void AllGatherTestEngine::setupConnections() { service->setLocalMemory(localMemory); comm_->setup(); }); - auto proxyChannels = service->proxyChannels(); - if (proxyChannels.size() > sizeof(constRawProxyChan) / sizeof(DeviceHandle)) { + auto portChannels = service->portChannels(); + if (portChannels.size() > sizeof(constRawPortChan) / sizeof(DeviceHandle)) { std::runtime_error("unexpected error"); } - CUDATHROW(cudaMemcpyToSymbol(constRawProxyChan, proxyChannels.data(), - sizeof(DeviceHandle) * proxyChannels.size())); + CUDATHROW(cudaMemcpyToSymbol(constRawPortChan, portChannels.data(), + sizeof(DeviceHandle) * portChannels.size())); } } diff --git a/test/mscclpp-test/allreduce_test.cu b/test/mscclpp-test/allreduce_test.cu index b7632a83d..34f1430db 100644 --- a/test/mscclpp-test/allreduce_test.cu +++ b/test/mscclpp-test/allreduce_test.cu @@ -13,12 +13,12 @@ template using DeviceHandle = mscclpp::DeviceHandle; -__constant__ DeviceHandle constDevFstRoundChans[16]; -__constant__ DeviceHandle constDevSndRoundChans[16]; +__constant__ DeviceHandle constDevFstRoundChans[16]; +__constant__ DeviceHandle constDevSndRoundChans[16]; -__constant__ DeviceHandle constSmInPlaceChans[8]; -__constant__ DeviceHandle constSmOutOfPlaceChans[8]; -__constant__ DeviceHandle constSmOutOfPlaceGetChans[8]; +__constant__ DeviceHandle constMemInPlaceChans[8]; +__constant__ DeviceHandle constMemOutOfPlaceChans[8]; +__constant__ DeviceHandle constMemOutOfPlaceGetChans[8]; __device__ uint64_t globalFlag; // TODO(chhwang): need an interface for this. @@ -94,8 +94,8 @@ __device__ void localReduceScatter(int* buff, int* scratch, int rank, int nRanks int peerSendId = (remoteSendToRank < rank) ? remoteSendToRank : remoteSendToRank - 1; int peerRecvId = (remoteRecvFromRank < rank) ? remoteRecvFromRank : remoteRecvFromRank - 1; - DeviceHandle& devFstSendChan = constDevFstRoundChans[peerSendId]; - DeviceHandle& devFstRecvChan = constDevFstRoundChans[peerRecvId]; + DeviceHandle& devFstSendChan = constDevFstRoundChans[peerSendId]; + DeviceHandle& devFstRecvChan = constDevFstRoundChans[peerRecvId]; size_t srcOffset = (((rankIndexInNode + i) % nRanksPerNode + startChunkIndex) * chunkSize + offsetInChunk) * sizeof(int); size_t dstOffset = rank * chunkSize * sizeof(int); @@ -110,7 +110,7 @@ __device__ void localReduceScatter(int* buff, int* scratch, int rank, int nRanks int prePeerRecvId = (preRemoteRecvFromRank < rank) ? preRemoteRecvFromRank : preRemoteRecvFromRank - 1; // overlap communication and computation - DeviceHandle& preDevFstRecvChan = constDevFstRoundChans[prePeerRecvId]; + DeviceHandle& preDevFstRecvChan = constDevFstRoundChans[prePeerRecvId]; if (isComm) { preDevFstRecvChan.wait(); devFstSendChan.putWithSignal(dstOffset, srcOffset, nelems * sizeof(int)); @@ -157,7 +157,7 @@ __device__ void reduceScatter(int* buff, int* scratch, int rank, int nRanksPerNo int peerNodeId = peerRank / nRanksPerNode; int isComm = (threadIdx.x == 0) && (blockIdx.x == 0); int peer = (peerRank < rank) ? peerRank : peerRank - 1; - DeviceHandle& proxyChan = constDevFstRoundChans[peer]; + DeviceHandle& portChan = constDevFstRoundChans[peer]; if (peerNodeId == rank / nRanksPerNode) { localReduceScatter(buff, scratch, rank, nRanksPerNode, 0, 0, chunkSize, chunkSize); return; @@ -172,12 +172,12 @@ __device__ void reduceScatter(int* buff, int* scratch, int rank, int nRanksPerNo if (isComm) { size_t offset = (peerRank * chunkSize) * sizeof(int); // opposite side - proxyChan.putWithSignal(offset, (chunkSize / pipelineSize * sizeof(int))); + portChan.putWithSignal(offset, (chunkSize / pipelineSize * sizeof(int))); } localReduceScatter(buff, scratch, rank, nRanksPerNode, startChunkIndex, chunkSize / pipelineSize, chunkSize, 2 * chunkSize / pipelineSize); if (isComm) { - proxyChan.wait(); + portChan.wait(); } deviceSyncer.sync(gridDim.x); // reduce data received from peer to related rank @@ -186,7 +186,7 @@ __device__ void reduceScatter(int* buff, int* scratch, int rank, int nRanksPerNo int* src = (int*)((char*)scratch + offset); vectorSum(dst, src, chunkSize / pipelineSize); if (isComm) { - proxyChan.flush(); + portChan.flush(); } deviceSyncer.sync(gridDim.x); @@ -194,11 +194,11 @@ __device__ void reduceScatter(int* buff, int* scratch, int rank, int nRanksPerNo startChunkIndex = (rank / nRanksPerNode) * nRanksPerNode; if (isComm) { size_t offset = (peerRank * chunkSize + chunkSize / pipelineSize) * sizeof(int); - proxyChan.putWithSignal(offset, (pipelineSize - 1) * chunkSize / pipelineSize * sizeof(int)); + portChan.putWithSignal(offset, (pipelineSize - 1) * chunkSize / pipelineSize * sizeof(int)); } localReduceScatter(buff, scratch, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize); if (isComm) { - proxyChan.wait(); + portChan.wait(); } deviceSyncer.sync(gridDim.x); // reduce to related rank @@ -207,7 +207,7 @@ __device__ void reduceScatter(int* buff, int* scratch, int rank, int nRanksPerNo src = (int*)((char*)scratch + offset); vectorSum(dst, src, 2 * chunkSize / pipelineSize); if (isComm) { - proxyChan.flush(); + portChan.flush(); } } @@ -228,8 +228,8 @@ __device__ void localAllGather(int rank, int nRanksPerNode, uint64_t offset, uin int peerSendId = (remoteSendToRank < rank) ? remoteSendToRank : remoteSendToRank - 1; int peerRecvId = (remoteRecvFromRank < rank) ? remoteRecvFromRank : remoteRecvFromRank - 1; - DeviceHandle& devSendChan = constDevSndRoundChans[peerSendId]; - DeviceHandle& devRecvChan = constDevSndRoundChans[peerRecvId]; + DeviceHandle& devSendChan = constDevSndRoundChans[peerSendId]; + DeviceHandle& devRecvChan = constDevSndRoundChans[peerRecvId]; // wait for the data from GPU (rank-i) % nranksPerNode to arrive devSendChan.putWithSignal(offset, size); devRecvChan.wait(); @@ -252,7 +252,7 @@ __device__ void allGather(int rank, int worldSize, int nRanksPerNode, size_t nel int peerRank = (rank + nRanksPerNode) % worldSize; int peerNodeId = peerRank / nRanksPerNode; int peer = (peerRank < rank) ? peerRank : peerRank - 1; - DeviceHandle& proxyChan = constDevSndRoundChans[peer]; + DeviceHandle& portChan = constDevSndRoundChans[peer]; if (peerNodeId == rank / nRanksPerNode) { localAllGather(rank, nRanksPerNode, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int)); @@ -260,30 +260,30 @@ __device__ void allGather(int rank, int worldSize, int nRanksPerNode, size_t nel } // Step 1 - proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), - (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int)); + portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int)); localAllGather(rank, nRanksPerNode, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int)); - proxyChan.wait(); - proxyChan.flush(); + portChan.wait(); + portChan.flush(); // Step 2 - proxyChan.putWithSignal((rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int), - nelemsPerGPU / pipelineSize * sizeof(int)); + portChan.putWithSignal((rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int), + nelemsPerGPU / pipelineSize * sizeof(int)); localAllGather(rank, nRanksPerNode, peerRank * nelemsPerGPU * sizeof(int), (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int)); - proxyChan.wait(); - proxyChan.flush(); + portChan.wait(); + portChan.flush(); // Step 3 localAllGather(rank, nRanksPerNode, (peerRank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int), nelemsPerGPU / pipelineSize * sizeof(int)); } -__device__ void localReduceScatterSm(int* buff, int rank, int nRanksPerNode, int startChunkIndex, size_t offsetInChunk, - size_t chunkSize, size_t nelems, int nBlocks) { +__device__ void localReduceScatterMem(int* buff, int rank, int nRanksPerNode, int startChunkIndex, size_t offsetInChunk, + size_t chunkSize, size_t nelems, int nBlocks) { if (nRanksPerNode == 1) return; if ((int)blockIdx.x >= nBlocks) return; const int nPeer = nRanksPerNode - 1; - DeviceHandle* smChans = constSmOutOfPlaceGetChans; + DeviceHandle* memChans = constMemOutOfPlaceGetChans; const size_t localRankIndexInNode = rank % nRanksPerNode; const size_t indexOffset = ((localRankIndexInNode + startChunkIndex) * chunkSize + offsetInChunk); @@ -292,10 +292,10 @@ __device__ void localReduceScatterSm(int* buff, int rank, int nRanksPerNode, int int4* buff4 = (int4*)buff; for (int peerIdx = threadIdx.x + blockIdx.x * blockDim.x; peerIdx < nPeer; peerIdx += blockDim.x * nBlocks) { - smChans[peerIdx].signal(); + memChans[peerIdx].signal(); } for (int peerIdx = threadIdx.x + blockIdx.x * blockDim.x; peerIdx < nPeer; peerIdx += blockDim.x * nBlocks) { - smChans[peerIdx].wait(); + memChans[peerIdx].wait(); } reduceScatterDeviceSyncer.sync(nBlocks); @@ -304,7 +304,7 @@ __device__ void localReduceScatterSm(int* buff, int rank, int nRanksPerNode, int int4 sum = make_int4(0, 0, 0, 0); for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { - int4 val = smChans[peerIdx].read(indexOffset4 + idx); + int4 val = memChans[peerIdx].read(indexOffset4 + idx); sum.w += val.w; sum.x += val.x; sum.y += val.y; @@ -320,19 +320,19 @@ __device__ void localReduceScatterSm(int* buff, int rank, int nRanksPerNode, int for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nLastInts; idx += blockDim.x * nBlocks) { int sum = 0; for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { - int val = smChans[peerIdx].read(indexOffset + nInt4 * 4 + idx); + int val = memChans[peerIdx].read(indexOffset + nInt4 * 4 + idx); sum += val; } buff[indexOffset + nInt4 * 4 + idx] += sum; } } -__device__ void localReduceScatterSm2(int* buff, int rank, int nRanksPerNode, size_t chunkSize, size_t nelems, - int nBlocks) { +__device__ void localReduceScatterMem2(int* buff, int rank, int nRanksPerNode, size_t chunkSize, size_t nelems, + int nBlocks) { if (nRanksPerNode == 1) return; if ((int)blockIdx.x >= nBlocks) return; const int nPeer = nRanksPerNode - 1; - DeviceHandle* smChans = constSmOutOfPlaceGetChans; + DeviceHandle* memChans = constMemOutOfPlaceGetChans; const size_t localRankIndexInNode = rank % nRanksPerNode; const size_t indexOffset = localRankIndexInNode * chunkSize; @@ -342,11 +342,11 @@ __device__ void localReduceScatterSm2(int* buff, int rank, int nRanksPerNode, si const int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < nPeer) { - smChans[tid].signal(); + memChans[tid].signal(); } const int waitStart = nBlocks * blockDim.x - nPeer; if (tid >= waitStart && tid < (int)(nBlocks * blockDim.x)) { - smChans[tid - waitStart].wait(); + memChans[tid - waitStart].wait(); } reduceScatterDeviceSyncer.sync(nBlocks); @@ -355,7 +355,7 @@ __device__ void localReduceScatterSm2(int* buff, int rank, int nRanksPerNode, si int4 val; int peerIdx = (index + localRankIndexInNode) % nPeer; for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nInt4; idx += blockDim.x * nBlocks) { - val = smChans[peerIdx].read(indexOffset4 + idx); + val = memChans[peerIdx].read(indexOffset4 + idx); buff4[indexOffset4 + idx].w += val.w; buff4[indexOffset4 + idx].x += val.x; buff4[indexOffset4 + idx].y += val.y; @@ -366,18 +366,18 @@ __device__ void localReduceScatterSm2(int* buff, int rank, int nRanksPerNode, si const size_t nLastInts = nelems % 4; for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nLastInts; idx += blockDim.x * nBlocks) { - int val = smChans[(localRankIndexInNode + peerIdx) % nPeer].read(indexOffset + nInt4 * 4 + idx); + int val = memChans[(localRankIndexInNode + peerIdx) % nPeer].read(indexOffset + nInt4 * 4 + idx); buff[indexOffset + nInt4 * 4 + idx] += val; } } } -__device__ void localReduceScatterSm3(int* buff, int rank, int nRanksPerNode, size_t chunkSize, size_t nelems, - int nBlocks) { +__device__ void localReduceScatterMem3(int* buff, int rank, int nRanksPerNode, size_t chunkSize, size_t nelems, + int nBlocks) { if (nRanksPerNode == 1) return; if ((int)blockIdx.x >= nBlocks) return; const int nPeer = nRanksPerNode - 1; - DeviceHandle* smChans = constSmOutOfPlaceGetChans; + DeviceHandle* memChans = constMemOutOfPlaceGetChans; const size_t localRankIndexInNode = rank % nRanksPerNode; const size_t indexOffset = localRankIndexInNode * chunkSize; @@ -387,11 +387,11 @@ __device__ void localReduceScatterSm3(int* buff, int rank, int nRanksPerNode, si const int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < nPeer) { - smChans[tid].signal(); + memChans[tid].signal(); } const int waitStart = nBlocks * blockDim.x - nPeer; if (tid >= waitStart && tid < (int)(nBlocks * blockDim.x)) { - smChans[tid - waitStart].wait(); + memChans[tid - waitStart].wait(); } reduceScatterDeviceSyncer.sync(nBlocks); @@ -405,7 +405,7 @@ __device__ void localReduceScatterSm3(int* buff, int rank, int nRanksPerNode, si int peerIdx = (index + localRankIndexInNode) % nPeer; for (size_t idx = base + threadIdx.x + blockIdx.x * blockDim.x; idx < base + unitNInt4; idx += blockDim.x * nBlocks) { - val = smChans[peerIdx].read(indexOffset4 + idx); + val = memChans[peerIdx].read(indexOffset4 + idx); buff4[indexOffset4 + idx].w += val.w; buff4[indexOffset4 + idx].x += val.x; buff4[indexOffset4 + idx].y += val.y; @@ -417,7 +417,7 @@ __device__ void localReduceScatterSm3(int* buff, int rank, int nRanksPerNode, si int4 val; int peerIdx = (index + localRankIndexInNode) % nPeer; for (size_t idx = base + threadIdx.x + blockIdx.x * blockDim.x; idx < nInt4; idx += blockDim.x * nBlocks) { - val = smChans[peerIdx].read(indexOffset4 + idx); + val = memChans[peerIdx].read(indexOffset4 + idx); buff4[indexOffset4 + idx].w += val.w; buff4[indexOffset4 + idx].x += val.x; buff4[indexOffset4 + idx].y += val.y; @@ -428,14 +428,14 @@ __device__ void localReduceScatterSm3(int* buff, int rank, int nRanksPerNode, si const size_t nLastInts = nelems % 4; for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nLastInts; idx += blockDim.x * nBlocks) { - int val = smChans[(localRankIndexInNode + peerIdx) % nPeer].read(indexOffset + nInt4 * 4 + idx); + int val = memChans[(localRankIndexInNode + peerIdx) % nPeer].read(indexOffset + nInt4 * 4 + idx); buff[indexOffset + nInt4 * 4 + idx] += val; } } } -__device__ void reduceScatterSm(int* buff, int* scratch, int rank, int nRanksPerNode, int worldSize, - size_t nelems // must be divisible by 3 +__device__ void reduceScatterMem(int* buff, int* scratch, int rank, int nRanksPerNode, int worldSize, + size_t nelems // must be divisible by 3 ) { // this reduce-scatter algorithm works as follows: // Step 1: each node does a local reduce-scatter on peer node data chunks with 1/pipeline portion of chunk data. For @@ -457,28 +457,28 @@ __device__ void reduceScatterSm(int* buff, int* scratch, int rank, int nRanksPer int isComm = (threadIdx.x == 0) && ((int)blockIdx.x == nBlocksForReduceScatter); int peer = (peerRank < rank) ? peerRank : peerRank - 1; int nBlocksRemain = gridDim.x - nBlocksForReduceScatter; - DeviceHandle& proxyChan = constDevFstRoundChans[peer]; + DeviceHandle& portChan = constDevFstRoundChans[peer]; if (peerNodeId == rank / nRanksPerNode) { - localReduceScatterSm(buff, rank, nRanksPerNode, 0, 0, chunkSize, chunkSize, gridDim.x); + localReduceScatterMem(buff, rank, nRanksPerNode, 0, 0, chunkSize, chunkSize, gridDim.x); return; } // step 1: local reduce int startChunkIndex = peerNodeId * nRanksPerNode; - localReduceScatterSm(buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize / pipelineSize, - nBlocksForReduceScatter); + localReduceScatterMem(buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize / pipelineSize, + nBlocksForReduceScatter); deviceSyncer.sync(gridDim.x); // step 2: local reduce and exchange data with neighbor if (isComm) { size_t offset = (peerRank * chunkSize) * sizeof(int); // opposite side - proxyChan.putWithSignal(offset, (chunkSize / pipelineSize * sizeof(int))); + portChan.putWithSignal(offset, (chunkSize / pipelineSize * sizeof(int))); } - localReduceScatterSm(buff, rank, nRanksPerNode, startChunkIndex, chunkSize / pipelineSize, chunkSize, - 2 * chunkSize / pipelineSize, nBlocksForReduceScatter); + localReduceScatterMem(buff, rank, nRanksPerNode, startChunkIndex, chunkSize / pipelineSize, chunkSize, + 2 * chunkSize / pipelineSize, nBlocksForReduceScatter); if (isComm) { - proxyChan.wait(); + portChan.wait(); } if ((int)blockIdx.x >= nBlocksForReduceScatter) { ibDeviceSyncer.sync(nBlocksRemain); @@ -489,7 +489,7 @@ __device__ void reduceScatterSm(int* buff, int* scratch, int rank, int nRanksPer vectorSum(dst, src, chunkSize / pipelineSize, blockIdx.x - nBlocksForReduceScatter, nBlocksRemain); } if (isComm) { - proxyChan.flush(); + portChan.flush(); } deviceSyncer.sync(gridDim.x); @@ -497,11 +497,11 @@ __device__ void reduceScatterSm(int* buff, int* scratch, int rank, int nRanksPer startChunkIndex = (rank / nRanksPerNode) * nRanksPerNode; if (isComm) { size_t offset = (peerRank * chunkSize + chunkSize / pipelineSize) * sizeof(int); - proxyChan.putWithSignal(offset, (pipelineSize - 1) * chunkSize / pipelineSize * sizeof(int)); + portChan.putWithSignal(offset, (pipelineSize - 1) * chunkSize / pipelineSize * sizeof(int)); } - localReduceScatterSm(buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize, nBlocksForReduceScatter); + localReduceScatterMem(buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize, nBlocksForReduceScatter); if (isComm) { - proxyChan.wait(); + portChan.wait(); } deviceSyncer.sync(gridDim.x); // reduce to related rank, can not overlap since localReduceScatter also calculate the sum @@ -510,13 +510,13 @@ __device__ void reduceScatterSm(int* buff, int* scratch, int rank, int nRanksPer int* src = (int*)((char*)scratch + offset); vectorSum(dst, src, 2 * chunkSize / pipelineSize); if (isComm) { - proxyChan.flush(); + portChan.flush(); } } // This kernel is the most performant when the number of blocks is a multiple of (nRanksPerNode - 1). -__device__ void localAllGatherSm(int rank, int nRanksPerNode, int startRankChunkIndex, uint64_t offsetInRankChunk, - uint64_t rankChunkSize, uint64_t size, size_t nBlocks) { +__device__ void localAllGatherMem(int rank, int nRanksPerNode, int startRankChunkIndex, uint64_t offsetInRankChunk, + uint64_t rankChunkSize, uint64_t size, size_t nBlocks) { if (nRanksPerNode == 1) return; if (blockIdx.x >= nBlocks) return; const size_t nPeer = nRanksPerNode - 1; @@ -552,15 +552,15 @@ __device__ void localAllGatherSm(int rank, int nRanksPerNode, int startRankChunk sizeForThisBlock += lastChunkSize; } if (threadIdx.x == 0 && peerLocalBlockIdx == 0) { - constSmInPlaceChans[peerIdx].signal(); - constSmInPlaceChans[peerIdx].wait(); + constMemInPlaceChans[peerIdx].signal(); + constMemInPlaceChans[peerIdx].wait(); } allGatherDeviceSyncer.sync(nBlocks); size_t offset = rankChunkSize * (startRankChunkIndex + remoteRankLocalIndex) + offsetInRankChunk; - constSmInPlaceChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x); + constMemInPlaceChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x); } -__device__ void localRingAllGatherSm(int rank, int nRanksPerNode, uint64_t size, size_t nBlocks) { +__device__ void localRingAllGatherMem(int rank, int nRanksPerNode, uint64_t size, size_t nBlocks) { if (nRanksPerNode == 1) return; if (blockIdx.x >= nBlocks) return; @@ -568,22 +568,22 @@ __device__ void localRingAllGatherSm(int rank, int nRanksPerNode, uint64_t size, const int nPeer = nRanksPerNode - 1; if (tid < nPeer) { - constSmInPlaceChans[tid].signal(); + constMemInPlaceChans[tid].signal(); } int waitStart = nBlocks * blockDim.x - nPeer; if (tid >= waitStart && tid < (int)(nBlocks * blockDim.x)) { - constSmInPlaceChans[tid - waitStart].wait(); + constMemInPlaceChans[tid - waitStart].wait(); } allGatherDeviceSyncer.sync(nBlocks); for (int i = 0; i < nPeer; ++i) { int peerIdx = (i + rank) % nPeer; const int remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); size_t offset = size * remoteRankLocalIndex; - constSmInPlaceChans[peerIdx].get(offset, size, tid, blockDim.x * nBlocks); + constMemInPlaceChans[peerIdx].get(offset, size, tid, blockDim.x * nBlocks); } } -__device__ void localRingAllGatherSm2(size_t rank, size_t nRanksPerNode, size_t size, size_t nBlocks) { +__device__ void localRingAllGatherMem2(size_t rank, size_t nRanksPerNode, size_t size, size_t nBlocks) { if (nRanksPerNode == 1) return; if (blockIdx.x >= nBlocks) return; @@ -591,11 +591,11 @@ __device__ void localRingAllGatherSm2(size_t rank, size_t nRanksPerNode, size_t const size_t nPeer = nRanksPerNode - 1; if (tid < nPeer) { - constSmInPlaceChans[tid].signal(); + constMemInPlaceChans[tid].signal(); } size_t waitStart = nBlocks * blockDim.x - nPeer; if (tid >= waitStart && tid < nBlocks * blockDim.x) { - constSmInPlaceChans[tid - waitStart].wait(); + constMemInPlaceChans[tid - waitStart].wait(); } allGatherDeviceSyncer.sync(nBlocks); const size_t unitSize = 16 * blockDim.x * nBlocks; @@ -605,19 +605,19 @@ __device__ void localRingAllGatherSm2(size_t rank, size_t nRanksPerNode, size_t size_t peerIdx = (i + rank) % nPeer; const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); size_t offset = size * remoteRankLocalIndex + base; - constSmInPlaceChans[peerIdx].get(offset, unitSize, tid, blockDim.x * nBlocks); + constMemInPlaceChans[peerIdx].get(offset, unitSize, tid, blockDim.x * nBlocks); } } for (size_t i = 0; i < nPeer; ++i) { size_t peerIdx = (i + rank) % nPeer; const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1); size_t offset = size * remoteRankLocalIndex + base; - constSmInPlaceChans[peerIdx].get(offset, size - base, tid, blockDim.x * nBlocks); + constMemInPlaceChans[peerIdx].get(offset, size - base, tid, blockDim.x * nBlocks); } } // This is an allgather4 equivalent -__device__ void allGatherSm(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) { +__device__ void allGatherMem(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) { // this allgather is a pipelined and hierarchical one and only works for two nodes // it is implemented as follows: // Step 1: each node does a local allgather and concurrently, @@ -632,14 +632,14 @@ __device__ void allGatherSm(int rank, int worldSize, int nRanksPerNode, size_t n int peerRank = (rank + nRanksPerNode) % worldSize; int peerNodeId = peerRank / nRanksPerNode; int peer = (peerRank < rank) ? peerRank : peerRank - 1; - DeviceHandle& proxyChan = constDevSndRoundChans[peer]; + DeviceHandle& portChan = constDevSndRoundChans[peer]; const size_t nBlocksForLocalAllGather = gridDim.x / (nRanksPerNode - 1) * (nRanksPerNode - 1); const size_t rankChunkSize = nelemsPerGPU * sizeof(int); const int startRankIndexInLocalNode = (rank / nRanksPerNode) * nRanksPerNode; const int startRankIndexInPeerNode = (peerRank / nRanksPerNode) * nRanksPerNode; if (peerNodeId == rank / nRanksPerNode) { - localAllGatherSm(rank, nRanksPerNode, 0, 0, rankChunkSize, rankChunkSize, gridDim.x); + localAllGatherMem(rank, nRanksPerNode, 0, 0, rankChunkSize, rankChunkSize, gridDim.x); return; } @@ -650,29 +650,29 @@ __device__ void allGatherSm(int rank, int worldSize, int nRanksPerNode, size_t n // Step 1 if (threadIdx.x == 0 && blockIdx.x == 0) { - proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), step1Bytes); + portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), step1Bytes); } - localAllGatherSm(rank, nRanksPerNode, startRankIndexInLocalNode, 0, rankChunkSize, rankChunkSize, - nBlocksForLocalAllGather); + localAllGatherMem(rank, nRanksPerNode, startRankIndexInLocalNode, 0, rankChunkSize, rankChunkSize, + nBlocksForLocalAllGather); if (threadIdx.x == 0 && blockIdx.x == 0) { - proxyChan.wait(); - proxyChan.flush(); + portChan.wait(); + portChan.flush(); } deviceSyncer.sync(gridDim.x); // Step 2 if (threadIdx.x == 0 && blockIdx.x == 0) { - proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int) + step1Bytes, step2Bytes); + portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int) + step1Bytes, step2Bytes); } - localAllGatherSm(rank, nRanksPerNode, startRankIndexInPeerNode, 0, rankChunkSize, step1Bytes, - nBlocksForLocalAllGather); + localAllGatherMem(rank, nRanksPerNode, startRankIndexInPeerNode, 0, rankChunkSize, step1Bytes, + nBlocksForLocalAllGather); if (threadIdx.x == 0 && blockIdx.x == 0) { - proxyChan.wait(); - proxyChan.flush(); + portChan.wait(); + portChan.flush(); } deviceSyncer.sync(gridDim.x); // Step 3 - localAllGatherSm(rank, nRanksPerNode, startRankIndexInPeerNode, step1Bytes, rankChunkSize, step2Bytes, - nBlocksForLocalAllGather); + localAllGatherMem(rank, nRanksPerNode, startRankIndexInPeerNode, step1Bytes, rankChunkSize, step2Bytes, + nBlocksForLocalAllGather); } __global__ void __launch_bounds__(1024) @@ -682,7 +682,7 @@ __global__ void __launch_bounds__(1024) int remoteRank = (peerId < rank) ? peerId : peerId + 1; // 1st communication phase: send data to the scratch buffer of the peer associated with this block - DeviceHandle& devFstRoundChan = constDevFstRoundChans[peerId]; + DeviceHandle& devFstRoundChan = constDevFstRoundChans[peerId]; Chunk toPeerChunk = getChunk(nelems, worldSize, remoteRank); // Now we need to figure out the offset of this chunk in the scratch buffer of the destination. // The destination will have allocated a scratch buffer of size numPeers() * toPeerChunk.size and @@ -700,7 +700,7 @@ __global__ void __launch_bounds__(1024) deviceSyncer.sync(gridDim.x); // Local reduction: every block reduces a slice of each chunk in the scratch buffer into the user buffer - DeviceHandle& devSndRoundChan = constDevSndRoundChans[peerId]; + DeviceHandle& devSndRoundChan = constDevSndRoundChans[peerId]; Chunk rankChunk = getChunk(nelems, worldSize, rank); int* chunk = buff + rankChunk.offset; int numPeers = gridDim.x / BLOCKS_PER_PEER; @@ -734,10 +734,10 @@ __global__ void __launch_bounds__(1024) allreduce1(int* buff, int* scratch, int int peerSendId = (remoteSendRank < rank) ? remoteSendRank : remoteSendRank - 1; int peerRecvId = (remoteRecvRank < rank) ? remoteRecvRank : remoteRecvRank - 1; - DeviceHandle& devFstSendChan = constDevFstRoundChans[peerSendId]; - DeviceHandle& devFstRecvChan = constDevFstRoundChans[peerRecvId]; - DeviceHandle& devSndSendChan = constDevSndRoundChans[peerSendId]; - DeviceHandle& devSndRecvChan = constDevSndRoundChans[peerRecvId]; + DeviceHandle& devFstSendChan = constDevFstRoundChans[peerSendId]; + DeviceHandle& devFstRecvChan = constDevFstRoundChans[peerRecvId]; + DeviceHandle& devSndSendChan = constDevSndRoundChans[peerSendId]; + DeviceHandle& devSndRecvChan = constDevSndRoundChans[peerRecvId]; // Step 1 size_t chunkIndex = (rank + worldSize - 1) % worldSize; @@ -846,12 +846,12 @@ __global__ void __launch_bounds__(1024) size_t pktBytes = nPkts * sizeof(mscclpp::LLPacket); // Channel to a local peer - int smChanIdx = blockIdx.x / BLOCKS_PER_PEER; - DeviceHandle smChan = constSmOutOfPlaceChans[smChanIdx]; + int memChanIdx = blockIdx.x / BLOCKS_PER_PEER; + DeviceHandle memChan = constMemOutOfPlaceChans[memChanIdx]; // Channel to a remote peer that has the same local rank as me int localRank = rank % nRanksPerNode; - DeviceHandle proxyChan = constDevFstRoundChans[localRank]; + DeviceHandle portChan = constDevFstRoundChans[localRank]; // Flag for packets. Initially 1 uint32_t flag = (uint32_t)globalFlag; @@ -876,11 +876,11 @@ __global__ void __launch_bounds__(1024) size_t srcOffset = ((blockIdx.x % BLOCKS_PER_PEER) * nelems * sizeof(int) / BLOCKS_PER_PEER); // offset for this block // Offset of the peer's scratch buffer (scratch) to write on - size_t dstOffset = (scratchOffset) + // double buffering - ((smChanIdx < localRank ? localRank - 1 : localRank) * pktBytes) + // offset for this rank + size_t dstOffset = (scratchOffset) + // double buffering + ((memChanIdx < localRank ? localRank - 1 : localRank) * pktBytes) + // offset for this rank (srcOffset * 2); // offset for this block: twice of srcOffset because 2 elems per packet // Write data to the peer's scratch - smChan.putPackets(dstOffset, srcOffset, nelems / BLOCKS_PER_PEER * sizeof(int), threadIdx.x, blockDim.x, flag); + memChan.putPackets(dstOffset, srcOffset, nelems / BLOCKS_PER_PEER * sizeof(int), threadIdx.x, blockDim.x, flag); // Read data from my scratch, reduce data with my buff, and write the result to my putPktBuf or to result const bool isSingleNode = (worldSize == nRanksPerNode); for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPkts; idx += blockDim.x * gridDim.x) { @@ -924,9 +924,9 @@ __global__ void __launch_bounds__(1024) // Write my putPktBuf to the remote peer's getPktBuf if (threadIdx.x == 0 && blockIdx.x == 0) { - proxyChan.put(pktBufOffset, pktBytes); + portChan.put(pktBufOffset, pktBytes); if ((flag & 63) == 0) { - proxyChan.flush(); + portChan.flush(); } } @@ -954,21 +954,21 @@ __global__ void __launch_bounds__(1024) __global__ void __launch_bounds__(1024) allreduce4(int* buff, int* scratch, int rank, int nRanksPerNode, int worldSize, size_t nelems) { - reduceScatterSm(buff, scratch, rank, nRanksPerNode, worldSize, nelems); + reduceScatterMem(buff, scratch, rank, nRanksPerNode, worldSize, nelems); deviceSyncer.sync(gridDim.x); - allGatherSm(rank, worldSize, nRanksPerNode, nelems / worldSize); + allGatherMem(rank, worldSize, nRanksPerNode, nelems / worldSize); } __global__ void __launch_bounds__(1024) allreduce5(int* buff, int rank, int nRanksPerNode, int worldSize, size_t nelems) { #if defined(__HIP_PLATFORM_AMD__) - localReduceScatterSm3(buff, rank, nRanksPerNode, nelems / worldSize, nelems / worldSize, gridDim.x); + localReduceScatterMem3(buff, rank, nRanksPerNode, nelems / worldSize, nelems / worldSize, gridDim.x); deviceSyncer.sync(gridDim.x); - localRingAllGatherSm2(rank, nRanksPerNode, nelems / worldSize * sizeof(int), gridDim.x); + localRingAllGatherMem2(rank, nRanksPerNode, nelems / worldSize * sizeof(int), gridDim.x); #else - localReduceScatterSm2(buff, rank, nRanksPerNode, nelems / worldSize, nelems / worldSize, gridDim.x); + localReduceScatterMem2(buff, rank, nRanksPerNode, nelems / worldSize, nelems / worldSize, gridDim.x); deviceSyncer.sync(gridDim.x); - localRingAllGatherSm(rank, nRanksPerNode, nelems / worldSize * sizeof(int), gridDim.x); + localRingAllGatherMem(rank, nRanksPerNode, nelems / worldSize * sizeof(int), gridDim.x); #endif } @@ -998,8 +998,8 @@ __global__ void __launch_bounds__(1024) uint2* dst = (uint2*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); // step 1: write to scratch buffer - constSmOutOfPlaceChans[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, - blockDim.x * nBlocksPerPeer, flag); + constMemOutOfPlaceChans[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, + blockDim.x * nBlocksPerPeer, flag); // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { uint2 data = make_uint2(0, 0); @@ -1021,7 +1021,7 @@ __global__ void __launch_bounds__(1024) packet.flag2 = flag; size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket) + (idx + rank * nPktsPerRank); for (int index = 0; index < nPeers; index++) { - constSmOutOfPlaceChans[index].write(offset, packet); + constMemOutOfPlaceChans[index].write(offset, packet); } } // step 3: get data result from scratch buffer @@ -1064,8 +1064,8 @@ __global__ void __launch_bounds__(1024) uint32_t* dst = (uint32_t*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); // step 1: write to scratch buffer - constSmOutOfPlaceChans[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), - tid, blockDim.x * nBlocksPerPeer, flag); + constMemOutOfPlaceChans[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), + tid, blockDim.x * nBlocksPerPeer, flag); // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { uint32_t data = 0; @@ -1083,7 +1083,7 @@ __global__ void __launch_bounds__(1024) packet.flag = flag; size_t offset = scratchResultOffset / sizeof(mscclpp::LL8Packet) + (idx + rank * nPktsPerRank); for (int index = 0; index < nPeers; index++) { - constSmOutOfPlaceChans[index].write(offset, packet); + constMemOutOfPlaceChans[index].write(offset, packet); } } // step 3: get data result from scratch buffer @@ -1255,9 +1255,9 @@ class AllReduceTestEngine : public BaseTestEngine { std::shared_ptr putPacketBuff_; std::shared_ptr getPacketBuff_; std::shared_ptr expectedBuff_; - std::vector smOutOfPlaceChannels_; - std::vector smInPlaceChannels_; - std::vector smOutOfPlaceGetChannels_; + std::vector memoryOutOfPlaceChannels_; + std::vector memoryInPlaceChannels_; + std::vector memoryOutOfPlaceGetChannels_; }; AllReduceTestEngine::AllReduceTestEngine(const TestArgs& args) : BaseTestEngine(args, "allreduce") { @@ -1304,94 +1304,98 @@ void AllReduceTestEngine::allocateBuffer() { } void AllReduceTestEngine::setupConnections() { - auto getChannelDeviceHandle = [](const std::vector& in, - std::vector>& out) { - return std::transform(in.begin(), in.end(), out.begin(), - [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); }); + auto getChannelDeviceHandle = [](const std::vector& in, + std::vector>& out) { + return std::transform(in.begin(), in.end(), out.begin(), [](const mscclpp::MemoryChannel& memoryChannel) { + return mscclpp::deviceHandle(memoryChannel); + }); }; if (isUsePacket()) { - std::vector> proxyChannels; + std::vector> portChannels; const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t); if (args_.kernelNum == 6 || args_.kernelNum == 7) { const size_t scratchPacketBuffBytes = nPacket * 2 * 2 * sizeof(mscclpp::LLPacket); - setupMeshConnections(smOutOfPlaceChannels_, inputBuff_.get(), args_.maxBytes, scratchPacketBuff_.get(), + setupMeshConnections(memoryOutOfPlaceChannels_, inputBuff_.get(), args_.maxBytes, scratchPacketBuff_.get(), scratchPacketBuffBytes); - std::vector> smChannelDeviceHandles(smOutOfPlaceChannels_.size()); - getChannelDeviceHandle(smOutOfPlaceChannels_, smChannelDeviceHandles); - CUDATHROW(cudaMemcpyToSymbol(constSmOutOfPlaceChans, smChannelDeviceHandles.data(), - sizeof(DeviceHandle) * smChannelDeviceHandles.size())); + std::vector> memoryChannelDeviceHandles(memoryOutOfPlaceChannels_.size()); + getChannelDeviceHandle(memoryOutOfPlaceChannels_, memoryChannelDeviceHandles); + CUDATHROW(cudaMemcpyToSymbol(constMemOutOfPlaceChans, memoryChannelDeviceHandles.data(), + sizeof(DeviceHandle) * memoryChannelDeviceHandles.size())); } if (args_.kernelNum == 2) { const size_t scratchPacketBuffBytes = nPacket * std::max(args_.nRanksPerNode - 1, 1) * 2 * sizeof(mscclpp::LLPacket); const size_t packetBuffBytes = nPacket * 2 * sizeof(mscclpp::LLPacket); - setupMeshConnections(smOutOfPlaceChannels_, proxyChannels, inputBuff_.get(), args_.maxBytes, putPacketBuff_.get(), - packetBuffBytes, getPacketBuff_.get(), packetBuffBytes, scratchPacketBuff_.get(), - scratchPacketBuffBytes); + setupMeshConnections(memoryOutOfPlaceChannels_, portChannels, inputBuff_.get(), args_.maxBytes, + putPacketBuff_.get(), packetBuffBytes, getPacketBuff_.get(), packetBuffBytes, + scratchPacketBuff_.get(), scratchPacketBuffBytes); - if (smOutOfPlaceChannels_.size() > sizeof(constSmOutOfPlaceChans) / sizeof(DeviceHandle)) { + if (memoryOutOfPlaceChannels_.size() > + sizeof(constMemOutOfPlaceChans) / sizeof(DeviceHandle)) { std::runtime_error("unexpected error"); } - if (proxyChannels.size() > sizeof(constDevFstRoundChans) / sizeof(DeviceHandle)) { + if (portChannels.size() > sizeof(constDevFstRoundChans) / sizeof(DeviceHandle)) { std::runtime_error("unexpected error"); } - std::vector> smChannelDeviceHandles(smOutOfPlaceChannels_.size()); - getChannelDeviceHandle(smOutOfPlaceChannels_, smChannelDeviceHandles); - CUDATHROW(cudaMemcpyToSymbol(constSmOutOfPlaceChans, smChannelDeviceHandles.data(), - sizeof(DeviceHandle) * smChannelDeviceHandles.size())); - CUDATHROW(cudaMemcpyToSymbol(constDevFstRoundChans, proxyChannels.data(), - sizeof(DeviceHandle) * proxyChannels.size())); + std::vector> memoryChannelDeviceHandles(memoryOutOfPlaceChannels_.size()); + getChannelDeviceHandle(memoryOutOfPlaceChannels_, memoryChannelDeviceHandles); + CUDATHROW(cudaMemcpyToSymbol(constMemOutOfPlaceChans, memoryChannelDeviceHandles.data(), + sizeof(DeviceHandle) * memoryChannelDeviceHandles.size())); + CUDATHROW(cudaMemcpyToSymbol(constDevFstRoundChans, portChannels.data(), + sizeof(DeviceHandle) * portChannels.size())); } } else { - std::vector> fstRoundChannels; - std::vector> sndRoundChannels; + std::vector> fstRoundChannels; + std::vector> sndRoundChannels; // Send data from local inputBuff to remote scratchBuff (out-of-place) setupMeshConnections(fstRoundChannels, inputBuff_.get(), args_.maxBytes, scratchBuff_.get(), args_.maxBytes); - if (fstRoundChannels.size() > sizeof(constDevFstRoundChans) / sizeof(DeviceHandle)) { + if (fstRoundChannels.size() > sizeof(constDevFstRoundChans) / sizeof(DeviceHandle)) { std::runtime_error("unexpected error"); } CUDATHROW(cudaMemcpyToSymbol(constDevFstRoundChans, fstRoundChannels.data(), - sizeof(DeviceHandle) * fstRoundChannels.size())); + sizeof(DeviceHandle) * fstRoundChannels.size())); // Send data from local inputBuff to remote inputBuff (in-place) setupMeshConnections(sndRoundChannels, inputBuff_.get(), args_.maxBytes); - if (sndRoundChannels.size() > sizeof(constDevSndRoundChans) / sizeof(DeviceHandle)) { + if (sndRoundChannels.size() > sizeof(constDevSndRoundChans) / sizeof(DeviceHandle)) { std::runtime_error("unexpected error"); } CUDATHROW(cudaMemcpyToSymbol(constDevSndRoundChans, sndRoundChannels.data(), - sizeof(DeviceHandle) * sndRoundChannels.size())); + sizeof(DeviceHandle) * sndRoundChannels.size())); - setupMeshConnections(smOutOfPlaceChannels_, inputBuff_.get(), args_.maxBytes, scratchBuff_.get(), args_.maxBytes); - if (smOutOfPlaceChannels_.size() > sizeof(constSmOutOfPlaceChans) / sizeof(DeviceHandle)) { + setupMeshConnections(memoryOutOfPlaceChannels_, inputBuff_.get(), args_.maxBytes, scratchBuff_.get(), + args_.maxBytes); + if (memoryOutOfPlaceChannels_.size() > + sizeof(constMemOutOfPlaceChans) / sizeof(DeviceHandle)) { std::runtime_error("unexpected error"); } - std::vector> smChannelDeviceHandles(smOutOfPlaceChannels_.size()); - getChannelDeviceHandle(smOutOfPlaceChannels_, smChannelDeviceHandles); - CUDATHROW(cudaMemcpyToSymbol(constSmOutOfPlaceChans, smChannelDeviceHandles.data(), - sizeof(DeviceHandle) * smChannelDeviceHandles.size())); + std::vector> memoryChannelDeviceHandles(memoryOutOfPlaceChannels_.size()); + getChannelDeviceHandle(memoryOutOfPlaceChannels_, memoryChannelDeviceHandles); + CUDATHROW(cudaMemcpyToSymbol(constMemOutOfPlaceChans, memoryChannelDeviceHandles.data(), + sizeof(DeviceHandle) * memoryChannelDeviceHandles.size())); - setupMeshConnections(smInPlaceChannels_, inputBuff_.get(), args_.maxBytes); - if (smInPlaceChannels_.size() > sizeof(constSmInPlaceChans) / sizeof(DeviceHandle)) { + setupMeshConnections(memoryInPlaceChannels_, inputBuff_.get(), args_.maxBytes); + if (memoryInPlaceChannels_.size() > sizeof(constMemInPlaceChans) / sizeof(DeviceHandle)) { std::runtime_error("unexpected error"); } - smChannelDeviceHandles.resize(smInPlaceChannels_.size()); - getChannelDeviceHandle(smInPlaceChannels_, smChannelDeviceHandles); - CUDATHROW(cudaMemcpyToSymbol(constSmInPlaceChans, smChannelDeviceHandles.data(), - sizeof(DeviceHandle) * smChannelDeviceHandles.size())); - - setupMeshConnections(smOutOfPlaceGetChannels_, inputBuff_.get(), args_.maxBytes, scratchBuff_.get(), args_.maxBytes, - ChannelSemantic::GET); - if (smOutOfPlaceGetChannels_.size() > - sizeof(constSmOutOfPlaceGetChans) / sizeof(DeviceHandle)) { + memoryChannelDeviceHandles.resize(memoryInPlaceChannels_.size()); + getChannelDeviceHandle(memoryInPlaceChannels_, memoryChannelDeviceHandles); + CUDATHROW(cudaMemcpyToSymbol(constMemInPlaceChans, memoryChannelDeviceHandles.data(), + sizeof(DeviceHandle) * memoryChannelDeviceHandles.size())); + + setupMeshConnections(memoryOutOfPlaceGetChannels_, inputBuff_.get(), args_.maxBytes, scratchBuff_.get(), + args_.maxBytes, ChannelSemantic::GET); + if (memoryOutOfPlaceGetChannels_.size() > + sizeof(constMemOutOfPlaceGetChans) / sizeof(DeviceHandle)) { std::runtime_error("unexpected error"); } - smChannelDeviceHandles.resize(smOutOfPlaceGetChannels_.size()); - getChannelDeviceHandle(smOutOfPlaceGetChannels_, smChannelDeviceHandles); - CUDATHROW(cudaMemcpyToSymbol(constSmOutOfPlaceGetChans, smChannelDeviceHandles.data(), - sizeof(DeviceHandle) * smChannelDeviceHandles.size())); + memoryChannelDeviceHandles.resize(memoryOutOfPlaceGetChannels_.size()); + getChannelDeviceHandle(memoryOutOfPlaceGetChannels_, memoryChannelDeviceHandles); + CUDATHROW(cudaMemcpyToSymbol(constMemOutOfPlaceGetChans, memoryChannelDeviceHandles.data(), + sizeof(DeviceHandle) * memoryChannelDeviceHandles.size())); } } diff --git a/test/mscclpp-test/alltoall_test.cu b/test/mscclpp-test/alltoall_test.cu index 6d39e9f5f..57e45e450 100644 --- a/test/mscclpp-test/alltoall_test.cu +++ b/test/mscclpp-test/alltoall_test.cu @@ -9,7 +9,7 @@ template using DeviceHandle = mscclpp::DeviceHandle; -__constant__ DeviceHandle constProxyChans[16]; +__constant__ DeviceHandle constPortChans[16]; __device__ mscclpp::DeviceSyncer deviceSyncer; void* localRecvBuff; void* localSendBuff; @@ -17,14 +17,14 @@ void* localSendBuff; __device__ void localAlltoall(int rank, int nRanksPerNode, size_t nElements) { int remoteRank = ((int)blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1; for (int i = 1; i < nRanksPerNode; i++) { - DeviceHandle proxyChan = constProxyChans[blockIdx.x]; + DeviceHandle portChan = constPortChans[blockIdx.x]; if (threadIdx.x == 0 && remoteRank % nRanksPerNode == (rank + i) % nRanksPerNode) { - proxyChan.putWithSignalAndFlush(rank * nElements * sizeof(int), remoteRank * nElements * sizeof(int), - nElements * sizeof(int)); + portChan.putWithSignalAndFlush(rank * nElements * sizeof(int), remoteRank * nElements * sizeof(int), + nElements * sizeof(int)); } // wait for the data from GPU (rank-i) % nranksPerNode to arrive if (threadIdx.x == 0 && remoteRank % nRanksPerNode == (rank - i + nRanksPerNode) % nRanksPerNode) { - proxyChan.wait(); + portChan.wait(); } deviceSyncer.sync(nRanksPerNode - 1); } @@ -32,16 +32,16 @@ __device__ void localAlltoall(int rank, int nRanksPerNode, size_t nElements) { __global__ void __launch_bounds__(1024) alltoall0(int rank, size_t nElements) { int remoteRank = ((int)blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1; - DeviceHandle proxyChan = constProxyChans[blockIdx.x]; + DeviceHandle portChan = constPortChans[blockIdx.x]; if (threadIdx.x == 0) { - proxyChan.putWithSignal(rank * nElements * sizeof(int), remoteRank * nElements * sizeof(int), - nElements * sizeof(int)); + portChan.putWithSignal(rank * nElements * sizeof(int), remoteRank * nElements * sizeof(int), + nElements * sizeof(int)); } deviceSyncer.sync(gridDim.x); if (threadIdx.x == 0) { - proxyChan.flush(); - proxyChan.wait(); + portChan.flush(); + portChan.wait(); } } @@ -149,14 +149,14 @@ void AllToAllTestEngine::allocateBuffer() { } void AllToAllTestEngine::setupConnections() { - std::vector> proxyChannels; - setupMeshConnections(proxyChannels, sendBuff_.get(), args_.maxBytes, recvBuff_.get(), args_.maxBytes); + std::vector> portChannels; + setupMeshConnections(portChannels, sendBuff_.get(), args_.maxBytes, recvBuff_.get(), args_.maxBytes); - if (proxyChannels.size() > sizeof(constProxyChans) / sizeof(DeviceHandle)) { + if (portChannels.size() > sizeof(constPortChans) / sizeof(DeviceHandle)) { std::runtime_error("unexpected error"); } - CUDATHROW(cudaMemcpyToSymbol(constProxyChans, proxyChannels.data(), - sizeof(DeviceHandle) * proxyChannels.size())); + CUDATHROW(cudaMemcpyToSymbol(constPortChans, portChannels.data(), + sizeof(DeviceHandle) * portChannels.size())); } std::vector AllToAllTestEngine::getSendBuff() { return {sendBuff_.get()}; } diff --git a/test/mscclpp-test/common.cc b/test/mscclpp-test/common.cc index c92312806..0bbf26012 100644 --- a/test/mscclpp-test/common.cc +++ b/test/mscclpp-test/common.cc @@ -397,7 +397,7 @@ void BaseTestEngine::setupMeshConnectionsInternal( // Create mesh connections between all ranks. If recvBuff is nullptr, assume in-place. // TODO(saemal): retrun the actual vector instead of void -void BaseTestEngine::setupMeshConnections(std::vector>& proxyChannels, +void BaseTestEngine::setupMeshConnections(std::vector>& portChannels, void* inputBuff, size_t inputBuffBytes, void* outputBuff, size_t outputBuffBytes, SetupChannelFunc setupChannel) { mscclpp::TransportFlags allTransports = mscclpp::Transport::CudaIpc; @@ -419,16 +419,16 @@ void BaseTestEngine::setupMeshConnections(std::vector(chanService_); for (size_t i = 0; i < connections.size(); ++i) { - proxyChannels.push_back(mscclpp::deviceHandle( - service->proxyChannel(service->buildAndAddSemaphore(*comm_, connections[i]), - service->addMemory(remoteRegMemories[i].get()), service->addMemory(inputBufRegMem)))); + portChannels.push_back(mscclpp::deviceHandle( + service->portChannel(service->buildAndAddSemaphore(*comm_, connections[i]), + service->addMemory(remoteRegMemories[i].get()), service->addMemory(inputBufRegMem)))); } } comm_->setup(); } -void BaseTestEngine::setupMeshConnections(std::vector& smChannels, void* inputBuff, +void BaseTestEngine::setupMeshConnections(std::vector& memoryChannels, void* inputBuff, size_t inputBuffBytes, void* outputBuff, size_t outputBuffBytes, ChannelSemantic semantic, size_t nChannelPerConnection) { mscclpp::TransportFlags allTransports = mscclpp::Transport::CudaIpc; @@ -446,11 +446,12 @@ void BaseTestEngine::setupMeshConnections(std::vector& smCha (outputBuff && semantic == ChannelSemantic::PUT) ? outputBufRegMem : inputBufRegMem; setupMeshConnectionsInternal(connections, localRegMemory, remoteRegMemories); - std::unordered_map>> smSemaphores; + std::unordered_map>> memorySemaphores; for (size_t cid = 0; cid < connections.size(); ++cid) { if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) { for (size_t i = 0; i < nChannelPerConnection; ++i) { - smSemaphores[cid].emplace_back(std::make_shared(*comm_, connections[cid])); + memorySemaphores[cid].emplace_back( + std::make_shared(*comm_, connections[cid])); } } } @@ -459,16 +460,16 @@ void BaseTestEngine::setupMeshConnections(std::vector& smCha for (size_t i = 0; i < nChannelPerConnection; ++i) { for (size_t cid = 0; cid < connections.size(); ++cid) { if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) { - smChannels.emplace_back(smSemaphores[cid][i], remoteRegMemories[cid].get(), - (outputBuff && semantic == ChannelSemantic::GET) ? outputBuff : inputBufRegMem.data(), - outputBuff); + memoryChannels.emplace_back( + memorySemaphores[cid][i], remoteRegMemories[cid].get(), + (outputBuff && semantic == ChannelSemantic::GET) ? outputBuff : inputBufRegMem.data(), outputBuff); } } } } -void BaseTestEngine::setupMeshConnections(std::vector& smChannels, - std::vector>& proxyChannels, +void BaseTestEngine::setupMeshConnections(std::vector& memoryChannels, + std::vector>& portChannels, void* inputBuff, size_t inputBuffBytes, void* putPacketBuff, size_t putPacketBuffBytes, void* getPacketBuff, size_t getPacketBuffBytes, void* outputBuff, size_t outputBuffBytes) { @@ -500,13 +501,13 @@ void BaseTestEngine::setupMeshConnections(std::vector& smCha setupMeshConnectionsInternal(connections, outputBufRegMem, remoteRegMemoriesOutput, false); } - std::unordered_map> smSemaphores; + std::unordered_map> memorySemaphores; std::unordered_map connIdToSemId; auto service = std::dynamic_pointer_cast(chanService_); for (size_t cid = 0; cid < connections.size(); ++cid) { if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) { - smSemaphores.emplace(cid, std::make_shared(*comm_, connections[cid])); + memorySemaphores.emplace(cid, std::make_shared(*comm_, connections[cid])); } else { connIdToSemId[cid] = service->buildAndAddSemaphore(*comm_, connections[cid]); } @@ -515,16 +516,16 @@ void BaseTestEngine::setupMeshConnections(std::vector& smCha for (size_t cid = 0; cid < connections.size(); ++cid) { if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) { - smChannels.emplace_back(smSemaphores[cid], - (outputBuff) ? remoteRegMemoriesOutput[cid].get() : remoteRegMemories[cid].get(), - inputBufRegMem.data(), (outputBuff) ? outputBufRegMem.data() : nullptr); + memoryChannels.emplace_back(memorySemaphores[cid], + (outputBuff) ? remoteRegMemoriesOutput[cid].get() : remoteRegMemories[cid].get(), + inputBufRegMem.data(), (outputBuff) ? outputBufRegMem.data() : nullptr); } else { if (putPacketBuff == nullptr || getPacketBuff == nullptr) { throw std::runtime_error("IB transport requires putPacketBuff and getPacketBuff"); } - proxyChannels.emplace_back(mscclpp::deviceHandle( - service->proxyChannel(connIdToSemId[cid], service->addMemory(remoteRegMemories[cid].get()), - service->addMemory(putPacketBufRegMem)))); + portChannels.emplace_back(mscclpp::deviceHandle( + service->portChannel(connIdToSemId[cid], service->addMemory(remoteRegMemories[cid].get()), + service->addMemory(putPacketBufRegMem)))); } } } diff --git a/test/mscclpp-test/common.hpp b/test/mscclpp-test/common.hpp index d7408cc29..0267713da 100644 --- a/test/mscclpp-test/common.hpp +++ b/test/mscclpp-test/common.hpp @@ -5,8 +5,8 @@ #define MSCCLPP_TESTS_COMMON_H_ #include -#include -#include +#include +#include #include #define CUDATHROW(cmd) \ @@ -113,14 +113,14 @@ class BaseTestEngine { const mscclpp::RegisteredMemory&)>; template using DeviceHandle = mscclpp::DeviceHandle; - void setupMeshConnections(std::vector>& proxyChannels, void* inputBuff, + void setupMeshConnections(std::vector>& portChannels, void* inputBuff, size_t inputBuffBytes, void* outputBuff = nullptr, size_t outputBuffBytes = 0, SetupChannelFunc setupChannel = nullptr); - void setupMeshConnections(std::vector& smChannels, void* inputBuff, size_t inputBuffBytes, + void setupMeshConnections(std::vector& memoryChannels, void* inputBuff, size_t inputBuffBytes, void* outputBuff = nullptr, size_t outputBuffBytes = 0, ChannelSemantic semantic = ChannelSemantic::PUT, size_t nChannelPerConnection = 1); - void setupMeshConnections(std::vector& smChannels, - std::vector>& proxyChannels, void* inputBuff, + void setupMeshConnections(std::vector& memoryChannels, + std::vector>& portChannels, void* inputBuff, size_t inputBuffBytes, void* putPacketBuff = nullptr, size_t putPacketBuffBytes = 0, void* getPacketBuff = nullptr, size_t getPacketBuffBytes = 0, void* outputBuff = nullptr, size_t outputBuffBytes = 0); diff --git a/test/mscclpp-test/sendrecv_test.cu b/test/mscclpp-test/sendrecv_test.cu index 0bd13e02c..99d7bd2f9 100644 --- a/test/mscclpp-test/sendrecv_test.cu +++ b/test/mscclpp-test/sendrecv_test.cu @@ -7,8 +7,8 @@ #include #include #include +#include #include -#include #include #include @@ -24,7 +24,7 @@ constexpr size_t MAX_BLOCKS_NUM = 32; template using DeviceHandle = mscclpp::DeviceHandle; -__constant__ DeviceHandle constSmChans[2]; +__constant__ DeviceHandle constMemChans[2]; inline int getBlockNum(size_t count) { return std::min((count + THRES_BYTES_PER_BLOCK - 1) / THRES_BYTES_PER_BLOCK, MAX_BLOCKS_NUM); @@ -41,8 +41,8 @@ __global__ void __launch_bounds__(1024) kernel(size_t dataSize, size_t dataPerBl size_t blockDataSize = min(dataSize - startIndex, dataPerBlock); int globalIndex = blockIdx.x * blockDim.x + threadIdx.x; - DeviceHandle sendConn = constSmChans[0]; - DeviceHandle recvConn = constSmChans[1]; + DeviceHandle sendConn = constMemChans[0]; + DeviceHandle recvConn = constMemChans[1]; sendConn.put(startIndex, startIndex, blockDataSize, threadIdx.x, blockDim.x); deviceSyncer.sync(gridDim.x); @@ -131,7 +131,7 @@ class SendRecvTestEngine : public BaseTestEngine { std::vector> devicePtrs_; std::shared_ptr expectedBuff_; - std::vector smChannels_; + std::vector memoryChannels_; }; SendRecvTestEngine::SendRecvTestEngine(const TestArgs& args) : BaseTestEngine(args, "sendrecv") { inPlace_ = false; } @@ -153,7 +153,7 @@ void SendRecvTestEngine::setupConnections() { std::array ranks = {sendToRank, recvFromRank}; auto service = std::dynamic_pointer_cast(chanService_); - std::vector> smSemaphores; + std::vector> memorySemaphores; auto sendConnFuture = comm_->connectOnSetup(sendToRank, 0, getTransport(args_.rank, sendToRank, args_.nRanksPerNode, ibDevice)); @@ -161,12 +161,12 @@ void SendRecvTestEngine::setupConnections() { auto recvConnFuture = comm_->connectOnSetup(recvFromRank, 0, getTransport(args_.rank, recvFromRank, args_.nRanksPerNode, ibDevice)); comm_->setup(); - smSemaphores.push_back(std::make_shared(*comm_, sendConnFuture.get())); - smSemaphores.push_back(std::make_shared(*comm_, recvConnFuture.get())); + memorySemaphores.push_back(std::make_shared(*comm_, sendConnFuture.get())); + memorySemaphores.push_back(std::make_shared(*comm_, recvConnFuture.get())); } else { comm_->setup(); - smSemaphores.push_back(std::make_shared(*comm_, sendConnFuture.get())); - smSemaphores.push_back(smSemaphores[0]); // reuse the send channel if worldSize is 2 + memorySemaphores.push_back(std::make_shared(*comm_, sendConnFuture.get())); + memorySemaphores.push_back(memorySemaphores[0]); // reuse the send channel if worldSize is 2 } comm_->setup(); @@ -183,15 +183,15 @@ void SendRecvTestEngine::setupConnections() { // swap to make sure devicePtrs_[0] in local rank write to devicePtrs_[1] in remote rank std::swap(futureRemoteMemory[0], futureRemoteMemory[1]); - std::vector> smChannelHandles(2); + std::vector> memoryChannelHandles(2); for (int i : {0, 1}) { // We assume ranks in the same node - smChannels_.emplace_back(smSemaphores[i], futureRemoteMemory[i].get(), (void*)localMemories[i].data()); + memoryChannels_.emplace_back(memorySemaphores[i], futureRemoteMemory[i].get(), (void*)localMemories[i].data()); } - std::transform(smChannels_.begin(), smChannels_.end(), smChannelHandles.begin(), - [](const mscclpp::SmChannel& smChannel) { return smChannel.deviceHandle(); }); - MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(constSmChans, smChannelHandles.data(), - sizeof(DeviceHandle) * smChannelHandles.size())); + std::transform(memoryChannels_.begin(), memoryChannels_.end(), memoryChannelHandles.begin(), + [](const mscclpp::MemoryChannel& memoryChannel) { return memoryChannel.deviceHandle(); }); + MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(constMemChans, memoryChannelHandles.data(), + sizeof(DeviceHandle) * memoryChannelHandles.size())); } std::vector SendRecvTestEngine::getSendBuff() { return {devicePtrs_[0].get()}; }