From 3565bfdf6d37e769f334ca0e2290f5927b3a9725 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 24 Jan 2025 14:25:31 -0800
Subject: [PATCH] Renaming channels (#436)

Renamed `ProxyChannel` to `PortChannel` and `SmChannel` to
`MemoryChannel`
---
 README.md                                     |  12 +-
 apps/nccl/src/allgather.hpp                   |  62 ++--
 apps/nccl/src/allreduce.hpp                   |  50 +--
 apps/nccl/src/broadcast.hpp                   |  76 ++--
 apps/nccl/src/nccl.cu                         | 136 +++----
 docs/design/design.md                         |  38 +-
 docs/design/mscclpp-dsl.md                    |   8 +-
 docs/getting-started/tutorials/index.rst      |   4 +-
 .../tutorials/initialization.md               |  16 +-
 .../tutorials/memory-channel.md               |   3 +
 .../getting-started/tutorials/port-channel.md |   3 +
 .../tutorials/proxy-channel.md                |   3 -
 docs/getting-started/tutorials/python-api.md  |   8 +-
 docs/getting-started/tutorials/sm-channel.md  |   3 -
 include/mscclpp/memory_channel.hpp            |  50 +++
 ...l_device.hpp => memory_channel_device.hpp} |  15 +-
 .../{proxy_channel.hpp => port_channel.hpp}   |  70 ++--
 ...nel_device.hpp => port_channel_device.hpp} |  40 +-
 include/mscclpp/semaphore.hpp                 |  16 +-
 include/mscclpp/semaphore_device.hpp          |   8 +-
 include/mscclpp/sm_channel.hpp                |  47 ---
 python/examples/allgather_barrier.py          |   6 +-
 python/examples/send_recv_packet.py           |   6 +-
 python/examples/send_recv_proxy.py            |  10 +-
 python/mscclpp/__init__.py                    |  89 ++++-
 python/mscclpp/comm.py                        |  44 ++-
 python/mscclpp/core_py.cpp                    |   8 +-
 python/mscclpp/language/collectives.py        |   3 -
 .../mscclpp/language/dag/instruction_dag.py   |   2 +-
 python/mscclpp/language/dag/optimizer.py      |  22 +-
 python/mscclpp/language/ir.py                 |   2 +-
 python/mscclpp/language/program.py            |  26 +-
 python/mscclpp/language/types.py              |   8 +-
 python/mscclpp/memory_channel_py.cpp          |  35 ++
 ...oxy_channel_py.cpp => port_channel_py.cpp} |  40 +-
 python/mscclpp/semaphore_py.cpp               |  18 +-
 python/mscclpp/sm_channel_py.cpp              |  35 --
 python/mscclpp_benchmark/allreduce.cu         | 186 +++++-----
 python/mscclpp_benchmark/mscclpp_op.py        |  68 ++--
 python/test/d2d_semaphore_test.cu             |   2 +-
 ...channel_test.cu => memory_channel_test.cu} |   5 +-
 python/test/nvls_test.cu                      |   2 +-
 ...y_channel_test.cu => port_channel_test.cu} |   6 +-
 python/test/test_mscclpp.py                   |  34 +-
 src/executor/execution_plan.cc                |  40 +-
 src/executor/executor.cc                      |  50 +--
 src/include/execution_common.hpp              |  16 +-
 src/include/execution_kernel.hpp              | 132 +++----
 src/include/execution_plan.hpp                |   4 +-
 src/{sm_channel.cc => memory_channel.cc}      |  10 +-
 src/{proxy_channel.cc => port_channel.cc}     |  31 +-
 src/semaphore.cc                              |   8 +-
 test/allgather_test_cpp.cu                    |  68 ++--
 test/mp_unit/CMakeLists.txt                   |   4 +-
 ...annel_tests.cu => memory_channel_tests.cu} | 144 ++++----
 test/mp_unit/mp_unit_tests.hpp                |  17 +-
 ...channel_tests.cu => port_channel_tests.cu} | 140 +++----
 test/mscclpp-test/allgather_test.cu           | 200 +++++-----
 test/mscclpp-test/allreduce_test.cu           | 342 +++++++++---------
 test/mscclpp-test/alltoall_test.cu            |  30 +-
 test/mscclpp-test/common.cc                   |  41 ++-
 test/mscclpp-test/common.hpp                  |  12 +-
 test/mscclpp-test/sendrecv_test.cu            |  32 +-
 63 files changed, 1373 insertions(+), 1273 deletions(-)
 create mode 100644 docs/getting-started/tutorials/memory-channel.md
 create mode 100644 docs/getting-started/tutorials/port-channel.md
 delete mode 100644 docs/getting-started/tutorials/proxy-channel.md
 delete mode 100644 docs/getting-started/tutorials/sm-channel.md
 create mode 100644 include/mscclpp/memory_channel.hpp
 rename include/mscclpp/{sm_channel_device.hpp => memory_channel_device.hpp} (97%)
 rename include/mscclpp/{proxy_channel.hpp => port_channel.hpp} (59%)
 rename include/mscclpp/{proxy_channel_device.hpp => port_channel_device.hpp} (87%)
 delete mode 100644 include/mscclpp/sm_channel.hpp
 create mode 100644 python/mscclpp/memory_channel_py.cpp
 rename python/mscclpp/{proxy_channel_py.cpp => port_channel_py.cpp} (54%)
 delete mode 100644 python/mscclpp/sm_channel_py.cpp
 rename python/test/{sm_channel_test.cu => memory_channel_test.cu} (83%)
 rename python/test/{proxy_channel_test.cu => port_channel_test.cu} (85%)
 rename src/{sm_channel.cc => memory_channel.cc} (54%)
 rename src/{proxy_channel.cc => port_channel.cc} (68%)
 rename test/mp_unit/{sm_channel_tests.cu => memory_channel_tests.cu} (64%)
 rename test/mp_unit/{proxy_channel_tests.cu => port_channel_tests.cu} (74%)

diff --git a/README.md b/README.md
index 4127f8b8e..03d894fca 100644
--- a/README.md
+++ b/README.md
@@ -50,8 +50,8 @@ The following highlights key concepts of MSCCL++.
 MSCCL++ provides peer-to-peer communication methods between GPUs. A peer-to-peer connection between two GPUs is called a *Channel*. Channels are constructed by MSCCL++ host-side interfaces and copied to GPUs during initialization. Channels provide *GPU-side interfaces*, which means that all communication methods are defined as a device function to be called from a GPU kernel code. For example, the `put()` method in the following example copies 1KB data from the local GPU to a remote GPU.
 
 ```cpp
-// `ProxyChannel` will be explained in the following section.
-__device__ mscclpp::DeviceHandle<mscclpp::ProxyChannel> channel;
+// `PortChannel` will be explained in the following section.
+__device__ mscclpp::DeviceHandle<mscclpp::PortChannel> channel;
 __global__ void gpuKernel() {
   ...
   // Only one thread is needed for this method.
@@ -79,15 +79,15 @@ __device__ void barrier() {
 
 MSCCL++ provides consistent interfaces, i.e., the above interfaces are used regardless of the location of the remote GPU (either on the local node or on a remote node) or the underlying link (either NVLink/xGMI or InfiniBand).
 
-### ProxyChannel and SmChannel
+### PortChannel and MemoryChannel
 
-MSCCL++ delivers two types of channels, **ProxyChannel** and **SmChannel**. `ProxyChannel` provides (R)DMA-based data copy and synchronization methods. When called, these methods send/receive a signal to/from a host-side proxy (hence the name `ProxyChannel`), which will trigger (R)DMA (such as `cudaMemcpy*` or `ibv_post_send`) or issue synchronization methods (such as `cudaStreamSynchronize` or `ibv_poll_cq`). Since the key functionalities are run by the proxy, `ProxyChannel` requires only a single GPU thread to call its methods. See all `ProxyChannel` methods from [here](./include/mscclpp/proxy_channel_device.hpp).
+MSCCL++ delivers two types of channels, **PortChannel** and **MemoryChannel**. `PortChannel` provides port-mapping-based data copy and synchronization methods. When called, these methods send/receive a signal to/from a host-side proxy, which will trigger (R)DMA (such as `cudaMemcpy*` or `ibv_post_send`) or issue synchronization methods (such as `cudaStreamSynchronize` or `ibv_poll_cq`). Since the key functionalities are run by the proxy, `PortChannel` requires only a single GPU thread to call its methods. See all `PortChannel` methods from [here](./include/mscclpp/port_channel_device.hpp).
 
-On the other hand, `SmChannel` provides memory-mapping-based copy and synchronization methods. When called, these methods will directly use GPU threads to read/write from/to the remote GPU's memory space. Comparing against `ProxyChannel`, `SmChannel` is especially performant for low-latency scenarios, while it may need many GPU threads to call copying methods at the same time to achieve high copying bandwidth. See all `SmChannel` methods from [here](./include/mscclpp/sm_channel_device.hpp).
+On the other hand, `MemoryChannel` provides memory-mapping-based copy and synchronization methods. When called, these methods will directly use GPU threads to read/write from/to the remote GPU's memory space. Comparing against `PortChannel`, `MemoryChannel` is especially performant for low-latency scenarios, while it may need many GPU threads to call copying methods at the same time to achieve high copying bandwidth. See all `MemoryChannel` methods from [here](./include/mscclpp/memory_channel_device.hpp).
 
 ### Host-Side Communication Proxy
 
-MSCCL++ provides a default implementation of a host-side proxy for ProxyChannels, which is a background host thread that busy polls triggers from GPUs and conducts functionalities accordingly. For example, the following is a typical host-side code for MSCCL++.
+MSCCL++ provides a default implementation of a host-side proxy for PortChannels, which is a background host thread that busy polls triggers from GPUs and conducts functionalities accordingly. For example, the following is a typical host-side code for MSCCL++.
 
 ```cpp
 // Bootstrap: initialize control-plane connections between all ranks
diff --git a/apps/nccl/src/allgather.hpp b/apps/nccl/src/allgather.hpp
index 59aedbb49..af4a6808a 100644
--- a/apps/nccl/src/allgather.hpp
+++ b/apps/nccl/src/allgather.hpp
@@ -7,14 +7,14 @@
 #include <mscclpp/concurrency_device.hpp>
 #include <mscclpp/core.hpp>
 #include <mscclpp/gpu.hpp>
-#include <mscclpp/sm_channel.hpp>
-#include <mscclpp/sm_channel_device.hpp>
+#include <mscclpp/memory_channel.hpp>
+#include <mscclpp/memory_channel_device.hpp>
 
 #include "common.hpp"
 
 template <bool IsOutOfPlace>
 __global__ void __launch_bounds__(1024, 1)
-    allgather6(void* sendbuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels, size_t channelOutOffset,
+    allgather6(void* sendbuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels, size_t channelOutOffset,
                size_t rank, [[maybe_unused]] size_t worldSize, size_t nRanksPerNode, size_t nelemsPerGPU) {
   const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
   const size_t lid = tid % WARP_SIZE;
@@ -24,11 +24,11 @@ __global__ void __launch_bounds__(1024, 1)
   const size_t nWarp = nThread / WARP_SIZE;
   const size_t nPeer = nRanksPerNode - 1;
   const size_t chanOffset = nPeer * blockIdx.x;
-  auto smChans = smChannels + chanOffset;
+  auto memChans = memoryChannels + chanOffset;
 
   if (threadIdx.x < nPeer) {
-    smChans[threadIdx.x].relaxedSignal();
-    smChans[threadIdx.x].wait();
+    memChans[threadIdx.x].relaxedSignal();
+    memChans[threadIdx.x].wait();
   }
   __syncthreads();
 
@@ -49,16 +49,16 @@ __global__ void __launch_bounds__(1024, 1)
     const size_t peerIdx = wid % nPeer;
     const size_t offset = bytesPerGPU * rank + (wid / nPeer) * unitBytesPerWarp;
     if constexpr (IsOutOfPlace) {
-      char* dst = reinterpret_cast<char*>(smChans[peerIdx].dst_);
-      char* src = reinterpret_cast<char*>(smChans[peerIdx].src_);
+      char* dst = reinterpret_cast<char*>(memChans[peerIdx].dst_);
+      char* src = reinterpret_cast<char*>(memChans[peerIdx].src_);
       char* buff = reinterpret_cast<char*>(sendbuff);
       const size_t offsetWithinRank = (wid / nPeer) * unitBytesPerWarp;
-      smChans[peerIdx].copy<16, false>(src + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid,
-                                       WARP_SIZE);
-      smChans[peerIdx].copy<16, false>(dst + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid,
-                                       WARP_SIZE);
+      memChans[peerIdx].copy<16, false>(src + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid,
+                                        WARP_SIZE);
+      memChans[peerIdx].copy<16, false>(dst + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid,
+                                        WARP_SIZE);
     } else {
-      smChans[peerIdx].put<16, false>(offset + channelOutOffset, unitBytesPerWarp, lid, WARP_SIZE);
+      memChans[peerIdx].put<16, false>(offset + channelOutOffset, unitBytesPerWarp, lid, WARP_SIZE);
     }
   }
 
@@ -67,16 +67,16 @@ __global__ void __launch_bounds__(1024, 1)
     const size_t peerIdx = gWid % nPeer;
     const size_t offset = bytesPerGPU * rank + (gWid / nPeer) * unitBytesPerWarp;
     if constexpr (IsOutOfPlace) {
-      char* dst = reinterpret_cast<char*>(smChans[peerIdx].dst_);
-      char* src = reinterpret_cast<char*>(smChans[peerIdx].src_);
+      char* dst = reinterpret_cast<char*>(memChans[peerIdx].dst_);
+      char* src = reinterpret_cast<char*>(memChans[peerIdx].src_);
       char* buff = reinterpret_cast<char*>(sendbuff);
       const size_t offsetWithinRank = (gWid / nPeer) * unitBytesPerWarp;
-      smChans[peerIdx].copy<16, false>(src + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid,
-                                       WARP_SIZE);
-      smChans[peerIdx].copy<16, false>(dst + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid,
-                                       WARP_SIZE);
+      memChans[peerIdx].copy<16, false>(src + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid,
+                                        WARP_SIZE);
+      memChans[peerIdx].copy<16, false>(dst + offset + channelOutOffset, buff + offsetWithinRank, unitBytesPerWarp, lid,
+                                        WARP_SIZE);
     } else {
-      smChans[peerIdx].put<16, false>(offset + channelOutOffset, unitBytesPerWarp, lid, WARP_SIZE);
+      memChans[peerIdx].put<16, false>(offset + channelOutOffset, unitBytesPerWarp, lid, WARP_SIZE);
     }
   }
 
@@ -90,15 +90,15 @@ __global__ void __launch_bounds__(1024, 1)
                                    : unitBytesPerWarp;
     if (remainBytes > 0) {
       if constexpr (IsOutOfPlace) {
-        char* dst = reinterpret_cast<char*>(smChans[peerIdx].dst_);
-        char* src = reinterpret_cast<char*>(smChans[peerIdx].src_);
+        char* dst = reinterpret_cast<char*>(memChans[peerIdx].dst_);
+        char* src = reinterpret_cast<char*>(memChans[peerIdx].src_);
         char* buff = reinterpret_cast<char*>(sendbuff);
-        smChans[peerIdx].copy<16, true>(src + offset + channelOutOffset, buff + offsetWithinRank, remainBytes, lid,
-                                        WARP_SIZE);
-        smChans[peerIdx].copy<16, true>(dst + offset + channelOutOffset, buff + offsetWithinRank, remainBytes, lid,
-                                        WARP_SIZE);
+        memChans[peerIdx].copy<16, true>(src + offset + channelOutOffset, buff + offsetWithinRank, remainBytes, lid,
+                                         WARP_SIZE);
+        memChans[peerIdx].copy<16, true>(dst + offset + channelOutOffset, buff + offsetWithinRank, remainBytes, lid,
+                                         WARP_SIZE);
       } else {
-        smChans[peerIdx].put<16, true>(offset + channelOutOffset, remainBytes, lid, WARP_SIZE);
+        memChans[peerIdx].put<16, true>(offset + channelOutOffset, remainBytes, lid, WARP_SIZE);
       }
     }
   }
@@ -106,14 +106,14 @@ __global__ void __launch_bounds__(1024, 1)
   deviceSyncer.sync(gridDim.x);
 
   if (threadIdx.x < nPeer) {
-    smChans[threadIdx.x].relaxedSignal();
-    smChans[threadIdx.x].wait();
+    memChans[threadIdx.x].relaxedSignal();
+    memChans[threadIdx.x].wait();
   }
 }
 
 template <bool IsOutOfPlace, typename T>
 cudaError_t allgather(T* buff, [[maybe_unused]] T* scratch, [[maybe_unused]] T* resultBuff,
-                      mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels, size_t channelOutOffset, int rank,
+                      mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels, size_t channelOutOffset, int rank,
                       int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream) {
   int nBlocks = 28;
   if (nelems <= 4096) {
@@ -123,7 +123,7 @@ cudaError_t allgather(T* buff, [[maybe_unused]] T* scratch, [[maybe_unused]] T*
   } else if (nelems >= 2097152) {
     nBlocks = 35;
   }
-  allgather6<IsOutOfPlace><<<nBlocks, 1024, 0, stream>>>((void*)buff, smChannels, channelOutOffset, rank, worldSize,
+  allgather6<IsOutOfPlace><<<nBlocks, 1024, 0, stream>>>((void*)buff, memoryChannels, channelOutOffset, rank, worldSize,
                                                          nRanksPerNode, nelems * sizeof(T) / sizeof(int));
   return cudaGetLastError();
 }
diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
index 41342413b..e53cd1f11 100644
--- a/apps/nccl/src/allreduce.hpp
+++ b/apps/nccl/src/allreduce.hpp
@@ -8,9 +8,9 @@
 #include <mscclpp/core.hpp>
 #include <mscclpp/gpu.hpp>
 #include <mscclpp/gpu_data_types.hpp>
+#include <mscclpp/memory_channel.hpp>
+#include <mscclpp/memory_channel_device.hpp>
 #include <mscclpp/packet_device.hpp>
-#include <mscclpp/sm_channel.hpp>
-#include <mscclpp/sm_channel_device.hpp>
 
 #if defined(ENABLE_NPKIT)
 #include <mscclpp/npkit/npkit.hpp>
@@ -196,7 +196,7 @@ __forceinline__ __device__ void vectorSum(T* dst, T* src, size_t nElem) {
 
 template <typename T>
 __global__ void __launch_bounds__(32, 1)
-    allreduceAllToAll(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
+    allreduceAllToAll(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels,
                       size_t channelDataOffset, size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize,
                       size_t nelems, uint32_t flag) {
   // This version of allreduce only works for single nodes
@@ -213,10 +213,10 @@ __global__ void __launch_bounds__(32, 1)
   uint32_t* src = (uint32_t*)((char*)buff);
   uint32_t* dst = (uint32_t*)((char*)resultBuff);
 
-  __shared__ mscclpp::DeviceHandle<mscclpp::SmChannel> channels[NRANKS_PER_NODE - 1];
+  __shared__ mscclpp::DeviceHandle<mscclpp::MemoryChannel> channels[NRANKS_PER_NODE - 1];
   const int lid = tid % WARP_SIZE;
   if (lid < nPeers) {
-    channels[lid] = smChannels[lid];
+    channels[lid] = memoryChannels[lid];
   }
   __syncwarp();
 
@@ -240,7 +240,7 @@ __global__ void __launch_bounds__(32, 1)
 
 template <typename T>
 __global__ void __launch_bounds__(1024, 1)
-    allreduce7(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
+    allreduce7(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels,
                size_t channelDataOffset, size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize,
                size_t nelems, uint32_t flag
 #if defined(ENABLE_NPKIT)
@@ -304,10 +304,10 @@ __global__ void __launch_bounds__(1024, 1)
   uint2* dst = (uint2*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int));
 
   // Put channels into shared memory, read channel info from global memory is unexpectable slow.
-  __shared__ mscclpp::DeviceHandle<mscclpp::SmChannel> channels[NRANKS_PER_NODE - 1];
+  __shared__ mscclpp::DeviceHandle<mscclpp::MemoryChannel> channels[NRANKS_PER_NODE - 1];
   const int lid = tid % WARP_SIZE;
   if (lid < nPeers) {
-    channels[lid] = smChannels[lid];
+    channels[lid] = memoryChannels[lid];
   }
   __syncwarp();
 
@@ -361,16 +361,16 @@ __global__ void __launch_bounds__(1024, 1)
 
 template <typename T>
 __global__ void __launch_bounds__(512, 1)
-    allreduce8(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
-               mscclpp::DeviceHandle<mscclpp::SmChannel>* smOutChannels, size_t channelOutDataOffset,
+    allreduce8(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels,
+               mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryOutChannels, size_t channelOutDataOffset,
                size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize, size_t nelems) {
   const int nPeer = nRanksPerNode - 1;
   const size_t chanOffset = nPeer * blockIdx.x;
   // assume (nelems * sizeof(T)) is divisible by (16 * worldSize)
   const size_t nInt4 = nelems * sizeof(T) / sizeof(int4);
   const size_t nInt4PerRank = nInt4 / worldSize;
-  auto smChans = smChannels + chanOffset;
-  auto smOutChans = smOutChannels + chanOffset;
+  auto memoryChans = memoryChannels + chanOffset;
+  auto memoryOutChans = memoryOutChannels + chanOffset;
 
   int4* buff4 = reinterpret_cast<int4*>(buff);
   int4* scratch4 = reinterpret_cast<int4*>((char*)scratch + channelScratchOffset);
@@ -396,12 +396,12 @@ __global__ void __launch_bounds__(512, 1)
   const size_t scratchChunkRankOffset = chunkSizePerRank * rank;
   const size_t scratchBaseOffsetInt4 = channelScratchOffset / sizeof(int4);
 
-  __shared__ mscclpp::DeviceHandle<mscclpp::SmChannel> channels[NRANKS_PER_NODE - 1];
-  __shared__ mscclpp::DeviceHandle<mscclpp::SmChannel> outChannels[NRANKS_PER_NODE - 1];
+  __shared__ mscclpp::DeviceHandle<mscclpp::MemoryChannel> channels[NRANKS_PER_NODE - 1];
+  __shared__ mscclpp::DeviceHandle<mscclpp::MemoryChannel> outChannels[NRANKS_PER_NODE - 1];
   const int lid = threadIdx.x % WARP_SIZE;
   if (lid < nPeer) {
-    channels[lid] = smChans[lid];
-    outChannels[lid] = smOutChans[lid];
+    channels[lid] = memoryChans[lid];
+    outChannels[lid] = memoryOutChans[lid];
   }
   __syncwarp();
 
@@ -496,8 +496,8 @@ __global__ void __launch_bounds__(512, 1)
 }
 
 template <typename T>
-cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
-                      mscclpp::DeviceHandle<mscclpp::SmChannel>* smOutChannels, size_t channelInOffset,
+cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels,
+                      mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryOutChannels, size_t channelInOffset,
                       size_t channelOutOffset, size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize,
                       size_t nelems, cudaStream_t stream) {
   static uint32_t flag = 1;
@@ -505,9 +505,9 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<
   if (sizeof(T) * nelems < worldSize * sizeof(int)) {
     int nBlocks = 7;
     int nThreadsPerBlock = 32;
-    allreduceAllToAll<<<nBlocks, nThreadsPerBlock, 0, stream>>>(buff, scratch, resultBuff, smChannels, channelInOffset,
-                                                                channelScratchOffset, rank, nRanksPerNode, worldSize,
-                                                                nelems, flag++);
+    allreduceAllToAll<<<nBlocks, nThreadsPerBlock, 0, stream>>>(buff, scratch, resultBuff, memoryChannels,
+                                                                channelInOffset, channelScratchOffset, rank,
+                                                                nRanksPerNode, worldSize, nelems, flag++);
   } else if (sizeof(T) * nelems <= (1 << 20)) {
     int nBlocks = 28;
     int nThreadsPerBlock = 1024;
@@ -518,17 +518,17 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<
 #if defined(ENABLE_NPKIT)
     size_t NpkitSharedMemSize = NPKIT_SHM_NUM_EVENTS * sizeof(NpKitEvent);
     allreduce7<<<nBlocks, nThreadsPerBlock, NpkitSharedMemSize, stream>>>(
-        buff, scratch, resultBuff, smChannels, channelInOffset, channelScratchOffset, rank, nRanksPerNode, worldSize,
-        nelems, flag++, NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
+        buff, scratch, resultBuff, memoryChannels, channelInOffset, channelScratchOffset, rank, nRanksPerNode,
+        worldSize, nelems, flag++, NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
 #else
-    allreduce7<<<nBlocks, nThreadsPerBlock, 0, stream>>>(buff, scratch, resultBuff, smChannels, channelInOffset,
+    allreduce7<<<nBlocks, nThreadsPerBlock, 0, stream>>>(buff, scratch, resultBuff, memoryChannels, channelInOffset,
                                                          channelScratchOffset, rank, nRanksPerNode, worldSize, nelems,
                                                          flag++);
 #endif
   } else {
     int nBlocks = 35;
     int nThreadsPerBlock = 512;
-    allreduce8<<<nBlocks, nThreadsPerBlock, 0, stream>>>(buff, scratch, resultBuff, smChannels, smOutChannels,
+    allreduce8<<<nBlocks, nThreadsPerBlock, 0, stream>>>(buff, scratch, resultBuff, memoryChannels, memoryOutChannels,
                                                          channelOutOffset, channelScratchOffset, rank, nRanksPerNode,
                                                          worldSize, nelems);
   }
diff --git a/apps/nccl/src/broadcast.hpp b/apps/nccl/src/broadcast.hpp
index e9a9111f6..6d52c963d 100644
--- a/apps/nccl/src/broadcast.hpp
+++ b/apps/nccl/src/broadcast.hpp
@@ -7,25 +7,25 @@
 #include <mscclpp/concurrency_device.hpp>
 #include <mscclpp/core.hpp>
 #include <mscclpp/gpu.hpp>
-#include <mscclpp/sm_channel.hpp>
-#include <mscclpp/sm_channel_device.hpp>
+#include <mscclpp/memory_channel.hpp>
+#include <mscclpp/memory_channel_device.hpp>
 
 #include "common.hpp"
 
 template <bool IsOutOfPlace>
 __global__ void __launch_bounds__(1024, 1)
-    broadcast6(void* sendbuff, void* scratchbuff, void* recvbuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
-               [[maybe_unused]] size_t channelOutOffset, size_t rank, [[maybe_unused]] size_t worldSize, size_t root,
-               size_t nRanksPerNode, size_t nelemsPerGPU) {
+    broadcast6(void* sendbuff, void* scratchbuff, void* recvbuff,
+               mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels, [[maybe_unused]] size_t channelOutOffset,
+               size_t rank, [[maybe_unused]] size_t worldSize, size_t root, size_t nRanksPerNode, size_t nelemsPerGPU) {
   const size_t nThread = blockDim.x * gridDim.x;
   const size_t nPeer = nRanksPerNode - 1;
   const size_t chanOffset = nPeer * blockIdx.x;
 
-  __shared__ mscclpp::DeviceHandle<mscclpp::SmChannel> smChans[NRANKS_PER_NODE - 1];
+  __shared__ mscclpp::DeviceHandle<mscclpp::MemoryChannel> memChans[NRANKS_PER_NODE - 1];
   if (threadIdx.x < nPeer) {
-    smChans[threadIdx.x] = smChannels[chanOffset + threadIdx.x];
-    smChans[threadIdx.x].relaxedSignal();
-    smChans[threadIdx.x].wait();
+    memChans[threadIdx.x] = memoryChannels[chanOffset + threadIdx.x];
+    memChans[threadIdx.x].relaxedSignal();
+    memChans[threadIdx.x].wait();
   }
   __syncthreads();
 
@@ -55,23 +55,23 @@ __global__ void __launch_bounds__(1024, 1)
     if (rank == root) {
       char* send_ = reinterpret_cast<char*>(sendbuff);
       for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) {
-        char* dst = reinterpret_cast<char*>(smChans[peerIdx].dst_);  // Peer's scratchbuff.
-        smChans[peerIdx].copy<16, false>(dst + offset, send_ + offset, unitBytesPerBlock, threadIdx.x, blockDim.x);
+        char* dst = reinterpret_cast<char*>(memChans[peerIdx].dst_);  // Peer's scratchbuff.
+        memChans[peerIdx].copy<16, false>(dst + offset, send_ + offset, unitBytesPerBlock, threadIdx.x, blockDim.x);
         __syncthreads();
-        if (threadIdx.x == peerIdx) smChans[peerIdx].signal();
+        if (threadIdx.x == peerIdx) memChans[peerIdx].signal();
       }
       if constexpr (IsOutOfPlace) {
         char* recv_ = reinterpret_cast<char*>(recvbuff);
-        smChans[0].copy<16, false>(recv_ + offset, send_ + offset, unitBytesPerBlock, threadIdx.x, blockDim.x);
+        memChans[0].copy<16, false>(recv_ + offset, send_ + offset, unitBytesPerBlock, threadIdx.x, blockDim.x);
       }
 
     } else {  // rank != root.
-      if (threadIdx.x == peerRootIdx) smChans[peerRootIdx].wait();
+      if (threadIdx.x == peerRootIdx) memChans[peerRootIdx].wait();
       __syncthreads();
       char* recv_ = reinterpret_cast<char*>(recvbuff);
       char* scratch_ = reinterpret_cast<char*>(scratchbuff);  // My scratchbuff.
-      smChans[peerRootIdx].copy<16, false>(recv_ + offset, scratch_ + offset, unitBytesPerBlock, threadIdx.x,
-                                           blockDim.x);
+      memChans[peerRootIdx].copy<16, false>(recv_ + offset, scratch_ + offset, unitBytesPerBlock, threadIdx.x,
+                                            blockDim.x);
     }
   }
 
@@ -81,30 +81,30 @@ __global__ void __launch_bounds__(1024, 1)
       scratchSub = -i * unitBytes;
       deviceSyncer.sync(gridDim.x);
       if (threadIdx.x < nPeer) {
-        smChans[threadIdx.x].relaxedSignal();
-        smChans[threadIdx.x].wait();
+        memChans[threadIdx.x].relaxedSignal();
+        memChans[threadIdx.x].wait();
       }
     }
     if (rank == root) {
       char* send_ = reinterpret_cast<char*>(sendbuff);
       for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) {
-        char* dst = reinterpret_cast<char*>(smChans[peerIdx].dst_);  // Peer's scratchbuff.
-        smChans[peerIdx].copy<16, false>(dst + offset + scratchSub, send_ + offset, unitBytesPerBlock, threadIdx.x,
-                                         blockDim.x);
+        char* dst = reinterpret_cast<char*>(memChans[peerIdx].dst_);  // Peer's scratchbuff.
+        memChans[peerIdx].copy<16, false>(dst + offset + scratchSub, send_ + offset, unitBytesPerBlock, threadIdx.x,
+                                          blockDim.x);
         __syncthreads();
-        if (threadIdx.x == peerIdx) smChans[peerIdx].signal();
+        if (threadIdx.x == peerIdx) memChans[peerIdx].signal();
       }
       if constexpr (IsOutOfPlace) {
         char* recv_ = reinterpret_cast<char*>(recvbuff);
-        smChans[0].copy<16, false>(recv_ + offset, send_ + offset, unitBytesPerBlock, threadIdx.x, blockDim.x);
+        memChans[0].copy<16, false>(recv_ + offset, send_ + offset, unitBytesPerBlock, threadIdx.x, blockDim.x);
       }
     } else {  // rank != root.
-      if (threadIdx.x == peerRootIdx) smChans[peerRootIdx].wait();
+      if (threadIdx.x == peerRootIdx) memChans[peerRootIdx].wait();
       __syncthreads();
       char* recv_ = reinterpret_cast<char*>(recvbuff);
       char* scratch_ = reinterpret_cast<char*>(scratchbuff);  // My scratchbuff.
-      smChans[peerRootIdx].copy<16, false>(recv_ + offset, scratch_ + offset + scratchSub, unitBytesPerBlock,
-                                           threadIdx.x, blockDim.x);
+      memChans[peerRootIdx].copy<16, false>(recv_ + offset, scratch_ + offset + scratchSub, unitBytesPerBlock,
+                                            threadIdx.x, blockDim.x);
     }
   }
 
@@ -116,23 +116,23 @@ __global__ void __launch_bounds__(1024, 1)
       if (rank == root) {
         char* send_ = reinterpret_cast<char*>(sendbuff);
         for (size_t peerIdx = 0; peerIdx < nPeer; peerIdx++) {
-          char* dst = reinterpret_cast<char*>(smChans[peerIdx].dst_);  // Peer's scratchbuff.
-          smChans[peerIdx].copy<16, true>(dst + offset + scratchSub, send_ + offset, remainBytes, threadIdx.x,
-                                          blockDim.x);
+          char* dst = reinterpret_cast<char*>(memChans[peerIdx].dst_);  // Peer's scratchbuff.
+          memChans[peerIdx].copy<16, true>(dst + offset + scratchSub, send_ + offset, remainBytes, threadIdx.x,
+                                           blockDim.x);
           __syncthreads();
-          if (threadIdx.x == peerIdx) smChans[peerIdx].signal();
+          if (threadIdx.x == peerIdx) memChans[peerIdx].signal();
         }
         if constexpr (IsOutOfPlace) {
           char* recv_ = reinterpret_cast<char*>(recvbuff);
-          smChans[0].copy<16, true>(recv_ + offset, send_ + offset, remainBytes, threadIdx.x, blockDim.x);
+          memChans[0].copy<16, true>(recv_ + offset, send_ + offset, remainBytes, threadIdx.x, blockDim.x);
         }
       } else {  // rank != root.
-        if (threadIdx.x == peerRootIdx) smChans[peerRootIdx].wait();
+        if (threadIdx.x == peerRootIdx) memChans[peerRootIdx].wait();
         __syncthreads();
         char* recv_ = reinterpret_cast<char*>(recvbuff);
         char* scratch_ = reinterpret_cast<char*>(scratchbuff);  // My scratchbuff.
-        smChans[peerRootIdx].copy<16, true>(recv_ + offset, scratch_ + offset + scratchSub, remainBytes, threadIdx.x,
-                                            blockDim.x);
+        memChans[peerRootIdx].copy<16, true>(recv_ + offset, scratch_ + offset + scratchSub, remainBytes, threadIdx.x,
+                                             blockDim.x);
       }
     }  // remainBytes > 0.
   }
@@ -140,13 +140,13 @@ __global__ void __launch_bounds__(1024, 1)
   deviceSyncer.sync(gridDim.x);
 
   if (threadIdx.x < nPeer) {
-    smChans[threadIdx.x].relaxedSignal();
-    smChans[threadIdx.x].wait();
+    memChans[threadIdx.x].relaxedSignal();
+    memChans[threadIdx.x].wait();
   }
 }
 
 template <bool IsOutOfPlace, typename T>
-cudaError_t broadcast(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
+cudaError_t broadcast(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels,
                       size_t channelOutOffset, int rank, int nRanksPerNode, int root, int worldSize, size_t nelems,
                       cudaStream_t stream) {
   int nBlocks = 7;
@@ -157,7 +157,7 @@ cudaError_t broadcast(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<
   // } else if (nelems >= 2097152) {
   //   nBlocks = 35;
   // }
-  broadcast6<IsOutOfPlace><<<nBlocks, 1024, 0, stream>>>((void*)buff, (void*)scratch, (void*)resultBuff, smChannels,
+  broadcast6<IsOutOfPlace><<<nBlocks, 1024, 0, stream>>>((void*)buff, (void*)scratch, (void*)resultBuff, memoryChannels,
                                                          channelOutOffset, rank, worldSize, root, nRanksPerNode,
                                                          nelems * sizeof(T) / sizeof(int));
   return cudaGetLastError();
diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu
index f91d15e69..3daadf8a3 100644
--- a/apps/nccl/src/nccl.cu
+++ b/apps/nccl/src/nccl.cu
@@ -7,8 +7,8 @@
 #include <mscclpp/core.hpp>
 #include <mscclpp/env.hpp>
 #include <mscclpp/executor.hpp>
-#include <mscclpp/sm_channel.hpp>
-#include <mscclpp/sm_channel_device.hpp>
+#include <mscclpp/memory_channel.hpp>
+#include <mscclpp/memory_channel_device.hpp>
 #include <mscclpp/utils.hpp>
 #include <sstream>
 #include <unordered_map>
@@ -69,14 +69,14 @@ struct hash<channelKey> {
 }  // namespace std
 
 struct ChannelInfo {
-  std::vector<mscclpp::SmChannel> smChannels;
-  std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>> smChannelDeviceHandles;
+  std::vector<mscclpp::MemoryChannel> memoryChannels;
+  std::shared_ptr<mscclpp::DeviceHandle<mscclpp::MemoryChannel>> memoryChannelDeviceHandles;
 };
 
 struct ncclComm {
   std::shared_ptr<mscclpp::Communicator> comm;
   std::vector<std::shared_ptr<mscclpp::Connection>> connections;
-  std::vector<std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>> smSemaphores;
+  std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores;
   std::shared_ptr<mscclpp::Executor> executor;
   std::unordered_map<std::string, std::vector<executionPlanInstance>> executionPlans;
 
@@ -148,16 +148,15 @@ static std::vector<mscclpp::RegisteredMemory> setupRemoteMemories(std::shared_pt
   return remoteMemories;
 }
 
-static std::vector<mscclpp::SmChannel> setupSmChannels(ncclComm_t comm,
-                                                       const std::vector<mscclpp::RegisteredMemory>& remoteMemories,
-                                                       void* src) {
-  std::vector<mscclpp::SmChannel> channels;
-  std::vector<std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>>& smSemaphores = comm->smSemaphores;
+static std::vector<mscclpp::MemoryChannel> setupMemoryChannels(
+    ncclComm_t comm, const std::vector<mscclpp::RegisteredMemory>& remoteMemories, void* src) {
+  std::vector<mscclpp::MemoryChannel> channels;
+  std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>>& memorySemaphores = comm->memorySemaphores;
   size_t nConnections = comm->connections.size();
   for (size_t idx = 0; idx < NUM_CHANNELS_PER_CONNECTION; ++idx) {
     for (size_t cid = 0; cid < nConnections; ++cid) {
       if (comm->connections[cid]->transport() == mscclpp::Transport::CudaIpc) {
-        channels.emplace_back(smSemaphores[idx * nConnections + cid], remoteMemories[cid], src, nullptr);
+        channels.emplace_back(memorySemaphores[idx * nConnections + cid], remoteMemories[cid], src, nullptr);
       }
     }
   }
@@ -171,15 +170,16 @@ static std::pair<std::string, executionPlanInstance> loadExecutionPlan(const std
   return std::make_pair(collective, executionPlanInstance{key, plan});
 }
 
-static std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>> setupSmChannelDeviceHandles(
-    const std::vector<mscclpp::SmChannel>& smChannels) {
-  std::vector<mscclpp::DeviceHandle<mscclpp::SmChannel>> smChannelDeviceHandles;
-  std::transform(smChannels.begin(), smChannels.end(), std::back_inserter(smChannelDeviceHandles),
-                 [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); });
-  std::shared_ptr<mscclpp::DeviceHandle<mscclpp::SmChannel>> ptr =
-      mscclpp::detail::gpuCallocShared<mscclpp::DeviceHandle<mscclpp::SmChannel>>(smChannelDeviceHandles.size());
-  mscclpp::gpuMemcpy<mscclpp::DeviceHandle<mscclpp::SmChannel>>(ptr.get(), smChannelDeviceHandles.data(),
-                                                                smChannelDeviceHandles.size(), cudaMemcpyHostToDevice);
+static std::shared_ptr<mscclpp::DeviceHandle<mscclpp::MemoryChannel>> setupMemoryChannelDeviceHandles(
+    const std::vector<mscclpp::MemoryChannel>& memoryChannels) {
+  std::vector<mscclpp::DeviceHandle<mscclpp::MemoryChannel>> memoryChannelDeviceHandles;
+  std::transform(memoryChannels.begin(), memoryChannels.end(), std::back_inserter(memoryChannelDeviceHandles),
+                 [](const mscclpp::MemoryChannel& memoryChannel) { return mscclpp::deviceHandle(memoryChannel); });
+  std::shared_ptr<mscclpp::DeviceHandle<mscclpp::MemoryChannel>> ptr =
+      mscclpp::detail::gpuCallocShared<mscclpp::DeviceHandle<mscclpp::MemoryChannel>>(
+          memoryChannelDeviceHandles.size());
+  mscclpp::gpuMemcpy<mscclpp::DeviceHandle<mscclpp::MemoryChannel>>(
+      ptr.get(), memoryChannelDeviceHandles.data(), memoryChannelDeviceHandles.size(), cudaMemcpyHostToDevice);
   return ptr;
 }
 
@@ -211,28 +211,28 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff,
   int rank = comm->comm->bootstrap()->getRank();
   channelKey sendKey{(void*)sendBasePtr, sendBytes};
   channelKey recvKey{(void*)recvBasePtr, recvBytes};
-  mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels = nullptr;
-  mscclpp::DeviceHandle<mscclpp::SmChannel>* smOutChannels = nullptr;
+  mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels = nullptr;
+  mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryOutChannels = nullptr;
 
   // Creating the channels
   if (count * ncclTypeSize(datatype) <= (1 << 20)) {
     auto sendIt = comm->channelScratchInfos.find(sendKey);
     if (sendIt == comm->channelScratchInfos.end()) {
-      std::vector<mscclpp::SmChannel> channels =
-          setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast<void*>((void*)sendBasePtr));
-      ChannelInfo channelInfo{channels, setupSmChannelDeviceHandles(channels)};
+      std::vector<mscclpp::MemoryChannel> channels =
+          setupMemoryChannels(comm, comm->remoteScratchRegMemories, const_cast<void*>((void*)sendBasePtr));
+      ChannelInfo channelInfo{channels, setupMemoryChannelDeviceHandles(channels)};
       sendIt = comm->channelScratchInfos.emplace(sendKey, channelInfo).first;
     }
 
-    smChannels = sendIt->second.smChannelDeviceHandles.get();
+    memoryChannels = sendIt->second.memoryChannelDeviceHandles.get();
   } else {
     std::vector<mscclpp::RegisteredMemory> remoteMemories;
 
     auto sendIt = comm->channelInInfos.find(sendKey);
     if (sendIt == comm->channelInInfos.end()) {
-      std::vector<mscclpp::SmChannel> channels =
-          setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast<void*>((void*)sendBasePtr));
-      ChannelInfo channelInfo{channels, setupSmChannelDeviceHandles(channels)};
+      std::vector<mscclpp::MemoryChannel> channels =
+          setupMemoryChannels(comm, comm->remoteScratchRegMemories, const_cast<void*>((void*)sendBasePtr));
+      ChannelInfo channelInfo{channels, setupMemoryChannelDeviceHandles(channels)};
       sendIt = comm->channelInInfos.emplace(sendKey, channelInfo).first;
     }
 
@@ -240,37 +240,37 @@ static ncclResult_t ncclAllReduceFallback(const void* sendbuff, void* recvbuff,
     if (recvIt == comm->channelOutInfos.end()) {
       remoteMemories =
           setupRemoteMemories(comm->comm, rank, (void*)recvBasePtr, recvBytes, mscclpp::Transport::CudaIpc);
-      std::vector<mscclpp::SmChannel> outChannels =
-          setupSmChannels(comm, remoteMemories, const_cast<void*>((void*)recvBasePtr));
-      ChannelInfo channelInfo{outChannels, setupSmChannelDeviceHandles(outChannels)};
+      std::vector<mscclpp::MemoryChannel> outChannels =
+          setupMemoryChannels(comm, remoteMemories, const_cast<void*>((void*)recvBasePtr));
+      ChannelInfo channelInfo{outChannels, setupMemoryChannelDeviceHandles(outChannels)};
       recvIt = comm->channelOutInfos.emplace(recvKey, channelInfo).first;
     }
 
-    smChannels = sendIt->second.smChannelDeviceHandles.get();
-    smOutChannels = recvIt->second.smChannelDeviceHandles.get();
+    memoryChannels = sendIt->second.memoryChannelDeviceHandles.get();
+    memoryOutChannels = recvIt->second.memoryChannelDeviceHandles.get();
   }
 
   switch (datatype) {
     case ncclFloat16:
-      CUDACHECK(allreduce((half*)sendbuff, (half*)comm->scratchBuff.get(), (half*)recvbuff, smChannels, smOutChannels,
-                          offsetIn, offsetOut, offsetScratch, rank, NRANKS_PER_NODE,
+      CUDACHECK(allreduce((half*)sendbuff, (half*)comm->scratchBuff.get(), (half*)recvbuff, memoryChannels,
+                          memoryOutChannels, offsetIn, offsetOut, offsetScratch, rank, NRANKS_PER_NODE,
                           comm->comm->bootstrap()->getNranks(), count, stream));
       break;
     case ncclFloat32:
-      CUDACHECK(allreduce((float*)sendbuff, (float*)comm->scratchBuff.get(), (float*)recvbuff, smChannels,
-                          smOutChannels, offsetIn, offsetOut, offsetScratch, comm->comm->bootstrap()->getRank(),
+      CUDACHECK(allreduce((float*)sendbuff, (float*)comm->scratchBuff.get(), (float*)recvbuff, memoryChannels,
+                          memoryOutChannels, offsetIn, offsetOut, offsetScratch, comm->comm->bootstrap()->getRank(),
                           NRANKS_PER_NODE, comm->comm->bootstrap()->getNranks(), count, stream));
       break;
     case ncclBfloat16:
       CUDACHECK(allreduce((__bfloat16*)sendbuff, (__bfloat16*)comm->scratchBuff.get(), (__bfloat16*)recvbuff,
-                          smChannels, smOutChannels, offsetIn, offsetOut, offsetScratch, rank, NRANKS_PER_NODE,
+                          memoryChannels, memoryOutChannels, offsetIn, offsetOut, offsetScratch, rank, NRANKS_PER_NODE,
                           comm->comm->bootstrap()->getNranks(), count, stream));
       break;
     case ncclInt32:
     case ncclUint32:
-      CUDACHECK(allreduce((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, smChannels, smOutChannels,
-                          offsetIn, offsetOut, offsetScratch, comm->comm->bootstrap()->getRank(), NRANKS_PER_NODE,
-                          comm->comm->bootstrap()->getNranks(), count, stream));
+      CUDACHECK(allreduce((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, memoryChannels,
+                          memoryOutChannels, offsetIn, offsetOut, offsetScratch, comm->comm->bootstrap()->getRank(),
+                          NRANKS_PER_NODE, comm->comm->bootstrap()->getNranks(), count, stream));
       break;
     default:
       WARN("datatype is invalid");
@@ -304,27 +304,27 @@ static ncclResult_t ncclAllGatherFallback(const void* sendbuff, void* recvbuff,
   channelKey recvKey{(void*)recvBasePtr, recvBytes};
   int rank = comm->comm->bootstrap()->getRank();
   int nRank = comm->comm->bootstrap()->getNranks();
-  mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels = nullptr;
+  mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels = nullptr;
 
   auto it = comm->channelOutInfos.find(recvKey);
   if (it == comm->channelOutInfos.end()) {
     std::vector<mscclpp::RegisteredMemory> remoteMemories = setupRemoteMemories(
         comm->comm, rank, const_cast<void*>((void*)recvBasePtr), recvBytes, mscclpp::Transport::CudaIpc);
-    std::vector<mscclpp::SmChannel> channels =
-        setupSmChannels(comm, remoteMemories, const_cast<void*>((void*)recvBasePtr));
-    std::vector<mscclpp::DeviceHandle<mscclpp::SmChannel>> smChannelDeviceHandles;
-    std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles),
-                   [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); });
-    ChannelInfo channelInfo{channels, setupSmChannelDeviceHandles(channels)};
+    std::vector<mscclpp::MemoryChannel> channels =
+        setupMemoryChannels(comm, remoteMemories, const_cast<void*>((void*)recvBasePtr));
+    std::vector<mscclpp::DeviceHandle<mscclpp::MemoryChannel>> memoryChannelDeviceHandles;
+    std::transform(channels.begin(), channels.end(), std::back_inserter(memoryChannelDeviceHandles),
+                   [](const mscclpp::MemoryChannel& memoryChannel) { return mscclpp::deviceHandle(memoryChannel); });
+    ChannelInfo channelInfo{channels, setupMemoryChannelDeviceHandles(channels)};
     it = comm->channelOutInfos.emplace(recvKey, channelInfo).first;
   }
 
-  smChannels = it->second.smChannelDeviceHandles.get();
+  memoryChannels = it->second.memoryChannelDeviceHandles.get();
   if ((char*)sendbuff == (char*)recvbuff + rank * sendcount) {
-    CUDACHECK(allgather<false>((int*)sendbuff, (int*)nullptr, (int*)recvbuff, smChannels, offsetOut, rank,
+    CUDACHECK(allgather<false>((int*)sendbuff, (int*)nullptr, (int*)recvbuff, memoryChannels, offsetOut, rank,
                                NRANKS_PER_NODE, nRank, bytes / sizeof(int), stream));
   } else {
-    CUDACHECK(allgather<true>((int*)sendbuff, (int*)nullptr, (int*)recvbuff, smChannels, offsetOut, rank,
+    CUDACHECK(allgather<true>((int*)sendbuff, (int*)nullptr, (int*)recvbuff, memoryChannels, offsetOut, rank,
                               NRANKS_PER_NODE, nRank, bytes / sizeof(int), stream));
   }
 
@@ -346,19 +346,19 @@ static void ncclCommInitRankFallbackSingleNode(ncclComm* commPtr, std::shared_pt
   std::transform(connectionFutures.begin(), connectionFutures.end(), std::back_inserter(connections),
                  [](const auto& future) { return future.get(); });
 
-  std::vector<std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>> smSemaphores;
+  std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores;
   for (size_t idx = 0; idx < NUM_CHANNELS_PER_CONNECTION; ++idx) {
     for (size_t cid = 0; cid < connections.size(); ++cid) {
       if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) {
-        smSemaphores.emplace_back(
-            std::make_shared<mscclpp::SmDevice2DeviceSemaphore>(*(mscclppComm), connections[cid]));
+        memorySemaphores.emplace_back(
+            std::make_shared<mscclpp::MemoryDevice2DeviceSemaphore>(*(mscclppComm), connections[cid]));
       }
     }
   }
 
   mscclppComm->setup();
   commPtr->connections = std::move(connections);
-  commPtr->smSemaphores = std::move(smSemaphores);
+  commPtr->memorySemaphores = std::move(memorySemaphores);
   commPtr->buffFlag = 0;
   commPtr->numScratchBuff = 2;
   commPtr->scratchBuff = mscclpp::GpuBuffer(SCRATCH_SIZE).memory();
@@ -584,29 +584,29 @@ NCCL_API ncclResult_t ncclBroadcastFallback(const void* sendbuff, void* recvbuff
   channelKey recvKey{(void*)0x0, 0};  // Just create the channel once.
   int rank = comm->comm->bootstrap()->getRank();
   int nRank = comm->comm->bootstrap()->getNranks();
-  mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels = nullptr;
+  mscclpp::DeviceHandle<mscclpp::MemoryChannel>* memoryChannels = nullptr;
 
   auto it = comm->channelOutInfos.find(recvKey);
   if (it == comm->channelOutInfos.end()) {
     // std::vector<mscclpp::RegisteredMemory> remoteMemories = setupRemoteMemories(
     //     comm->comm, rank, const_cast<void*>((void*)recvBasePtr), recvBytes, mscclpp::Transport::CudaIpc);
-    // std::vector<mscclpp::SmChannel> channels =
-    //     setupSmChannels(comm, remoteMemories, const_cast<void*>((void*)recvBasePtr));
-    std::vector<mscclpp::SmChannel> channels =
-        setupSmChannels(comm, comm->remoteScratchRegMemories, const_cast<void*>((void*)recvBasePtr));
-    std::vector<mscclpp::DeviceHandle<mscclpp::SmChannel>> smChannelDeviceHandles;
-    std::transform(channels.begin(), channels.end(), std::back_inserter(smChannelDeviceHandles),
-                   [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); });
-    ChannelInfo channelInfo{channels, setupSmChannelDeviceHandles(channels)};
+    // std::vector<mscclpp::MemoryChannel> channels =
+    //     setupMemoryChannels(comm, remoteMemories, const_cast<void*>((void*)recvBasePtr));
+    std::vector<mscclpp::MemoryChannel> channels =
+        setupMemoryChannels(comm, comm->remoteScratchRegMemories, const_cast<void*>((void*)recvBasePtr));
+    std::vector<mscclpp::DeviceHandle<mscclpp::MemoryChannel>> memoryChannelDeviceHandles;
+    std::transform(channels.begin(), channels.end(), std::back_inserter(memoryChannelDeviceHandles),
+                   [](const mscclpp::MemoryChannel& memoryChannel) { return mscclpp::deviceHandle(memoryChannel); });
+    ChannelInfo channelInfo{channels, setupMemoryChannelDeviceHandles(channels)};
     it = comm->channelOutInfos.emplace(recvKey, channelInfo).first;
   }
 
-  smChannels = it->second.smChannelDeviceHandles.get();
+  memoryChannels = it->second.memoryChannelDeviceHandles.get();
   if ((char*)sendbuff == (char*)recvbuff) {
-    CUDACHECK(broadcast<false>((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, smChannels, offsetOut,
+    CUDACHECK(broadcast<false>((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, memoryChannels, offsetOut,
                                rank, NRANKS_PER_NODE, root, nRank, bytes / sizeof(int), stream));
   } else {
-    CUDACHECK(broadcast<true>((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, smChannels, offsetOut,
+    CUDACHECK(broadcast<true>((int*)sendbuff, (int*)comm->scratchBuff.get(), (int*)recvbuff, memoryChannels, offsetOut,
                               rank, NRANKS_PER_NODE, root, nRank, bytes / sizeof(int), stream));
   }
 
diff --git a/docs/design/design.md b/docs/design/design.md
index c67e4d62a..eb0b59327 100644
--- a/docs/design/design.md
+++ b/docs/design/design.md
@@ -33,17 +33,17 @@ __global__ void gpuKernel() {
 ```
 MSCCL++ also provides efficient synchronization methods, `signal()`, `flush()`, and `wait()`. We will discuss these methods in the following sections.
 
-#### SmChannel & ProxyChannel
-MSCCL++ delivers two types of channels, **ProxyChannel** and **SmChannel**. `ProxyChannel` provides (R)DMA-based data copy and synchronization methods. When called, these methods send/receive a signal to/from a host-side proxy (hence the name `ProxyChannel`), which will trigger (R)DMA (such as `cudaMemcpy*` or `ibv_post_send`) or issue synchronization methods (such as `cudaStreamSynchronize` or `ibv_poll_cq`). Since the key functionalities are run by the proxy, ProxyChannel requires only a single GPU thread to call its methods. See all `ProxyChannel` methods from [here](https://github.com/microsoft/mscclpp/blob/main/include/mscclpp/proxy_channel_device.hpp).
+#### MemoryChannel & PortChannel
+MSCCL++ delivers two types of channels, **PortChannel** and **MemoryChannel**. `PortChannel` provides port-mapping-based data copy and synchronization methods. When called, these methods send/receive a signal to/from a host-side proxy, which will trigger (R)DMA (such as `cudaMemcpy*` or `ibv_post_send`) or issue synchronization methods (such as `cudaStreamSynchronize` or `ibv_poll_cq`). Since the key functionalities are run by the proxy, PortChannel requires only a single GPU thread to call its methods. See all `PortChannel` methods from [here](https://github.com/microsoft/mscclpp/blob/main/include/mscclpp/port_channel_device.hpp).
 
-On the other hand, `SmChannel` provides memory-mapping-based copy and synchronization methods. When called, these methods will directly use GPU threads to read/write from/to the remote GPU's memory space. Comparing against ProxyChannel, SmChannel is especially performant for low-latency scenarios, while it may need many GPU threads to call copying methods at the same time to achieve high copying bandwidth. See all SmChannel methods from [here](https://github.com/microsoft/mscclpp/blob/main/include/mscclpp/sm_channel_device.hpp).
+On the other hand, `MemoryChannel` provides memory-mapping-based copy and synchronization methods. When called, these methods will directly use GPU threads to read/write from/to the remote GPU's memory space. Comparing against PortChannel, MemoryChannel is especially performant for low-latency scenarios, while it may need many GPU threads to call copying methods at the same time to achieve high copying bandwidth. See all MemoryChannel methods from [here](https://github.com/microsoft/mscclpp/blob/main/include/mscclpp/memory_channel_device.hpp).
 
 ### Fifo & Trigger
 One of the key features of MSCCL++ is to offload the communication logic from the GPU to the CPU.
 To offload the communication logic from the GPU to the CPU, MSCCL++ introduces the concept of `Fifo` and `Trigger`. A Fifo is a circular buffer that shared between the GPU and the CPU. It is used to store `Trigger`. A `Trigger` is a signal that is sent from the GPU to the CPU to notify the CPU that there are commands in the Fifo that need to be processed. The CPU will then process the commands in the Fifo and send a signal back to the GPU to notify the GPU that the commands have been processed. The implementation details of Fifo and Trigger can be found in following sections.
 
 ### ProxyService
-Proxy service is a persistent service that resides in the CPU side. It functions as a polling service that receives the message `Trigger` from the GPU side and then transfers data according to the command.  When we use `ProxyChannel` for communication, a `Trigger` is sent from the GPU side to the `ProxyService`. Then `ProxyService` will invoke `cudaMemcpy*` or `IB verbs` to transfer data to the targe device.
+Proxy service is a persistent service that resides in the CPU side. It functions as a polling service that receives the message `Trigger` from the GPU side and then transfers data according to the command.  When we use `PortChannel` for communication, a `Trigger` is sent from the GPU side to the `ProxyService`. Then `ProxyService` will invoke `cudaMemcpy*` or `IB verbs` to transfer data to the targe device.
 
 ## Implementation
 
@@ -60,18 +60,18 @@ MSCCL++ offers one-sided communication methods directly callable from a GPU kern
 This operation is executed within a kernel launched with a single block.
 ```cpp
 // Running on rank 0
-__device__ void gpuKernel(mscclpp::SmChannelDeviceHandle* smChannel) {
-  smChannel[0].put(/*dstOffset=*/ 0, /*srcOffset=*/ 0, /*size=*/ 1024, /*threadId*/ threadIdx.x, /*numThreads*/ blockDim.x);
+__device__ void gpuKernel(mscclpp::MemoryChannelDeviceHandle* memoryChannel) {
+  memoryChannel[0].put(/*dstOffset=*/ 0, /*srcOffset=*/ 0, /*size=*/ 1024, /*threadId*/ threadIdx.x, /*numThreads*/ blockDim.x);
   __syncthreads();
   if (threadIdx.x == 0) {
-    smChannel[0].signal();
+    memoryChannel[0].signal();
   }
 }
 
 // Running on rank 1
-__device__ void gpuKernel(mscclpp::SmChannelDeviceHandle* smChannel) {
+__device__ void gpuKernel(mscclpp::MemoryChannelDeviceHandle* memoryChannel) {
   if (threadIdx.x == 0) {
-    smChannel[0].wait();
+    memoryChannel[0].wait();
   }
   __syncthreads();
   // Data is ready to use
@@ -81,14 +81,14 @@ __device__ void gpuKernel(mscclpp::SmChannelDeviceHandle* smChannel) {
 Similar to the LL protocol offered by NCCL, MSCCL++ introduces a `Packet` structure designed to facilitate the transfer of both data and flags within a single instruction, proving particularly beneficial for applications where latency is a critical concern. The following code shows the basic usage of the `Packet` structure. The flag should be same for sender and receiver side.
 ```cpp
 // Running on rank 0
-__device__ void gpuKernel(mscclpp::SmChannelDeviceHandle* smChans, int flag) {
-  smChans[0].putPackets(/*dstOffset=*/ 0, /*srcOffset=*/ 0, /*size=*/ 1024, /*threadId*/ threadIdx.x, /*numThreads*/ blockDim.x,
+__device__ void gpuKernel(mscclpp::MemoryChannelDeviceHandle* memChans, int flag) {
+  memChans[0].putPackets(/*dstOffset=*/ 0, /*srcOffset=*/ 0, /*size=*/ 1024, /*threadId*/ threadIdx.x, /*numThreads*/ blockDim.x,
                         /*flag=*/ flag);
 }
 
 // Running on rank 1
-__device__ void gpuKernel(mscclpp::SmChannelDeviceHandle* smChans, int flag) {
-  smChans[0].getPackets(/*dstOffset=*/ 0, /*srcOffset=*/ 0, /*size=*/ 1024, /*threadId*/ threadIdx.x, /*numThreads*/ blockDim.x,
+__device__ void gpuKernel(mscclpp::MemoryChannelDeviceHandle* memChans, int flag) {
+  memChans[0].getPackets(/*dstOffset=*/ 0, /*srcOffset=*/ 0, /*size=*/ 1024, /*threadId*/ threadIdx.x, /*numThreads*/ blockDim.x,
                         /*flag=*/ flag);
   // Data is ready to use
 }
@@ -117,11 +117,11 @@ In this section, we will discuss several use cases that demonstrate the capabili
 
 MSCCL++ enables the offloading of communication logic from the GPU to the CPU, facilitating the overlapping of communication and computation processes. The code snippet provided illustrates this overlapping technique. In the depicted scenario, the GPU emits a signal to the CPU indicating readiness for data transfer. Subsequently, while the GPU continues to execute computation tasks, the CPU initiates the data transfer to the designated target device.
 ```cpp
-__device__ void gpuKernel(mscclpp::ProxyChannelDeviceHandle* proxyChannel) {
+__device__ void gpuKernel(mscclpp::PortChannelDeviceHandle* portChannel) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   // Send a trigger to the CPU
   if (tid == 0) {
-    proxyChannel[0].putWithSignal(/*dstOffset*/ 0, /*srcOffset*/ 0, /*size*/ 1024);
+    portChannel[0].putWithSignal(/*dstOffset*/ 0, /*srcOffset*/ 0, /*size*/ 1024);
   }
   // Continue computation
   matrixMul()
@@ -138,18 +138,18 @@ Traditional communication libraries enforce a separation between communication a
 MCSCL++ offers a low-level communication API, allowing users to design customized collective communication algorithms. The following code demonstrates how to implement a customized All2All algorithm using MSCCL++.
 ```cpp
 using DeviceHandle = mscclpp::DeviceHandle<T>;
-__device__ void localAlltoall(DeviceHandle<mscclpp::ProxyChannel>* proxyChans, int rank,
+__device__ void localAlltoall(DeviceHandle<mscclpp::PortChannel>* portChans, int rank,
                               int nRanksPerNode, size_t nElements) {
   int remoteRank = ((int)blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1;
   for (int i = 1; i < nRanksPerNode; i++) {
-    DeviceHandle<mscclpp::ProxyChannel> proxyChan = proxyChans[blockIdx.x];
+    DeviceHandle<mscclpp::PortChannel> portChan = portChans[blockIdx.x];
     if (threadIdx.x == 0 && remoteRank % nRanksPerNode == (rank + i) % nRanksPerNode) {
-      proxyChan.putWithSignalAndFlush(rank * nElements * sizeof(int), remoteRank * nElements * sizeof(int),
+      portChan.putWithSignalAndFlush(rank * nElements * sizeof(int), remoteRank * nElements * sizeof(int),
                                       nElements * sizeof(int));
     }
     // wait for the data from GPU (rank-i) % nranksPerNode to arrive
     if (threadIdx.x == 0 && remoteRank % nRanksPerNode == (rank - i + nRanksPerNode) % nRanksPerNode) {
-      proxyChan.wait();
+      portChan.wait();
     }
     deviceSyncer.sync(nRanksPerNode - 1);
   }
diff --git a/docs/design/mscclpp-dsl.md b/docs/design/mscclpp-dsl.md
index 9b34b29f0..9b6955e81 100644
--- a/docs/design/mscclpp-dsl.md
+++ b/docs/design/mscclpp-dsl.md
@@ -72,11 +72,11 @@ The operation can only be applied to the chunks. We provide a set of communicati
 ***Please notice***: MSCCLPPLang only provides one-sided communication operations. The user needs to make sure that the data is ready to be sent or received before calling the communication operations. Also we provides `wait/signal` operations to synchronize the communication across GPUs.
 
 #### Channel
-A channel is a communication channel between two GPUs. It is used to send and receive data between GPUs. We supports three types of channel: `ChannelType.sm`, `ChannelType.proxy` and `ChannelType.nvls`.
+A channel is a communication channel between two GPUs. It is used to send and receive data between GPUs. We supports three types of channel: `ChannelType.memory`, `ChannelType.port` and `ChannelType.nvls`.
 
-`ChannelType.sm` is used for communication between GPUs on the same node. This channel uses GPU processors to transfer data.
+`ChannelType.memory` is used for communication between GPUs on the same node. This channel uses GPU processors to transfer data.
 
-`ChannelType.proxy` is used for communication between GPUs, whether they are on different nodes or the same node. This channel will offload the data transfer to CPU processors, which can provide better throughput compared to `ChannelType.sm`. However, this comes at the cost of higher latency compared to `ChannelType.sm`.
+`ChannelType.port` is used for communication between GPUs, whether they are on different nodes or the same node. This channel will offload the data transfer to CPU processors, which can provide better throughput compared to `ChannelType.memory`. However, this comes at the cost of higher latency compared to `ChannelType.memory`.
 
 `ChannelType.nvls` is used for communication between GPUs on the same node. This feature offloads the data processing task to the switch, requiring specific hardware support. Refer [nvdia documentation](https://www.nvidia.com/en-us/data-center/nvlink/) for more details.
 
@@ -85,7 +85,7 @@ We can assign operations to a thread block. The thread block is a group of threa
 
 #### Instance
 An instance is a parallel execution of the program. For example, if a collective algorithm is designed to run on `n` chunks with `m` thread blocks, setting the instance to 2 will run the algorithm on `2n` chunks with `2m` thread blocks. Serveral replication policies are supported, including `duplicated` and `interleaved`.
-- `duplicated`: Each chunk is split into smaller parts based on the number of instances, duplicating the same instructions for all parts. For example, ChunkA is split into ChunkA0 and ChunkA1, while ChunkB is split into ChunkB0 and ChunkB1. Both ChunkA0 and ChunkA1 belong to Instance 0, and both ChunkB0 and ChunkB1 belong to Instance 1. 
+- `duplicated`: Each chunk is split into smaller parts based on the number of instances, duplicating the same instructions for all parts. For example, ChunkA is split into ChunkA0 and ChunkA1, while ChunkB is split into ChunkB0 and ChunkB1. Both ChunkA0 and ChunkA1 belong to Instance 0, and both ChunkB0 and ChunkB1 belong to Instance 1.
 - `interleaved`: Assign chunks to instances in an interleaved manner. For example, ChunkA and ChunkB are split into to ChunkA0, ChunkA1, ChunkB0, and ChunkB1. ChunkA0 and ChunkB0 belong to Instance 0, while ChunkA1 and ChunkB1 belong to Instance 1.
 
 #### Instruction Fusion
diff --git a/docs/getting-started/tutorials/index.rst b/docs/getting-started/tutorials/index.rst
index 7ee91b194..c43207edd 100644
--- a/docs/getting-started/tutorials/index.rst
+++ b/docs/getting-started/tutorials/index.rst
@@ -9,8 +9,8 @@ This tutorial section provides a step-by-step guide to help you get started with
    :hidden:
 
    initialization
-   proxy-channel
-   sm-channel
+   port-channel
+   memory-channel
    packet-api
    customized-proxy-service
    python-api
diff --git a/docs/getting-started/tutorials/initialization.md b/docs/getting-started/tutorials/initialization.md
index b1d4c1d26..4b9327fa6 100644
--- a/docs/getting-started/tutorials/initialization.md
+++ b/docs/getting-started/tutorials/initialization.md
@@ -13,7 +13,7 @@ We will setup a mesh topology with eight GPUs. Each GPU will be connected to its
 ```cpp
 #include <mscclpp/core.hpp>
 #include <mscclpp/gpu_utils.hpp>
-#include <mscclpp/proxy_channel.hpp>
+#include <mscclpp/port_channel.hpp>
 
 #include <memory>
 #include <string>
@@ -21,7 +21,7 @@ We will setup a mesh topology with eight GPUs. Each GPU will be connected to its
 
 template <class T>
 using DeviceHandle = mscclpp::DeviceHandle<T>;
-__constant__ DeviceHandle<mscclpp::ProxyChannel> constProxyChans[8];
+__constant__ DeviceHandle<mscclpp::PortChannel> constPortChans[8];
 
 void setupMeshTopology(int rank, int worldsize, void* data, size_t dataSize) {
   std::string ip_port = "10.0.0.4:50000";
@@ -55,17 +55,17 @@ void setupMeshTopology(int rank, int worldsize, void* data, size_t dataSize) {
 
   comm.setup();
 
-  std::vector<DeviceHandle<mscclpp::ProxyChannel>> proxyChannels;
+  std::vector<DeviceHandle<mscclpp::PortChannel>> portChannels;
   for (size_t i = 0; i < semaphoreIds.size(); ++i) {
-    proxyChannels.push_back(mscclpp::deviceHandle(mscclpp::ProxyChannel(
-        proxyService.proxyChannel(semaphoreIds[i]), proxyService.addMemory(remoteMemories[i].get()),
+    portChannels.push_back(mscclpp::deviceHandle(mscclpp::PortChannel(
+        proxyService.portChannel(semaphoreIds[i]), proxyService.addMemory(remoteMemories[i].get()),
         proxyService.addMemory(localMemories[i]))));
   }
 
-  if (proxyChannels.size() > sizeof(constProxyChans) / sizeof(DeviceHandle<mscclpp::ProxyChannel>)) {
+  if (portChannels.size() > sizeof(constPortChans) / sizeof(DeviceHandle<mscclpp::PortChannel>)) {
     std::runtime_error("unexpected error");
   }
-  CUDACHECK(cudaMemcpyToSymbol(constProxyChans, proxyChannels.data(),
-                              sizeof(DeviceHandle<mscclpp::ProxyChannel>) * proxyChannels.size()));
+  CUDACHECK(cudaMemcpyToSymbol(constPortChans, portChannels.data(),
+                              sizeof(DeviceHandle<mscclpp::PortChannel>) * portChannels.size()));
 }
 ```
diff --git a/docs/getting-started/tutorials/memory-channel.md b/docs/getting-started/tutorials/memory-channel.md
new file mode 100644
index 000000000..d6f78e32e
--- /dev/null
+++ b/docs/getting-started/tutorials/memory-channel.md
@@ -0,0 +1,3 @@
+# Using MemoryChannel for Intra-Node Communication
+
+TBU
diff --git a/docs/getting-started/tutorials/port-channel.md b/docs/getting-started/tutorials/port-channel.md
new file mode 100644
index 000000000..a4db69854
--- /dev/null
+++ b/docs/getting-started/tutorials/port-channel.md
@@ -0,0 +1,3 @@
+# Offload commnunication to CPU with PortChannel
+
+TBU
diff --git a/docs/getting-started/tutorials/proxy-channel.md b/docs/getting-started/tutorials/proxy-channel.md
deleted file mode 100644
index fec5c4cc0..000000000
--- a/docs/getting-started/tutorials/proxy-channel.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Offload commnunication to CPU with ProxyChannel
-
-TBU
diff --git a/docs/getting-started/tutorials/python-api.md b/docs/getting-started/tutorials/python-api.md
index c2f26c23f..cac195c93 100644
--- a/docs/getting-started/tutorials/python-api.md
+++ b/docs/getting-started/tutorials/python-api.md
@@ -35,7 +35,7 @@ if __name__ == "__main__":
     nelems = 1024
     memory = GpuBuffer(nelem, dtype=cp.int32)
     proxy_service = ProxyService()
-    simple_channels = group.make_proxy_channels(proxy_service, memory, connections)
+    simple_channels = group.make_port_channels(proxy_service, memory, connections)
     proxy_service.start_proxy()
     mscclpp_group.barrier()
     launch_kernel(mscclpp_group.my_rank, mscclpp_group.nranks, simple_channels, memory)
@@ -48,7 +48,7 @@ We provide some Python utils to help you launch kernel via python. Here is a exa
 ```python
 from mscclpp.utils import KernelBuilder, pack
 
-def launch_kernel(my_rank: int, nranks: int, simple_channels: List[ProxyChannel], memory: cp.ndarray):
+def launch_kernel(my_rank: int, nranks: int, simple_channels: List[PortChannel], memory: cp.ndarray):
     file_dir = os.path.dirname(os.path.abspath(__file__))
     kernel = KernelBuilder(file="test.cu", kernel_name="test", file_dir=file_dir).get_compiled_kernel()
     params = b""
@@ -74,11 +74,11 @@ def launch_kernel(my_rank: int, nranks: int, simple_channels: List[ProxyChannel]
 The test kernel is defined in `test.cu` as follows:
 ```cuda
 #include <mscclpp/packet_device.hpp>
-#include <mscclpp/proxy_channel_device.hpp>
+#include <mscclpp/port_channel_device.hpp>
 
 // be careful about using channels[my_rank] as it is inavlie and it is there just for simplicity of indexing
 extern "C" __global__ void __launch_bounds__(1024, 1)
-    proxy_channel(mscclpp::ProxyChannelDeviceHandle* channels, int my_rank, int nranks,
+    port_channel(mscclpp::PortChannelDeviceHandle* channels, int my_rank, int nranks,
                          int num_elements) {
     int tid = threadIdx.x;
     int nthreads = blockDim.x;
diff --git a/docs/getting-started/tutorials/sm-channel.md b/docs/getting-started/tutorials/sm-channel.md
deleted file mode 100644
index 191e47b36..000000000
--- a/docs/getting-started/tutorials/sm-channel.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Using SmChannel for Intra-Node Communication
-
-TBU
diff --git a/include/mscclpp/memory_channel.hpp b/include/mscclpp/memory_channel.hpp
new file mode 100644
index 000000000..533907b9c
--- /dev/null
+++ b/include/mscclpp/memory_channel.hpp
@@ -0,0 +1,50 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef MSCCLPP_MEMORY_CHANNEL_HPP_
+#define MSCCLPP_MEMORY_CHANNEL_HPP_
+
+#include <type_traits>
+
+#include "core.hpp"
+#include "memory_channel_device.hpp"
+#include "semaphore.hpp"
+
+namespace mscclpp {
+
+/// Channel for accessing peer memory directly from GPU threads.
+struct MemoryChannel {
+ private:
+  std::shared_ptr<MemoryDevice2DeviceSemaphore> semaphore_;
+  RegisteredMemory dst_;
+  void* src_;
+  void* getPacketBuffer_;
+
+ public:
+  /// Constructor.
+  MemoryChannel() = default;
+
+  /// Constructor.
+  /// @param semaphore The semaphore used to synchronize the communication.
+  /// @param dst Registered memory of the destination.
+  /// @param src The source memory address.
+  /// @param getPacketBuffer The optional buffer used for @ref getPackets().
+  MemoryChannel(std::shared_ptr<MemoryDevice2DeviceSemaphore> semaphore, RegisteredMemory dst, void* src,
+                void* getPacketBuffer = nullptr);
+
+  /// Device-side handle for @ref MemoryChannel.
+  using DeviceHandle = MemoryChannelDeviceHandle;
+
+  /// Returns the device-side handle.
+  ///
+  /// User should make sure the MemoryChannel is not released when using the returned handle.
+  ///
+  DeviceHandle deviceHandle() const;
+};
+
+/// @deprecated Use @ref MemoryChannel instead.
+[[deprecated("Use MemoryChannel instead.")]] typedef MemoryChannel SmChannel;
+
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_MEMORY_CHANNEL_HPP_
diff --git a/include/mscclpp/sm_channel_device.hpp b/include/mscclpp/memory_channel_device.hpp
similarity index 97%
rename from include/mscclpp/sm_channel_device.hpp
rename to include/mscclpp/memory_channel_device.hpp
index e49a431b7..d49eb4def 100644
--- a/include/mscclpp/sm_channel_device.hpp
+++ b/include/mscclpp/memory_channel_device.hpp
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef MSCCLPP_SM_CHANNEL_DEVICE_HPP_
-#define MSCCLPP_SM_CHANNEL_DEVICE_HPP_
+#ifndef MSCCLPP_MEMORY_CHANNEL_DEVICE_HPP_
+#define MSCCLPP_MEMORY_CHANNEL_DEVICE_HPP_
 
 #include "semaphore_device.hpp"
 #if defined(MSCCLPP_DEVICE_COMPILE)
@@ -42,9 +42,9 @@ MSCCLPP_DEVICE_INLINE void copy(T* dst, T* src, uint64_t numElems, uint32_t thre
 
 #endif  // defined(MSCCLPP_DEVICE_COMPILE)
 
-/// Channel for accessing peer memory directly from SM.
-struct SmChannelDeviceHandle {
-  SmDevice2DeviceSemaphoreDeviceHandle semaphore_;
+/// Device-side handle of a MemoryChannel.
+struct MemoryChannelDeviceHandle {
+  MemoryDevice2DeviceSemaphoreDeviceHandle semaphore_;
   void* src_;
   void* dst_;
   void* getPacketBuffer_;
@@ -276,6 +276,9 @@ struct SmChannelDeviceHandle {
 #endif  // defined(MSCCLPP_DEVICE_COMPILE)
 };
 
+/// @deprecated Use @ref MemoryChannelDeviceHandle instead.
+[[deprecated("Use MemoryChannelDeviceHandle instead.")]] typedef MemoryChannelDeviceHandle SmChannelDeviceHandle;
+
 }  // namespace mscclpp
 
-#endif  // MSCCLPP_SM_CHANNEL_DEVICE_HPP_
+#endif  // MSCCLPP_MEMORY_CHANNEL_DEVICE_HPP_
diff --git a/include/mscclpp/proxy_channel.hpp b/include/mscclpp/port_channel.hpp
similarity index 59%
rename from include/mscclpp/proxy_channel.hpp
rename to include/mscclpp/port_channel.hpp
index 4f2978f75..3d5a62843 100644
--- a/include/mscclpp/proxy_channel.hpp
+++ b/include/mscclpp/port_channel.hpp
@@ -1,18 +1,18 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef MSCCLPP_PROXY_CHANNEL_HPP_
-#define MSCCLPP_PROXY_CHANNEL_HPP_
+#ifndef MSCCLPP_PORT_CHANNEL_HPP_
+#define MSCCLPP_PORT_CHANNEL_HPP_
 
 #include "core.hpp"
+#include "port_channel_device.hpp"
 #include "proxy.hpp"
-#include "proxy_channel_device.hpp"
 #include "semaphore.hpp"
 
 namespace mscclpp {
 
-struct BaseProxyChannel;
-struct ProxyChannel;
+struct BasePortChannel;
+struct PortChannel;
 
 /// Base class for proxy services. Proxy services are used to proxy data between devices.
 class BaseProxyService {
@@ -49,17 +49,17 @@ class ProxyService : public BaseProxyService {
   /// @return The semaphore.
   std::shared_ptr<Host2DeviceSemaphore> semaphore(SemaphoreId id) const;
 
-  /// Get a base proxy channel by semaphore ID.
+  /// Get a base port channel by semaphore ID.
   /// @param id The ID of the semaphore.
-  /// @return The base proxy channel.
-  BaseProxyChannel baseProxyChannel(SemaphoreId id);
+  /// @return The base port channel.
+  BasePortChannel basePortChannel(SemaphoreId id);
 
-  /// Get a proxy channel by semaphore ID and memory regions.
+  /// Get a port channel by semaphore ID and memory regions.
   /// @param id The ID of the semaphore.
   /// @param dst The destination memory region.
   /// @param src The source memory region.
-  /// @return The proxy channel.
-  ProxyChannel proxyChannel(SemaphoreId id, MemoryId dst, MemoryId src);
+  /// @return The port channel.
+  PortChannel portChannel(SemaphoreId id, MemoryId dst, MemoryId src);
 
   /// Start the proxy service.
   void startProxy();
@@ -79,8 +79,8 @@ class ProxyService : public BaseProxyService {
   ProxyHandlerResult handleTrigger(ProxyTrigger triggerRaw);
 };
 
-/// Proxy channel.
-struct BaseProxyChannel {
+/// Port channel without specifying source/destination memory regions.
+struct BasePortChannel {
  protected:
   SemaphoreId semaphoreId_;
 
@@ -89,34 +89,34 @@ struct BaseProxyChannel {
   std::shared_ptr<Proxy> proxy_;
 
  public:
-  BaseProxyChannel() = default;
+  BasePortChannel() = default;
 
-  BaseProxyChannel(SemaphoreId semaphoreId, std::shared_ptr<Host2DeviceSemaphore> semaphore,
-                   std::shared_ptr<Proxy> proxy);
+  BasePortChannel(SemaphoreId semaphoreId, std::shared_ptr<Host2DeviceSemaphore> semaphore,
+                  std::shared_ptr<Proxy> proxy);
 
-  BaseProxyChannel(const BaseProxyChannel& other) = default;
+  BasePortChannel(const BasePortChannel& other) = default;
 
-  BaseProxyChannel& operator=(BaseProxyChannel& other) = default;
+  BasePortChannel& operator=(BasePortChannel& other) = default;
 
-  /// Device-side handle for @ref BaseProxyChannel.
-  using DeviceHandle = BaseProxyChannelDeviceHandle;
+  /// Device-side handle for @ref BasePortChannel.
+  using DeviceHandle = BasePortChannelDeviceHandle;
 
   /// Returns the device-side handle.
   ///
-  /// User should make sure the BaseProxyChannel is not released when using the returned handle.
+  /// User should make sure the BasePortChannel is not released when using the returned handle.
   ///
   DeviceHandle deviceHandle() const;
 };
 
-/// A common form of proxy channel with a single destination and source memory region.
-struct ProxyChannel : public BaseProxyChannel {
+/// Port channel.
+struct PortChannel : public BasePortChannel {
  private:
   MemoryId dst_;
   MemoryId src_;
 
  public:
   /// Default constructor.
-  ProxyChannel() = default;
+  PortChannel() = default;
 
   /// Constructor.
   /// @param semaphoreId The ID of the semaphore.
@@ -124,25 +124,31 @@ struct ProxyChannel : public BaseProxyChannel {
   /// @param proxy The proxy.
   /// @param dst The destination memory region.
   /// @param src The source memory region.
-  ProxyChannel(SemaphoreId semaphoreId, std::shared_ptr<Host2DeviceSemaphore> semaphore, std::shared_ptr<Proxy> proxy,
-               MemoryId dst, MemoryId src);
+  PortChannel(SemaphoreId semaphoreId, std::shared_ptr<Host2DeviceSemaphore> semaphore, std::shared_ptr<Proxy> proxy,
+              MemoryId dst, MemoryId src);
 
   /// Copy constructor.
-  ProxyChannel(const ProxyChannel& other) = default;
+  PortChannel(const PortChannel& other) = default;
 
   /// Assignment operator.
-  ProxyChannel& operator=(ProxyChannel& other) = default;
+  PortChannel& operator=(PortChannel& other) = default;
 
-  /// Device-side handle for @ref ProxyChannel.
-  using DeviceHandle = ProxyChannelDeviceHandle;
+  /// Device-side handle for @ref PortChannel.
+  using DeviceHandle = PortChannelDeviceHandle;
 
   /// Returns the device-side handle.
   ///
-  /// User should make sure the ProxyChannel is not released when using the returned handle.
+  /// User should make sure the PortChannel is not released when using the returned handle.
   ///
   DeviceHandle deviceHandle() const;
 };
 
+/// @deprecated Use @ref BasePortChannel instead.
+[[deprecated("Use BasePortChannel instead.")]] typedef BasePortChannel BaseProxyChannel;
+
+/// @deprecated Use @ref PortChannel instead.
+[[deprecated("Use PortChannel instead.")]] typedef PortChannel ProxyChannel;
+
 }  // namespace mscclpp
 
-#endif  // MSCCLPP_PROXY_CHANNEL_HPP_
+#endif  // MSCCLPP_PORT_CHANNEL_HPP_
diff --git a/include/mscclpp/proxy_channel_device.hpp b/include/mscclpp/port_channel_device.hpp
similarity index 87%
rename from include/mscclpp/proxy_channel_device.hpp
rename to include/mscclpp/port_channel_device.hpp
index 38237978a..bd9cefe78 100644
--- a/include/mscclpp/proxy_channel_device.hpp
+++ b/include/mscclpp/port_channel_device.hpp
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef MSCCLPP_PROXY_CHANNEL_DEVICE_HPP_
-#define MSCCLPP_PROXY_CHANNEL_DEVICE_HPP_
+#ifndef MSCCLPP_PORT_CHANNEL_DEVICE_HPP_
+#define MSCCLPP_PORT_CHANNEL_DEVICE_HPP_
 
 #include "fifo_device.hpp"
 #include "semaphore_device.hpp"
@@ -83,7 +83,7 @@ union ChannelTrigger {
 #endif  // defined(MSCCLPP_DEVICE_COMPILE)
 };
 
-struct BaseProxyChannelDeviceHandle {
+struct BasePortChannelDeviceHandle {
   SemaphoreId semaphoreId_;
 
   Host2DeviceSemaphoreDeviceHandle semaphore_;
@@ -92,11 +92,11 @@ struct BaseProxyChannelDeviceHandle {
   // can produce for and the sole proxy thread consumes it.
   FifoDeviceHandle fifo_;
 
-  MSCCLPP_HOST_DEVICE_INLINE BaseProxyChannelDeviceHandle() {}
+  MSCCLPP_HOST_DEVICE_INLINE BasePortChannelDeviceHandle() {}
 
-  MSCCLPP_HOST_DEVICE_INLINE BaseProxyChannelDeviceHandle(SemaphoreId semaphoreId,
-                                                          Host2DeviceSemaphoreDeviceHandle semaphore,
-                                                          FifoDeviceHandle fifo)
+  MSCCLPP_HOST_DEVICE_INLINE BasePortChannelDeviceHandle(SemaphoreId semaphoreId,
+                                                         Host2DeviceSemaphoreDeviceHandle semaphore,
+                                                         FifoDeviceHandle fifo)
       : semaphoreId_(semaphoreId), semaphore_(semaphore), fifo_(fifo) {}
 
 #if defined(MSCCLPP_DEVICE_COMPILE)
@@ -171,27 +171,27 @@ struct BaseProxyChannelDeviceHandle {
     fifo_.sync(curFifoHead);
   }
 
-  /// Check if the proxy channel has been signaled.
-  /// @return true if the proxy channel has been signaled.
+  /// Check if the port channel has been signaled.
+  /// @return true if the port channel has been signaled.
   MSCCLPP_DEVICE_INLINE bool poll() { return semaphore_.poll(); }
 
-  /// Wait for the proxy channel to be signaled.
+  /// Wait for the port channel to be signaled.
   /// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative.
   MSCCLPP_DEVICE_INLINE void wait(int64_t maxSpinCount = 10000000) { semaphore_.wait(maxSpinCount); }
 
 #endif  // defined(MSCCLPP_DEVICE_COMPILE)
 };
 
-struct ProxyChannelDeviceHandle : public BaseProxyChannelDeviceHandle {
+struct PortChannelDeviceHandle : public BasePortChannelDeviceHandle {
   MemoryId dst_;
   MemoryId src_;
 
-  MSCCLPP_HOST_DEVICE_INLINE ProxyChannelDeviceHandle(){};
+  MSCCLPP_HOST_DEVICE_INLINE PortChannelDeviceHandle(){};
 
-  MSCCLPP_HOST_DEVICE_INLINE ProxyChannelDeviceHandle(SemaphoreId semaphoreId,
-                                                      Host2DeviceSemaphoreDeviceHandle semaphore, FifoDeviceHandle fifo,
-                                                      MemoryId dst, MemoryId src)
-      : BaseProxyChannelDeviceHandle(semaphoreId, semaphore, fifo), dst_(dst), src_(src) {}
+  MSCCLPP_HOST_DEVICE_INLINE PortChannelDeviceHandle(SemaphoreId semaphoreId,
+                                                     Host2DeviceSemaphoreDeviceHandle semaphore, FifoDeviceHandle fifo,
+                                                     MemoryId dst, MemoryId src)
+      : BasePortChannelDeviceHandle(semaphoreId, semaphore, fifo), dst_(dst), src_(src) {}
 
 #if defined(MSCCLPP_DEVICE_COMPILE)
   /// Push a @ref TriggerData to the FIFO.
@@ -199,7 +199,7 @@ struct ProxyChannelDeviceHandle : public BaseProxyChannelDeviceHandle {
   /// @param srcOffset The offset into the source memory region.
   /// @param size The size of the transfer.
   MSCCLPP_DEVICE_INLINE void put(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) {
-    BaseProxyChannelDeviceHandle::put(dst_, dstOffset, src_, srcOffset, size);
+    BasePortChannelDeviceHandle::put(dst_, dstOffset, src_, srcOffset, size);
   }
 
   /// Push a @ref TriggerData to the FIFO.
@@ -212,7 +212,7 @@ struct ProxyChannelDeviceHandle : public BaseProxyChannelDeviceHandle {
   /// @param srcOffset The offset into the source memory region.
   /// @param size The size of the transfer.
   MSCCLPP_DEVICE_INLINE void putWithSignal(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) {
-    BaseProxyChannelDeviceHandle::putWithSignal(dst_, dstOffset, src_, srcOffset, size);
+    BasePortChannelDeviceHandle::putWithSignal(dst_, dstOffset, src_, srcOffset, size);
   }
 
   /// Push a @ref TriggerData and a @ref TriggerFlag at the same time to the FIFO.
@@ -225,7 +225,7 @@ struct ProxyChannelDeviceHandle : public BaseProxyChannelDeviceHandle {
   /// @param srcOffset The offset into the source memory region.
   /// @param size The size of the transfer.
   MSCCLPP_DEVICE_INLINE void putWithSignalAndFlush(uint64_t dstOffset, uint64_t srcOffset, uint64_t size) {
-    BaseProxyChannelDeviceHandle::putWithSignalAndFlush(dst_, dstOffset, src_, srcOffset, size);
+    BasePortChannelDeviceHandle::putWithSignalAndFlush(dst_, dstOffset, src_, srcOffset, size);
   }
 
   /// Push a @ref TriggerData, a @ref TriggerFlag, and a @ref TriggerSync at the same time to the FIFO.
@@ -239,4 +239,4 @@ struct ProxyChannelDeviceHandle : public BaseProxyChannelDeviceHandle {
 
 }  // namespace mscclpp
 
-#endif  // MSCCLPP_PROXY_CHANNEL_DEVICE_HPP_
+#endif  // MSCCLPP_PORT_CHANNEL_DEVICE_HPP_
diff --git a/include/mscclpp/semaphore.hpp b/include/mscclpp/semaphore.hpp
index b28373bdc..55dbbe740 100644
--- a/include/mscclpp/semaphore.hpp
+++ b/include/mscclpp/semaphore.hpp
@@ -116,19 +116,19 @@ class Host2HostSemaphore : public BaseSemaphore<std::default_delete, std::defaul
   std::shared_ptr<Connection> connection_;
 };
 
-/// A semaphore for sending signals from the local device to a peer device via SM.
-class SmDevice2DeviceSemaphore : public BaseSemaphore<detail::GpuDeleter, detail::GpuDeleter> {
+/// A semaphore for sending signals from the local device to a peer device via a GPU thread.
+class MemoryDevice2DeviceSemaphore : public BaseSemaphore<detail::GpuDeleter, detail::GpuDeleter> {
  public:
   /// Constructor.
   /// @param communicator The communicator.
   /// @param connection The connection associated with this semaphore.
-  SmDevice2DeviceSemaphore(Communicator& communicator, std::shared_ptr<Connection> connection);
+  MemoryDevice2DeviceSemaphore(Communicator& communicator, std::shared_ptr<Connection> connection);
 
   /// Constructor.
-  SmDevice2DeviceSemaphore() = delete;
+  MemoryDevice2DeviceSemaphore() = delete;
 
-  /// Device-side handle for @ref SmDevice2DeviceSemaphore.
-  using DeviceHandle = SmDevice2DeviceSemaphoreDeviceHandle;
+  /// Device-side handle for @ref MemoryDevice2DeviceSemaphore.
+  using DeviceHandle = MemoryDevice2DeviceSemaphoreDeviceHandle;
 
   /// Returns the device-side handle.
   DeviceHandle deviceHandle() const;
@@ -136,6 +136,10 @@ class SmDevice2DeviceSemaphore : public BaseSemaphore<detail::GpuDeleter, detail
   bool isRemoteInboundSemaphoreIdSet_;
 };
 
+/// @deprecated Use @ref MemoryDevice2DeviceSemaphore instead.
+[[deprecated(
+    "Use MemoryDevice2DeviceSemaphore instead.")]] typedef MemoryDevice2DeviceSemaphore SmDevice2DeviceSemaphore;
+
 }  // namespace mscclpp
 
 #endif  // MSCCLPP_SEMAPHORE_HPP_
diff --git a/include/mscclpp/semaphore_device.hpp b/include/mscclpp/semaphore_device.hpp
index cd455078a..41f50c046 100644
--- a/include/mscclpp/semaphore_device.hpp
+++ b/include/mscclpp/semaphore_device.hpp
@@ -36,8 +36,8 @@ struct Host2DeviceSemaphoreDeviceHandle {
   uint64_t* expectedInboundSemaphoreId;
 };
 
-/// Device-side handle for @ref SmDevice2DeviceSemaphore.
-struct SmDevice2DeviceSemaphoreDeviceHandle {
+/// Device-side handle for @ref MemoryDevice2DeviceSemaphore.
+struct MemoryDevice2DeviceSemaphoreDeviceHandle {
 #if defined(MSCCLPP_DEVICE_COMPILE)
   /// Poll if the remote device has signaled.
   /// @return true if the remote device has signaled.
@@ -102,6 +102,10 @@ struct SmDevice2DeviceSemaphoreDeviceHandle {
   uint64_t* expectedInboundSemaphoreId;
 };
 
+/// @deprecated Use @ref MemoryDevice2DeviceSemaphoreDeviceHandle instead.
+[[deprecated("Use MemoryDevice2DeviceSemaphoreDeviceHandle instead.")]] typedef MemoryDevice2DeviceSemaphoreDeviceHandle
+    SmDevice2DeviceSemaphoreDeviceHandle;
+
 }  // namespace mscclpp
 
 #endif  // MSCCLPP_SEMAPHORE_DEVICE_HPP_
diff --git a/include/mscclpp/sm_channel.hpp b/include/mscclpp/sm_channel.hpp
deleted file mode 100644
index 1a759968b..000000000
--- a/include/mscclpp/sm_channel.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef MSCCLPP_SM_CHANNEL_HPP_
-#define MSCCLPP_SM_CHANNEL_HPP_
-
-#include <type_traits>
-
-#include "core.hpp"
-#include "semaphore.hpp"
-#include "sm_channel_device.hpp"
-
-namespace mscclpp {
-
-/// Channel for accessing peer memory directly from SM.
-struct SmChannel {
- private:
-  std::shared_ptr<SmDevice2DeviceSemaphore> semaphore_;
-  RegisteredMemory dst_;
-  void* src_;
-  void* getPacketBuffer_;
-
- public:
-  /// Constructor.
-  SmChannel() = default;
-
-  /// Constructor.
-  /// @param semaphore The semaphore used to synchronize the communication.
-  /// @param dst Registered memory of the destination.
-  /// @param src The source memory address.
-  /// @param getPacketBuffer The optional buffer used for @ref getPackets().
-  SmChannel(std::shared_ptr<SmDevice2DeviceSemaphore> semaphore, RegisteredMemory dst, void* src,
-            void* getPacketBuffer = nullptr);
-
-  /// Device-side handle for @ref SmChannel.
-  using DeviceHandle = SmChannelDeviceHandle;
-
-  /// Returns the device-side handle.
-  ///
-  /// User should make sure the SmChannel is not released when using the returned handle.
-  ///
-  DeviceHandle deviceHandle() const;
-};
-
-}  // namespace mscclpp
-
-#endif  // MSCCLPP_SM_CHANNEL_HPP_
diff --git a/python/examples/allgather_barrier.py b/python/examples/allgather_barrier.py
index acc0c2a2f..d6f358045 100644
--- a/python/examples/allgather_barrier.py
+++ b/python/examples/allgather_barrier.py
@@ -28,7 +28,7 @@ def allgather_test(gpus, instances):
             c = chunk(n, Buffer.input, 0, 1)
             for peer in range(gpus):
                 if n != peer:
-                    c.put(peer, Buffer.output, n, sendtb=peer, chan_type=ChannelType.sm)
+                    c.put(peer, Buffer.output, n, sendtb=peer, chan_type=ChannelType.memory)
                 else:
                     c.copy(n, Buffer.output, n, sendtb=peer)
             # explicit barrier
@@ -36,13 +36,13 @@ def allgather_test(gpus, instances):
             r.barrier(tb_list=list(range(gpus)))
             for peer in range(gpus):
                 if n != peer:
-                    c.signal(peer, Buffer.output, n, sendtb=peer, chan_type=ChannelType.sm)
+                    c.signal(peer, Buffer.output, n, sendtb=peer, chan_type=ChannelType.memory)
 
         for n in range(gpus):
             for peer in range(gpus):
                 c = chunk(n, Buffer.output, peer, 1)
                 if n != peer:
-                    c.wait(peer, Buffer.input, peer, recvtb=peer, chan_type=ChannelType.sm)
+                    c.wait(peer, Buffer.input, peer, recvtb=peer, chan_type=ChannelType.memory)
 
         Json()
         Check()
diff --git a/python/examples/send_recv_packet.py b/python/examples/send_recv_packet.py
index f0272344e..4ecb58ddb 100644
--- a/python/examples/send_recv_packet.py
+++ b/python/examples/send_recv_packet.py
@@ -10,9 +10,9 @@
 
 def send_recv(instances):
     """
-    Send and receive data between two ranks using proxy channels, with LL protocol and double scratch buffer.
+    Send and receive data between two ranks using port channels, with LL protocol and double scratch buffer.
     Steps:
-    1. Each rank sends a chunk to every other rank's scratch buffer with packet format via proxy channel.
+    1. Each rank sends a chunk to every other rank's scratch buffer with packet format via port channel.
     2. Wait for the data to be received, then copy it to the output buffer.
     """
     size = 2
@@ -36,7 +36,7 @@ def send_recv(instances):
                     "scratch",
                     1,
                     sendtb=0,
-                    chan_type=ChannelType.proxy,
+                    chan_type=ChannelType.port,
                     temp_buffer="scratch",
                     temp_buffer_index=0,
                 )
diff --git a/python/examples/send_recv_proxy.py b/python/examples/send_recv_proxy.py
index ec6baee99..f9ed2f309 100644
--- a/python/examples/send_recv_proxy.py
+++ b/python/examples/send_recv_proxy.py
@@ -10,7 +10,7 @@
 
 def send_recv(instances):
     """
-    Send and receive data between two ranks using proxy channels.
+    Send and receive data between two ranks using port channels.
     steps:
     1. Each rank sends a chunk to the other rank's scratch buffer and signals the other rank that the data has been sent.
     2. Wait for the data to be received then copy it to the output buffer.
@@ -34,14 +34,14 @@ def send_recv(instances):
                     "scratch",
                     1,
                     sendtb=0,
-                    chan_type=ChannelType.proxy,
+                    chan_type=ChannelType.port,
                 )
-                c.signal(nghr, "scratch", 1, sendtb=0, chan_type=ChannelType.proxy)
-                c.flush(nghr, "scratch", 1, sendtb=0, chan_type=ChannelType.proxy)
+                c.signal(nghr, "scratch", 1, sendtb=0, chan_type=ChannelType.port)
+                c.flush(nghr, "scratch", 1, sendtb=0, chan_type=ChannelType.port)
 
         for r in range(size):
             c = chunk(r, "scratch", 1)
-            c.wait(1 - r, Buffer.input, 0, recvtb=0, chan_type=ChannelType.proxy)
+            c.wait(1 - r, Buffer.input, 0, recvtb=0, chan_type=ChannelType.port)
             c.copy(r, Buffer.output, 0, sendtb=0)
 
         Json()
diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py
index 678379ac2..839b921e7 100644
--- a/python/mscclpp/__init__.py
+++ b/python/mscclpp/__init__.py
@@ -1,7 +1,9 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import os as _os
+import os
+import warnings
+from functools import wraps
 
 from ._mscclpp import (
     Env,
@@ -22,9 +24,9 @@
     numa,
     ProxyService,
     RegisteredMemory,
-    ProxyChannel,
-    SmChannel,
-    SmDevice2DeviceSemaphore,
+    PortChannel,
+    MemoryChannel,
+    MemoryDevice2DeviceSemaphore,
     TcpBootstrap,
     Transport,
     TransportFlags,
@@ -39,17 +41,82 @@
     npkit,
 )
 
-__version__ = version()
 
-if _os.environ.get("MSCCLPP_HOME", None) is None:
-    _os.environ["MSCCLPP_HOME"] = _os.path.abspath(_os.path.dirname(__file__))
+__all__ = [
+    "Communicator",
+    "Connection",
+    "connect_nvls_collective",
+    "EndpointConfig",
+    "Fifo",
+    "Host2DeviceSemaphore",
+    "Host2HostSemaphore",
+    "numa",
+    "ProxyService",
+    "RegisteredMemory",
+    "PortChannel",
+    "MemoryChannel",
+    "MemoryDevice2DeviceSemaphore",
+    "TcpBootstrap",
+    "Transport",
+    "TransportFlags",
+    "DataType",
+    "Executor",
+    "ExecutionPlan",
+    "PacketType",
+    "version",
+    "is_nvls_supported",
+    "alloc_shared_physical_cuda",
+    "npkit",
+    "__version__",
+    "get_include",
+    "get_lib",
+    ### Deprecated ###
+    "ProxyChannel",
+    "SmChannel",
+    "SmDevice2DeviceSemaphore",
+]
 
+__version__: str = str(version())
 
-def get_include():
+if os.environ.get("MSCCLPP_HOME", None) is None:
+    os.environ["MSCCLPP_HOME"] = os.path.abspath(os.path.dirname(__file__))
+
+
+def get_include() -> str:
     """Return the directory that contains the MSCCL++ headers."""
-    return _os.path.join(_os.path.dirname(__file__), "include")
+    return os.path.join(os.path.dirname(__file__), "include")
 
 
-def get_lib():
+def get_lib() -> str:
     """Return the directory that contains the MSCCL++ headers."""
-    return _os.path.join(_os.path.dirname(__file__), "lib")
+    return os.path.join(os.path.dirname(__file__), "lib")
+
+
+def deprecated(new_cls):
+    def decorator(old_cls):
+        @wraps(old_cls)
+        def wrapper(*args, **kwargs):
+            warnings.warn(
+                f"{old_cls.__name__} is deprecated, use {new_cls.__name__} instead.",
+                DeprecationWarning,
+            )
+            return new_cls(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
+@deprecated(PortChannel)
+class ProxyChannel(PortChannel):
+    pass
+
+
+@deprecated(MemoryChannel)
+class SmChannel(MemoryChannel):
+    pass
+
+
+@deprecated(MemoryDevice2DeviceSemaphore)
+class SmDevice2DeviceSemaphore(MemoryDevice2DeviceSemaphore):
+    pass
diff --git a/python/mscclpp/comm.py b/python/mscclpp/comm.py
index c2726826f..8d2e0f481 100644
--- a/python/mscclpp/comm.py
+++ b/python/mscclpp/comm.py
@@ -14,9 +14,9 @@
     Host2HostSemaphore,
     ProxyService,
     RegisteredMemory,
-    ProxyChannel,
-    SmChannel,
-    SmDevice2DeviceSemaphore,
+    PortChannel,
+    MemoryChannel,
+    MemoryDevice2DeviceSemaphore,
     TcpBootstrap,
     Transport,
     TransportFlags,
@@ -135,7 +135,7 @@ def register_tensor_with_connections(
     def make_semaphore(
         self,
         connections: dict[int, Connection],
-        semaphore_type: Type[Host2HostSemaphore] or Type[Host2DeviceSemaphore] or Type[SmDevice2DeviceSemaphore],
+        semaphore_type: Type[Host2HostSemaphore] or Type[Host2DeviceSemaphore] or Type[MemoryDevice2DeviceSemaphore],
     ) -> dict[int, Host2HostSemaphore]:
         semaphores = {}
         for rank in connections:
@@ -143,33 +143,35 @@ def make_semaphore(
         self.communicator.setup()
         return semaphores
 
-    def make_sm_channels(self, tensor: cp.ndarray, connections: dict[int, Connection]) -> dict[int, SmChannel]:
-        semaphores = self.make_semaphore(connections, SmDevice2DeviceSemaphore)
+    def make_memory_channels(self, tensor: cp.ndarray, connections: dict[int, Connection]) -> dict[int, MemoryChannel]:
+        semaphores = self.make_semaphore(connections, MemoryDevice2DeviceSemaphore)
         registered_memories = self.register_tensor_with_connections(tensor, connections)
         channels = {}
         tensor_data_ptr = tensor.data_ptr() if is_torch_tensor(tensor) else tensor.data.ptr
         for rank in connections:
-            channels[rank] = SmChannel(semaphores[rank], registered_memories[rank], tensor_data_ptr)
+            channels[rank] = MemoryChannel(semaphores[rank], registered_memories[rank], tensor_data_ptr)
         return channels
 
-    def make_sm_channels_with_scratch(
+    def make_memory_channels_with_scratch(
         self,
         tensor: cp.ndarray,
         scratchTensor: cp.ndarray,
         connections: dict[int, Connection],
-    ) -> dict[int, SmChannel]:
-        semaphores = self.make_semaphore(connections, SmDevice2DeviceSemaphore)
+    ) -> dict[int, MemoryChannel]:
+        semaphores = self.make_semaphore(connections, MemoryDevice2DeviceSemaphore)
         registered_memories = self.register_tensor_with_connections(scratchTensor, connections)
         channels = {}
         tensor_data_ptr = tensor.data_ptr() if is_torch_tensor(tensor) else tensor.data.ptr
         scratch_data_ptr = scratchTensor.data_ptr() if is_torch_tensor(scratchTensor) else scratchTensor.data.ptr
         for rank in connections:
-            channels[rank] = SmChannel(semaphores[rank], registered_memories[rank], tensor_data_ptr, scratch_data_ptr)
+            channels[rank] = MemoryChannel(
+                semaphores[rank], registered_memories[rank], tensor_data_ptr, scratch_data_ptr
+            )
         return channels
 
-    def make_proxy_channels(
+    def make_port_channels(
         self, proxy_service: ProxyService, tensor: cp.ndarray, connections: dict[int, Connection]
-    ) -> dict[int, SmChannel]:
+    ) -> dict[int, MemoryChannel]:
         semaphores = self.make_semaphore(connections, Host2DeviceSemaphore)
         registered_memories = self.register_tensor_with_connections(tensor, connections)
         memory_ids = {}
@@ -180,18 +182,16 @@ def make_proxy_channels(
             semaphore_ids[rank] = proxy_service.add_semaphore(semaphores[rank])
         channels = {}
         for rank in semaphores:
-            channels[rank] = proxy_service.proxy_channel(
-                semaphore_ids[rank], memory_ids[rank], memory_ids[self.my_rank]
-            )
+            channels[rank] = proxy_service.port_channel(semaphore_ids[rank], memory_ids[rank], memory_ids[self.my_rank])
         return channels
 
-    def make_proxy_channels_with_scratch(
+    def make_port_channels_with_scratch(
         self,
         proxy_service: ProxyService,
         tensor: cp.ndarray,
         scratchTensor: cp.ndarray,
         connections: dict[int, Connection],
-    ) -> dict[int, SmChannel]:
+    ) -> dict[int, MemoryChannel]:
         transport_flags = TransportFlags()
         for rank in connections:
             transport_flags |= connections[rank].transport()
@@ -218,21 +218,19 @@ def make_proxy_channels_with_scratch(
             semaphore_ids[rank] = proxy_service.add_semaphore(semaphores[rank])
         channels = {}
         for rank in semaphores:
-            channels[rank] = proxy_service.proxy_channel(
-                semaphore_ids[rank], memory_ids[rank], memory_ids[self.my_rank]
-            )
+            channels[rank] = proxy_service.port_channel(semaphore_ids[rank], memory_ids[rank], memory_ids[self.my_rank])
         return channels
 
     def register_semaphore_with_proxy(
         self, proxy_service: ProxyService, connections: dict[int, Connection]
-    ) -> dict[int, SmChannel]:
+    ) -> dict[int, MemoryChannel]:
         semaphores = self.make_semaphore(connections, Host2DeviceSemaphore)
         semaphore_ids = {}
         for rank in semaphores:
             semaphore_ids[rank] = proxy_service.add_semaphore(semaphores[rank])
         channels = {}
         for rank in semaphores:
-            channels[rank] = proxy_service.base_proxy_channel(semaphore_ids[rank])
+            channels[rank] = proxy_service.base_port_channel(semaphore_ids[rank])
         return channels
 
     def register_memory_with_proxy(
diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp
index 90ee22860..48bd57ab1 100644
--- a/python/mscclpp/core_py.cpp
+++ b/python/mscclpp/core_py.cpp
@@ -15,8 +15,8 @@ using namespace mscclpp;
 
 extern void register_env(nb::module_& m);
 extern void register_error(nb::module_& m);
-extern void register_proxy_channel(nb::module_& m);
-extern void register_sm_channel(nb::module_& m);
+extern void register_port_channel(nb::module_& m);
+extern void register_memory_channel(nb::module_& m);
 extern void register_fifo(nb::module_& m);
 extern void register_semaphore(nb::module_& m);
 extern void register_utils(nb::module_& m);
@@ -187,8 +187,8 @@ void register_core(nb::module_& m) {
 NB_MODULE(_mscclpp, m) {
   register_env(m);
   register_error(m);
-  register_proxy_channel(m);
-  register_sm_channel(m);
+  register_port_channel(m);
+  register_memory_channel(m);
   register_fifo(m);
   register_semaphore(m);
   register_utils(m);
diff --git a/python/mscclpp/language/collectives.py b/python/mscclpp/language/collectives.py
index 67b735ba9..55fe51880 100644
--- a/python/mscclpp/language/collectives.py
+++ b/python/mscclpp/language/collectives.py
@@ -6,7 +6,6 @@
 
 
 class Collective:
-
     def __init__(self, num_ranks, chunk_factor, inplace, num_ranks_per_node=-1, **kwargs):
         self.num_ranks = num_ranks
         self.chunk_factor = chunk_factor
@@ -36,7 +35,6 @@ def get_buffer_index(self, rank, buffer, index):
 
 
 class AllToAll(Collective):
-
     def __init__(self, num_ranks, chunk_factor, inplace):
         Collective.__init__(self, num_ranks, chunk_factor, inplace)
         self.name = "alltoall"
@@ -137,7 +135,6 @@ def get_buffer_index(self, rank, buffer, index):
 
 
 class AllReduce(Collective):
-
     def __init__(self, num_ranks, chunk_factor, inplace, num_ranks_per_node=-1, **kwargs):
         num_chunk_groups = kwargs.get("num_chunk_groups", num_ranks)
         Collective.__init__(
diff --git a/python/mscclpp/language/dag/instruction_dag.py b/python/mscclpp/language/dag/instruction_dag.py
index dcc1189ca..6f137a90a 100644
--- a/python/mscclpp/language/dag/instruction_dag.py
+++ b/python/mscclpp/language/dag/instruction_dag.py
@@ -221,7 +221,7 @@ def add_flush(self, rank, send_ref, recv_ref, tb):
             next=set(),
             prev=set(),
             tb=tb,
-            channel_type=ChannelType.proxy,
+            channel_type=ChannelType.port,
             step=tb_step,
         )
         buffer = send_ref.buffer
diff --git a/python/mscclpp/language/dag/optimizer.py b/python/mscclpp/language/dag/optimizer.py
index 62fc0f5e8..4cfa638db 100644
--- a/python/mscclpp/language/dag/optimizer.py
+++ b/python/mscclpp/language/dag/optimizer.py
@@ -19,7 +19,6 @@
 
 
 class _InstructionOptimizer:
-
     def try_merge_same_instructions(
         self,
         op: Op,
@@ -128,8 +127,8 @@ def try_fuse_with_put(self, op: Op, next_op: Op, tb: Threadblock, queue: list) -
             and same_tb(op, next_op)
             and same_count(op, next_op)
             and buf_dst_src_match(op, next_op)
-            and next_op.channel_type == ChannelType.sm
-            and (op.channel_type == ChannelType.none or op.channel_type == ChannelType.sm)
+            and next_op.channel_type == ChannelType.memory
+            and (op.channel_type == ChannelType.none or op.channel_type == ChannelType.memory)
             and not circular_dep_after_merge(op, next_op)
             and all_prevs_visited_after_merge(op, next_op)
         ):
@@ -140,10 +139,10 @@ def try_fuse_with_put(self, op: Op, next_op: Op, tb: Threadblock, queue: list) -
                 op.inst = Instruction.read_reduce_copy_send
             elif op.inst == Instruction.reduce:
                 op.inst = Instruction.reduce_send
-                op.channel_type = ChannelType.sm
+                op.channel_type = ChannelType.memory
             elif op.inst == Instruction.reduce_packet:
                 op.inst = Instruction.reduce_send_packet
-                op.channel_type = ChannelType.sm
+                op.channel_type = ChannelType.memory
             # Append the destination chunk from next_op
             op.dsts.append(
                 (
@@ -158,11 +157,11 @@ def try_fuse_with_put(self, op: Op, next_op: Op, tb: Threadblock, queue: list) -
             return True
         return False
 
-    def try_fuse_instructions_using_proxy_channel(
+    def try_fuse_instructions_using_port_channel(
         self, op: Op, next_op: Op, tb: Threadblock, queue: list, expected_next_inst: Instruction
     ) -> bool:
         """
-        Attempts to fuse operations which using proxy channel.
+        Attempts to fuse operations which using port channel.
         :param op: The current operation.
         :param next_op: The next operation to potentially merge with.
         :param tb: The thread block containing the operations.
@@ -177,7 +176,7 @@ def try_fuse_instructions_using_proxy_channel(
             and same_buf_dst(op, next_op)
             and same_buf_src(op, next_op)
             and same_chan_type(op, next_op)
-            and op.channel_type == ChannelType.proxy
+            and op.channel_type == ChannelType.port
             and not circular_dep_after_merge(op, next_op)
             and all_prevs_visited_after_merge(op, next_op)
         ):
@@ -229,7 +228,6 @@ def try_remove_op(self, pending_remove_op: Op, condition: bool) -> bool:
 
 
 class DagOptimizer:
-
     def __init__(self, instruction_dag: InstructionDAG):
         self.optimizer = _InstructionOptimizer()
         self.dag = instruction_dag
@@ -257,7 +255,7 @@ def remove_redundant_signal_wait(self):
                     queue = queue[1:]
 
     def fuse_instructions(self):
-        self._fuse_instructions_using_proxy_channel()
+        self._fuse_instructions_using_port_channel()
         self._fuse_same_instructions()
         self._optimize_rrcs_rs()
         self._optimize_group_ops()
@@ -267,7 +265,7 @@ def fuse_instructions(self):
     # -> putWithSignal(src, sbuf, si, dst, dbuf, di)
     # put(src, sbuf, si, dst, dbuf, di) signal(src, sbuf, si, dst, dbuf, di) flush(src, sbuf, si, dst, dbuf, di)
     # -> putWithSignalAndFlush(src, sbuf, si, dst, dbuf, di)
-    def _fuse_instructions_using_proxy_channel(self):
+    def _fuse_instructions_using_port_channel(self):
         inst_followup_map = {
             Instruction.put: Instruction.signal,
             Instruction.put_with_signal: Instruction.flush,
@@ -280,7 +278,7 @@ def _fuse_instructions_using_proxy_channel(self):
                     fused = False
                     if op.inst in inst_followup_map:
                         for next_op in op.next:
-                            fused = self.optimizer.try_fuse_instructions_using_proxy_channel(
+                            fused = self.optimizer.try_fuse_instructions_using_port_channel(
                                 op, next_op, tb, queue, inst_followup_map[op.inst]
                             )
                             if fused:
diff --git a/python/mscclpp/language/ir.py b/python/mscclpp/language/ir.py
index 3b84b5298..4cb12e6da 100644
--- a/python/mscclpp/language/ir.py
+++ b/python/mscclpp/language/ir.py
@@ -286,7 +286,7 @@ def to_json(self, op: Op, tb_channel_dict: dict) -> _JsonInstruction:
 class _ReduceSendConverter(_OpConverter):
     def to_json(self, op: Op, tb_channel_dict: dict) -> _JsonInstruction:
         dst_channel_ids = self.get_channel_ids(
-            op.dsts, tb_channel_dict, op.dst.buffer, op.dsts[0].buffer, ChannelType.sm
+            op.dsts, tb_channel_dict, op.dst.buffer, op.dsts[0].buffer, ChannelType.memory
         )
         o_buff = {"src": op.dst.buffer.value, "dst": op.dsts[0].buffer.value}
         srcs = list(map(lambda x: {"buff": x.buffer.value, "off": x.index}, op.srcs))
diff --git a/python/mscclpp/language/program.py b/python/mscclpp/language/program.py
index 6cf0d15b1..0657f1fcf 100644
--- a/python/mscclpp/language/program.py
+++ b/python/mscclpp/language/program.py
@@ -222,7 +222,7 @@ def _get_buffer_index(self, remote_rank, buffer, index):
             return buffer, self.prog.buffers[remote_rank][buffer].instance_size()
         return buffer, index
 
-    def _put(self, dst, buffer=None, index=-1, sendtb=-1, chan_type=ChannelType.sm, use_packet=False):
+    def _put(self, dst, buffer=None, index=-1, sendtb=-1, chan_type=ChannelType.memory, use_packet=False):
         self.prog.check_buffer_exists(dst, buffer)
         assert self.rank != dst, "Cannot put to the same rank"
         buffer, index = self._get_buffer_index(dst, buffer, index)
@@ -237,7 +237,7 @@ def _put(self, dst, buffer=None, index=-1, sendtb=-1, chan_type=ChannelType.sm,
             self.prog.instr_dag.add_put(self.rank, self, dst_chunkref, sendtb, chan_type)
         return dst_chunkref
 
-    def put(self, dst, buffer=None, index=-1, sendtb=-1, chan_type=ChannelType.sm):
+    def put(self, dst, buffer=None, index=-1, sendtb=-1, chan_type=ChannelType.memory):
         return self._put(dst, buffer, index, sendtb, chan_type)
 
     def put_packet(
@@ -246,19 +246,19 @@ def put_packet(
         buffer=None,
         index=-1,
         sendtb=-1,
-        chan_type=ChannelType.sm,
+        chan_type=ChannelType.memory,
         temp_buffer=None,
         temp_buffer_index=-1,
     ):
         chunk_ref = self
-        if chan_type == ChannelType.proxy:
-            assert temp_buffer is not None, "Need to specify a temporary buffer for proxy channels"
+        if chan_type == ChannelType.port:
+            assert temp_buffer is not None, "Need to specify a temporary buffer for port channels"
             chunk_ref = self._copy(
                 self.rank, temp_buffer, temp_buffer_index, sendtb, trans_from_packet=False, trans_to_packet=True
             )
         return chunk_ref._put(dst, buffer, index, sendtb, chan_type, True)
 
-    def get(self, src, buffer=None, index=-1, recvtb=-1, chan_type=ChannelType.sm):
+    def get(self, src, buffer=None, index=-1, recvtb=-1, chan_type=ChannelType.memory):
         self.prog.check_buffer_exists(src, buffer)
         sender = src
         receiver = self.rank
@@ -273,7 +273,7 @@ def get(self, src, buffer=None, index=-1, recvtb=-1, chan_type=ChannelType.sm):
     # for signal and wait, currently we assuem the pair will use the same tb index. In future we need
     # to infer the tb index from the instruction DAG Add a channel is define as (send_tb, src_buffer, recv_tb, dst_buffer, type).
     # Then we can use DAG info to reduce the number of channels.
-    def signal(self, dst, buffer=None, index=-1, sendtb=-1, chan_type=ChannelType.sm):
+    def signal(self, dst, buffer=None, index=-1, sendtb=-1, chan_type=ChannelType.memory):
         sender = self.rank
         receiver = dst
         assert sender != receiver, "Cannot signal to the same rank"
@@ -282,9 +282,9 @@ def signal(self, dst, buffer=None, index=-1, sendtb=-1, chan_type=ChannelType.sm
         dst_chunkref = self.prog.get_ref(dst, buffer, index, self.size)
         self.prog.instr_dag.add_signal(sender, self, dst_chunkref, sendtb, chan_type)
 
-    # only proxy channel need to use this function
-    def flush(self, dst, buffer=None, index=-1, sendtb=-1, chan_type=ChannelType.proxy):
-        assert chan_type == ChannelType.proxy, "Only proxy channel can use flush"
+    # only port channel need to use this function
+    def flush(self, dst, buffer=None, index=-1, sendtb=-1, chan_type=ChannelType.port):
+        assert chan_type == ChannelType.port, "Only port channel can use flush"
         sender = self.rank
         receiver = dst
         assert sender != receiver, "Cannot flush to the same rank"
@@ -293,7 +293,7 @@ def flush(self, dst, buffer=None, index=-1, sendtb=-1, chan_type=ChannelType.pro
         dst_chunkref = self.prog.get_ref(dst, buffer, index, self.size)
         self.prog.instr_dag.add_flush(sender, self, dst_chunkref, sendtb)
 
-    def wait(self, src, buffer=None, index=-1, recvtb=-1, chan_type=ChannelType.sm):
+    def wait(self, src, buffer=None, index=-1, recvtb=-1, chan_type=ChannelType.memory):
         sender = src
         receiver = self.rank
         assert sender != receiver, "Cannot wait on the same rank"
@@ -324,7 +324,7 @@ def copy(self, dst, buffer=None, index=-1, sendtb=-1):
     def copy_packet(self, dst, buffer=None, index=-1, sendtb=-1):
         return self._copy(dst, buffer, index, sendtb, trans_from_packet=True, trans_to_packet=False)
 
-    def _reduce(self, other_chunkref, recvtb=-1, channel_type=ChannelType.sm, use_packet=False):
+    def _reduce(self, other_chunkref, recvtb=-1, channel_type=ChannelType.memory, use_packet=False):
         dst = self.rank
         src = other_chunkref.rank
 
@@ -342,7 +342,7 @@ def _reduce(self, other_chunkref, recvtb=-1, channel_type=ChannelType.sm, use_pa
         return self
 
     # Reduces the chunk(s) referenced by other_chunkref into the chunk(s) referenced by this chunkref
-    def reduce(self, other_chunkref, recvtb=-1, channel_type=ChannelType.sm):
+    def reduce(self, other_chunkref, recvtb=-1, channel_type=ChannelType.memory):
         return self._reduce(other_chunkref, recvtb, channel_type)
 
     # Reduces the chunk(s) referenced by other_chunkref into the chunk(s) referenced by this chunkref
diff --git a/python/mscclpp/language/types.py b/python/mscclpp/language/types.py
index f6202ccfe..a819bc034 100644
--- a/python/mscclpp/language/types.py
+++ b/python/mscclpp/language/types.py
@@ -114,11 +114,15 @@ def __hash__(self):
 
 
 class ChannelType(Enum):
-    proxy = "proxy"
-    sm = "sm"
+    port = "port"
+    memory = "memory"
     none = "none"
     nvls = "nvls"
 
+    # Deprecated
+    proxy = "port"
+    sm = "memory"
+
     def __str__(self):
         return self.value
 
diff --git a/python/mscclpp/memory_channel_py.cpp b/python/mscclpp/memory_channel_py.cpp
new file mode 100644
index 000000000..d4e1c5442
--- /dev/null
+++ b/python/mscclpp/memory_channel_py.cpp
@@ -0,0 +1,35 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/string.h>
+
+#include <mscclpp/memory_channel.hpp>
+
+namespace nb = nanobind;
+using namespace mscclpp;
+
+void register_memory_channel(nb::module_& m) {
+  nb::class_<MemoryChannel> memoryChannel(m, "MemoryChannel");
+  memoryChannel
+      .def("__init__",
+           [](MemoryChannel* memoryChannel, std::shared_ptr<MemoryDevice2DeviceSemaphore> semaphore,
+              RegisteredMemory dst, uintptr_t src) { new (memoryChannel) MemoryChannel(semaphore, dst, (void*)src); })
+      .def("__init__",
+           [](MemoryChannel* memoryChannel, std::shared_ptr<MemoryDevice2DeviceSemaphore> semaphore,
+              RegisteredMemory dst, uintptr_t src, uintptr_t get_packet_buffer) {
+             new (memoryChannel) MemoryChannel(semaphore, dst, (void*)src, (void*)get_packet_buffer);
+           })
+      .def("device_handle", &MemoryChannel::deviceHandle);
+
+  nb::class_<MemoryChannel::DeviceHandle>(m, "MemoryChannelDeviceHandle")
+      .def(nb::init<>())
+      .def_rw("semaphore_", &MemoryChannel::DeviceHandle::semaphore_)
+      .def_rw("src_", &MemoryChannel::DeviceHandle::src_)
+      .def_rw("dst_", &MemoryChannel::DeviceHandle::dst_)
+      .def_rw("getPacketBuffer_", &MemoryChannel::DeviceHandle::getPacketBuffer_)
+      .def_prop_ro("raw", [](const MemoryChannel::DeviceHandle& self) -> nb::bytes {
+        return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
+      });
+};
diff --git a/python/mscclpp/proxy_channel_py.cpp b/python/mscclpp/port_channel_py.cpp
similarity index 54%
rename from python/mscclpp/proxy_channel_py.cpp
rename to python/mscclpp/port_channel_py.cpp
index dfe882228..dd33724e0 100644
--- a/python/mscclpp/proxy_channel_py.cpp
+++ b/python/mscclpp/port_channel_py.cpp
@@ -5,12 +5,12 @@
 #include <nanobind/stl/shared_ptr.h>
 #include <nanobind/stl/string.h>
 
-#include <mscclpp/proxy_channel.hpp>
+#include <mscclpp/port_channel.hpp>
 
 namespace nb = nanobind;
 using namespace mscclpp;
 
-void register_proxy_channel(nb::module_& m) {
+void register_port_channel(nb::module_& m) {
   nb::class_<BaseProxyService>(m, "BaseProxyService")
       .def("start_proxy", &BaseProxyService::startProxy)
       .def("stop_proxy", &BaseProxyService::stopProxy);
@@ -23,36 +23,36 @@ void register_proxy_channel(nb::module_& m) {
       .def("add_semaphore", &ProxyService::addSemaphore, nb::arg("semaphore"))
       .def("add_memory", &ProxyService::addMemory, nb::arg("memory"))
       .def("semaphore", &ProxyService::semaphore, nb::arg("id"))
-      .def("base_proxy_channel", &ProxyService::baseProxyChannel, nb::arg("id"))
-      .def("proxy_channel", &ProxyService::proxyChannel, nb::arg("id"), nb::arg("dst"), nb::arg("src"));
+      .def("base_port_channel", &ProxyService::basePortChannel, nb::arg("id"))
+      .def("port_channel", &ProxyService::portChannel, nb::arg("id"), nb::arg("dst"), nb::arg("src"));
 
-  nb::class_<BaseProxyChannel>(m, "BaseProxyChannel")
+  nb::class_<BasePortChannel>(m, "BasePortChannel")
       .def(nb::init<SemaphoreId, std::shared_ptr<Host2DeviceSemaphore>, std::shared_ptr<Proxy>>(),
            nb::arg("semaphoreId"), nb::arg("semaphore"), nb::arg("proxy"))
-      .def("device_handle", &BaseProxyChannel::deviceHandle);
+      .def("device_handle", &BasePortChannel::deviceHandle);
 
-  nb::class_<BaseProxyChannel::DeviceHandle>(m, "BaseProxyChannelDeviceHandle")
+  nb::class_<BasePortChannel::DeviceHandle>(m, "BasePortChannelDeviceHandle")
       .def(nb::init<>())
-      .def_rw("semaphoreId_", &BaseProxyChannel::DeviceHandle::semaphoreId_)
-      .def_rw("semaphore_", &BaseProxyChannel::DeviceHandle::semaphore_)
-      .def_rw("fifo_", &BaseProxyChannel::DeviceHandle::fifo_)
-      .def_prop_ro("raw", [](const BaseProxyChannel::DeviceHandle& self) -> nb::bytes {
+      .def_rw("semaphoreId_", &BasePortChannel::DeviceHandle::semaphoreId_)
+      .def_rw("semaphore_", &BasePortChannel::DeviceHandle::semaphore_)
+      .def_rw("fifo_", &BasePortChannel::DeviceHandle::fifo_)
+      .def_prop_ro("raw", [](const BasePortChannel::DeviceHandle& self) -> nb::bytes {
         return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
       });
 
-  nb::class_<ProxyChannel>(m, "ProxyChannel")
+  nb::class_<PortChannel>(m, "PortChannel")
       .def(nb::init<SemaphoreId, std::shared_ptr<Host2DeviceSemaphore>, std::shared_ptr<Proxy>, MemoryId, MemoryId>(),
            nb::arg("semaphoreId"), nb::arg("semaphore"), nb::arg("proxy"), nb::arg("dst"), nb::arg("src"))
-      .def("device_handle", &ProxyChannel::deviceHandle);
+      .def("device_handle", &PortChannel::deviceHandle);
 
-  nb::class_<ProxyChannel::DeviceHandle>(m, "ProxyChannelDeviceHandle")
+  nb::class_<PortChannel::DeviceHandle>(m, "PortChannelDeviceHandle")
       .def(nb::init<>())
-      .def_rw("semaphoreId_", &ProxyChannel::DeviceHandle::semaphoreId_)
-      .def_rw("semaphore_", &ProxyChannel::DeviceHandle::semaphore_)
-      .def_rw("fifo_", &ProxyChannel::DeviceHandle::fifo_)
-      .def_rw("src_", &ProxyChannel::DeviceHandle::src_)
-      .def_rw("dst_", &ProxyChannel::DeviceHandle::dst_)
-      .def_prop_ro("raw", [](const ProxyChannel::DeviceHandle& self) -> nb::bytes {
+      .def_rw("semaphoreId_", &PortChannel::DeviceHandle::semaphoreId_)
+      .def_rw("semaphore_", &PortChannel::DeviceHandle::semaphore_)
+      .def_rw("fifo_", &PortChannel::DeviceHandle::fifo_)
+      .def_rw("src_", &PortChannel::DeviceHandle::src_)
+      .def_rw("dst_", &PortChannel::DeviceHandle::dst_)
+      .def_prop_ro("raw", [](const PortChannel::DeviceHandle& self) -> nb::bytes {
         return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
       });
 };
diff --git a/python/mscclpp/semaphore_py.cpp b/python/mscclpp/semaphore_py.cpp
index a616a89da..daadeb03b 100644
--- a/python/mscclpp/semaphore_py.cpp
+++ b/python/mscclpp/semaphore_py.cpp
@@ -33,18 +33,18 @@ void register_semaphore(nb::module_& m) {
       .def("wait", &Host2HostSemaphore::wait, nb::call_guard<nb::gil_scoped_release>(),
            nb::arg("max_spin_count") = 10000000);
 
-  nb::class_<SmDevice2DeviceSemaphore> smDevice2DeviceSemaphore(m, "SmDevice2DeviceSemaphore");
-  smDevice2DeviceSemaphore
+  nb::class_<MemoryDevice2DeviceSemaphore> memoryDevice2DeviceSemaphore(m, "MemoryDevice2DeviceSemaphore");
+  memoryDevice2DeviceSemaphore
       .def(nb::init<Communicator&, std::shared_ptr<Connection>>(), nb::arg("communicator"), nb::arg("connection"))
-      .def("device_handle", &SmDevice2DeviceSemaphore::deviceHandle);
+      .def("device_handle", &MemoryDevice2DeviceSemaphore::deviceHandle);
 
-  nb::class_<SmDevice2DeviceSemaphore::DeviceHandle>(smDevice2DeviceSemaphore, "DeviceHandle")
+  nb::class_<MemoryDevice2DeviceSemaphore::DeviceHandle>(memoryDevice2DeviceSemaphore, "DeviceHandle")
       .def(nb::init<>())
-      .def_rw("inboundSemaphoreId", &SmDevice2DeviceSemaphore::DeviceHandle::inboundSemaphoreId)
-      .def_rw("outboundSemaphoreId", &SmDevice2DeviceSemaphore::DeviceHandle::outboundSemaphoreId)
-      .def_rw("remoteInboundSemaphoreId", &SmDevice2DeviceSemaphore::DeviceHandle::remoteInboundSemaphoreId)
-      .def_rw("expectedInboundSemaphoreId", &SmDevice2DeviceSemaphore::DeviceHandle::expectedInboundSemaphoreId)
-      .def_prop_ro("raw", [](const SmDevice2DeviceSemaphore::DeviceHandle& self) -> nb::bytes {
+      .def_rw("inboundSemaphoreId", &MemoryDevice2DeviceSemaphore::DeviceHandle::inboundSemaphoreId)
+      .def_rw("outboundSemaphoreId", &MemoryDevice2DeviceSemaphore::DeviceHandle::outboundSemaphoreId)
+      .def_rw("remoteInboundSemaphoreId", &MemoryDevice2DeviceSemaphore::DeviceHandle::remoteInboundSemaphoreId)
+      .def_rw("expectedInboundSemaphoreId", &MemoryDevice2DeviceSemaphore::DeviceHandle::expectedInboundSemaphoreId)
+      .def_prop_ro("raw", [](const MemoryDevice2DeviceSemaphore::DeviceHandle& self) -> nb::bytes {
         return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
       });
 }
diff --git a/python/mscclpp/sm_channel_py.cpp b/python/mscclpp/sm_channel_py.cpp
deleted file mode 100644
index 04a51eb8b..000000000
--- a/python/mscclpp/sm_channel_py.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <nanobind/nanobind.h>
-#include <nanobind/stl/shared_ptr.h>
-#include <nanobind/stl/string.h>
-
-#include <mscclpp/sm_channel.hpp>
-
-namespace nb = nanobind;
-using namespace mscclpp;
-
-void register_sm_channel(nb::module_& m) {
-  nb::class_<SmChannel> smChannel(m, "SmChannel");
-  smChannel
-      .def("__init__",
-           [](SmChannel* smChannel, std::shared_ptr<SmDevice2DeviceSemaphore> semaphore, RegisteredMemory dst,
-              uintptr_t src) { new (smChannel) SmChannel(semaphore, dst, (void*)src); })
-      .def("__init__",
-           [](SmChannel* smChannel, std::shared_ptr<SmDevice2DeviceSemaphore> semaphore, RegisteredMemory dst,
-              uintptr_t src, uintptr_t get_packet_buffer) {
-             new (smChannel) SmChannel(semaphore, dst, (void*)src, (void*)get_packet_buffer);
-           })
-      .def("device_handle", &SmChannel::deviceHandle);
-
-  nb::class_<SmChannel::DeviceHandle>(m, "SmChannelDeviceHandle")
-      .def(nb::init<>())
-      .def_rw("semaphore_", &SmChannel::DeviceHandle::semaphore_)
-      .def_rw("src_", &SmChannel::DeviceHandle::src_)
-      .def_rw("dst_", &SmChannel::DeviceHandle::dst_)
-      .def_rw("getPacketBuffer_", &SmChannel::DeviceHandle::getPacketBuffer_)
-      .def_prop_ro("raw", [](const SmChannel::DeviceHandle& self) -> nb::bytes {
-        return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
-      });
-};
diff --git a/python/mscclpp_benchmark/allreduce.cu b/python/mscclpp_benchmark/allreduce.cu
index 4c9851b9a..dbe376a3a 100644
--- a/python/mscclpp_benchmark/allreduce.cu
+++ b/python/mscclpp_benchmark/allreduce.cu
@@ -8,9 +8,9 @@
 #endif
 
 #include <mscclpp/concurrency_device.hpp>
+#include <mscclpp/memory_channel_device.hpp>
 #include <mscclpp/nvls_device.hpp>
-#include <mscclpp/proxy_channel_device.hpp>
-#include <mscclpp/sm_channel_device.hpp>
+#include <mscclpp/port_channel_device.hpp>
 
 __device__ mscclpp::DeviceSyncer deviceSyncer;
 __device__ mscclpp::DeviceSyncer allGatherDeviceSyncer;
@@ -124,7 +124,7 @@ __forceinline__ __device__ void vectorSum(TYPE* dst, TYPE* src, size_t nElem) {
 // -------------------------------------------
 
 template <int READ_ONLY>
-__device__ void allreduce1_helper(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff, int rank, int nranks,
+__device__ void allreduce1_helper(mscclpp::MemoryChannelDeviceHandle* memChans, TYPE* buff, int rank, int nranks,
                                   size_t nelems) {
   const size_t chunkSize = nelems / nranks;
   if (nranks == 1) return;
@@ -140,10 +140,10 @@ __device__ void allreduce1_helper(mscclpp::SmChannelDeviceHandle* smChans, TYPE*
   }
   __syncthreads();
   if (tid < nPeer) {
-    smChans[tid].relaxedSignal();
+    memChans[tid].relaxedSignal();
   }
   if (tid >= nPeer && tid < nPeer * 2) {
-    smChans[tid - nPeer].wait();
+    memChans[tid - nPeer].wait();
   }
   deviceSyncer.sync(gridDim.x);
 
@@ -155,14 +155,14 @@ __device__ void allreduce1_helper(mscclpp::SmChannelDeviceHandle* smChans, TYPE*
       int4 val;
       int peerIdx = (index + rank);
       if (peerIdx >= nPeer) peerIdx -= nPeer;
-      val = smChans[peerIdx].read<int4>(indexOffset4 + idx);
+      val = memChans[peerIdx].read<int4>(indexOffset4 + idx);
       tmp = add_vectors<TYPE>(tmp, val);
     }
     if (READ_ONLY == 0) {
       for (int index = 0; index < nPeer; ++index) {
         int peerIdx = (index + rank);
         if (peerIdx >= nPeer) peerIdx -= nPeer;
-        smChans[peerIdx].write<int4>(indexOffset4 + idx, tmp);
+        memChans[peerIdx].write<int4>(indexOffset4 + idx, tmp);
       }
     }
     buff4[indexOffset4 + idx] = tmp;
@@ -178,14 +178,14 @@ __device__ void allreduce1_helper(mscclpp::SmChannelDeviceHandle* smChans, TYPE*
     for (int index = 0; index < nPeer; ++index) {
       int peerIdx = (index + rank);
       if (peerIdx >= nPeer) peerIdx -= nPeer;
-      TYPE val = smChans[peerIdx].read<TYPE>(idx);
+      TYPE val = memChans[peerIdx].read<TYPE>(idx);
       tmp += val;
     }
     if (READ_ONLY == 0) {
       for (int index = 0; index < nPeer; ++index) {
         int peerIdx = (index + rank);
         if (peerIdx >= nPeer) peerIdx -= nPeer;
-        smChans[peerIdx].write<TYPE>(idx, tmp);
+        memChans[peerIdx].write<TYPE>(idx, tmp);
       }
     }
     buff[idx] = tmp;
@@ -198,10 +198,10 @@ __device__ void allreduce1_helper(mscclpp::SmChannelDeviceHandle* smChans, TYPE*
   }
   __syncthreads();
   if (tid < nPeer) {
-    smChans[tid].relaxedSignal();
+    memChans[tid].relaxedSignal();
   }
   if (tid >= nPeer && tid < nPeer * 2) {
-    smChans[tid - nPeer].wait();
+    memChans[tid - nPeer].wait();
   }
 
   if (READ_ONLY) {
@@ -211,17 +211,18 @@ __device__ void allreduce1_helper(mscclpp::SmChannelDeviceHandle* smChans, TYPE*
       if (peerIdx >= nPeer) peerIdx -= nPeer;
       const int remoteRank = (peerIdx < rank ? peerIdx : peerIdx + 1);
       size_t offset = chunkSize * remoteRank * sizeof(TYPE);
-      smChans[peerIdx].get(offset, chunkSize * sizeof(TYPE), tid, blockDim.x * gridDim.x);
+      memChans[peerIdx].get(offset, chunkSize * sizeof(TYPE), tid, blockDim.x * gridDim.x);
     }
   }
 }
 
-extern "C" __global__ void __launch_bounds__(1024, 1) allreduce1(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff,
-                                                                 int rank, int nranks, size_t nelems, int read_only) {
+extern "C" __global__ void __launch_bounds__(1024, 1)
+    allreduce1(mscclpp::MemoryChannelDeviceHandle* memChans, TYPE* buff, int rank, int nranks, size_t nelems,
+               int read_only) {
   if (read_only)
-    allreduce1_helper<1>(smChans, buff, rank, nranks, nelems);
+    allreduce1_helper<1>(memChans, buff, rank, nranks, nelems);
   else
-    allreduce1_helper<0>(smChans, buff, rank, nranks, nelems);
+    allreduce1_helper<0>(memChans, buff, rank, nranks, nelems);
 }
 
 // -------------------------------------------
@@ -231,7 +232,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1) allreduce1(mscclpp::SmChan
 __device__ uint64_t globalFlag = 1;
 
 extern "C" __global__ void __launch_bounds__(1024, 1)
-    allreduce2(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff, TYPE* scratch, void* resultBuff, int rank,
+    allreduce2(mscclpp::MemoryChannelDeviceHandle* memChans, TYPE* buff, TYPE* scratch, void* resultBuff, int rank,
                int worldSize, size_t nelems) {
   nelems = nelems / (sizeof(int) / sizeof(TYPE));
   // This version of allreduce only works for single nodes
@@ -246,7 +247,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
   const int localBlockIdx = blockIdx.x % nBlocksPerPeer;
   const int peerIdx = blockIdx.x / nBlocksPerPeer;
   const int remoteRank = peerIdx < rank ? peerIdx : peerIdx + 1;
-  mscclpp::SmChannelDeviceHandle smChan = smChans[peerIdx];
+  mscclpp::MemoryChannelDeviceHandle memChan = memChans[peerIdx];
   const int tid = threadIdx.x + localBlockIdx * blockDim.x;
   // double buffering
   size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LLPacket);
@@ -259,7 +260,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
   uint2* dst = (uint2*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int));
 
   // step 1: write to scratch buffer
-  smChan.putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer, flag);
+  memChan.putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer, flag);
   // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer
   for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) {
     uint2 data = make_uint2(0, 0);
@@ -279,7 +280,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
     packet.flag2 = flag;
     size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket) + (idx + rank * nPktsPerRank);
     for (int index = 0; index < nPeers; index++) {
-      smChans[index].write(offset, packet);
+      memChans[index].write(offset, packet);
     }
   }
   // step 3: get data result from scratch buffer
@@ -301,7 +302,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
 // -------------------------------------------
 
 extern "C" __global__ void __launch_bounds__(1024, 1)
-    allreduce3(mscclpp::ProxyChannelDeviceHandle* fstRoundChans, mscclpp::ProxyChannelDeviceHandle* sndRoundChans,
+    allreduce3(mscclpp::PortChannelDeviceHandle* fstRoundChans, mscclpp::PortChannelDeviceHandle* sndRoundChans,
                TYPE* buff, TYPE* scratch, int rank, int worldSize, size_t nelems) {
   nelems = nelems / (sizeof(int) / sizeof(TYPE));
 
@@ -311,10 +312,10 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
   int peerSendId = (remoteSendRank < rank) ? remoteSendRank : remoteSendRank - 1;
   int peerRecvId = (remoteRecvRank < rank) ? remoteRecvRank : remoteRecvRank - 1;
 
-  mscclpp::ProxyChannelDeviceHandle& devFstSendChan = fstRoundChans[peerSendId];
-  mscclpp::ProxyChannelDeviceHandle& devFstRecvChan = fstRoundChans[peerRecvId];
-  mscclpp::ProxyChannelDeviceHandle& devSndSendChan = sndRoundChans[peerSendId];
-  mscclpp::ProxyChannelDeviceHandle& devSndRecvChan = sndRoundChans[peerRecvId];
+  mscclpp::PortChannelDeviceHandle& devFstSendChan = fstRoundChans[peerSendId];
+  mscclpp::PortChannelDeviceHandle& devFstRecvChan = fstRoundChans[peerRecvId];
+  mscclpp::PortChannelDeviceHandle& devSndSendChan = sndRoundChans[peerSendId];
+  mscclpp::PortChannelDeviceHandle& devSndRecvChan = sndRoundChans[peerRecvId];
 
   // Step 1
   size_t chunkIndex = (rank + worldSize - 1) % worldSize;
@@ -419,9 +420,9 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
 // AllReduce4
 // 2-node
 // -------------------------------------------
-__device__ void localReduceScatterSm(mscclpp::SmChannelDeviceHandle* smChans, TYPE* buff, int rank, int nRanksPerNode,
-                                     int startChunkIndex, size_t offsetInChunk, size_t chunkSize, size_t nelems,
-                                     int nBlocks) {
+__device__ void localReduceScatterMem(mscclpp::MemoryChannelDeviceHandle* memChans, TYPE* buff, int rank,
+                                      int nRanksPerNode, int startChunkIndex, size_t offsetInChunk, size_t chunkSize,
+                                      size_t nelems, int nBlocks) {
   if (nRanksPerNode == 1) return;
   if (blockIdx.x >= nBlocks) return;
   const int nPeer = nRanksPerNode - 1;
@@ -433,10 +434,10 @@ __device__ void localReduceScatterSm(mscclpp::SmChannelDeviceHandle* smChans, TY
   int4* buff4 = (int4*)buff;
 
   for (int peerIdx = threadIdx.x + blockIdx.x * blockDim.x; peerIdx < nPeer; peerIdx += blockDim.x * nBlocks) {
-    smChans[peerIdx].relaxedSignal();
+    memChans[peerIdx].relaxedSignal();
   }
   for (int peerIdx = threadIdx.x + blockIdx.x * blockDim.x; peerIdx < nPeer; peerIdx += blockDim.x * nBlocks) {
-    smChans[peerIdx].wait();
+    memChans[peerIdx].wait();
   }
   reduceScatterDeviceSyncer.sync(nBlocks);
 
@@ -447,7 +448,7 @@ __device__ void localReduceScatterSm(mscclpp::SmChannelDeviceHandle* smChans, TY
       int4 val;
       int peerIdx = index + localRankIndexInNode;
       if (peerIdx >= nPeer) peerIdx -= nPeer;
-      val = smChans[peerIdx].read<int4>(indexOffset4 + idx);
+      val = memChans[peerIdx].read<int4>(indexOffset4 + idx);
       tmp = add_vectors<TYPE>(tmp, val);
     }
     buff4[indexOffset4 + idx] = tmp;
@@ -457,9 +458,9 @@ __device__ void localReduceScatterSm(mscclpp::SmChannelDeviceHandle* smChans, TY
 }
 
 // This kernel is the most performant when the number of blocks is a multiple of (nRanksPerNode - 1).
-__device__ void localAllGatherSm(mscclpp::SmChannelDeviceHandle* smChans, int rank, int nRanksPerNode,
-                                 int startRankChunkIndex, uint64_t offsetInRankChunk, uint64_t rankChunkSize,
-                                 uint64_t size, size_t nBlocks) {
+__device__ void localAllGatherMem(mscclpp::MemoryChannelDeviceHandle* memChans, int rank, int nRanksPerNode,
+                                  int startRankChunkIndex, uint64_t offsetInRankChunk, uint64_t rankChunkSize,
+                                  uint64_t size, size_t nBlocks) {
   if (nRanksPerNode == 1) return;
   if (blockIdx.x >= nBlocks) return;
   const size_t nPeer = nRanksPerNode - 1;
@@ -495,16 +496,16 @@ __device__ void localAllGatherSm(mscclpp::SmChannelDeviceHandle* smChans, int ra
     sizeForThisBlock += lastChunkSize;
   }
   if (threadIdx.x == 0 && peerLocalBlockIdx == 0) {
-    smChans[peerIdx].relaxedSignal();
-    smChans[peerIdx].wait();
+    memChans[peerIdx].relaxedSignal();
+    memChans[peerIdx].wait();
   }
   allGatherDeviceSyncer.sync(nBlocks);
   size_t offset = rankChunkSize * (startRankChunkIndex + remoteRankLocalIndex) + offsetInRankChunk;
-  smChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x);
+  memChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x);
 }
 
-__device__ void localAllGatherAllPairsSm(mscclpp::SmChannelDeviceHandle* smChans, int rank, int nRanksPerNode,
-                                         uint64_t size, size_t nBlocks) {
+__device__ void localAllGatherAllPairsMem(mscclpp::MemoryChannelDeviceHandle* memChans, int rank, int nRanksPerNode,
+                                          uint64_t size, size_t nBlocks) {
   if (nRanksPerNode == 1) return;
   if (blockIdx.x >= nBlocks) return;
 
@@ -512,24 +513,24 @@ __device__ void localAllGatherAllPairsSm(mscclpp::SmChannelDeviceHandle* smChans
   const int nPeer = nRanksPerNode - 1;
 
   if (tid < nPeer) {
-    smChans[tid].signal();
+    memChans[tid].signal();
   }
   int waitStart = nBlocks * blockDim.x - nPeer;
   if (tid >= waitStart && tid < nBlocks * blockDim.x) {
-    smChans[tid - waitStart].wait();
+    memChans[tid - waitStart].wait();
   }
   allGatherDeviceSyncer.sync(nBlocks);
   for (int i = 0; i < nPeer; ++i) {
     int peerIdx = (i + rank) % nPeer;
     const int remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1);
     size_t offset = size * remoteRankLocalIndex;
-    smChans[peerIdx].get(offset, size, tid, blockDim.x * nBlocks);
+    memChans[peerIdx].get(offset, size, tid, blockDim.x * nBlocks);
   }
 }
 
 // This is an allgather4 equivalent
-__device__ void allGatherSm(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::ProxyChannelDeviceHandle* proxyChans,
-                            int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU, int pipelineDepth) {
+__device__ void allGatherMem(mscclpp::MemoryChannelDeviceHandle* memChans, mscclpp::PortChannelDeviceHandle* portChans,
+                             int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU, int pipelineDepth) {
   // this allgather is a pipelined and hierarchical one and only works for two nodes
   // it is implemented as follows:
   // Step 1: each node does a local allgather and concurrently,
@@ -544,14 +545,14 @@ __device__ void allGatherSm(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::Pr
   int peerRank = (rank + nRanksPerNode) % worldSize;
   int peerNodeId = peerRank / nRanksPerNode;
   int peer = (peerRank < rank) ? peerRank : peerRank - 1;
-  mscclpp::ProxyChannelDeviceHandle proxyChan = proxyChans[peer];
+  mscclpp::PortChannelDeviceHandle portChan = portChans[peer];
   const size_t nBlocksForLocalAllGather = gridDim.x / (nRanksPerNode - 1) * (nRanksPerNode - 1);
   const size_t rankChunkSize = nelemsPerGPU * sizeof(int);
   const int startRankIndexInLocalNode = (rank / nRanksPerNode) * nRanksPerNode;
   const int startRankIndexInPeerNode = (peerRank / nRanksPerNode) * nRanksPerNode;
 
   if (peerNodeId == rank / nRanksPerNode) {
-    localAllGatherSm(smChans, rank, nRanksPerNode, 0, 0, rankChunkSize, rankChunkSize, gridDim.x);
+    localAllGatherMem(memChans, rank, nRanksPerNode, 0, 0, rankChunkSize, rankChunkSize, gridDim.x);
     return;
   }
 
@@ -562,36 +563,37 @@ __device__ void allGatherSm(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::Pr
 
   // Step 1
   if (threadIdx.x == 0 && blockIdx.x == 0 && step1Bytes > 0) {
-    proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), step1Bytes);
+    portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), step1Bytes);
   }
-  localAllGatherSm(smChans, rank, nRanksPerNode, startRankIndexInLocalNode, 0, rankChunkSize, rankChunkSize,
-                   nBlocksForLocalAllGather);
+  localAllGatherMem(memChans, rank, nRanksPerNode, startRankIndexInLocalNode, 0, rankChunkSize, rankChunkSize,
+                    nBlocksForLocalAllGather);
   if (threadIdx.x == 0 && blockIdx.x == 0 && step1Bytes > 0) {
-    proxyChan.wait();
-    proxyChan.flush();
+    portChan.wait();
+    portChan.flush();
   }
   deviceSyncer.sync(gridDim.x);
   // Step 2
   if (threadIdx.x == 0 && blockIdx.x == 0) {
-    proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int) + step1Bytes, step2Bytes);
+    portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int) + step1Bytes, step2Bytes);
   }
   if (step1Bytes > 0)
-    localAllGatherSm(smChans, rank, nRanksPerNode, startRankIndexInPeerNode, 0, rankChunkSize, step1Bytes,
-                     nBlocksForLocalAllGather);
+    localAllGatherMem(memChans, rank, nRanksPerNode, startRankIndexInPeerNode, 0, rankChunkSize, step1Bytes,
+                      nBlocksForLocalAllGather);
   if (threadIdx.x == 0 && blockIdx.x == 0) {
-    proxyChan.wait();
-    proxyChan.flush();
+    portChan.wait();
+    portChan.flush();
   }
   deviceSyncer.sync(gridDim.x);
   // Step 3
-  localAllGatherSm(smChans, rank, nRanksPerNode, startRankIndexInPeerNode, step1Bytes, rankChunkSize, step2Bytes,
-                   nBlocksForLocalAllGather);
+  localAllGatherMem(memChans, rank, nRanksPerNode, startRankIndexInPeerNode, step1Bytes, rankChunkSize, step2Bytes,
+                    nBlocksForLocalAllGather);
 }
 
-__device__ void reduceScatterSm(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::ProxyChannelDeviceHandle* proxyChans,
-                                TYPE* buff, TYPE* scratch, int rank, int nRanksPerNode, int worldSize,
-                                size_t nelems,  // must be divisible by 3
-                                int pipelineDepth) {
+__device__ void reduceScatterMem(mscclpp::MemoryChannelDeviceHandle* memChans,
+                                 mscclpp::PortChannelDeviceHandle* portChans, TYPE* buff, TYPE* scratch, int rank,
+                                 int nRanksPerNode, int worldSize,
+                                 size_t nelems,  // must be divisible by 3
+                                 int pipelineDepth) {
   // this reduce-scatter algorithm works as follows:
   // Step 1: each node does a local reduce-scatter on peer node data chunks with 1/pipeline portion of chunk data. For
   // example, 2 nodes and each node has 2 ranks. rank 0 and rank 1 perform reduce-scatter on chunk 2 and chunk 3, with
@@ -612,29 +614,29 @@ __device__ void reduceScatterSm(mscclpp::SmChannelDeviceHandle* smChans, mscclpp
   int isComm = (threadIdx.x == 0) && (blockIdx.x == nBlocksForReduceScatter);
   int peer = (peerRank < rank) ? peerRank : peerRank - 1;
   int nBlocksRemain = gridDim.x - nBlocksForReduceScatter;
-  mscclpp::ProxyChannelDeviceHandle proxyChan = proxyChans[peer];
+  mscclpp::PortChannelDeviceHandle portChan = portChans[peer];
   if (peerNodeId == rank / nRanksPerNode) {
-    localReduceScatterSm(smChans, buff, rank, nRanksPerNode, 0, 0, chunkSize, chunkSize, gridDim.x);
+    localReduceScatterMem(memChans, buff, rank, nRanksPerNode, 0, 0, chunkSize, chunkSize, gridDim.x);
     return;
   }
 
   // step 1: local reduce
   int startChunkIndex = peerNodeId * nRanksPerNode;
-  localReduceScatterSm(smChans, buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize / pipelineSize,
-                       nBlocksForReduceScatter);
+  localReduceScatterMem(memChans, buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize / pipelineSize,
+                        nBlocksForReduceScatter);
   deviceSyncer.sync(gridDim.x);
 
   // step 2: local reduce and exchange data with neighbor
   if (isComm) {
     size_t offset = (peerRank * chunkSize) * sizeof(int);
     // opposite side
-    proxyChan.putWithSignal(offset, (chunkSize / pipelineSize * sizeof(int)));
+    portChan.putWithSignal(offset, (chunkSize / pipelineSize * sizeof(int)));
   }
   if (pipelineSize > 1)
-    localReduceScatterSm(smChans, buff, rank, nRanksPerNode, startChunkIndex, chunkSize / pipelineSize, chunkSize,
-                         (pipelineSize - 1) * chunkSize / pipelineSize, nBlocksForReduceScatter);
+    localReduceScatterMem(memChans, buff, rank, nRanksPerNode, startChunkIndex, chunkSize / pipelineSize, chunkSize,
+                          (pipelineSize - 1) * chunkSize / pipelineSize, nBlocksForReduceScatter);
   if (isComm) {
-    proxyChan.wait();
+    portChan.wait();
   }
   if (blockIdx.x >= nBlocksForReduceScatter) {
     ibDeviceSyncer.sync(nBlocksRemain);
@@ -645,7 +647,7 @@ __device__ void reduceScatterSm(mscclpp::SmChannelDeviceHandle* smChans, mscclpp
     vectorSum((TYPE*)dst, (TYPE*)src, chunkSize / pipelineSize, blockIdx.x - nBlocksForReduceScatter, nBlocksRemain);
   }
   if (isComm) {
-    proxyChan.flush();
+    portChan.flush();
   }
   deviceSyncer.sync(gridDim.x);
 
@@ -653,12 +655,12 @@ __device__ void reduceScatterSm(mscclpp::SmChannelDeviceHandle* smChans, mscclpp
   startChunkIndex = (rank / nRanksPerNode) * nRanksPerNode;
   if (isComm && pipelineSize > 1) {
     size_t offset = (peerRank * chunkSize + chunkSize / pipelineSize) * sizeof(int);
-    proxyChan.putWithSignal(offset, (pipelineSize - 1) * chunkSize / pipelineSize * sizeof(int));
+    portChan.putWithSignal(offset, (pipelineSize - 1) * chunkSize / pipelineSize * sizeof(int));
   }
-  localReduceScatterSm(smChans, buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize,
-                       nBlocksForReduceScatter);
+  localReduceScatterMem(memChans, buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize,
+                        nBlocksForReduceScatter);
   if (isComm && pipelineSize > 1) {
-    proxyChan.wait();
+    portChan.wait();
   }
   deviceSyncer.sync(gridDim.x);
   // reduce to related rank, can not overlap since localReduceScatter also calculate the sum
@@ -667,24 +669,24 @@ __device__ void reduceScatterSm(mscclpp::SmChannelDeviceHandle* smChans, mscclpp
   int* src = (int*)((char*)scratch + offset);
   if (pipelineSize > 1) vectorSum((TYPE*)dst, (TYPE*)src, (pipelineSize - 1) * chunkSize / pipelineSize);
   if (isComm) {
-    proxyChan.flush();
+    portChan.flush();
   }
 }
 
 extern "C" __global__ void __launch_bounds__(1024, 1) __global__
-    allreduce4(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::ProxyChannelDeviceHandle* reduceScatterProxyChans,
-               mscclpp::ProxyChannelDeviceHandle* allGatherProxyChans, TYPE* buff, TYPE* scratch, int rank,
+    allreduce4(mscclpp::MemoryChannelDeviceHandle* memChans, mscclpp::PortChannelDeviceHandle* reduceScatterPortChans,
+               mscclpp::PortChannelDeviceHandle* allGatherPortChans, TYPE* buff, TYPE* scratch, int rank,
                int nRanksPerNode, int worldSize, size_t nelems, int pipelineDepth) {
   nelems = nelems / (sizeof(int) / sizeof(TYPE));
-  reduceScatterSm(smChans, reduceScatterProxyChans, buff, scratch, rank, nRanksPerNode, worldSize, nelems,
-                  pipelineDepth);
+  reduceScatterMem(memChans, reduceScatterPortChans, buff, scratch, rank, nRanksPerNode, worldSize, nelems,
+                   pipelineDepth);
   deviceSyncer.sync(gridDim.x);
-  allGatherSm(smChans, allGatherProxyChans, rank, worldSize, nRanksPerNode, nelems / worldSize, pipelineDepth);
+  allGatherMem(memChans, allGatherPortChans, rank, worldSize, nRanksPerNode, nelems / worldSize, pipelineDepth);
 }
 
 // allreduce 5 for 2-nodes
 extern "C" __global__ void __launch_bounds__(1024, 1)
-    allreduce5(mscclpp::SmChannelDeviceHandle* smChans, mscclpp::ProxyChannelDeviceHandle* proxyChans, TYPE* buff,
+    allreduce5(mscclpp::MemoryChannelDeviceHandle* memChans, mscclpp::PortChannelDeviceHandle* portChans, TYPE* buff,
                TYPE* scratch, TYPE* putBuff, TYPE* resultBuff, int rank, int nRanksPerNode, int worldSize,
                size_t nelems) {
   nelems = nelems / (sizeof(int) / sizeof(TYPE));
@@ -701,8 +703,8 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
   const int localBlockIdx = blockIdx.x % nBlocksPerPeer;
   const int peerIdx = blockIdx.x / nBlocksPerPeer;
   const int remoteRankIdx = peerIdx < localRankId ? peerIdx : peerIdx + 1;
-  mscclpp::SmChannelDeviceHandle smChan = smChans[peerIdx];
-  mscclpp::ProxyChannelDeviceHandle proxyChan = proxyChans[localRankId];
+  mscclpp::MemoryChannelDeviceHandle memChan = memChans[peerIdx];
+  mscclpp::PortChannelDeviceHandle portChan = portChans[localRankId];
   const int tid = threadIdx.x + localBlockIdx * blockDim.x;
   // double buffering
   size_t scratchBaseOffset = (flag & 1) ? 0 : nPkts * sizeof(mscclpp::LLPacket);
@@ -717,8 +719,8 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
 
   // step 1: write to scratch buffer
   if (nRanksPerNode > 1) {
-    smChan.putPackets(scratchOffset, srcOffset, nelemsPerLocalRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer,
-                      flag);
+    memChan.putPackets(scratchOffset, srcOffset, nelemsPerLocalRank * sizeof(int), tid, blockDim.x * nBlocksPerPeer,
+                       flag);
   }
   // step 2: get data from scratch buffer, do local reduce-scatter in each node.
   mscclpp::LLPacket* putPkt = (mscclpp::LLPacket*)((char*)putBuff + putBaseOffset);
@@ -737,9 +739,9 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
   deviceSyncer.sync(gridDim.x);
   // step 3. send local reduced data to remote node.
   if (threadIdx.x == 0 && blockIdx.x == 0) {
-    proxyChan.put(scratchOffset, putBaseOffset, nPktsPerLocalRank * sizeof(mscclpp::LLPacket));
+    portChan.put(scratchOffset, putBaseOffset, nPktsPerLocalRank * sizeof(mscclpp::LLPacket));
     if ((flag & 63) == 0) {
-      proxyChan.flush();
+      portChan.flush();
     }
   }
   // step 4. try to read the data from scratch buffer and write to local peers
@@ -756,7 +758,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
     packet.flag2 = flag;
     size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket) + (idx + localRankId * nPktsPerLocalRank);
     for (int index = 0; index < nPeersInNode; index++) {
-      smChans[index].write(offset, packet);
+      memChans[index].write(offset, packet);
     }
     dst[idx] = res;
   }
@@ -787,7 +789,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
 // Barrier among all devices
 // Should be called by all threads on all devices
 // Assumes \p num_threads_per_block >= \p num_ranks
-__forceinline__ __device__ void barrier(mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores, int thread_id,
+__forceinline__ __device__ void barrier(mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle* semaphores, int thread_id,
                                         int block_id, int num_blocks, int num_ranks) {
   // wait for every device
   if (block_id == 0) {
@@ -804,7 +806,7 @@ __forceinline__ __device__ void barrier(mscclpp::SmDevice2DeviceSemaphoreDeviceH
 
 // Assumes \p kVecSize is 1, 2, 4, or 8 (default 8)
 template <typename DataType = float, int kVecSize = 8>
-MSCCLPP_DEVICE_INLINE void allreduce6_helper(mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores,
+MSCCLPP_DEVICE_INLINE void allreduce6_helper(mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle* semaphores,
                                              mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs, int my_rank,
                                              int num_ranks, size_t num_elements) {
   DataType* mc_ptr = (DataType*)nvlsPtrs.mcPtr;
@@ -863,7 +865,7 @@ MSCCLPP_DEVICE_INLINE void allreduce6_helper(mscclpp::SmDevice2DeviceSemaphoreDe
 }
 
 extern "C" __global__ void __launch_bounds__(1024, 1)
-    allreduce6(mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores,
+    allreduce6(mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle* semaphores,
                mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs, int my_rank, int num_ranks, size_t num_elements,
                size_t vector_size) {
   if (vector_size == 8) {
diff --git a/python/mscclpp_benchmark/mscclpp_op.py b/python/mscclpp_benchmark/mscclpp_op.py
index c2af7a4fc..a04fcc8c1 100644
--- a/python/mscclpp_benchmark/mscclpp_op.py
+++ b/python/mscclpp_benchmark/mscclpp_op.py
@@ -1,7 +1,7 @@
 import os
 import cupy as cp
 import ctypes
-from mscclpp import Transport, ProxyService, SmDevice2DeviceSemaphore
+from mscclpp import Transport, ProxyService, MemoryDevice2DeviceSemaphore
 import mscclpp.comm as mscclpp_comm
 from mscclpp.utils import KernelBuilder, GpuBuffer, pack
 
@@ -48,8 +48,8 @@ def __init__(
         self.connections = self.group.make_connection(remote_nghrs, Transport.CudaIpc)
         type_str = type_to_str(memory.dtype)
 
-        # create a sm_channel for each remote neighbor
-        self.sm_channels = self.group.make_sm_channels(self.memory, self.connections)
+        # create a memory_channel for each remote neighbor
+        self.memory_channels = self.group.make_memory_channels(self.memory, self.connections)
         file_dir = os.path.dirname(os.path.abspath(__file__))
         self.kernel = KernelBuilder(
             file="allreduce.cu",
@@ -60,7 +60,7 @@ def __init__(
         self.device_handles = []
         for rank in range(self.group.nranks):
             if rank != self.group.my_rank:
-                self.device_handles.append(self.sm_channels[rank].device_handle().raw)
+                self.device_handles.append(self.memory_channels[rank].device_handle().raw)
 
         self.device_handles_cp = cp.asarray(memoryview(b"".join(self.device_handles)), dtype=cp.uint8)
 
@@ -116,8 +116,8 @@ def __init__(
         type_str = type_to_str(memory.dtype)
 
         self.scratch = GpuBuffer(self.memory.size * 8, dtype=self.memory.dtype)
-        # create a sm_channel for each remote neighbor
-        self.sm_channels = self.group.make_sm_channels_with_scratch(self.memory, self.scratch, self.connections)
+        # create a memory_channel for each remote neighbor
+        self.memory_channels = self.group.make_memory_channels_with_scratch(self.memory, self.scratch, self.connections)
         file_dir = os.path.dirname(os.path.abspath(__file__))
         self.kernel = KernelBuilder(
             file="allreduce.cu", kernel_name="allreduce2", file_dir=file_dir, macro_dict={"TYPE": type_str}
@@ -125,7 +125,7 @@ def __init__(
         self.device_handles = []
         for rank in range(self.group.nranks):
             if rank != self.group.my_rank:
-                self.device_handles.append(self.sm_channels[rank].device_handle().raw)
+                self.device_handles.append(self.memory_channels[rank].device_handle().raw)
 
         self.device_handles_cp = cp.asarray(memoryview(b"".join(self.device_handles)), dtype=cp.uint8)
 
@@ -181,11 +181,11 @@ def __init__(
         self.proxy_service = proxy_service
         self.scratch = GpuBuffer(self.memory.size, dtype=self.memory.dtype)
 
-        # create a sm_channel for each remote neighbor
-        self.fst_round_proxy_chans = self.group.make_proxy_channels_with_scratch(
+        # create a memory_channel for each remote neighbor
+        self.fst_round_port_chans = self.group.make_port_channels_with_scratch(
             self.proxy_service, self.memory, self.scratch, self.connections
         )
-        self.snd_round_proxy_chans = self.group.make_proxy_channels(self.proxy_service, self.memory, self.connections)
+        self.snd_round_port_chans = self.group.make_port_channels(self.proxy_service, self.memory, self.connections)
         file_dir = os.path.dirname(os.path.abspath(__file__))
         self.kernel = KernelBuilder(
             file="allreduce.cu", kernel_name="allreduce3", file_dir=file_dir, macro_dict={"TYPE": type_str}
@@ -194,8 +194,8 @@ def __init__(
         self.snd_device_handles = []
         for rank in range(self.group.nranks):
             if rank != self.group.my_rank:
-                self.fst_device_handles.append(self.fst_round_proxy_chans[rank].device_handle().raw)
-                self.snd_device_handles.append(self.snd_round_proxy_chans[rank].device_handle().raw)
+                self.fst_device_handles.append(self.fst_round_port_chans[rank].device_handle().raw)
+                self.snd_device_handles.append(self.snd_round_port_chans[rank].device_handle().raw)
         self.fst_device_handles_cp = cp.asarray(memoryview(b"".join(self.fst_device_handles)), dtype=cp.uint8)
         self.snd_device_handles_cp = cp.asarray(memoryview(b"".join(self.snd_device_handles)), dtype=cp.uint8)
 
@@ -261,31 +261,29 @@ def __init__(
         self.proxy_service = proxy_service
         self.scratch = GpuBuffer(self.memory.size, dtype=self.memory.dtype)
         same_node_connections = {rank: conn for rank, conn in self.connections.items() if in_same_node(rank)}
-        # create a sm_channel for each remote neighbor
-        self.sm_channels = self.group.make_sm_channels(self.memory, same_node_connections)
-        self.reduce_scatter_proxy_channels = self.group.make_proxy_channels_with_scratch(
+        # create a memory_channel for each remote neighbor
+        self.memory_channels = self.group.make_memory_channels(self.memory, same_node_connections)
+        self.reduce_scatter_port_channels = self.group.make_port_channels_with_scratch(
             self.proxy_service, self.memory, self.scratch, self.connections
         )
-        self.all_gather_proxy_channels = self.group.make_proxy_channels(
-            self.proxy_service, self.memory, self.connections
-        )
+        self.all_gather_port_channels = self.group.make_port_channels(self.proxy_service, self.memory, self.connections)
         file_dir = os.path.dirname(os.path.abspath(__file__))
         self.kernel = KernelBuilder(
             file="allreduce.cu", kernel_name="allreduce4", file_dir=file_dir, macro_dict={"TYPE": type_str}
         ).get_compiled_kernel()
-        self.sm_device_handles = []
+        self.mem_device_handles = []
         self.reduce_sactter_proxy_device_handles = []
         self.all_gather_proxy_device_handles = []
         for rank in range(self.group.nranks):
             if rank != self.group.my_rank and in_same_node(rank):
-                self.sm_device_handles.append(self.sm_channels[rank].device_handle().raw)
+                self.mem_device_handles.append(self.memory_channels[rank].device_handle().raw)
             if rank != self.group.my_rank:
                 self.reduce_sactter_proxy_device_handles.append(
-                    self.reduce_scatter_proxy_channels[rank].device_handle().raw
+                    self.reduce_scatter_port_channels[rank].device_handle().raw
                 )
-                self.all_gather_proxy_device_handles.append(self.all_gather_proxy_channels[rank].device_handle().raw)
+                self.all_gather_proxy_device_handles.append(self.all_gather_port_channels[rank].device_handle().raw)
 
-        self.sm_device_handles_cp = cp.asarray(memoryview(b"".join(self.sm_device_handles)), dtype=cp.uint8)
+        self.mem_device_handles_cp = cp.asarray(memoryview(b"".join(self.mem_device_handles)), dtype=cp.uint8)
         self.reduce_sactter_proxy_device_handles_cp = cp.asarray(
             memoryview(b"".join(self.reduce_sactter_proxy_device_handles)), dtype=cp.uint8
         )
@@ -306,7 +304,7 @@ def set_params(self, nblocks, block_size, pipeline_depth):
 
         self.params = b""
         self.params += pack(
-            self.sm_device_handles_cp,
+            self.mem_device_handles_cp,
             self.reduce_sactter_proxy_device_handles_cp,
             self.all_gather_proxy_device_handles_cp,
             self.memory,
@@ -366,24 +364,26 @@ def __init__(
         self.put_buff = GpuBuffer(self.memory.size * 8 // nranks_per_node, dtype=self.memory.dtype)
         same_node_connections = {rank: conn for rank, conn in self.connections.items() if in_same_node(rank)}
         across_node_connections = {rank: conn for rank, conn in self.connections.items() if not in_same_node(rank)}
-        # create a sm_channel for each remote neighbor
-        self.sm_channels = self.group.make_sm_channels_with_scratch(self.memory, self.scratch, same_node_connections)
-        self.proxy_channels = self.group.make_proxy_channels_with_scratch(
+        # create a memory_channel for each remote neighbor
+        self.memory_channels = self.group.make_memory_channels_with_scratch(
+            self.memory, self.scratch, same_node_connections
+        )
+        self.port_channels = self.group.make_port_channels_with_scratch(
             self.proxy_service, self.put_buff, self.scratch, across_node_connections
         )
         file_dir = os.path.dirname(os.path.abspath(__file__))
         self.kernel = KernelBuilder(
             file="allreduce.cu", kernel_name="allreduce5", file_dir=file_dir, macro_dict={"TYPE": type_str}
         ).get_compiled_kernel()
-        self.sm_device_handles = []
+        self.mem_device_handles = []
         self.proxy_device_handles = []
         for rank in range(self.group.nranks):
             if rank != self.group.my_rank and in_same_node(rank):
-                self.sm_device_handles.append(self.sm_channels[rank].device_handle().raw)
+                self.mem_device_handles.append(self.memory_channels[rank].device_handle().raw)
             if rank != self.group.my_rank and not in_same_node(rank):
-                self.proxy_device_handles.append(self.proxy_channels[rank].device_handle().raw)
+                self.proxy_device_handles.append(self.port_channels[rank].device_handle().raw)
 
-        self.sm_device_handles_cp = cp.asarray(memoryview(b"".join(self.sm_device_handles)), dtype=cp.uint8)
+        self.mem_device_handles_cp = cp.asarray(memoryview(b"".join(self.mem_device_handles)), dtype=cp.uint8)
         self.proxy_device_handles_cp = cp.asarray(memoryview(b"".join(self.proxy_device_handles)), dtype=cp.uint8)
 
         self.set_params(nblocks, block_size)
@@ -398,7 +398,7 @@ def set_params(self, nblocks, block_size):
 
         self.params = b""
         self.params += pack(
-            self.sm_device_handles_cp,
+            self.mem_device_handles_cp,
             self.proxy_device_handles_cp,
             self.memory,
             self.scratch,
@@ -446,8 +446,8 @@ def __init__(
             self.memory.data.ptr, self.memory.data.mem.size
         )
 
-        # create a sm_channel for each remote neighbor
-        self.semaphores = group.make_semaphore(self.nvlink_connections, SmDevice2DeviceSemaphore)
+        # create a memory_channel for each remote neighbor
+        self.semaphores = group.make_semaphore(self.nvlink_connections, MemoryDevice2DeviceSemaphore)
         file_dir = os.path.dirname(os.path.abspath(__file__))
         self.kernel = KernelBuilder(
             file="allreduce.cu",
diff --git a/python/test/d2d_semaphore_test.cu b/python/test/d2d_semaphore_test.cu
index 04b945e3d..d6bc3ec5e 100644
--- a/python/test/d2d_semaphore_test.cu
+++ b/python/test/d2d_semaphore_test.cu
@@ -6,7 +6,7 @@
 // be careful about using semaphore[my_rank] as it is an invalid semaphore and it is there just for simplicity of
 // indexing
 extern "C" __global__ void __launch_bounds__(1024, 1)
-    d2d_semaphore(mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks) {
+    d2d_semaphore(mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks) {
   int tid = threadIdx.x;
   if (tid < nranks && tid != my_rank) {
     semaphores[tid].signal();
diff --git a/python/test/sm_channel_test.cu b/python/test/memory_channel_test.cu
similarity index 83%
rename from python/test/sm_channel_test.cu
rename to python/test/memory_channel_test.cu
index 93b5c99aa..48a831493 100644
--- a/python/test/sm_channel_test.cu
+++ b/python/test/memory_channel_test.cu
@@ -1,11 +1,12 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <mscclpp/sm_channel_device.hpp>
+#include <mscclpp/memory_channel_device.hpp>
 
 // be careful about using channels[my_rank] as it is inavlie and it is there just for simplicity of indexing
 extern "C" __global__ void __launch_bounds__(1024, 1)
-    sm_channel(mscclpp::SmChannelDeviceHandle* channels, int my_rank, int nranks, int num_elements, int use_packet) {
+    memory_channel(mscclpp::MemoryChannelDeviceHandle* channels, int my_rank, int nranks, int num_elements,
+                   int use_packet) {
   int tid = threadIdx.x;
   int bid = blockIdx.x;
   uint64_t size_per_rank = (num_elements * sizeof(int)) / nranks;
diff --git a/python/test/nvls_test.cu b/python/test/nvls_test.cu
index 3bc2f52d2..8d391ff59 100644
--- a/python/test/nvls_test.cu
+++ b/python/test/nvls_test.cu
@@ -10,7 +10,7 @@ __device__ mscclpp::DeviceSyncer deviceSyncer;
 
 extern "C" __global__ void __launch_bounds__(1024, 1)
     nvls_test(mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs,
-              mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks, int nbytes) {
+              mscclpp::MemoryDevice2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks, int nbytes) {
   int nelem = nbytes / sizeof(float);
   float* dev_ptr = (float*)nvlsPtrs.devicePtr;
   float* mc_ptr = (float*)nvlsPtrs.mcPtr;
diff --git a/python/test/proxy_channel_test.cu b/python/test/port_channel_test.cu
similarity index 85%
rename from python/test/proxy_channel_test.cu
rename to python/test/port_channel_test.cu
index d79a97bf6..05b99d1ab 100644
--- a/python/test/proxy_channel_test.cu
+++ b/python/test/port_channel_test.cu
@@ -2,12 +2,12 @@
 // Licensed under the MIT license.
 
 #include <mscclpp/packet_device.hpp>
-#include <mscclpp/proxy_channel_device.hpp>
+#include <mscclpp/port_channel_device.hpp>
 
 // be careful about using channels[my_rank] as it is inavlie and it is there just for simplicity of indexing
 extern "C" __global__ void __launch_bounds__(1024, 1)
-    proxy_channel(mscclpp::ProxyChannelDeviceHandle* channels, int my_rank, int nranks, int* data, int* scratch,
-                  int num_elements, int use_packet) {
+    port_channel(mscclpp::PortChannelDeviceHandle* channels, int my_rank, int nranks, int* data, int* scratch,
+                 int num_elements, int use_packet) {
   int tid = threadIdx.x;
   int nthreads = blockDim.x;
   uint64_t size_per_rank = (num_elements * sizeof(int)) / nranks;
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index 1a5f99c42..f0e63daf4 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -22,7 +22,7 @@
     Host2DeviceSemaphore,
     Host2HostSemaphore,
     ProxyService,
-    SmDevice2DeviceSemaphore,
+    MemoryDevice2DeviceSemaphore,
     TcpBootstrap,
     Transport,
     is_nvls_supported,
@@ -363,9 +363,9 @@ def __init__(
             ).get_compiled_kernel()
             self.nblocks = 1
             self.nthreads = nranks
-        elif test_name == "sm_channel":
+        elif test_name == "memory_channel":
             self._kernel = KernelBuilder(
-                file="sm_channel_test.cu", kernel_name="sm_channel", file_dir=file_dir
+                file="memory_channel_test.cu", kernel_name="memory_channel", file_dir=file_dir
             ).get_compiled_kernel()
             self.nblocks = nranks
             self.nthreads = 1024
@@ -381,9 +381,9 @@ def __init__(
             ).get_compiled_kernel()
             self.nblocks = 1
             self.nthreads = nranks
-        elif test_name == "proxy_channel":
+        elif test_name == "port_channel":
             self._kernel = KernelBuilder(
-                file="proxy_channel_test.cu", kernel_name="proxy_channel", file_dir=file_dir
+                file="port_channel_test.cu", kernel_name="port_channel", file_dir=file_dir
             ).get_compiled_kernel()
             self.nblocks = 1
             self.nthreads = 1024
@@ -411,11 +411,11 @@ def __init__(
             # keep a reference to the device handles so that they don't get garbage collected
             self._d_semaphore_or_channels = cp.asarray(memoryview(b"".join(device_handles)), dtype=cp.uint8)
 
-        if test_name in ["h2d_semaphore", "d2d_semaphore", "sm_channel", "proxy_channel"]:
+        if test_name in ["h2d_semaphore", "d2d_semaphore", "memory_channel", "port_channel"]:
             self.params += pack(self._d_semaphore_or_channels, my_rank, nranks)
-            if test_name == "sm_channel":
+            if test_name == "memory_channel":
                 self.params += pack(tensor.size, use_packet)
-            if test_name == "proxy_channel":
+            if test_name == "port_channel":
                 self.params += pack(tensor, scratch, tensor.size, use_packet)
         elif test_name == "fifo":
             self.params = fifo.device_handle().raw
@@ -457,7 +457,7 @@ def signal(semaphores):
 def test_d2d_semaphores(mpi_group: MpiGroup):
     group, connections = create_group_and_connection(mpi_group, "NVLink")
 
-    semaphores = group.make_semaphore(connections, SmDevice2DeviceSemaphore)
+    semaphores = group.make_semaphore(connections, MemoryDevice2DeviceSemaphore)
     group.barrier()
     kernel = MscclppKernel("d2d_semaphore", group.my_rank, group.nranks, semaphores)
     kernel()
@@ -468,7 +468,7 @@ def test_d2d_semaphores(mpi_group: MpiGroup):
 @parametrize_mpi_groups(2, 4, 8, 16)
 @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
 @pytest.mark.parametrize("use_packet", [False, True])
-def test_sm_channels(mpi_group: MpiGroup, nelem: int, use_packet: bool):
+def test_memory_channels(mpi_group: MpiGroup, nelem: int, use_packet: bool):
     group, connections = create_group_and_connection(mpi_group, "NVLink")
 
     memory = GpuBuffer(nelem, dtype=cp.int32)
@@ -483,10 +483,10 @@ def test_sm_channels(mpi_group: MpiGroup, nelem: int, use_packet: bool):
         memory_expected[(nelemPerRank * rank) : (nelemPerRank * (rank + 1))] = rank + 1
 
     if use_packet:
-        channels = group.make_sm_channels_with_scratch(memory, scratch, connections)
+        channels = group.make_memory_channels_with_scratch(memory, scratch, connections)
     else:
-        channels = group.make_sm_channels(memory, connections)
-    kernel = MscclppKernel("sm_channel", group.my_rank, group.nranks, channels, memory, use_packet, scratch)
+        channels = group.make_memory_channels(memory, connections)
+    kernel = MscclppKernel("memory_channel", group.my_rank, group.nranks, channels, memory, use_packet, scratch)
 
     group.barrier()
     kernel()
@@ -565,7 +565,7 @@ def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
 @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
 @pytest.mark.parametrize("transport", ["NVLink", "IB"])
 @pytest.mark.parametrize("use_packet", [False, True])
-def test_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool):
+def test_port_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool):
     group, connections = create_group_and_connection(mpi_group, transport)
 
     memory = GpuBuffer(nelem, dtype=cp.int32)
@@ -586,10 +586,10 @@ def test_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_pack
         memory_to_register = scratch
     else:
         memory_to_register = memory
-    channels = group.make_proxy_channels(proxy_service, memory_to_register, connections)
+    channels = group.make_port_channels(proxy_service, memory_to_register, connections)
 
     kernel = MscclppKernel(
-        "proxy_channel",
+        "port_channel",
         my_rank=group.my_rank,
         nranks=group.nranks,
         semaphore_or_channels=channels,
@@ -614,7 +614,7 @@ def test_nvls(mpi_group: MpiGroup):
     mem_handle = nvls_connection.allocate_bind_memory(nbytes)
 
     nvlinks_connections = create_connection(group, "NVLink")
-    semaphores = group.make_semaphore(nvlinks_connections, SmDevice2DeviceSemaphore)
+    semaphores = group.make_semaphore(nvlinks_connections, MemoryDevice2DeviceSemaphore)
 
     kernel = MscclppKernel(
         "nvls",
diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
index 56c881bd0..ed5509052 100644
--- a/src/executor/execution_plan.cc
+++ b/src/executor/execution_plan.cc
@@ -74,10 +74,10 @@ auto convertToBufferType = [](const std::string& str) {
 };
 
 auto convertToChannelType = [](const std::string& str) {
-  if (str == "sm") {
-    return mscclpp::ChannelType::SM;
-  } else if (str == "proxy") {
-    return mscclpp::ChannelType::PROXY;
+  if (str == "memory" || str == "sm") {
+    return mscclpp::ChannelType::MEMORY;
+  } else if (str == "port" || str == "proxy") {
+    return mscclpp::ChannelType::PORT;
   } else if (str == "none") {
     return mscclpp::ChannelType::NONE;
   } else if (str == "nvls") {
@@ -304,7 +304,7 @@ void ExecutionPlan::Impl::parseChannels(
   }
 }
 
-// Construct the channel info. Step 1. Flatten SM and PROXY channels into separate vectors.
+// Construct the channel info. Step 1. Flatten MEMORY and PORT channels into separate vectors.
 // Step 2. For each threadblock, construct a vector of channel indexes and keys.
 void ExecutionPlan::Impl::setupChannels(const json& gpus) {
   using mapKey = std::tuple<int, BufferType, BufferType, ChannelType>;
@@ -331,7 +331,7 @@ void ExecutionPlan::Impl::setupChannels(const json& gpus) {
   // setup threadblockChannelMap
   for (const auto& gpu : gpus) {
     int rank = gpu["id"];
-    auto channelTypes = {ChannelType::SM, ChannelType::PROXY, ChannelType::NVLS};
+    auto channelTypes = {ChannelType::MEMORY, ChannelType::PORT, ChannelType::NVLS};
     std::unordered_map<ChannelKey, std::vector<int>> channelMap;
     for (auto channelType : channelTypes) {
       const std::vector<ChannelInfo> channelInfos = this->getChannelInfos(rank, channelType);
@@ -352,18 +352,18 @@ void ExecutionPlan::Impl::setupChannels(const json& gpus) {
       }
     }
     int nthreadblocks = gpu["threadblocks"].size();
-    this->threadblockSMChannelMap[rank].resize(nthreadblocks);
-    this->threadblockProxyChannelMap[rank].resize(nthreadblocks);
+    this->threadblockMemoryChannelMap[rank].resize(nthreadblocks);
+    this->threadblockPortChannelMap[rank].resize(nthreadblocks);
     this->threadblockNvlsChannelMap[rank].resize(nthreadblocks);
     for (const auto& threadblock : gpu["threadblocks"]) {
       for (const auto& channel : threadblock["channels"]) {
         ChannelType channelType = convertToChannelType(channel["ctype"]);
         ChannelKey key = {convertToBufferType(channel["src"]), convertToBufferType(channel["dst"]), channelType};
         for (int id : channel["cids"]) {
-          if (channelType == ChannelType::SM) {
-            this->threadblockSMChannelMap[rank][threadblock["id"]].emplace_back(channelMap[key][id], key);
-          } else if (channelType == ChannelType::PROXY) {
-            this->threadblockProxyChannelMap[rank][threadblock["id"]].emplace_back(channelMap[key][id], key);
+          if (channelType == ChannelType::MEMORY) {
+            this->threadblockMemoryChannelMap[rank][threadblock["id"]].emplace_back(channelMap[key][id], key);
+          } else if (channelType == ChannelType::PORT) {
+            this->threadblockPortChannelMap[rank][threadblock["id"]].emplace_back(channelMap[key][id], key);
           } else if (channelType == ChannelType::NVLS) {
             this->threadblockNvlsChannelMap[rank][threadblock["id"]].emplace_back(channelMap[key][id], key);
           }
@@ -394,15 +394,15 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse
       std::unordered_map<ChannelKey, std::vector<int>> channelIndexes;
       std::vector<Operation> ops;
       int threadblockId = threadblock["id"];
-      const auto& smChannels = this->threadblockSMChannelMap[rank][threadblockId];
-      const auto& proxyChannels = this->threadblockProxyChannelMap[rank][threadblockId];
+      const auto& memoryChannels = this->threadblockMemoryChannelMap[rank][threadblockId];
+      const auto& portChannels = this->threadblockPortChannelMap[rank][threadblockId];
       const auto& nvlsChannels = this->threadblockNvlsChannelMap[rank][threadblockId];
-      for (size_t i = 0; i < smChannels.size(); i++) {
-        const auto& [_, key] = smChannels[i];
+      for (size_t i = 0; i < memoryChannels.size(); i++) {
+        const auto& [_, key] = memoryChannels[i];
         channelIndexes[key].push_back(i);
       }
-      for (size_t i = 0; i < proxyChannels.size(); i++) {
-        const auto& [_, key] = proxyChannels[i];
+      for (size_t i = 0; i < portChannels.size(); i++) {
+        const auto& [_, key] = portChannels[i];
         channelIndexes[key].push_back(i);
       }
       for (size_t i = 0; i < nvlsChannels.size(); i++) {
@@ -586,8 +586,8 @@ void ExecutionPlan::Impl::reset() {
   this->operations.clear();
   this->channelInfos.clear();
   this->nvlsInfos.clear();
-  this->threadblockSMChannelMap.clear();
-  this->threadblockProxyChannelMap.clear();
+  this->threadblockMemoryChannelMap.clear();
+  this->threadblockPortChannelMap.clear();
   this->threadblockNvlsChannelMap.clear();
   this->inputChunks.clear();
   this->outputChunks.clear();
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index 944ddb254..25e55bb56 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -2,9 +2,9 @@
 // Licensed under the MIT license.
 
 #include <mscclpp/executor.hpp>
+#include <mscclpp/memory_channel.hpp>
 #include <mscclpp/nvls.hpp>
-#include <mscclpp/proxy_channel.hpp>
-#include <mscclpp/sm_channel.hpp>
+#include <mscclpp/port_channel.hpp>
 #include <set>
 
 #include "execution_kernel.hpp"
@@ -113,10 +113,10 @@ struct ExecutionContext {
   std::unordered_map<int, std::shared_ptr<Connection>> connections;
   std::vector<std::shared_ptr<NvlsConnection>> nvlsConnections;
   std::unordered_map<std::pair<BufferType, int>, mscclpp::RegisteredMemory> registeredMemories;
-  std::vector<std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>> smSemaphores;
+  std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores;
   std::vector<mscclpp::SemaphoreId> proxySemaphores;
-  std::vector<mscclpp::SmChannel> smChannels;
-  std::vector<mscclpp::ProxyChannel> proxyChannels;
+  std::vector<mscclpp::MemoryChannel> memoryChannels;
+  std::vector<mscclpp::PortChannel> portChannels;
   std::vector<mscclpp::NvlsConnection::DeviceMulticastPointer> nvlsChannels;
   std::unordered_map<DeviceExecutionPlanKey, std::vector<DeviceExecutionPlan>> deviceExecutionPlans;
   std::unordered_map<DeviceExecutionPlanKey, std::shared_ptr<char>> deviceExecutionPlansBuffers;
@@ -194,9 +194,9 @@ struct Executor::Impl {
   TransportFlags getTransportFlags(std::vector<ChannelInfo>& infos, int rank) {
     TransportFlags flags;
     for (ChannelInfo& info : infos) {
-      if (info.channelType == ChannelType::SM) {
+      if (info.channelType == ChannelType::MEMORY) {
         flags |= Transport::CudaIpc;
-      } else if (info.channelType == ChannelType::PROXY) {
+      } else if (info.channelType == ChannelType::PORT) {
         for (int peer : info.connectedPeers) {
           if (!inSameNode(rank, peer, this->nranksPerNode)) {
             flags |= IBs[rank % this->nranksPerNode];
@@ -279,16 +279,16 @@ struct Executor::Impl {
 
   void setupChannels(ExecutionContext& context, void* sendbuff, void* recvbuff, size_t sendBufferSize,
                      size_t recvBufferSize, int rank, const ExecutionPlan& plan) {
-    const auto channelTypes = {ChannelType::SM, ChannelType::PROXY};
-    std::vector<std::shared_ptr<SmDevice2DeviceSemaphore>> smSemaphores;
+    const auto channelTypes = {ChannelType::MEMORY, ChannelType::PORT};
+    std::vector<std::shared_ptr<MemoryDevice2DeviceSemaphore>> memorySemaphores;
     std::vector<mscclpp::SemaphoreId> proxySemaphores;
     auto processChannelInfos = [&](std::vector<ChannelInfo>& channelInfos) {
       for (ChannelInfo& info : channelInfos) {
         for (int peer : info.connectedPeers) {
-          if (info.channelType == ChannelType::SM) {
-            smSemaphores.push_back(
-                std::make_shared<SmDevice2DeviceSemaphore>(*this->comm, context.connections.at(peer)));
-          } else if (info.channelType == ChannelType::PROXY) {
+          if (info.channelType == ChannelType::MEMORY) {
+            memorySemaphores.push_back(
+                std::make_shared<MemoryDevice2DeviceSemaphore>(*this->comm, context.connections.at(peer)));
+          } else if (info.channelType == ChannelType::PORT) {
             proxySemaphores.push_back(
                 context.proxyService->buildAndAddSemaphore(*this->comm, context.connections.at(peer)));
           }
@@ -307,7 +307,7 @@ struct Executor::Impl {
       processChannelInfos(channelInfos);
     }
     this->comm->setup();
-    context.smSemaphores = std::move(smSemaphores);
+    context.memorySemaphores = std::move(memorySemaphores);
     context.proxySemaphores = std::move(proxySemaphores);
 
     auto getBufferSize = [&](BufferType type) {
@@ -332,11 +332,11 @@ struct Executor::Impl {
         TransportFlags transport = getTransportFlags(channelInfos, rank);
         RegisteredMemory localMemory = this->comm->registerMemory(src, bufferSize, transport);
         for (int peer : info.connectedPeers) {
-          if (channelType == ChannelType::SM) {
-            context.smChannels.emplace_back(context.smSemaphores[index++],
-                                            context.registeredMemories[{info.dstBufferType, peer}], src, nullptr);
-          } else if (channelType == ChannelType::PROXY) {
-            context.proxyChannels.emplace_back(context.proxyService->proxyChannel(
+          if (channelType == ChannelType::MEMORY) {
+            context.memoryChannels.emplace_back(context.memorySemaphores[index++],
+                                                context.registeredMemories[{info.dstBufferType, peer}], src, nullptr);
+          } else if (channelType == ChannelType::PORT) {
+            context.portChannels.emplace_back(context.proxyService->portChannel(
                 context.proxySemaphores[index++],
                 context.proxyService->addMemory(context.registeredMemories[{info.dstBufferType, peer}]),
                 context.proxyService->addMemory(localMemory)));
@@ -366,15 +366,15 @@ struct Executor::Impl {
       DeviceExecutionPlan deviceExecutionPlan = {};
       std::vector<Operation> ops = plan.impl_->getOperations(rank, threadblock);
       deviceExecutionPlan.nOperations = ops.size();
-      deviceExecutionPlan.nSmChannels = plan.impl_->threadblockSMChannelMap.at(rank).at(threadblock).size();
-      deviceExecutionPlan.nProxyChannels = plan.impl_->threadblockProxyChannelMap.at(rank).at(threadblock).size();
+      deviceExecutionPlan.nMemoryChannels = plan.impl_->threadblockMemoryChannelMap.at(rank).at(threadblock).size();
+      deviceExecutionPlan.nPortChannels = plan.impl_->threadblockPortChannelMap.at(rank).at(threadblock).size();
       int chanIndex = 0;
-      for (const auto& [index, _] : plan.impl_->threadblockSMChannelMap.at(rank).at(threadblock)) {
-        deviceExecutionPlan.channels.smChannels[chanIndex++] = mscclpp::deviceHandle(context.smChannels[index]);
+      for (const auto& [index, _] : plan.impl_->threadblockMemoryChannelMap.at(rank).at(threadblock)) {
+        deviceExecutionPlan.channels.memoryChannels[chanIndex++] = mscclpp::deviceHandle(context.memoryChannels[index]);
       }
       chanIndex = 0;
-      for (const auto& [index, _] : plan.impl_->threadblockProxyChannelMap.at(rank).at(threadblock)) {
-        deviceExecutionPlan.channels.proxyChannels[chanIndex++] = mscclpp::deviceHandle(context.proxyChannels[index]);
+      for (const auto& [index, _] : plan.impl_->threadblockPortChannelMap.at(rank).at(threadblock)) {
+        deviceExecutionPlan.channels.portChannels[chanIndex++] = mscclpp::deviceHandle(context.portChannels[index]);
       }
       chanIndex = 0;
       for (const auto& [index, _] : plan.impl_->threadblockNvlsChannelMap.at(rank).at(threadblock)) {
diff --git a/src/include/execution_common.hpp b/src/include/execution_common.hpp
index f6ed215e1..87b1a69e1 100644
--- a/src/include/execution_common.hpp
+++ b/src/include/execution_common.hpp
@@ -4,9 +4,9 @@
 #ifndef MSCCLPP_EXECUTION_COMMON_HPP_
 #define MSCCLPP_EXECUTION_COMMON_HPP_
 
+#include <mscclpp/memory_channel.hpp>
 #include <mscclpp/nvls.hpp>
-#include <mscclpp/proxy_channel.hpp>
-#include <mscclpp/sm_channel.hpp>
+#include <mscclpp/port_channel.hpp>
 
 namespace mscclpp {
 
@@ -23,8 +23,8 @@ enum class BufferType : uint8_t {
 
 enum class ChannelType : uint8_t {
   NONE,
-  SM,
-  PROXY,
+  MEMORY,
+  PORT,
   NVLS,
 };
 
@@ -53,8 +53,8 @@ enum class OperationType : uint8_t {
 };
 
 struct Channels {
-  mscclpp::DeviceHandle<mscclpp::SmChannel> smChannels[MAX_CHANNEL];
-  mscclpp::DeviceHandle<mscclpp::ProxyChannel> proxyChannels[MAX_CHANNEL];
+  mscclpp::DeviceHandle<mscclpp::MemoryChannel> memoryChannels[MAX_CHANNEL];
+  mscclpp::DeviceHandle<mscclpp::PortChannel> portChannels[MAX_CHANNEL];
   mscclpp::DeviceHandle<mscclpp::NvlsConnection::DeviceMulticastPointer> nvlsChannels[MAX_CHANNEL];
 };
 
@@ -97,8 +97,8 @@ struct Operation {
 
 // total size = 2304 + 6400 + 4 + 12(padding) = 8720 bytes
 struct __attribute__((aligned(16))) DeviceExecutionPlan {
-  uint8_t nSmChannels;                  // 1 bytes
-  uint8_t nProxyChannels;               // 1 bytes
+  uint8_t nMemoryChannels;              // 1 bytes
+  uint8_t nPortChannels;                // 1 bytes
   uint16_t nOperations;                 // 2 bytes
   Channels channels;                    // 2304 bytes
   Operation operations[MAX_OPERATION];  // 64 * 100 = 6400 bytes
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index 98bed37eb..cea5fbf3b 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -9,9 +9,9 @@
 #include <mscclpp/npkit/npkit.hpp>
 #endif
 #include <mscclpp/concurrency_device.hpp>
+#include <mscclpp/memory_channel.hpp>
 #include <mscclpp/packet_device.hpp>
-#include <mscclpp/proxy_channel.hpp>
-#include <mscclpp/sm_channel.hpp>
+#include <mscclpp/port_channel.hpp>
 
 #include "execution_common.hpp"
 
@@ -192,68 +192,71 @@ MSCCLPP_DEVICE_INLINE T* getBuffer(T* input, T* output, T* scratch, BufferType b
   return nullptr;
 }
 
-MSCCLPP_DEVICE_INLINE void handleSignal(DeviceHandle<SmChannel>* smChannels, DeviceHandle<ProxyChannel>* proxyChannels,
-                                        uint8_t* channelIndex, int nChannels, ChannelType chType) {
+MSCCLPP_DEVICE_INLINE void handleSignal(DeviceHandle<MemoryChannel>* memoryChannels,
+                                        DeviceHandle<PortChannel>* portChannels, uint8_t* channelIndex, int nChannels,
+                                        ChannelType chType) {
   int tid = threadIdx.x;
-  if (tid < nChannels && chType == ChannelType::SM) {
-    smChannels[channelIndex[tid]].signal();
+  if (tid < nChannels && chType == ChannelType::MEMORY) {
+    memoryChannels[channelIndex[tid]].signal();
     return;
   }
-  if (tid < nChannels && chType == ChannelType::PROXY) {
-    proxyChannels[channelIndex[threadIdx.x]].signal();
+  if (tid < nChannels && chType == ChannelType::PORT) {
+    portChannels[channelIndex[threadIdx.x]].signal();
   }
 }
 
-MSCCLPP_DEVICE_INLINE void handleWait(DeviceHandle<SmChannel>* smChannels, DeviceHandle<ProxyChannel>* proxyChannels,
-                                      uint8_t* channelIndexes, int nChannels, ChannelType chType) {
+MSCCLPP_DEVICE_INLINE void handleWait(DeviceHandle<MemoryChannel>* memoryChannels,
+                                      DeviceHandle<PortChannel>* portChannels, uint8_t* channelIndexes, int nChannels,
+                                      ChannelType chType) {
   int tid = threadIdx.x;
-  if (tid < nChannels && chType == ChannelType::SM) {
-    smChannels[channelIndexes[tid]].wait();
+  if (tid < nChannels && chType == ChannelType::MEMORY) {
+    memoryChannels[channelIndexes[tid]].wait();
     return;
   }
-  if (tid < nChannels && chType == ChannelType::PROXY) {
-    proxyChannels[channelIndexes[tid]].wait();
+  if (tid < nChannels && chType == ChannelType::PORT) {
+    portChannels[channelIndexes[tid]].wait();
   }
 }
 
-MSCCLPP_DEVICE_INLINE void handleFlush(DeviceHandle<ProxyChannel>* proxyChannels, uint8_t* channelIndexes,
+MSCCLPP_DEVICE_INLINE void handleFlush(DeviceHandle<PortChannel>* portChannels, uint8_t* channelIndexes,
                                        int nChannels) {
   int tid = threadIdx.x;
   if (tid < nChannels) {
-    proxyChannels[channelIndexes[tid]].flush();
+    portChannels[channelIndexes[tid]].flush();
   }
 }
 
-MSCCLPP_DEVICE_INLINE void handleGet(DeviceHandle<SmChannel>* smChannel, uint8_t* srcChannelIndexes,
+MSCCLPP_DEVICE_INLINE void handleGet(DeviceHandle<MemoryChannel>* memoryChannel, uint8_t* srcChannelIndexes,
                                      uint32_t* dstOffsets, uint32_t* srcOffsets, int count, uint32_t size) {
   for (int i = 0; i < count; i++) {
     uint32_t dstOffset = dstOffsets[i];
     uint32_t srcOffset = srcOffsets[i];
-    smChannel[srcChannelIndexes[i]].get(dstOffset, srcOffset, size, threadIdx.x, blockDim.x);
+    memoryChannel[srcChannelIndexes[i]].get(dstOffset, srcOffset, size, threadIdx.x, blockDim.x);
   }
 }
 
 template <bool PutWithSignal = false, bool PutWithSignalAndFlush = false>
-MSCCLPP_DEVICE_INLINE void handlePut(DeviceHandle<SmChannel>* smChannel, DeviceHandle<ProxyChannel>* proxyChannels,
-                                     uint8_t* dstChannelIndexes, uint32_t* dstOffsets, uint32_t* srcOffsets, int count,
-                                     uint32_t size, ChannelType chType) {
-  if (chType == ChannelType::SM) {
+MSCCLPP_DEVICE_INLINE void handlePut(DeviceHandle<MemoryChannel>* memoryChannel,
+                                     DeviceHandle<PortChannel>* portChannels, uint8_t* dstChannelIndexes,
+                                     uint32_t* dstOffsets, uint32_t* srcOffsets, int count, uint32_t size,
+                                     ChannelType chType) {
+  if (chType == ChannelType::MEMORY) {
     for (int i = 0; i < count; i++) {
       uint32_t dstOffset = dstOffsets[i];
       uint32_t srcOffset = srcOffsets[i];
-      smChannel[dstChannelIndexes[i]].put(dstOffset, srcOffset, size, threadIdx.x, blockDim.x);
+      memoryChannel[dstChannelIndexes[i]].put(dstOffset, srcOffset, size, threadIdx.x, blockDim.x);
     }
     return;
   }
-  if (chType == ChannelType::PROXY) {
+  if (chType == ChannelType::PORT) {
     int tid = threadIdx.x;
     if (tid < count) {
       if constexpr (PutWithSignal) {
-        proxyChannels[dstChannelIndexes[tid]].putWithSignal(dstOffsets[tid], srcOffsets[tid], size);
+        portChannels[dstChannelIndexes[tid]].putWithSignal(dstOffsets[tid], srcOffsets[tid], size);
       } else if constexpr (PutWithSignalAndFlush) {
-        proxyChannels[dstChannelIndexes[tid]].putWithSignalAndFlush(dstOffsets[tid], srcOffsets[tid], size);
+        portChannels[dstChannelIndexes[tid]].putWithSignalAndFlush(dstOffsets[tid], srcOffsets[tid], size);
       } else {
-        proxyChannels[dstChannelIndexes[tid]].put(dstOffsets[tid], srcOffsets[tid], size);
+        portChannels[dstChannelIndexes[tid]].put(dstOffsets[tid], srcOffsets[tid], size);
       }
     }
   }
@@ -261,7 +264,8 @@ MSCCLPP_DEVICE_INLINE void handlePut(DeviceHandle<SmChannel>* smChannel, DeviceH
 
 template <typename T>
 MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* output, uint32_t outputOffsetByBytes, T* input,
-                                                    uint32_t inputOffsetByBytes, DeviceHandle<SmChannel>* smChannels,
+                                                    uint32_t inputOffsetByBytes,
+                                                    DeviceHandle<MemoryChannel>* memoryChannels,
                                                     uint8_t* dstChannelIndexes, uint8_t* srcChannelIndexes,
                                                     uint32_t* dstOffsets, uint32_t* srcOffsets, int nDstChannels,
                                                     int nSrcChannels, uint32_t size, bool sendToRemote = true) {
@@ -275,14 +279,14 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* output, uint32_t outputOf
     for (int index = 0; index < nSrcChannels; ++index) {
       int4 val;
       size_t srcOffset = srcOffsets[index] / sizeof(int4);
-      val = smChannels[srcChannelIndexes[index]].read<int4>(srcOffset + idx);
+      val = memoryChannels[srcChannelIndexes[index]].read<int4>(srcOffset + idx);
       tmp = add_vectors<T>(tmp, val);
     }
     output4[outputOffset4 + idx] = tmp;
     if (sendToRemote) {
       for (int index = 0; index < nDstChannels; ++index) {
         size_t dstOffset = dstOffsets[index] / sizeof(int4);
-        smChannels[dstChannelIndexes[index]].write<int4>(dstOffset + idx, tmp);
+        memoryChannels[dstChannelIndexes[index]].write<int4>(dstOffset + idx, tmp);
       }
     }
   }
@@ -294,48 +298,48 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceCopySend(T* output, uint32_t outputOf
     T tmp = input[idx];
     for (int index = 0; index < nSrcChannels; ++index) {
       size_t srcOffset = srcOffsets[index] / sizeof(T);
-      tmp = add_elements(tmp, smChannels[srcChannelIndexes[index]].read<T>(srcOffset + idx));
+      tmp = add_elements(tmp, memoryChannels[srcChannelIndexes[index]].read<T>(srcOffset + idx));
     }
     output[idx] = tmp;
     if (sendToRemote) {
       for (int index = 0; index < nDstChannels; ++index) {
         size_t dstOffset = dstOffsets[index] / sizeof(T);
-        smChannels[dstChannelIndexes[index]].write<T>(dstOffset + idx, tmp);
+        memoryChannels[dstChannelIndexes[index]].write<T>(dstOffset + idx, tmp);
       }
     }
   }
 }
 
 template <typename PacketType>
-MSCCLPP_DEVICE_INLINE void handlePutPacket(size_t scratchSize, DeviceHandle<SmChannel>* smChannels,
-                                           DeviceHandle<ProxyChannel>* proxyChannels, uint8_t* dstChannelIndexes,
+MSCCLPP_DEVICE_INLINE void handlePutPacket(size_t scratchSize, DeviceHandle<MemoryChannel>* memoryChannels,
+                                           DeviceHandle<PortChannel>* portChannels, uint8_t* dstChannelIndexes,
                                            uint32_t* dstOffsets, uint32_t* srcOffsets, int nDstChannels, uint32_t size,
                                            ChannelType chType, uint32_t flag) {
   const size_t scratchBaseOffset = flag & 0x1 ? 0 : scratchSize >> 1;
-  if (chType == ChannelType::SM) {
+  if (chType == ChannelType::MEMORY) {
     for (int index = 0; index < nDstChannels; ++index) {
-      smChannels[dstChannelIndexes[index]].putPackets<PacketType>(
+      memoryChannels[dstChannelIndexes[index]].putPackets<PacketType>(
           scratchBaseOffset + dstOffsets[index] * 2, srcOffsets[index], size, threadIdx.x, blockDim.x, flag);
     }
   }
-  if (chType == ChannelType::PROXY) {
+  if (chType == ChannelType::PORT) {
     int tid = threadIdx.x;
     if (tid >= nDstChannels) {
       return;
     }
-    // For proxy channel, we assume src and dst are in packet format
+    // For port channel, we assume src and dst are in packet format
     uint32_t dstOffset = (dstOffsets[tid] << 1) + scratchBaseOffset;
     uint32_t srcOffset = (srcOffsets[tid] << 1) + scratchBaseOffset;
-    proxyChannels[dstChannelIndexes[tid]].put(dstOffset, srcOffset, size << 1);
+    portChannels[dstChannelIndexes[tid]].put(dstOffset, srcOffset, size << 1);
   }
 }
 
 template <typename T, typename PacketType, bool SendToRemote = true>
 MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* dst, uint32_t dstOffsetByBytes, T* src, uint32_t srcOffsetByBytes,
                                                   T* inputBuff, size_t inputBuffSize, uint32_t* inputOffsets, int nSrcs,
-                                                  DeviceHandle<SmChannel>* smChannels, uint8_t* outputChannelIndexes,
-                                                  uint32_t* outputOffsets, int nDstChannels, size_t size,
-                                                  uint32_t flag) {
+                                                  DeviceHandle<MemoryChannel>* memoryChannels,
+                                                  uint8_t* outputChannelIndexes, uint32_t* outputOffsets,
+                                                  int nDstChannels, size_t size, uint32_t flag) {
   size_t nPackets = size * 2 / sizeof(PacketType);
   const size_t intputBaseOffset = flag & 0x1 ? 0 : inputBuffSize >> 1;
   const uint32_t srcOffset = srcOffsetByBytes / sizeof(PacketPayload<PacketType>);
@@ -356,7 +360,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* dst, uint32_t dstOffsetByBy
       PacketType pkt(data, flag);
       for (int index = 0; index < nDstChannels; ++index) {
         size_t offset = (intputBaseOffset + outputOffsets[index] * 2) / sizeof(PacketType);
-        smChannels[outputChannelIndexes[index]].write(offset + idx, pkt);
+        memoryChannels[outputChannelIndexes[index]].write(offset + idx, pkt);
       }
     }
   }
@@ -385,9 +389,9 @@ MSCCLPP_DEVICE_INLINE void handleTransformToPacket(void* dst, void* src, size_t
 
 template <typename T>
 MSCCLPP_DEVICE_INLINE void handleReduceSend(T* dst, uint32_t dstOffsetByBytes, T* src, uint32_t srcOffsetByBytes,
-                                            T* input, uint32_t* inputOffsets, DeviceHandle<SmChannel>* smChannels,
-                                            uint8_t* outputChannelIndexes, uint32_t* outputOffsets, int nOutChannels,
-                                            uint32_t size) {
+                                            T* input, uint32_t* inputOffsets,
+                                            DeviceHandle<MemoryChannel>* memoryChannels, uint8_t* outputChannelIndexes,
+                                            uint32_t* outputOffsets, int nOutChannels, uint32_t size) {
   const size_t nInt4 = size / sizeof(int4);
   const size_t srcOffset4 = srcOffsetByBytes / sizeof(int4);
   const size_t dstOffset4 = dstOffsetByBytes / sizeof(int4);
@@ -404,7 +408,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSend(T* dst, uint32_t dstOffsetByBytes, T
     dst4[dstOffset4 + idx] = tmp;
     for (int index = 0; index < nOutChannels; ++index) {
       size_t offset = outputOffsets[index] / sizeof(int4);
-      smChannels[outputChannelIndexes[index]].write<int4>(offset + idx, tmp);
+      memoryChannels[outputChannelIndexes[index]].write<int4>(offset + idx, tmp);
     }
   }
   // handle rest of data
@@ -420,7 +424,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSend(T* dst, uint32_t dstOffsetByBytes, T
     dst[idx] = tmp;
     for (int index = 0; index < nOutChannels; ++index) {
       size_t offset = outputOffsets[index] / sizeof(T);
-      smChannels[outputChannelIndexes[index]].write<T>(offset + idx, tmp);
+      memoryChannels[outputChannelIndexes[index]].write<T>(offset + idx, tmp);
     }
   }
 }
@@ -492,8 +496,8 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
   localPlan = (DeviceExecutionPlan*)sharedMem;
   int nOperations = localPlan->nOperations;
   Operation* operations = localPlan->operations;
-  DeviceHandle<SmChannel>* smChannels = localPlan->channels.smChannels;
-  DeviceHandle<ProxyChannel>* proxyChannels = localPlan->channels.proxyChannels;
+  DeviceHandle<MemoryChannel>* memoryChannels = localPlan->channels.memoryChannels;
+  DeviceHandle<PortChannel>* portChannels = localPlan->channels.portChannels;
   [[maybe_unused]] DeviceHandle<NvlsConnection::DeviceMulticastPointer>* nvlsChannels =
       localPlan->channels.nvlsChannels;
 
@@ -534,22 +538,22 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
       int syncStateIndex = op.deviceSyncerIndex;
       deviceSyncers[syncStateIndex].sync(nThreadBlocks);
     } else if (op.type == OperationType::SIGNAL) {
-      handleSignal(smChannels, proxyChannels, op.outputChannelIndexes, op.nOutputs, op.channelType);
+      handleSignal(memoryChannels, portChannels, op.outputChannelIndexes, op.nOutputs, op.channelType);
     } else if (op.type == OperationType::WAIT) {
-      handleWait(smChannels, proxyChannels, op.inputChannelIndexes, op.nInputs, op.channelType);
+      handleWait(memoryChannels, portChannels, op.inputChannelIndexes, op.nInputs, op.channelType);
     } else if (op.type == OperationType::FLUSH) {
-      handleFlush(proxyChannels, op.outputChannelIndexes, op.nOutputs);
+      handleFlush(portChannels, op.outputChannelIndexes, op.nOutputs);
     } else if (op.type == OperationType::PUT) {
-      handlePut(smChannels, proxyChannels, op.outputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs,
+      handlePut(memoryChannels, portChannels, op.outputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs,
                 op.size, op.channelType);
     } else if (op.type == OperationType::PUT_WITH_SIGNAL) {
-      handlePut<true>(smChannels, proxyChannels, op.outputChannelIndexes, op.outputOffsets, op.inputOffsets,
+      handlePut<true>(memoryChannels, portChannels, op.outputChannelIndexes, op.outputOffsets, op.inputOffsets,
                       op.nOutputs, op.size, op.channelType);
     } else if (op.type == OperationType::PUT_WITH_SIGNAL_AND_FLUSH) {
-      handlePut<false, true>(smChannels, proxyChannels, op.outputChannelIndexes, op.outputOffsets, op.inputOffsets,
+      handlePut<false, true>(memoryChannels, portChannels, op.outputChannelIndexes, op.outputOffsets, op.inputOffsets,
                              op.nOutputs, op.size, op.channelType);
     } else if (op.type == OperationType::GET) {
-      handleGet(smChannels, op.inputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nInputs, op.size);
+      handleGet(memoryChannels, op.inputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nInputs, op.size);
     } else if (op.type == OperationType::COPY) {
       T* dst = getBuffer(input, output, scratch, op.dstBufferType);
       T* src = getBuffer(input, output, scratch, op.srcBufferType);
@@ -557,30 +561,30 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
     } else if (op.type == OperationType::READ_REDUCE_COPY_SEND) {
       T* dst = getBuffer(input, output, scratch, op.dstBufferType);
       T* src = getBuffer(input, output, scratch, op.srcBufferType);
-      handleReadReduceCopySend(dst, op.dstOffset, src, op.srcOffset, smChannels, op.outputChannelIndexes,
+      handleReadReduceCopySend(dst, op.dstOffset, src, op.srcOffset, memoryChannels, op.outputChannelIndexes,
                                op.inputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs, op.nInputs,
                                op.size);
     } else if (op.type == OperationType::READ_REDUCE_COPY) {
       T* dst = getBuffer(input, output, scratch, op.dstBufferType);
       T* src = getBuffer(input, output, scratch, op.srcBufferType);
 
-      handleReadReduceCopySend(dst, op.dstOffset, src, op.srcOffset, smChannels, op.outputChannelIndexes,
+      handleReadReduceCopySend(dst, op.dstOffset, src, op.srcOffset, memoryChannels, op.outputChannelIndexes,
                                op.inputChannelIndexes, op.outputOffsets, op.inputOffsets, op.nOutputs, op.nInputs,
                                op.size, false);
     } else if (op.type == OperationType::PUT_PACKET) {
-      handlePutPacket<PacketType>(scratchSize, smChannels, proxyChannels, op.outputChannelIndexes, op.outputOffsets,
+      handlePutPacket<PacketType>(scratchSize, memoryChannels, portChannels, op.outputChannelIndexes, op.outputOffsets,
                                   op.inputOffsets, op.nOutputs, op.size, op.channelType, flag);
     } else if (op.type == OperationType::REDUCE_SEND_PACKET) {
       T* dst = getBuffer(input, output, scratch, op.dstBufferType);
       T* src = getBuffer(input, output, scratch, op.srcBufferType);
       handleReduceSendPacket<T, PacketType>(dst, op.dstOffset, src, op.srcOffset, scratch, scratchSize, op.inputOffsets,
-                                            op.nInputs, smChannels, op.outputChannelIndexes, op.outputOffsets,
+                                            op.nInputs, memoryChannels, op.outputChannelIndexes, op.outputOffsets,
                                             op.nOutputs, op.size, flag);
     } else if (op.type == OperationType::REDUCE_PACKET) {
       T* dst = getBuffer(input, output, scratch, op.dstBufferType);
       T* src = getBuffer(input, output, scratch, op.srcBufferType);
       handleReduceSendPacket<T, PacketType, false>(dst, op.dstOffset, src, op.srcOffset, scratch, scratchSize,
-                                                   op.inputOffsets, op.nInputs, smChannels, op.outputChannelIndexes,
+                                                   op.inputOffsets, op.nInputs, memoryChannels, op.outputChannelIndexes,
                                                    op.outputOffsets, op.nOutputs, op.size, flag);
     } else if (op.type == OperationType::COPY_PACKET) {
       T* dst = getBuffer(input, output, scratch, op.dstBufferType);
@@ -594,8 +598,8 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
       T* dst = getBuffer(input, output, scratch, op.dstBufferType);
       T* src = getBuffer(input, output, scratch, op.srcBufferType);
       T* tmp = getBuffer(input, output, scratch, op.inputBufferType);
-      handleReduceSend(dst, op.dstOffset, src, op.srcOffset, tmp, op.inputOffsets, smChannels, op.outputChannelIndexes,
-                       op.outputOffsets, op.nOutputs, op.size);
+      handleReduceSend(dst, op.dstOffset, src, op.srcOffset, tmp, op.inputOffsets, memoryChannels,
+                       op.outputChannelIndexes, op.outputOffsets, op.nOutputs, op.size);
     }
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
     else if (op.type == OperationType::MULTI_LOAD_REDUCE_STORE) {
diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp
index 080a76883..66ed464d4 100644
--- a/src/include/execution_plan.hpp
+++ b/src/include/execution_plan.hpp
@@ -98,8 +98,8 @@ struct ExecutionPlan::Impl {
   // for nvls channels
   std::unordered_map<int, std::vector<NvlsInfo>> nvlsInfos;
   // threadblockChannelMap[rank][threadblock] = [channelIndex, channelKey]
-  std::unordered_map<int, std::vector<std::vector<std::pair<int, ChannelKey>>>> threadblockSMChannelMap;
-  std::unordered_map<int, std::vector<std::vector<std::pair<int, ChannelKey>>>> threadblockProxyChannelMap;
+  std::unordered_map<int, std::vector<std::vector<std::pair<int, ChannelKey>>>> threadblockMemoryChannelMap;
+  std::unordered_map<int, std::vector<std::vector<std::pair<int, ChannelKey>>>> threadblockPortChannelMap;
   std::unordered_map<int, std::vector<std::vector<std::pair<int, ChannelKey>>>> threadblockNvlsChannelMap;
   std::unordered_map<int, uint32_t> inputChunks;
   std::unordered_map<int, uint32_t> outputChunks;
diff --git a/src/sm_channel.cc b/src/memory_channel.cc
similarity index 54%
rename from src/sm_channel.cc
rename to src/memory_channel.cc
index a148595bf..3adce71fe 100644
--- a/src/sm_channel.cc
+++ b/src/memory_channel.cc
@@ -1,22 +1,22 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <mscclpp/sm_channel.hpp>
+#include <mscclpp/memory_channel.hpp>
 
 #include "api.h"
 #include "debug.h"
 
 namespace mscclpp {
 
-MSCCLPP_API_CPP SmChannel::SmChannel(std::shared_ptr<SmDevice2DeviceSemaphore> semaphore, RegisteredMemory dst,
-                                     void* src, void* getPacketBuffer)
+MSCCLPP_API_CPP MemoryChannel::MemoryChannel(std::shared_ptr<MemoryDevice2DeviceSemaphore> semaphore,
+                                             RegisteredMemory dst, void* src, void* getPacketBuffer)
     : semaphore_(semaphore), dst_(dst), src_(src), getPacketBuffer_(getPacketBuffer) {
   if (!dst.transports().has(Transport::CudaIpc)) {
-    throw Error("SmChannel: dst must be registered with CudaIpc", ErrorCode::InvalidUsage);
+    throw Error("MemoryChannel: dst must be registered with CudaIpc", ErrorCode::InvalidUsage);
   }
 }
 
-MSCCLPP_API_CPP SmChannel::DeviceHandle SmChannel::deviceHandle() const {
+MSCCLPP_API_CPP MemoryChannel::DeviceHandle MemoryChannel::deviceHandle() const {
   return DeviceHandle{.semaphore_ = semaphore_->deviceHandle(),
                       .src_ = src_,
                       .dst_ = dst_.data(),
diff --git a/src/proxy_channel.cc b/src/port_channel.cc
similarity index 68%
rename from src/proxy_channel.cc
rename to src/port_channel.cc
index f2ca00674..e574af9fc 100644
--- a/src/proxy_channel.cc
+++ b/src/port_channel.cc
@@ -2,21 +2,21 @@
 // Licensed under the MIT license.
 
 #include <mscclpp/numa.hpp>
-#include <mscclpp/proxy_channel.hpp>
+#include <mscclpp/port_channel.hpp>
 
 #include "api.h"
 #include "debug.h"
 
 namespace mscclpp {
 
-MSCCLPP_API_CPP BaseProxyChannel::BaseProxyChannel(SemaphoreId semaphoreId,
-                                                   std::shared_ptr<Host2DeviceSemaphore> semaphore,
-                                                   std::shared_ptr<Proxy> proxy)
+MSCCLPP_API_CPP BasePortChannel::BasePortChannel(SemaphoreId semaphoreId,
+                                                 std::shared_ptr<Host2DeviceSemaphore> semaphore,
+                                                 std::shared_ptr<Proxy> proxy)
     : semaphoreId_(semaphoreId), semaphore_(semaphore), proxy_(proxy) {}
 
-MSCCLPP_API_CPP ProxyChannel::ProxyChannel(SemaphoreId semaphoreId, std::shared_ptr<Host2DeviceSemaphore> semaphore,
-                                           std::shared_ptr<Proxy> proxy, MemoryId dst, MemoryId src)
-    : BaseProxyChannel(semaphoreId, semaphore, proxy), dst_(dst), src_(src) {}
+MSCCLPP_API_CPP PortChannel::PortChannel(SemaphoreId semaphoreId, std::shared_ptr<Host2DeviceSemaphore> semaphore,
+                                         std::shared_ptr<Proxy> proxy, MemoryId dst, MemoryId src)
+    : BasePortChannel(semaphoreId, semaphore, proxy), dst_(dst), src_(src) {}
 
 MSCCLPP_API_CPP ProxyService::ProxyService(size_t fifoSize)
     : proxy_(std::make_shared<Proxy>([&](ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); },
@@ -46,12 +46,12 @@ MSCCLPP_API_CPP std::shared_ptr<Host2DeviceSemaphore> ProxyService::semaphore(Se
   return semaphores_[id];
 }
 
-MSCCLPP_API_CPP BaseProxyChannel ProxyService::baseProxyChannel(SemaphoreId id) {
-  return BaseProxyChannel(id, semaphores_[id], proxy_);
+MSCCLPP_API_CPP BasePortChannel ProxyService::basePortChannel(SemaphoreId id) {
+  return BasePortChannel(id, semaphores_[id], proxy_);
 }
 
-MSCCLPP_API_CPP ProxyChannel ProxyService::proxyChannel(SemaphoreId id, MemoryId dst, MemoryId src) {
-  return ProxyChannel(id, semaphores_[id], proxy_, dst, src);
+MSCCLPP_API_CPP PortChannel ProxyService::portChannel(SemaphoreId id, MemoryId dst, MemoryId src) {
+  return PortChannel(id, semaphores_[id], proxy_, dst, src);
 }
 
 MSCCLPP_API_CPP void ProxyService::startProxy() { proxy_->start(); }
@@ -95,13 +95,12 @@ ProxyHandlerResult ProxyService::handleTrigger(ProxyTrigger triggerRaw) {
   return result;
 }
 
-MSCCLPP_API_CPP BaseProxyChannel::DeviceHandle BaseProxyChannel::deviceHandle() const {
-  return BaseProxyChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo().deviceHandle());
+MSCCLPP_API_CPP BasePortChannel::DeviceHandle BasePortChannel::deviceHandle() const {
+  return BasePortChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo().deviceHandle());
 }
 
-MSCCLPP_API_CPP ProxyChannel::DeviceHandle ProxyChannel::deviceHandle() const {
-  return ProxyChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo().deviceHandle(), dst_,
-                                    src_);
+MSCCLPP_API_CPP PortChannel::DeviceHandle PortChannel::deviceHandle() const {
+  return PortChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo().deviceHandle(), dst_, src_);
 }
 
 }  // namespace mscclpp
diff --git a/src/semaphore.cc b/src/semaphore.cc
index c6238b532..b03ff6736 100644
--- a/src/semaphore.cc
+++ b/src/semaphore.cc
@@ -91,8 +91,8 @@ MSCCLPP_API_CPP void Host2HostSemaphore::wait(int64_t maxSpinCount) {
   }
 }
 
-MSCCLPP_API_CPP SmDevice2DeviceSemaphore::SmDevice2DeviceSemaphore(Communicator& communicator,
-                                                                   std::shared_ptr<Connection> connection)
+MSCCLPP_API_CPP MemoryDevice2DeviceSemaphore::MemoryDevice2DeviceSemaphore(Communicator& communicator,
+                                                                           std::shared_ptr<Connection> connection)
     : BaseSemaphore(createGpuSemaphoreId(), createGpuSemaphoreId(), createGpuSemaphoreId()) {
   INFO(MSCCLPP_INIT, "Creating a Device2Device semaphore for %s transport from %d to %d",
        connection->getTransportName().c_str(), communicator.bootstrap()->getRank(),
@@ -107,8 +107,8 @@ MSCCLPP_API_CPP SmDevice2DeviceSemaphore::SmDevice2DeviceSemaphore(Communicator&
   }
 }
 
-MSCCLPP_API_CPP SmDevice2DeviceSemaphore::DeviceHandle SmDevice2DeviceSemaphore::deviceHandle() const {
-  SmDevice2DeviceSemaphore::DeviceHandle device;
+MSCCLPP_API_CPP MemoryDevice2DeviceSemaphore::DeviceHandle MemoryDevice2DeviceSemaphore::deviceHandle() const {
+  MemoryDevice2DeviceSemaphore::DeviceHandle device;
   device.remoteInboundSemaphoreId = isRemoteInboundSemaphoreIdSet_
                                         ? reinterpret_cast<uint64_t*>(remoteInboundSemaphoreIdsRegMem_.get().data())
                                         : nullptr;
diff --git a/test/allgather_test_cpp.cu b/test/allgather_test_cpp.cu
index 0f5d37759..836ee3e64 100644
--- a/test/allgather_test_cpp.cu
+++ b/test/allgather_test_cpp.cu
@@ -2,7 +2,7 @@
 // Licensed under the MIT license.
 
 #include <mscclpp/core.hpp>
-#include <mscclpp/proxy_channel.hpp>
+#include <mscclpp/port_channel.hpp>
 
 #ifdef MSCCLPP_USE_MPI_FOR_TESTS
 #include "mpi.h"
@@ -40,25 +40,25 @@ static double getTime(void) {
 
 template <class T>
 using DeviceHandle = mscclpp::DeviceHandle<T>;
-__constant__ DeviceHandle<mscclpp::ProxyChannel> constProxyChans[16];
+__constant__ DeviceHandle<mscclpp::PortChannel> constPortChans[16];
 
-__device__ void allgather0(DeviceHandle<mscclpp::ProxyChannel> proxyChan, int rank, size_t nelemsPerGPU) {
+__device__ void allgather0(DeviceHandle<mscclpp::PortChannel> portChan, int rank, size_t nelemsPerGPU) {
   // this allgather is really simple and implemented as an alltoall
 
   // this thread's role is a sender role
   // put your data asynchronously
-  if ((threadIdx.x % 32) == 0) proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int));
+  if ((threadIdx.x % 32) == 0) portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int));
   // make sure everyone is put their data before some thread randomly blocks everyone else in signal
   __syncthreads();
   // push with flag and sync to make sure the data is received
-  if ((threadIdx.x % 32) == 0) proxyChan.flush();
+  if ((threadIdx.x % 32) == 0) portChan.flush();
 
   // this thread's role is a receiver role. wait on the semaphore to make sure the data is ready
-  if ((threadIdx.x % 32) == 0) proxyChan.wait();
+  if ((threadIdx.x % 32) == 0) portChan.wait();
 }
 
-__device__ void localAllGather(DeviceHandle<mscclpp::ProxyChannel> proxyChan, int rank, int nranksPerNode,
-                               int remoteRank, uint64_t offset, uint64_t size) {
+__device__ void localAllGather(DeviceHandle<mscclpp::PortChannel> portChan, int rank, int nranksPerNode, int remoteRank,
+                               uint64_t offset, uint64_t size) {
   // this allgather algorithm works as follows:
   // Step 1: GPU rank i sends data to GPU rank (i+1) % nranksPerNode
   // and waits for data from GPU rank (i-1) % nranksPerNode
@@ -68,11 +68,11 @@ __device__ void localAllGather(DeviceHandle<mscclpp::ProxyChannel> proxyChan, in
   for (int i = 1; i < nranksPerNode; i++) {
     if ((remoteRank % nranksPerNode) == ((rank + i) % nranksPerNode)) {
       // put your data to GPU (rank+i) % nranksPerNode and signal in one call
-      if ((threadIdx.x % 32) == 0) proxyChan.putWithSignal(offset, size);
+      if ((threadIdx.x % 32) == 0) portChan.putWithSignal(offset, size);
     }
     // wait for the data from GPU (rank-i) % nranksPerNode to arrive
     if ((remoteRank % nranksPerNode) == ((rank - i + nranksPerNode) % nranksPerNode)) {
-      if ((threadIdx.x % 32) == 0) proxyChan.wait();
+      if ((threadIdx.x % 32) == 0) portChan.wait();
     }
 #if defined(__HIP_PLATFORM_AMD__)
     // NOTE: we actually need a group barrier here for better performance, but __syncthreads() is still correct.
@@ -83,15 +83,15 @@ __device__ void localAllGather(DeviceHandle<mscclpp::ProxyChannel> proxyChan, in
   }
 }
 
-__device__ void allgather1(DeviceHandle<mscclpp::ProxyChannel> proxyChan, int rank, int nranksPerNode, int remoteRank,
+__device__ void allgather1(DeviceHandle<mscclpp::PortChannel> portChan, int rank, int nranksPerNode, int remoteRank,
                            size_t nelemsPerGPU) {
-  localAllGather(proxyChan, rank, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int),
+  localAllGather(portChan, rank, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int),
                  nelemsPerGPU * sizeof(int));
   if (remoteRank / nranksPerNode == rank / nranksPerNode)
-    if ((threadIdx.x % 32) == 0) proxyChan.flush();
+    if ((threadIdx.x % 32) == 0) portChan.flush();
 }
 
-__device__ void allgather2(DeviceHandle<mscclpp::ProxyChannel> proxyChan, int rank, int world_size, int nranksPerNode,
+__device__ void allgather2(DeviceHandle<mscclpp::PortChannel> portChan, int rank, int world_size, int nranksPerNode,
                            int remoteRank, size_t nelemsPerGPU) {
   // this allgather is a pipelined and hierarchical one and only works for two nodes
   // it is implemented as follows:
@@ -108,16 +108,16 @@ __device__ void allgather2(DeviceHandle<mscclpp::ProxyChannel> proxyChan, int ra
   // Step 1
   // local allgather
   if (remoteRank / nranksPerNode == rank / nranksPerNode) {
-    localAllGather(proxyChan, rank, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int),
+    localAllGather(portChan, rank, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int),
                    nelemsPerGPU * sizeof(int));
   }
   // cross-node exchange
   if (remoteRank % nranksPerNode == rank % nranksPerNode) {
     // opposite side
     if ((threadIdx.x % 32) == 0)
-      proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int),
-                              (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int));
-    if ((threadIdx.x % 32) == 0) proxyChan.wait();
+      portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int),
+                             (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int));
+    if ((threadIdx.x % 32) == 0) portChan.wait();
   }
 
   __syncthreads();
@@ -126,7 +126,7 @@ __device__ void allgather2(DeviceHandle<mscclpp::ProxyChannel> proxyChan, int ra
   // local allgather
   int otherNghr = (rank + nranksPerNode) % world_size;
   if (remoteRank / nranksPerNode == rank / nranksPerNode) {
-    localAllGather(proxyChan, rank, nranksPerNode, remoteRank, otherNghr * nelemsPerGPU * sizeof(int),
+    localAllGather(portChan, rank, nranksPerNode, remoteRank, otherNghr * nelemsPerGPU * sizeof(int),
                    (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int));
   }
 
@@ -134,9 +134,9 @@ __device__ void allgather2(DeviceHandle<mscclpp::ProxyChannel> proxyChan, int ra
   if (remoteRank % nranksPerNode == rank % nranksPerNode) {
     // opposite side
     if ((threadIdx.x % 32) == 0)
-      proxyChan.putWithSignal((rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int),
-                              nelemsPerGPU / pipelineSize * sizeof(int));
-    if ((threadIdx.x % 32) == 0) proxyChan.wait();
+      portChan.putWithSignal((rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int),
+                             nelemsPerGPU / pipelineSize * sizeof(int));
+    if ((threadIdx.x % 32) == 0) portChan.wait();
   }
 
   __syncthreads();
@@ -144,29 +144,29 @@ __device__ void allgather2(DeviceHandle<mscclpp::ProxyChannel> proxyChan, int ra
   // Step 3
   // local allgather
   if (remoteRank / nranksPerNode == rank / nranksPerNode) {
-    localAllGather(proxyChan, rank, nranksPerNode, remoteRank,
+    localAllGather(portChan, rank, nranksPerNode, remoteRank,
                    (otherNghr * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int),
                    nelemsPerGPU / pipelineSize * sizeof(int));
   }
 
   if (remoteRank / nranksPerNode == rank / nranksPerNode || remoteRank % nranksPerNode == rank % nranksPerNode) {
-    if ((threadIdx.x % 32) == 0) proxyChan.flush();
+    if ((threadIdx.x % 32) == 0) portChan.flush();
   }
 }
 
 __global__ void kernel(int rank, int world_size, int nranksPerNode, size_t nelemsPerGPU, int kernel) {
-  // find the mapping between remoteRank and proxyChans
+  // find the mapping between remoteRank and portChans
   int warpId = threadIdx.x / 32;
   int remoteRank = (warpId < rank) ? warpId : warpId + 1;
   // Each warp is responsible for one of the remote ranks
-  DeviceHandle<mscclpp::ProxyChannel> proxyChan = constProxyChans[warpId];
+  DeviceHandle<mscclpp::PortChannel> portChan = constPortChans[warpId];
 
   if (kernel == 0)
-    allgather0(proxyChan, rank, nelemsPerGPU);
+    allgather0(portChan, rank, nelemsPerGPU);
   else if (kernel == 1)
-    allgather1(proxyChan, rank, nranksPerNode, remoteRank, nelemsPerGPU);
+    allgather1(portChan, rank, nranksPerNode, remoteRank, nelemsPerGPU);
   else if (kernel == 2)
-    allgather2(proxyChan, rank, world_size, nranksPerNode, remoteRank, nelemsPerGPU);
+    allgather2(portChan, rank, world_size, nranksPerNode, remoteRank, nelemsPerGPU);
 }
 
 int rankToLocalRank(int rank) { return rank % nranksPerNode; }
@@ -234,17 +234,17 @@ void setupMscclppConnections(int rank, int world_size, mscclpp::Communicator& co
 
   comm.setup();
 
-  std::vector<DeviceHandle<mscclpp::ProxyChannel>> proxyChannels;
+  std::vector<DeviceHandle<mscclpp::PortChannel>> portChannels;
   for (size_t i = 0; i < semaphoreIds.size(); ++i) {
-    proxyChannels.push_back(mscclpp::deviceHandle(proxyService.proxyChannel(
+    portChannels.push_back(mscclpp::deviceHandle(proxyService.portChannel(
         semaphoreIds[i], proxyService.addMemory(remoteMemories[i].get()), proxyService.addMemory(localMemories[i]))));
   }
 
-  if (proxyChannels.size() > sizeof(constProxyChans) / sizeof(DeviceHandle<mscclpp::ProxyChannel>)) {
+  if (portChannels.size() > sizeof(constPortChans) / sizeof(DeviceHandle<mscclpp::PortChannel>)) {
     std::runtime_error("unexpected error");
   }
-  CUDACHECK(cudaMemcpyToSymbol(constProxyChans, proxyChannels.data(),
-                               sizeof(DeviceHandle<mscclpp::ProxyChannel>) * proxyChannels.size()));
+  CUDACHECK(cudaMemcpyToSymbol(constPortChans, portChannels.data(),
+                               sizeof(DeviceHandle<mscclpp::PortChannel>) * portChannels.size()));
 }
 
 void printUsage(const char* prog, bool isMpi) {
diff --git a/test/mp_unit/CMakeLists.txt b/test/mp_unit/CMakeLists.txt
index 8e37d2405..007e3e6dd 100644
--- a/test/mp_unit/CMakeLists.txt
+++ b/test/mp_unit/CMakeLists.txt
@@ -6,7 +6,7 @@ target_sources(mp_unit_tests PRIVATE
     bootstrap_tests.cc
     ib_tests.cu
     communicator_tests.cu
-    proxy_channel_tests.cu
-    sm_channel_tests.cu
+    port_channel_tests.cu
+    memory_channel_tests.cu
     executor_tests.cc
 )
diff --git a/test/mp_unit/sm_channel_tests.cu b/test/mp_unit/memory_channel_tests.cu
similarity index 64%
rename from test/mp_unit/sm_channel_tests.cu
rename to test/mp_unit/memory_channel_tests.cu
index af4aa2985..daa0423d6 100644
--- a/test/mp_unit/sm_channel_tests.cu
+++ b/test/mp_unit/memory_channel_tests.cu
@@ -5,7 +5,7 @@
 
 #include "mp_unit_tests.hpp"
 
-void SmChannelOneToOneTest::SetUp() {
+void MemoryChannelOneToOneTest::SetUp() {
   // Need at least two ranks within a node
   if (gEnv->nRanksPerNode < 2) {
     GTEST_SKIP();
@@ -15,10 +15,11 @@ void SmChannelOneToOneTest::SetUp() {
   CommunicatorTestBase::SetUp();
 }
 
-void SmChannelOneToOneTest::TearDown() { CommunicatorTestBase::TearDown(); }
+void MemoryChannelOneToOneTest::TearDown() { CommunicatorTestBase::TearDown(); }
 
-void SmChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::SmChannel>& smChannels, void* inputBuff,
-                                                 size_t inputBuffBytes, void* outputBuff, size_t outputBuffBytes) {
+void MemoryChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::MemoryChannel>& memoryChannels,
+                                                     void* inputBuff, size_t inputBuffBytes, void* outputBuff,
+                                                     size_t outputBuffBytes) {
   const int rank = communicator->bootstrap()->getRank();
   const int worldSize = communicator->bootstrap()->getNranks();
   const bool isInPlace = (outputBuff == nullptr);
@@ -59,34 +60,35 @@ void SmChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::SmChannel>
     }
     connections[r] = connectionFutures[r].get();
 
-    smSemaphores[r] = std::make_shared<mscclpp::SmDevice2DeviceSemaphore>(*communicator, connections[r]);
+    memorySemaphores[r] = std::make_shared<mscclpp::MemoryDevice2DeviceSemaphore>(*communicator, connections[r]);
 
-    smChannels.emplace_back(smSemaphores[r], remoteMemFutures[r].get(), inputBufRegMem.data(),
-                            (isInPlace ? nullptr : outputBufRegMem.data()));
+    memoryChannels.emplace_back(memorySemaphores[r], remoteMemFutures[r].get(), inputBufRegMem.data(),
+                                (isInPlace ? nullptr : outputBufRegMem.data()));
   }
 
   communicator->setup();
 }
 
-__constant__ DeviceHandle<mscclpp::SmChannel> gChannelOneToOneTestConstSmChans;
+__constant__ DeviceHandle<mscclpp::MemoryChannel> gChannelOneToOneTestConstMemChans;
 
-void SmChannelOneToOneTest::packetPingPongTest(const std::string testName, PacketPingPongKernelWrapper kernelWrapper) {
+void MemoryChannelOneToOneTest::packetPingPongTest(const std::string testName,
+                                                   PacketPingPongKernelWrapper kernelWrapper) {
   if (gEnv->rank >= numRanksToUse) return;
 
   const int nElem = 4 * 1024 * 1024;
   const int defaultNTries = 1000;
 
-  std::vector<mscclpp::SmChannel> smChannels;
+  std::vector<mscclpp::MemoryChannel> memoryChannels;
   std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
   std::shared_ptr<int> intermBuff = mscclpp::GpuBuffer<int>(nElem * 2).memory();
-  setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int), intermBuff.get(), nElem * 2 * sizeof(int));
-  std::vector<DeviceHandle<mscclpp::SmChannel>> deviceHandles(smChannels.size());
-  std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(),
-                 [](const mscclpp::SmChannel& smChan) { return mscclpp::deviceHandle(smChan); });
+  setupMeshConnections(memoryChannels, buff.get(), nElem * sizeof(int), intermBuff.get(), nElem * 2 * sizeof(int));
+  std::vector<DeviceHandle<mscclpp::MemoryChannel>> deviceHandles(memoryChannels.size());
+  std::transform(memoryChannels.begin(), memoryChannels.end(), deviceHandles.begin(),
+                 [](const mscclpp::MemoryChannel& memChan) { return mscclpp::deviceHandle(memChan); });
 
-  ASSERT_EQ(smChannels.size(), 1);
-  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstSmChans, deviceHandles.data(),
-                                       sizeof(DeviceHandle<mscclpp::SmChannel>)));
+  ASSERT_EQ(memoryChannels.size(), 1);
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstMemChans, deviceHandles.data(),
+                                       sizeof(DeviceHandle<mscclpp::MemoryChannel>)));
 
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
@@ -125,15 +127,15 @@ void SmChannelOneToOneTest::packetPingPongTest(const std::string testName, Packe
   }
 }
 
-__global__ void kernelSmPutPingPong(int* buff, int rank, int nElem, int* ret) {
-  DeviceHandle<mscclpp::SmChannel>& smChan = gChannelOneToOneTestConstSmChans;
+__global__ void kernelMemPutPingPong(int* buff, int rank, int nElem, int* ret) {
+  DeviceHandle<mscclpp::MemoryChannel>& memChan = gChannelOneToOneTestConstMemChans;
   volatile int* sendBuff = (volatile int*)buff;
   int nTries = 1000;
   int rank1Offset = 10000000;
   for (int i = 0; i < nTries; i++) {
     if (rank == 0) {
       if (i > 0) {
-        if (threadIdx.x == 0) smChan.wait();
+        if (threadIdx.x == 0) memChan.wait();
         __syncthreads();
         for (int j = threadIdx.x; j < nElem; j += blockDim.x) {
           if (sendBuff[j] != rank1Offset + i - 1 + j) {
@@ -147,11 +149,11 @@ __global__ void kernelSmPutPingPong(int* buff, int rank, int nElem, int* ret) {
         sendBuff[j] = i + j;
       }
       __syncthreads();
-      smChan.put(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x);
-      if (threadIdx.x == 0) smChan.signal();
+      memChan.put(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x);
+      if (threadIdx.x == 0) memChan.signal();
     }
     if (rank == 1) {
-      if (threadIdx.x == 0) smChan.wait();
+      if (threadIdx.x == 0) memChan.wait();
       __syncthreads();
       for (int j = threadIdx.x; j < nElem; j += blockDim.x) {
         if (sendBuff[j] != i + j) {
@@ -165,59 +167,59 @@ __global__ void kernelSmPutPingPong(int* buff, int rank, int nElem, int* ret) {
           sendBuff[j] = rank1Offset + i + j;
         }
         __syncthreads();
-        smChan.put(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x);
-        if (threadIdx.x == 0) smChan.signal();
+        memChan.put(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x);
+        if (threadIdx.x == 0) memChan.signal();
       }
     }
   }
 }
 
-TEST_F(SmChannelOneToOneTest, PutPingPong) {
+TEST_F(MemoryChannelOneToOneTest, PutPingPong) {
   if (gEnv->rank >= numRanksToUse) return;
 
   const int nElem = 4 * 1024 * 1024;
 
-  std::vector<mscclpp::SmChannel> smChannels;
+  std::vector<mscclpp::MemoryChannel> memoryChannels;
   std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
-  setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int));
-  std::vector<DeviceHandle<mscclpp::SmChannel>> deviceHandles(smChannels.size());
-  std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(),
-                 [](const mscclpp::SmChannel& smChan) { return mscclpp::deviceHandle(smChan); });
+  setupMeshConnections(memoryChannels, buff.get(), nElem * sizeof(int));
+  std::vector<DeviceHandle<mscclpp::MemoryChannel>> deviceHandles(memoryChannels.size());
+  std::transform(memoryChannels.begin(), memoryChannels.end(), deviceHandles.begin(),
+                 [](const mscclpp::MemoryChannel& memChan) { return mscclpp::deviceHandle(memChan); });
 
-  ASSERT_EQ(smChannels.size(), 1);
-  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstSmChans, deviceHandles.data(),
-                                       sizeof(DeviceHandle<mscclpp::SmChannel>)));
+  ASSERT_EQ(memoryChannels.size(), 1);
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstMemChans, deviceHandles.data(),
+                                       sizeof(DeviceHandle<mscclpp::MemoryChannel>)));
 
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
-  kernelSmPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get());
+  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get());
   MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
 
   EXPECT_EQ(*ret, 0);
   *ret = 0;
 
-  kernelSmPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get());
+  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get());
   MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
 
   EXPECT_EQ(*ret, 0);
   *ret = 0;
 
-  kernelSmPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get());
+  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get());
   MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
 
   EXPECT_EQ(*ret, 0);
   *ret = 0;
 
-  kernelSmPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get());
+  kernelMemPutPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get());
   MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
 
   EXPECT_EQ(*ret, 0);
 }
 
-__global__ void kernelSmGetPingPong(int* buff, int rank, int nElem, int* ret) {
+__global__ void kernelMemGetPingPong(int* buff, int rank, int nElem, int* ret) {
   if (rank > 1) return;
 
-  DeviceHandle<mscclpp::SmChannel>& smChan = gChannelOneToOneTestConstSmChans;
+  DeviceHandle<mscclpp::MemoryChannel>& memChan = gChannelOneToOneTestConstMemChans;
   volatile int* buffPtr = (volatile int*)buff;
   int offset0 = (rank == 0) ? 0 : 10000000;
   int offset1 = (rank == 0) ? 10000000 : 0;
@@ -231,14 +233,14 @@ __global__ void kernelSmGetPingPong(int* buff, int rank, int nElem, int* ret) {
         buffPtr[j] = offset0 + i + j;
       }
       if (threadIdx.x == 0) {
-        smChan.signal();
+        memChan.signal();
       }
     } else {
       if (threadIdx.x == 0) {
-        smChan.wait();
+        memChan.wait();
       }
       __syncthreads();
-      smChan.get(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x);
+      memChan.get(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x);
       __syncthreads();
       for (int j = threadIdx.x; j < nElem; j += blockDim.x) {
         if (buffPtr[j] != offset1 + i + j) {
@@ -251,52 +253,52 @@ __global__ void kernelSmGetPingPong(int* buff, int rank, int nElem, int* ret) {
   }
 }
 
-TEST_F(SmChannelOneToOneTest, GetPingPong) {
+TEST_F(MemoryChannelOneToOneTest, GetPingPong) {
   if (gEnv->rank >= numRanksToUse) return;
 
   const int nElem = 4 * 1024 * 1024;
 
-  std::vector<mscclpp::SmChannel> smChannels;
+  std::vector<mscclpp::MemoryChannel> memoryChannels;
   std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
-  setupMeshConnections(smChannels, buff.get(), nElem * sizeof(int));
-  std::vector<DeviceHandle<mscclpp::SmChannel>> deviceHandles(smChannels.size());
-  std::transform(smChannels.begin(), smChannels.end(), deviceHandles.begin(),
-                 [](const mscclpp::SmChannel& smChan) { return mscclpp::deviceHandle(smChan); });
+  setupMeshConnections(memoryChannels, buff.get(), nElem * sizeof(int));
+  std::vector<DeviceHandle<mscclpp::MemoryChannel>> deviceHandles(memoryChannels.size());
+  std::transform(memoryChannels.begin(), memoryChannels.end(), deviceHandles.begin(),
+                 [](const mscclpp::MemoryChannel& memChan) { return mscclpp::deviceHandle(memChan); });
 
   ASSERT_EQ(deviceHandles.size(), 1);
-  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstSmChans, deviceHandles.data(),
-                                       sizeof(DeviceHandle<mscclpp::SmChannel>)));
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstMemChans, deviceHandles.data(),
+                                       sizeof(DeviceHandle<mscclpp::MemoryChannel>)));
 
   std::shared_ptr<int> ret = mscclpp::detail::gpuCallocHostShared<int>();
 
-  kernelSmGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get());
+  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1, ret.get());
   MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
 
   EXPECT_EQ(*ret, 0);
   *ret = 0;
 
-  kernelSmGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get());
+  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024, ret.get());
   MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
 
   EXPECT_EQ(*ret, 0);
   *ret = 0;
 
-  kernelSmGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get());
+  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 1024 * 1024, ret.get());
   MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
 
   EXPECT_EQ(*ret, 0);
   *ret = 0;
 
-  kernelSmGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get());
+  kernelMemGetPingPong<<<1, 1024>>>(buff.get(), gEnv->rank, 4 * 1024 * 1024, ret.get());
   MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
 
   EXPECT_EQ(*ret, 0);
 }
 
-__global__ void kernelSmLL8PacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries) {
+__global__ void kernelMemLL8PacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries) {
   if (rank > 1) return;
 
-  DeviceHandle<mscclpp::SmChannel>& smChan = gChannelOneToOneTestConstSmChans;
+  DeviceHandle<mscclpp::MemoryChannel>& memChan = gChannelOneToOneTestConstMemChans;
   volatile int* sendBuff = (volatile int*)buff;
   int putOffset = (rank == 0) ? 0 : 10000000;
   int getOffset = (rank == 0) ? 10000000 : 0;
@@ -312,9 +314,9 @@ __global__ void kernelSmLL8PacketPingPong(int* buff, int rank, int nElem, int* r
         // sendBuff[2 * j + 1] = putOffset + i + 2 * j + 1;
       }
       // __syncthreads();
-      smChan.putPackets<mscclpp::LL8Packet>(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag);
+      memChan.putPackets<mscclpp::LL8Packet>(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag);
     } else {
-      smChan.getPackets<mscclpp::LL8Packet>(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag);
+      memChan.getPackets<mscclpp::LL8Packet>(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag);
       // If each thread reads 8 bytes at once, we don't need a barrier after getPackets().
       // __syncthreads();
       for (int j = threadIdx.x; j < nElem; j += blockDim.x) {
@@ -331,10 +333,10 @@ __global__ void kernelSmLL8PacketPingPong(int* buff, int rank, int nElem, int* r
   }
 }
 
-__global__ void kernelSmLL16PacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries) {
+__global__ void kernelMemLL16PacketPingPong(int* buff, int rank, int nElem, int* ret, int nTries) {
   if (rank > 1) return;
 
-  DeviceHandle<mscclpp::SmChannel>& smChan = gChannelOneToOneTestConstSmChans;
+  DeviceHandle<mscclpp::MemoryChannel>& memChan = gChannelOneToOneTestConstMemChans;
   volatile int* sendBuff = (volatile int*)buff;
   int putOffset = (rank == 0) ? 0 : 10000000;
   int getOffset = (rank == 0) ? 10000000 : 0;
@@ -349,9 +351,9 @@ __global__ void kernelSmLL16PacketPingPong(int* buff, int rank, int nElem, int*
         sendBuff[2 * j + 1] = putOffset + i + 2 * j + 1;
       }
       // __syncthreads();
-      smChan.putPackets<mscclpp::LL16Packet>(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag);
+      memChan.putPackets<mscclpp::LL16Packet>(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag);
     } else {
-      smChan.getPackets<mscclpp::LL16Packet>(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag);
+      memChan.getPackets<mscclpp::LL16Packet>(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag);
       // If each thread reads 8 bytes at once, we don't need a barrier after getPackets().
       // __syncthreads();
       for (int j = threadIdx.x; j < nElem / 2; j += blockDim.x) {
@@ -374,16 +376,16 @@ __global__ void kernelSmLL16PacketPingPong(int* buff, int rank, int nElem, int*
   }
 }
 
-TEST_F(SmChannelOneToOneTest, LL8PacketPingPong) {
-  auto kernelSmLL8PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
-    kernelSmLL8PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
+TEST_F(MemoryChannelOneToOneTest, LL8PacketPingPong) {
+  auto kernelMemLL8PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
+    kernelMemLL8PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
   };
-  packetPingPongTest("smLL8PacketPingPong", kernelSmLL8PacketPingPongWrapper);
+  packetPingPongTest("memoryLL8PacketPingPong", kernelMemLL8PacketPingPongWrapper);
 }
 
-TEST_F(SmChannelOneToOneTest, LL16PacketPingPong) {
-  auto kernelSmLL16PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
-    kernelSmLL16PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
+TEST_F(MemoryChannelOneToOneTest, LL16PacketPingPong) {
+  auto kernelMemLL16PacketPingPongWrapper = [](int* buff, int rank, int nElem, int* ret, int nTries) {
+    kernelMemLL16PacketPingPong<<<1, 1024>>>(buff, rank, nElem, ret, nTries);
   };
-  packetPingPongTest("smLL16PacketPingPong", kernelSmLL16PacketPingPongWrapper);
+  packetPingPongTest("memoryLL16PacketPingPong", kernelMemLL16PacketPingPongWrapper);
 }
diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp
index c00ecb6b6..a2d8ac74f 100644
--- a/test/mp_unit/mp_unit_tests.hpp
+++ b/test/mp_unit/mp_unit_tests.hpp
@@ -8,9 +8,9 @@
 
 #include <mscclpp/core.hpp>
 #include <mscclpp/executor.hpp>
+#include <mscclpp/memory_channel.hpp>
 #include <mscclpp/packet_device.hpp>
-#include <mscclpp/proxy_channel.hpp>
-#include <mscclpp/sm_channel.hpp>
+#include <mscclpp/port_channel.hpp>
 #include <mscclpp/utils.hpp>
 
 #include "ib.hpp"
@@ -128,7 +128,7 @@ class CommunicatorTest : public CommunicatorTestBase {
 template <class T>
 using DeviceHandle = mscclpp::DeviceHandle<T>;
 
-class ProxyChannelOneToOneTest : public CommunicatorTestBase {
+class PortChannelOneToOneTest : public CommunicatorTestBase {
  protected:
   struct PingPongTestParams {
     bool useIPC;
@@ -140,9 +140,8 @@ class ProxyChannelOneToOneTest : public CommunicatorTestBase {
   void SetUp() override;
   void TearDown() override;
 
-  void setupMeshConnections(std::vector<mscclpp::ProxyChannel>& proxyChannels, bool useIPC, bool useIb,
-                            bool useEthernet, void* sendBuff, size_t sendBuffBytes, void* recvBuff = nullptr,
-                            size_t recvBuffBytes = 0);
+  void setupMeshConnections(std::vector<mscclpp::PortChannel>& portChannels, bool useIPC, bool useIb, bool useEthernet,
+                            void* sendBuff, size_t sendBuffBytes, void* recvBuff = nullptr, size_t recvBuffBytes = 0);
   void testPingPong(PingPongTestParams params);
   void testPingPongPerf(PingPongTestParams params);
   void testPacketPingPong(bool useIbOnly);
@@ -151,17 +150,17 @@ class ProxyChannelOneToOneTest : public CommunicatorTestBase {
   std::shared_ptr<mscclpp::ProxyService> proxyService;
 };
 
-class SmChannelOneToOneTest : public CommunicatorTestBase {
+class MemoryChannelOneToOneTest : public CommunicatorTestBase {
  protected:
   void SetUp() override;
   void TearDown() override;
 
-  void setupMeshConnections(std::vector<mscclpp::SmChannel>& smChannels, void* inputBuff, size_t inputBuffBytes,
+  void setupMeshConnections(std::vector<mscclpp::MemoryChannel>& memoryChannels, void* inputBuff, size_t inputBuffBytes,
                             void* outputBuff = nullptr, size_t outputBuffBytes = 0);
   using PacketPingPongKernelWrapper = std::function<void(int*, int, int, int*, int)>;
   void packetPingPongTest(const std::string testName, PacketPingPongKernelWrapper kernelWrapper);
 
-  std::unordered_map<int, std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>> smSemaphores;
+  std::unordered_map<int, std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores;
 };
 
 class ExecutorTest : public MultiProcessTest {
diff --git a/test/mp_unit/proxy_channel_tests.cu b/test/mp_unit/port_channel_tests.cu
similarity index 74%
rename from test/mp_unit/proxy_channel_tests.cu
rename to test/mp_unit/port_channel_tests.cu
index 192985b47..f49c23306 100644
--- a/test/mp_unit/proxy_channel_tests.cu
+++ b/test/mp_unit/port_channel_tests.cu
@@ -6,18 +6,18 @@
 
 #include "mp_unit_tests.hpp"
 
-void ProxyChannelOneToOneTest::SetUp() {
+void PortChannelOneToOneTest::SetUp() {
   // Use only two ranks
   setNumRanksToUse(2);
   CommunicatorTestBase::SetUp();
   proxyService = std::make_shared<mscclpp::ProxyService>();
 }
 
-void ProxyChannelOneToOneTest::TearDown() { CommunicatorTestBase::TearDown(); }
+void PortChannelOneToOneTest::TearDown() { CommunicatorTestBase::TearDown(); }
 
-void ProxyChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::ProxyChannel>& proxyChannels, bool useIPC,
-                                                    bool useIb, bool useEthernet, void* sendBuff, size_t sendBuffBytes,
-                                                    void* recvBuff, size_t recvBuffBytes) {
+void PortChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::PortChannel>& portChannels, bool useIPC,
+                                                   bool useIb, bool useEthernet, void* sendBuff, size_t sendBuffBytes,
+                                                   void* recvBuff, size_t recvBuffBytes) {
   const int rank = communicator->bootstrap()->getRank();
   const int worldSize = communicator->bootstrap()->getNranks();
   const bool isInPlace = (recvBuff == nullptr);
@@ -64,17 +64,17 @@ void ProxyChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::ProxyCh
     }
     mscclpp::SemaphoreId cid = proxyService->buildAndAddSemaphore(*communicator, connectionFutures[r].get());
 
-    proxyChannels.emplace_back(proxyService->proxyChannel(cid, proxyService->addMemory(remoteMemFutures[r].get()),
-                                                          proxyService->addMemory(sendBufRegMem)));
+    portChannels.emplace_back(proxyService->portChannel(cid, proxyService->addMemory(remoteMemFutures[r].get()),
+                                                        proxyService->addMemory(sendBufRegMem)));
   }
 
   communicator->setup();
 }
 
-__constant__ DeviceHandle<mscclpp::ProxyChannel> gChannelOneToOneTestConstProxyChans;
+__constant__ DeviceHandle<mscclpp::PortChannel> gChannelOneToOneTestConstPortChans;
 
 __global__ void kernelProxyPingPong(int* buff, int rank, int nElem, bool waitWithPoll, int nTries, int* ret) {
-  DeviceHandle<mscclpp::ProxyChannel>& proxyChan = gChannelOneToOneTestConstProxyChans;
+  DeviceHandle<mscclpp::PortChannel>& portChan = gChannelOneToOneTestConstPortChans;
   volatile int* sendBuff = (volatile int*)buff;
   int flusher = 0;
   int rank1Offset = 10000000;
@@ -84,7 +84,7 @@ __global__ void kernelProxyPingPong(int* buff, int rank, int nElem, bool waitWit
         if (threadIdx.x == 0) {
           if (waitWithPoll) {
             int spin = 1000000;
-            while (!proxyChan.poll() && spin > 0) {
+            while (!portChan.poll() && spin > 0) {
               spin--;
             }
             if (spin == 0) {
@@ -92,7 +92,7 @@ __global__ void kernelProxyPingPong(int* buff, int rank, int nElem, bool waitWit
               *ret = 1;
             }
           } else {
-            proxyChan.wait();
+            portChan.wait();
           }
         }
         __syncthreads();
@@ -109,13 +109,13 @@ __global__ void kernelProxyPingPong(int* buff, int rank, int nElem, bool waitWit
       }
       __syncthreads();
       // __threadfence_system(); // not necessary if we make sendBuff volatile
-      if (threadIdx.x == 0) proxyChan.putWithSignal(0, nElem * sizeof(int));
+      if (threadIdx.x == 0) portChan.putWithSignal(0, nElem * sizeof(int));
     }
     if (rank == 1) {
       if (threadIdx.x == 0) {
         if (waitWithPoll) {
           int spin = 1000000;
-          while (!proxyChan.poll() && spin > 0) {
+          while (!portChan.poll() && spin > 0) {
             spin--;
           }
           if (spin == 0) {
@@ -123,7 +123,7 @@ __global__ void kernelProxyPingPong(int* buff, int rank, int nElem, bool waitWit
             *ret = 1;
           }
         } else {
-          proxyChan.wait();
+          portChan.wait();
         }
       }
       __syncthreads();
@@ -140,32 +140,32 @@ __global__ void kernelProxyPingPong(int* buff, int rank, int nElem, bool waitWit
         }
         __syncthreads();
         // __threadfence_system(); // not necessary if we make sendBuff volatile
-        if (threadIdx.x == 0) proxyChan.putWithSignal(0, nElem * sizeof(int));
+        if (threadIdx.x == 0) portChan.putWithSignal(0, nElem * sizeof(int));
       }
     }
     flusher++;
     if (flusher == 1) {
-      if (threadIdx.x == 0) proxyChan.flush();
+      if (threadIdx.x == 0) portChan.flush();
       flusher = 0;
     }
   }
 }
 
-void ProxyChannelOneToOneTest::testPingPong(PingPongTestParams params) {
+void PortChannelOneToOneTest::testPingPong(PingPongTestParams params) {
   if (gEnv->rank >= numRanksToUse) return;
 
   const int nElem = 4 * 1024 * 1024;
 
-  std::vector<mscclpp::ProxyChannel> proxyChannels;
+  std::vector<mscclpp::PortChannel> portChannels;
   std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
-  setupMeshConnections(proxyChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int));
+  setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int));
 
-  std::vector<DeviceHandle<mscclpp::ProxyChannel>> proxyChannelHandles;
-  for (auto& ch : proxyChannels) proxyChannelHandles.push_back(ch.deviceHandle());
+  std::vector<DeviceHandle<mscclpp::PortChannel>> portChannelHandles;
+  for (auto& ch : portChannels) portChannelHandles.push_back(ch.deviceHandle());
 
-  ASSERT_EQ(proxyChannels.size(), 1);
-  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstProxyChans, proxyChannelHandles.data(),
-                                       sizeof(DeviceHandle<mscclpp::ProxyChannel>)));
+  ASSERT_EQ(portChannels.size(), 1);
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstPortChans, portChannelHandles.data(),
+                                       sizeof(DeviceHandle<mscclpp::PortChannel>)));
 
   proxyService->startProxy();
 
@@ -196,21 +196,21 @@ void ProxyChannelOneToOneTest::testPingPong(PingPongTestParams params) {
   proxyService->stopProxy();
 }
 
-void ProxyChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
+void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
   if (gEnv->rank >= numRanksToUse) return;
 
   const int nElem = 4 * 1024 * 1024;
 
-  std::vector<mscclpp::ProxyChannel> proxyChannels;
+  std::vector<mscclpp::PortChannel> portChannels;
   std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
-  setupMeshConnections(proxyChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int));
+  setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int));
 
-  std::vector<DeviceHandle<mscclpp::ProxyChannel>> proxyChannelHandles;
-  for (auto& ch : proxyChannels) proxyChannelHandles.push_back(ch.deviceHandle());
+  std::vector<DeviceHandle<mscclpp::PortChannel>> portChannelHandles;
+  for (auto& ch : portChannels) portChannelHandles.push_back(ch.deviceHandle());
 
-  ASSERT_EQ(proxyChannels.size(), 1);
-  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstProxyChans, proxyChannelHandles.data(),
-                                       sizeof(DeviceHandle<mscclpp::ProxyChannel>)));
+  ASSERT_EQ(portChannels.size(), 1);
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstPortChans, portChannelHandles.data(),
+                                       sizeof(DeviceHandle<mscclpp::PortChannel>)));
 
   proxyService->startProxy();
 
@@ -240,46 +240,46 @@ void ProxyChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
   proxyService->stopProxy();
 }
 
-TEST_F(ProxyChannelOneToOneTest, PingPong) {
+TEST_F(PortChannelOneToOneTest, PingPong) {
   testPingPong(PingPongTestParams{.useIPC = true, .useIB = true, .useEthernet = false, .waitWithPoll = false});
 }
 
-TEST_F(ProxyChannelOneToOneTest, PingPongIb) {
+TEST_F(PortChannelOneToOneTest, PingPongIb) {
   testPingPong(PingPongTestParams{.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false});
 }
 
-TEST_F(ProxyChannelOneToOneTest, PingPongEthernet) {
+TEST_F(PortChannelOneToOneTest, PingPongEthernet) {
   testPingPong(PingPongTestParams{.useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false});
 }
 
-TEST_F(ProxyChannelOneToOneTest, PingPongWithPoll) {
+TEST_F(PortChannelOneToOneTest, PingPongWithPoll) {
   testPingPong(PingPongTestParams{.useIPC = true, .useIB = true, .useEthernet = false, .waitWithPoll = true});
 }
 
-TEST_F(ProxyChannelOneToOneTest, PingPongIbWithPoll) {
+TEST_F(PortChannelOneToOneTest, PingPongIbWithPoll) {
   testPingPong(PingPongTestParams{.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = true});
 }
 
-TEST_F(ProxyChannelOneToOneTest, PingPongPerf) {
+TEST_F(PortChannelOneToOneTest, PingPongPerf) {
   testPingPongPerf(PingPongTestParams{.useIPC = true, .useIB = true, .useEthernet = false, .waitWithPoll = false});
 }
 
-TEST_F(ProxyChannelOneToOneTest, PingPongPerfIb) {
+TEST_F(PortChannelOneToOneTest, PingPongPerfIb) {
   testPingPongPerf(PingPongTestParams{.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false});
 }
 
-TEST_F(ProxyChannelOneToOneTest, PingPongPerfEthernet) {
+TEST_F(PortChannelOneToOneTest, PingPongPerfEthernet) {
   testPingPongPerf(PingPongTestParams{.useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false});
 }
 
-__device__ mscclpp::DeviceSyncer gChannelOneToOneTestProxyChansSyncer;
+__device__ mscclpp::DeviceSyncer gChannelOneToOneTestPortChansSyncer;
 
 template <bool CheckCorrectness>
 __global__ void kernelProxyLLPingPong(int* buff, mscclpp::LLPacket* putPktBuf, mscclpp::LLPacket* getPktBuf, int rank,
                                       int nElem, int nTries, int* ret) {
   if (rank > 1) return;
 
-  DeviceHandle<mscclpp::ProxyChannel>& proxyChan = gChannelOneToOneTestConstProxyChans;
+  DeviceHandle<mscclpp::PortChannel>& portChan = gChannelOneToOneTestConstPortChans;
   volatile int* buffPtr = (volatile int*)buff;
   int putOffset = (rank == 0) ? 0 : 10000000;
   int getOffset = (rank == 0) ? 10000000 : 0;
@@ -302,14 +302,14 @@ __global__ void kernelProxyLLPingPong(int* buff, mscclpp::LLPacket* putPktBuf, m
         // __syncthreads();
       }
       mscclpp::putPackets(putPktBuf, 0, buff, 0, nElem * sizeof(int), threadId, numThreads, flag);
-      gChannelOneToOneTestProxyChansSyncer.sync(gridDim.x);
+      gChannelOneToOneTestPortChansSyncer.sync(gridDim.x);
       if (threadId == 0) {
         // Send data from the local putPacketBuffer to the remote getPacketBuffer
-        proxyChan.put(0, nPkt * sizeof(mscclpp::LLPacket));
+        portChan.put(0, nPkt * sizeof(mscclpp::LLPacket));
       }
       flusher++;
       if (flusher == 64) {
-        if (threadId == 0) proxyChan.flush();
+        if (threadId == 0) portChan.flush();
         flusher = 0;
       }
     } else {
@@ -333,38 +333,38 @@ __global__ void kernelProxyLLPingPong(int* buff, mscclpp::LLPacket* putPktBuf, m
         }
       }
       // Make sure all threads are done in this iteration
-      gChannelOneToOneTestProxyChansSyncer.sync(gridDim.x);
+      gChannelOneToOneTestPortChansSyncer.sync(gridDim.x);
     }
   }
 }
 
-void ProxyChannelOneToOneTest::testPacketPingPong(bool useIbOnly) {
+void PortChannelOneToOneTest::testPacketPingPong(bool useIbOnly) {
   if (gEnv->rank >= numRanksToUse) return;
 
   const int nElem = 4 * 1024 * 1024;
 
-  std::vector<mscclpp::ProxyChannel> proxyChannels;
+  std::vector<mscclpp::PortChannel> portChannels;
   std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
 
   const size_t nPacket = (nElem * sizeof(int) + sizeof(uint64_t) - 1) / sizeof(uint64_t);
   auto putPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();
   auto getPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();
 
-  setupMeshConnections(proxyChannels, !useIbOnly, true, false, putPacketBuffer.get(),
+  setupMeshConnections(portChannels, !useIbOnly, true, false, putPacketBuffer.get(),
                        nPacket * sizeof(mscclpp::LLPacket), getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket));
 
-  ASSERT_EQ(proxyChannels.size(), 1);
+  ASSERT_EQ(portChannels.size(), 1);
 
-  std::vector<DeviceHandle<mscclpp::ProxyChannel>> proxyChannelHandles;
-  for (auto& proxyChannel : proxyChannels) {
-    proxyChannelHandles.push_back(proxyChannel.deviceHandle());
+  std::vector<DeviceHandle<mscclpp::PortChannel>> portChannelHandles;
+  for (auto& portChannel : portChannels) {
+    portChannelHandles.push_back(portChannel.deviceHandle());
   }
 
-  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstProxyChans, proxyChannelHandles.data(),
-                                       sizeof(DeviceHandle<mscclpp::ProxyChannel>)));
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstPortChans, portChannelHandles.data(),
+                                       sizeof(DeviceHandle<mscclpp::PortChannel>)));
 
   mscclpp::DeviceSyncer syncer = {};
-  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestProxyChansSyncer, &syncer, sizeof(mscclpp::DeviceSyncer)));
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestPortChansSyncer, &syncer, sizeof(mscclpp::DeviceSyncer)));
 
   proxyService->startProxy();
 
@@ -405,33 +405,33 @@ void ProxyChannelOneToOneTest::testPacketPingPong(bool useIbOnly) {
   proxyService->stopProxy();
 }
 
-void ProxyChannelOneToOneTest::testPacketPingPongPerf(bool useIbOnly) {
+void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIbOnly) {
   if (gEnv->rank >= numRanksToUse) return;
 
   const int nElem = 4 * 1024 * 1024;
 
-  std::vector<mscclpp::ProxyChannel> proxyChannels;
+  std::vector<mscclpp::PortChannel> portChannels;
   std::shared_ptr<int> buff = mscclpp::GpuBuffer<int>(nElem).memory();
 
   const size_t nPacket = (nElem * sizeof(int) + sizeof(uint64_t) - 1) / sizeof(uint64_t);
   auto putPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();
   auto getPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();
 
-  setupMeshConnections(proxyChannels, !useIbOnly, true, false, putPacketBuffer.get(),
+  setupMeshConnections(portChannels, !useIbOnly, true, false, putPacketBuffer.get(),
                        nPacket * sizeof(mscclpp::LLPacket), getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket));
 
-  ASSERT_EQ(proxyChannels.size(), 1);
+  ASSERT_EQ(portChannels.size(), 1);
 
-  std::vector<DeviceHandle<mscclpp::ProxyChannel>> proxyChannelHandles;
-  for (auto& proxyChannel : proxyChannels) {
-    proxyChannelHandles.push_back(proxyChannel.deviceHandle());
+  std::vector<DeviceHandle<mscclpp::PortChannel>> portChannelHandles;
+  for (auto& portChannel : portChannels) {
+    portChannelHandles.push_back(portChannel.deviceHandle());
   }
 
-  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstProxyChans, proxyChannelHandles.data(),
-                                       sizeof(DeviceHandle<mscclpp::ProxyChannel>)));
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestConstPortChans, portChannelHandles.data(),
+                                       sizeof(DeviceHandle<mscclpp::PortChannel>)));
 
   mscclpp::DeviceSyncer syncer = {};
-  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestProxyChansSyncer, &syncer, sizeof(mscclpp::DeviceSyncer)));
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gChannelOneToOneTestPortChansSyncer, &syncer, sizeof(mscclpp::DeviceSyncer)));
 
   proxyService->startProxy();
 
@@ -461,10 +461,10 @@ void ProxyChannelOneToOneTest::testPacketPingPongPerf(bool useIbOnly) {
   proxyService->stopProxy();
 }
 
-TEST_F(ProxyChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false); }
+TEST_F(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false); }
 
-TEST_F(ProxyChannelOneToOneTest, PacketPingPongIb) { testPacketPingPong(true); }
+TEST_F(PortChannelOneToOneTest, PacketPingPongIb) { testPacketPingPong(true); }
 
-TEST_F(ProxyChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false); }
+TEST_F(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false); }
 
-TEST_F(ProxyChannelOneToOneTest, PacketPingPongPerfIb) { testPacketPingPongPerf(true); }
+TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIb) { testPacketPingPongPerf(true); }
diff --git a/test/mscclpp-test/allgather_test.cu b/test/mscclpp-test/allgather_test.cu
index 27506f340..17319bc9e 100644
--- a/test/mscclpp-test/allgather_test.cu
+++ b/test/mscclpp-test/allgather_test.cu
@@ -21,37 +21,37 @@ constexpr uint64_t MAGIC = 0xdeadbeef;
 
 template <class T>
 using DeviceHandle = mscclpp::DeviceHandle<T>;
-__constant__ DeviceHandle<mscclpp::ProxyChannel> constProxyChans[16];
-__constant__ DeviceHandle<mscclpp::BaseProxyChannel> constRawProxyChan[16];
+__constant__ DeviceHandle<mscclpp::PortChannel> constPortChans[16];
+__constant__ DeviceHandle<mscclpp::BasePortChannel> constRawPortChan[16];
 
-__constant__ DeviceHandle<mscclpp::SmChannel> constSmChans[512];
-__constant__ DeviceHandle<mscclpp::SmChannel> constSmOutOfPlaceChans[16];
+__constant__ DeviceHandle<mscclpp::MemoryChannel> constMemChans[512];
+__constant__ DeviceHandle<mscclpp::MemoryChannel> constMemOutOfPlaceChans[16];
 __device__ uint64_t globalFlag;
 
 __global__ void __launch_bounds__(1024) allgather0(int rank, size_t nelemsPerGPU) {
   int warpId = threadIdx.x / WARP_SIZE;
 
   // Each warp is responsible for one of the remote ranks
-  DeviceHandle<mscclpp::ProxyChannel> proxyChan = constProxyChans[warpId];
+  DeviceHandle<mscclpp::PortChannel> portChan = constPortChans[warpId];
 
   // this allgather is really simple and implemented as an alltoall
 
   // this thread's role is a sender role
   // put your data asynchronously
   if (threadIdx.x % WARP_SIZE == 0) {
-    proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int));
+    portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int));
   }
   // make sure everyone is put their data before some thread randomly blocks everyone else in signal
   __syncthreads();
   // push with flag and sync to make sure the data is received
-  if (threadIdx.x % WARP_SIZE == 0) proxyChan.flush();
+  if (threadIdx.x % WARP_SIZE == 0) portChan.flush();
 
   // this thread's role is a receiver role. wait on the semaphore to make sure the data is ready
-  if (threadIdx.x % WARP_SIZE == 0) proxyChan.wait();
+  if (threadIdx.x % WARP_SIZE == 0) portChan.wait();
 }
 
-__device__ void localAllGather(DeviceHandle<mscclpp::ProxyChannel> proxyChan, int rank, int nRanksPerNode,
-                               int remoteRank, uint64_t offset, uint64_t size, bool flushAfterSignal = true) {
+__device__ void localAllGather(DeviceHandle<mscclpp::PortChannel> portChan, int rank, int nRanksPerNode, int remoteRank,
+                               uint64_t offset, uint64_t size, bool flushAfterSignal = true) {
   // this allgather algorithm works as follows:
   // Step 1: GPU rank i sends data to GPU rank (i+1) % nRanksPerNode
   // and waits for data from GPU rank (i-1) % nRanksPerNode
@@ -61,12 +61,12 @@ __device__ void localAllGather(DeviceHandle<mscclpp::ProxyChannel> proxyChan, in
   for (int i = 1; i < nRanksPerNode; i++) {
     if ((remoteRank % nRanksPerNode) == ((rank + i) % nRanksPerNode)) {
       // put your data to GPU (rank+i) % nRanksPerNode and signal in one call
-      if (flushAfterSignal && (threadIdx.x % WARP_SIZE) == 0) proxyChan.putWithSignalAndFlush(offset, size);
-      if (!flushAfterSignal && (threadIdx.x % WARP_SIZE) == 0) proxyChan.putWithSignal(offset, size);
+      if (flushAfterSignal && (threadIdx.x % WARP_SIZE) == 0) portChan.putWithSignalAndFlush(offset, size);
+      if (!flushAfterSignal && (threadIdx.x % WARP_SIZE) == 0) portChan.putWithSignal(offset, size);
     }
     // wait for the data from GPU (rank-i) % nRanksPerNode to arrive
     if ((remoteRank % nRanksPerNode) == ((rank - i + nRanksPerNode) % nRanksPerNode)) {
-      if ((threadIdx.x % WARP_SIZE) == 0) proxyChan.wait();
+      if ((threadIdx.x % WARP_SIZE) == 0) portChan.wait();
     }
 #if defined(__HIP_PLATFORM_AMD__)
     // NOTE: we actually need a group barrier here for better performance, but __syncthreads() is still correct.
@@ -80,8 +80,8 @@ __device__ void localAllGather(DeviceHandle<mscclpp::ProxyChannel> proxyChan, in
 __device__ mscclpp::DeviceSyncer deviceSyncer;
 
 // This kernel is the most performant when the number of blocks is a multiple of (nRanksPerNode - 1).
-__device__ void localAllGatherSm(int rank, int nRanksPerNode, int startRankChunkIndex, uint64_t offsetInRankChunk,
-                                 uint64_t rankChunkSize, uint64_t size, size_t nBlocks) {
+__device__ void localAllGatherMem(int rank, int nRanksPerNode, int startRankChunkIndex, uint64_t offsetInRankChunk,
+                                  uint64_t rankChunkSize, uint64_t size, size_t nBlocks) {
   if (nRanksPerNode == 1) return;
   if (blockIdx.x >= nBlocks) return;
   const size_t nPeer = nRanksPerNode - 1;
@@ -117,12 +117,12 @@ __device__ void localAllGatherSm(int rank, int nRanksPerNode, int startRankChunk
     sizeForThisBlock += lastChunkSize;
   }
   if (threadIdx.x == 0 && peerLocalBlockIdx == 0) {
-    constSmChans[peerIdx].signal();
-    constSmChans[peerIdx].wait();
+    constMemChans[peerIdx].signal();
+    constMemChans[peerIdx].wait();
   }
   deviceSyncer.sync(nBlocks);
   size_t offset = rankChunkSize * (startRankChunkIndex + remoteRankLocalIndex) + offsetInRankChunk;
-  constSmChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x);
+  constMemChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x);
 }
 
 __global__ void __launch_bounds__(1024) allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) {
@@ -130,9 +130,9 @@ __global__ void __launch_bounds__(1024) allgather1(int rank, int nRanksPerNode,
   int remoteRank = (warpId < rank) ? warpId : warpId + 1;
 
   // Each warp is responsible for one of the remote ranks
-  DeviceHandle<mscclpp::ProxyChannel> proxyChan = constProxyChans[warpId];
+  DeviceHandle<mscclpp::PortChannel> portChan = constPortChans[warpId];
 
-  localAllGather(proxyChan, rank, nRanksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int),
+  localAllGather(portChan, rank, nRanksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int),
                  nelemsPerGPU * sizeof(int));
 }
 
@@ -141,7 +141,7 @@ __global__ void __launch_bounds__(1024) allgather2(int rank, int worldSize, int
   int remoteRank = (warpId < rank) ? warpId : warpId + 1;
 
   // Each warp is responsible for one of the remote ranks
-  DeviceHandle<mscclpp::ProxyChannel> proxyChan = constProxyChans[warpId];
+  DeviceHandle<mscclpp::PortChannel> portChan = constPortChans[warpId];
 
   // this allgather is a pipelined and hierarchical one and only works for two nodes
   // it is implemented as follows:
@@ -158,16 +158,16 @@ __global__ void __launch_bounds__(1024) allgather2(int rank, int worldSize, int
   // Step 1
   // local allgather
   if (remoteRank / nRanksPerNode == rank / nRanksPerNode) {
-    localAllGather(proxyChan, rank, nRanksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int),
+    localAllGather(portChan, rank, nRanksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int),
                    nelemsPerGPU * sizeof(int), false);
   }
   // cross-node exchange
   if (remoteRank % nRanksPerNode == rank % nRanksPerNode) {
     // opposite side
     if ((threadIdx.x % WARP_SIZE) == 0)
-      proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int),
-                              (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int));
-    if ((threadIdx.x % WARP_SIZE) == 0) proxyChan.wait();
+      portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int),
+                             (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int));
+    if ((threadIdx.x % WARP_SIZE) == 0) portChan.wait();
   }
 
   // sync here to make sure IB flush dose not block the CUDA IPC traffic
@@ -175,7 +175,7 @@ __global__ void __launch_bounds__(1024) allgather2(int rank, int worldSize, int
   // need to flush ib channel here to avoid cq overflow. since we won't change send suffer after send, we don't need
   // to flush for IPC channel.
   if (remoteRank % nRanksPerNode == rank % nRanksPerNode) {
-    if ((threadIdx.x % WARP_SIZE) == 0) proxyChan.flush();
+    if ((threadIdx.x % WARP_SIZE) == 0) portChan.flush();
   }
   __syncthreads();
 
@@ -183,7 +183,7 @@ __global__ void __launch_bounds__(1024) allgather2(int rank, int worldSize, int
   // local allgather
   int otherNghr = (rank + nRanksPerNode) % worldSize;
   if (remoteRank / nRanksPerNode == rank / nRanksPerNode) {
-    localAllGather(proxyChan, rank, nRanksPerNode, remoteRank, otherNghr * nelemsPerGPU * sizeof(int),
+    localAllGather(portChan, rank, nRanksPerNode, remoteRank, otherNghr * nelemsPerGPU * sizeof(int),
                    (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int), false);
   }
 
@@ -191,21 +191,21 @@ __global__ void __launch_bounds__(1024) allgather2(int rank, int worldSize, int
   if (remoteRank % nRanksPerNode == rank % nRanksPerNode) {
     // opposite side
     if ((threadIdx.x % WARP_SIZE) == 0)
-      proxyChan.putWithSignal((rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int),
-                              nelemsPerGPU / pipelineSize * sizeof(int));
-    if ((threadIdx.x % WARP_SIZE) == 0) proxyChan.wait();
+      portChan.putWithSignal((rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int),
+                             nelemsPerGPU / pipelineSize * sizeof(int));
+    if ((threadIdx.x % WARP_SIZE) == 0) portChan.wait();
   }
 
   __syncthreads();
   if (remoteRank % nRanksPerNode == rank % nRanksPerNode) {
-    if ((threadIdx.x % WARP_SIZE) == 0) proxyChan.flush();
+    if ((threadIdx.x % WARP_SIZE) == 0) portChan.flush();
   }
   __syncthreads();
 
   // Step 3
   // local allgather
   if (remoteRank / nRanksPerNode == rank / nRanksPerNode) {
-    localAllGather(proxyChan, rank, nRanksPerNode, remoteRank,
+    localAllGather(portChan, rank, nRanksPerNode, remoteRank,
                    (otherNghr * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int),
                    nelemsPerGPU / pipelineSize * sizeof(int));
   }
@@ -215,7 +215,7 @@ __global__ void __launch_bounds__(1024) allgather3() {
   int warpId = threadIdx.x / WARP_SIZE;
 
   // Each warp is responsible for one of the remote ranks
-  DeviceHandle<mscclpp::BaseProxyChannel> proxyChan = constRawProxyChan[warpId];
+  DeviceHandle<mscclpp::BasePortChannel> portChan = constRawPortChan[warpId];
 
   int tid = threadIdx.x;
   __syncthreads();
@@ -224,12 +224,12 @@ __global__ void __launch_bounds__(1024) allgather3() {
     trigger.fst = MAGIC;
     trigger.snd = 0;
     // offload all the work to the proxy
-    uint64_t currentFifoHead = proxyChan.fifo_.push(trigger);
+    uint64_t currentFifoHead = portChan.fifo_.push(trigger);
     // wait for the work to be done in cpu side
-    proxyChan.fifo_.sync(currentFifoHead);
+    portChan.fifo_.sync(currentFifoHead);
   }
   if (tid % WARP_SIZE == 0) {
-    proxyChan.wait();
+    portChan.wait();
   }
 }
 
@@ -248,14 +248,14 @@ __global__ void __launch_bounds__(1024) allgather4(int rank, int worldSize, int
   int peerRank = (rank + nRanksPerNode) % worldSize;
   int peerNodeId = peerRank / nRanksPerNode;
   int peer = (peerRank < rank) ? peerRank : peerRank - 1;
-  DeviceHandle<mscclpp::ProxyChannel>& proxyChan = constProxyChans[peer];
+  DeviceHandle<mscclpp::PortChannel>& portChan = constPortChans[peer];
   const size_t nBlocksForLocalAllGather = gridDim.x;
   const size_t rankChunkSize = nelemsPerGPU * sizeof(int);
   const int startRankIndexInLocalNode = (rank / nRanksPerNode) * nRanksPerNode;
   const int startRankIndexInPeerNode = (peerRank / nRanksPerNode) * nRanksPerNode;
 
   if (peerNodeId == rank / nRanksPerNode) {
-    localAllGatherSm(rank, nRanksPerNode, 0, 0, rankChunkSize, rankChunkSize, nBlocksForLocalAllGather);
+    localAllGatherMem(rank, nRanksPerNode, 0, 0, rankChunkSize, rankChunkSize, nBlocksForLocalAllGather);
     return;
   }
 
@@ -266,29 +266,29 @@ __global__ void __launch_bounds__(1024) allgather4(int rank, int worldSize, int
 
   // Step 1
   if (threadIdx.x == 0 && blockIdx.x == 0) {
-    proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), step1Bytes);
+    portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), step1Bytes);
   }
-  localAllGatherSm(rank, nRanksPerNode, startRankIndexInLocalNode, 0, rankChunkSize, rankChunkSize,
-                   nBlocksForLocalAllGather);
+  localAllGatherMem(rank, nRanksPerNode, startRankIndexInLocalNode, 0, rankChunkSize, rankChunkSize,
+                    nBlocksForLocalAllGather);
   if (threadIdx.x == 0 && blockIdx.x == 0) {
-    proxyChan.wait();
-    proxyChan.flush();
+    portChan.wait();
+    portChan.flush();
   }
   deviceSyncer.sync(nBlocksForLocalAllGather);
   // Step 2
   if (threadIdx.x == 0 && blockIdx.x == 0) {
-    proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int) + step1Bytes, step2Bytes);
+    portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int) + step1Bytes, step2Bytes);
   }
-  localAllGatherSm(rank, nRanksPerNode, startRankIndexInPeerNode, 0, rankChunkSize, step1Bytes,
-                   nBlocksForLocalAllGather);
+  localAllGatherMem(rank, nRanksPerNode, startRankIndexInPeerNode, 0, rankChunkSize, step1Bytes,
+                    nBlocksForLocalAllGather);
   if (threadIdx.x == 0 && blockIdx.x == 0) {
-    proxyChan.wait();
-    proxyChan.flush();
+    portChan.wait();
+    portChan.flush();
   }
   deviceSyncer.sync(nBlocksForLocalAllGather);
   // Step 3
-  localAllGatherSm(rank, nRanksPerNode, startRankIndexInPeerNode, step1Bytes, rankChunkSize, step2Bytes,
-                   nBlocksForLocalAllGather);
+  localAllGatherMem(rank, nRanksPerNode, startRankIndexInPeerNode, step1Bytes, rankChunkSize, step2Bytes,
+                    nBlocksForLocalAllGather);
 }
 
 __global__ void __launch_bounds__(1024, 1)
@@ -304,11 +304,11 @@ __global__ void __launch_bounds__(1024, 1)
   const size_t nWarp = nThread / WARP_SIZE;
   const size_t nPeer = nRanksPerNode - 1;
   const size_t chanOffset = nPeer * blockIdx.x;
-  auto smChans = constSmChans + chanOffset;
+  auto memChans = constMemChans + chanOffset;
 
   if (wid < nPeer && lid == 0) {
-    smChans[wid].relaxedSignal();
-    smChans[wid].wait();
+    memChans[wid].relaxedSignal();
+    memChans[wid].wait();
   }
   __syncthreads();
   const size_t bytesPerGPU = nelemsPerGPU * sizeof(int);
@@ -328,7 +328,7 @@ __global__ void __launch_bounds__(1024, 1)
     const size_t peerIdx = wid % nPeer;
     const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1);
     const size_t offset = bytesPerGPU * remoteRankLocalIndex + (wid / nPeer) * unitBytesPerWarp;
-    smChans[peerIdx].get<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE);
+    memChans[peerIdx].get<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE);
   }
 
   for (size_t i = 1; i < nLoop; ++i) {
@@ -336,7 +336,7 @@ __global__ void __launch_bounds__(1024, 1)
     const size_t peerIdx = gWid % nPeer;
     const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1);
     const size_t offset = bytesPerGPU * remoteRankLocalIndex + (gWid / nPeer) * unitBytesPerWarp;
-    smChans[peerIdx].get<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE);
+    memChans[peerIdx].get<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE);
   }
 
   if (bytes % unitBytes > 0) {
@@ -349,7 +349,7 @@ __global__ void __launch_bounds__(1024, 1)
                                    ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0)
                                    : unitBytesPerWarp;
     if (remainBytes > 0) {
-      smChans[peerIdx].get<16, true>(offset, remainBytes, lid, WARP_SIZE);
+      memChans[peerIdx].get<16, true>(offset, remainBytes, lid, WARP_SIZE);
     }
   }
 }
@@ -367,11 +367,11 @@ __global__ void __launch_bounds__(1024, 1)
   const size_t nWarp = nThread / WARP_SIZE;
   const size_t nPeer = nRanksPerNode - 1;
   const size_t chanOffset = nPeer * blockIdx.x;
-  auto smChans = constSmChans + chanOffset;
+  auto memChans = constMemChans + chanOffset;
 
   if (wid < nPeer && lid == 0) {
-    smChans[wid].relaxedSignal();
-    smChans[wid].wait();
+    memChans[wid].relaxedSignal();
+    memChans[wid].wait();
   }
   __syncthreads();
   const size_t bytesPerGPU = nelemsPerGPU * sizeof(int);
@@ -390,14 +390,14 @@ __global__ void __launch_bounds__(1024, 1)
     // First loop unrolling
     const size_t peerIdx = wid % nPeer;
     const size_t offset = bytesPerGPU * rank + (wid / nPeer) * unitBytesPerWarp;
-    smChans[peerIdx].put<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE);
+    memChans[peerIdx].put<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE);
   }
 
   for (size_t i = 1; i < nLoop; ++i) {
     const size_t gWid = wid + i * nWarp;
     const size_t peerIdx = gWid % nPeer;
     const size_t offset = bytesPerGPU * rank + (gWid / nPeer) * unitBytesPerWarp;
-    smChans[peerIdx].put<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE);
+    memChans[peerIdx].put<16, false>(offset, unitBytesPerWarp, lid, WARP_SIZE);
   }
 
   if (bytes % unitBytes > 0) {
@@ -409,7 +409,7 @@ __global__ void __launch_bounds__(1024, 1)
                                    ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0)
                                    : unitBytesPerWarp;
     if (remainBytes > 0) {
-      smChans[peerIdx].put<16, true>(offset, remainBytes, lid, WARP_SIZE);
+      memChans[peerIdx].put<16, true>(offset, remainBytes, lid, WARP_SIZE);
     }
   }
 }
@@ -426,7 +426,7 @@ __global__ void __launch_bounds__(1024, 1)
   const size_t nThread = blockDim.x * nBlock;
   const size_t nWarp = nThread / WARP_SIZE;
   const size_t nPeer = nRanksPerNode - 1;
-  auto smChans = constSmOutOfPlaceChans;
+  auto memChans = constMemOutOfPlaceChans;
 
   const uint32_t flag = (uint32_t)globalFlag;
   const size_t bytesPerGPU = nelemsPerGPU * sizeof(int);
@@ -443,7 +443,7 @@ __global__ void __launch_bounds__(1024, 1)
     // First loop unrolling
     const size_t peerIdx = wid % nPeer;
     const size_t offset = bytesPerGPU * rank + (wid / nPeer) * unitBytesPerWarp;
-    smChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag);
+    memChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag);
   }
 
   if (nLoop > 0) {
@@ -451,14 +451,14 @@ __global__ void __launch_bounds__(1024, 1)
     const size_t peerIdx = wid % nPeer;
     const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1);
     const size_t offset = bytesPerGPU * remoteRankLocalIndex + (wid / nPeer) * unitBytesPerWarp;
-    smChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag);
+    memChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag);
   }
 
   for (size_t i = 1; i < nLoop; ++i) {
     const size_t gWid = wid + i * nWarp;
     const size_t peerIdx = gWid % nPeer;
     const size_t offset = bytesPerGPU * rank + (gWid / nPeer) * unitBytesPerWarp;
-    smChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag);
+    memChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag);
   }
 
   for (size_t i = 1; i < nLoop; ++i) {
@@ -466,7 +466,7 @@ __global__ void __launch_bounds__(1024, 1)
     const size_t peerIdx = gWid % nPeer;
     const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1);
     const size_t offset = bytesPerGPU * remoteRankLocalIndex + (gWid / nPeer) * unitBytesPerWarp;
-    smChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag);
+    memChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, unitBytesPerWarp, lid, WARP_SIZE, flag);
   }
 
   if (bytes % unitBytes > 0) {
@@ -478,7 +478,7 @@ __global__ void __launch_bounds__(1024, 1)
                                    ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0)
                                    : unitBytesPerWarp;
     if (remainBytes > 0) {
-      smChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, remainBytes, lid, WARP_SIZE, flag);
+      memChans[peerIdx].putPackets(scratchOffset + offset * 2, offset, remainBytes, lid, WARP_SIZE, flag);
     }
   }
   if (bytes % unitBytes > 0) {
@@ -491,7 +491,7 @@ __global__ void __launch_bounds__(1024, 1)
                                    ? ((bytesPerGPU > offsetWithinRank) ? (bytesPerGPU - offsetWithinRank) : 0)
                                    : unitBytesPerWarp;
     if (remainBytes > 0) {
-      smChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, remainBytes, lid, WARP_SIZE, flag);
+      memChans[peerIdx].getPackets(scratchOffset + offset * 2, offset, remainBytes, lid, WARP_SIZE, flag);
     }
   }
 
@@ -513,10 +513,10 @@ class AllGatherProxyService : public mscclpp::BaseProxyService {
     semaphores_.push_back(std::make_shared<mscclpp::Host2DeviceSemaphore>(communicator, connection));
     return semaphores_.size() - 1;
   }
-  std::vector<DeviceHandle<mscclpp::BaseProxyChannel>> proxyChannels() {
-    std::vector<DeviceHandle<mscclpp::BaseProxyChannel>> result;
+  std::vector<DeviceHandle<mscclpp::BasePortChannel>> portChannels() {
+    std::vector<DeviceHandle<mscclpp::BasePortChannel>> result;
     for (auto& semaphore : semaphores_) {
-      result.push_back(mscclpp::deviceHandle(mscclpp::BaseProxyChannel(0, semaphore, proxy_)));
+      result.push_back(mscclpp::deviceHandle(mscclpp::BasePortChannel(0, semaphore, proxy_)));
     }
     return result;
   }
@@ -705,8 +705,8 @@ class AllGatherTestEngine : public BaseTestEngine {
   std::shared_ptr<int> sendBuff_;
   std::shared_ptr<int[]> expectedBuff_;
   std::shared_ptr<mscclpp::LLPacket> scratchPacketBuff_;
-  std::vector<mscclpp::SmChannel> smChannels_;
-  std::vector<mscclpp::SmChannel> smOutOfPlaceChannels_;
+  std::vector<mscclpp::MemoryChannel> memoryChannels_;
+  std::vector<mscclpp::MemoryChannel> memoryOutOfPlaceChannels_;
 };
 
 AllGatherTestEngine::AllGatherTestEngine(const TestArgs& args) : BaseTestEngine(args, "allgather") {}
@@ -723,42 +723,46 @@ void AllGatherTestEngine::allocateBuffer() {
 }
 
 void AllGatherTestEngine::setupConnections() {
-  std::vector<DeviceHandle<mscclpp::ProxyChannel>> devProxyChannels;
+  std::vector<DeviceHandle<mscclpp::PortChannel>> devPortChannels;
   if (!isUsingHostOffload(args_.kernelNum)) {
-    setupMeshConnections(devProxyChannels, sendBuff_.get(), args_.maxBytes);
-    if (devProxyChannels.size() > sizeof(constProxyChans) / sizeof(DeviceHandle<mscclpp::ProxyChannel>)) {
+    setupMeshConnections(devPortChannels, sendBuff_.get(), args_.maxBytes);
+    if (devPortChannels.size() > sizeof(constPortChans) / sizeof(DeviceHandle<mscclpp::PortChannel>)) {
       std::runtime_error("unexpected error");
     }
-    CUDATHROW(cudaMemcpyToSymbol(constProxyChans, devProxyChannels.data(),
-                                 sizeof(DeviceHandle<mscclpp::ProxyChannel>) * devProxyChannels.size()));
+    CUDATHROW(cudaMemcpyToSymbol(constPortChans, devPortChannels.data(),
+                                 sizeof(DeviceHandle<mscclpp::PortChannel>) * devPortChannels.size()));
 
-    setupMeshConnections(smChannels_, sendBuff_.get(), args_.maxBytes, nullptr, 0, ChannelSemantic::PUT, 64);
-    std::vector<DeviceHandle<mscclpp::SmChannel>> smChannelHandles(smChannels_.size());
-    if (smChannels_.size() > sizeof(constSmChans) / sizeof(DeviceHandle<mscclpp::SmChannel>)) {
+    setupMeshConnections(memoryChannels_, sendBuff_.get(), args_.maxBytes, nullptr, 0, ChannelSemantic::PUT, 64);
+    std::vector<DeviceHandle<mscclpp::MemoryChannel>> memoryChannelHandles(memoryChannels_.size());
+    if (memoryChannels_.size() > sizeof(constMemChans) / sizeof(DeviceHandle<mscclpp::MemoryChannel>)) {
       std::runtime_error("unexpected error");
     }
-    std::transform(smChannels_.begin(), smChannels_.end(), smChannelHandles.begin(),
-                   [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); });
-    CUDATHROW(cudaMemcpyToSymbol(constSmChans, smChannelHandles.data(),
-                                 sizeof(DeviceHandle<mscclpp::SmChannel>) * smChannelHandles.size()));
+    std::transform(memoryChannels_.begin(), memoryChannels_.end(), memoryChannelHandles.begin(),
+                   [](const mscclpp::MemoryChannel& memoryChannel) { return mscclpp::deviceHandle(memoryChannel); });
+    CUDATHROW(cudaMemcpyToSymbol(constMemChans, memoryChannelHandles.data(),
+                                 sizeof(DeviceHandle<mscclpp::MemoryChannel>) * memoryChannelHandles.size()));
 
     if (args_.kernelNum == 7) {
       const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t);
       const size_t scratchPacketBuffBytes = nPacket * 2 * 2 * sizeof(mscclpp::LLPacket);
-      setupMeshConnections(smOutOfPlaceChannels_, sendBuff_.get(), args_.maxBytes, scratchPacketBuff_.get(),
+      setupMeshConnections(memoryOutOfPlaceChannels_, sendBuff_.get(), args_.maxBytes, scratchPacketBuff_.get(),
                            scratchPacketBuffBytes);
-      std::vector<DeviceHandle<mscclpp::SmChannel>> smOutOfPlaceChannelHandles(smOutOfPlaceChannels_.size());
-      if (smOutOfPlaceChannels_.size() > sizeof(constSmOutOfPlaceChans) / sizeof(DeviceHandle<mscclpp::SmChannel>)) {
+      std::vector<DeviceHandle<mscclpp::MemoryChannel>> memoryOutOfPlaceChannelHandles(
+          memoryOutOfPlaceChannels_.size());
+      if (memoryOutOfPlaceChannels_.size() >
+          sizeof(constMemOutOfPlaceChans) / sizeof(DeviceHandle<mscclpp::MemoryChannel>)) {
         std::runtime_error("unexpected error");
       }
-      std::transform(smOutOfPlaceChannels_.begin(), smOutOfPlaceChannels_.end(), smOutOfPlaceChannelHandles.begin(),
-                     [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); });
-      CUDATHROW(cudaMemcpyToSymbol(constSmOutOfPlaceChans, smOutOfPlaceChannelHandles.data(),
-                                   sizeof(DeviceHandle<mscclpp::SmChannel>) * smOutOfPlaceChannelHandles.size()));
+      std::transform(memoryOutOfPlaceChannels_.begin(), memoryOutOfPlaceChannels_.end(),
+                     memoryOutOfPlaceChannelHandles.begin(),
+                     [](const mscclpp::MemoryChannel& memoryChannel) { return mscclpp::deviceHandle(memoryChannel); });
+      CUDATHROW(
+          cudaMemcpyToSymbol(constMemOutOfPlaceChans, memoryOutOfPlaceChannelHandles.data(),
+                             sizeof(DeviceHandle<mscclpp::MemoryChannel>) * memoryOutOfPlaceChannelHandles.size()));
     }
   } else {
     auto service = std::dynamic_pointer_cast<AllGatherProxyService>(chanService_);
-    setupMeshConnections(devProxyChannels, sendBuff_.get(), args_.maxBytes, nullptr, 0,
+    setupMeshConnections(devPortChannels, sendBuff_.get(), args_.maxBytes, nullptr, 0,
                          [&](std::vector<std::shared_ptr<mscclpp::Connection>> conns,
                              std::vector<mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>>& remoteMemories,
                              const mscclpp::RegisteredMemory& localMemory) {
@@ -770,12 +774,12 @@ void AllGatherTestEngine::setupConnections() {
                            service->setLocalMemory(localMemory);
                            comm_->setup();
                          });
-    auto proxyChannels = service->proxyChannels();
-    if (proxyChannels.size() > sizeof(constRawProxyChan) / sizeof(DeviceHandle<mscclpp::BaseProxyChannel>)) {
+    auto portChannels = service->portChannels();
+    if (portChannels.size() > sizeof(constRawPortChan) / sizeof(DeviceHandle<mscclpp::BasePortChannel>)) {
       std::runtime_error("unexpected error");
     }
-    CUDATHROW(cudaMemcpyToSymbol(constRawProxyChan, proxyChannels.data(),
-                                 sizeof(DeviceHandle<mscclpp::BaseProxyChannel>) * proxyChannels.size()));
+    CUDATHROW(cudaMemcpyToSymbol(constRawPortChan, portChannels.data(),
+                                 sizeof(DeviceHandle<mscclpp::BasePortChannel>) * portChannels.size()));
   }
 }
 
diff --git a/test/mscclpp-test/allreduce_test.cu b/test/mscclpp-test/allreduce_test.cu
index b7632a83d..34f1430db 100644
--- a/test/mscclpp-test/allreduce_test.cu
+++ b/test/mscclpp-test/allreduce_test.cu
@@ -13,12 +13,12 @@
 
 template <class T>
 using DeviceHandle = mscclpp::DeviceHandle<T>;
-__constant__ DeviceHandle<mscclpp::ProxyChannel> constDevFstRoundChans[16];
-__constant__ DeviceHandle<mscclpp::ProxyChannel> constDevSndRoundChans[16];
+__constant__ DeviceHandle<mscclpp::PortChannel> constDevFstRoundChans[16];
+__constant__ DeviceHandle<mscclpp::PortChannel> constDevSndRoundChans[16];
 
-__constant__ DeviceHandle<mscclpp::SmChannel> constSmInPlaceChans[8];
-__constant__ DeviceHandle<mscclpp::SmChannel> constSmOutOfPlaceChans[8];
-__constant__ DeviceHandle<mscclpp::SmChannel> constSmOutOfPlaceGetChans[8];
+__constant__ DeviceHandle<mscclpp::MemoryChannel> constMemInPlaceChans[8];
+__constant__ DeviceHandle<mscclpp::MemoryChannel> constMemOutOfPlaceChans[8];
+__constant__ DeviceHandle<mscclpp::MemoryChannel> constMemOutOfPlaceGetChans[8];
 __device__ uint64_t globalFlag;
 
 // TODO(chhwang): need an interface for this.
@@ -94,8 +94,8 @@ __device__ void localReduceScatter(int* buff, int* scratch, int rank, int nRanks
     int peerSendId = (remoteSendToRank < rank) ? remoteSendToRank : remoteSendToRank - 1;
     int peerRecvId = (remoteRecvFromRank < rank) ? remoteRecvFromRank : remoteRecvFromRank - 1;
 
-    DeviceHandle<mscclpp::ProxyChannel>& devFstSendChan = constDevFstRoundChans[peerSendId];
-    DeviceHandle<mscclpp::ProxyChannel>& devFstRecvChan = constDevFstRoundChans[peerRecvId];
+    DeviceHandle<mscclpp::PortChannel>& devFstSendChan = constDevFstRoundChans[peerSendId];
+    DeviceHandle<mscclpp::PortChannel>& devFstRecvChan = constDevFstRoundChans[peerRecvId];
     size_t srcOffset =
         (((rankIndexInNode + i) % nRanksPerNode + startChunkIndex) * chunkSize + offsetInChunk) * sizeof(int);
     size_t dstOffset = rank * chunkSize * sizeof(int);
@@ -110,7 +110,7 @@ __device__ void localReduceScatter(int* buff, int* scratch, int rank, int nRanks
       int prePeerRecvId = (preRemoteRecvFromRank < rank) ? preRemoteRecvFromRank : preRemoteRecvFromRank - 1;
 
       // overlap communication and computation
-      DeviceHandle<mscclpp::ProxyChannel>& preDevFstRecvChan = constDevFstRoundChans[prePeerRecvId];
+      DeviceHandle<mscclpp::PortChannel>& preDevFstRecvChan = constDevFstRoundChans[prePeerRecvId];
       if (isComm) {
         preDevFstRecvChan.wait();
         devFstSendChan.putWithSignal(dstOffset, srcOffset, nelems * sizeof(int));
@@ -157,7 +157,7 @@ __device__ void reduceScatter(int* buff, int* scratch, int rank, int nRanksPerNo
   int peerNodeId = peerRank / nRanksPerNode;
   int isComm = (threadIdx.x == 0) && (blockIdx.x == 0);
   int peer = (peerRank < rank) ? peerRank : peerRank - 1;
-  DeviceHandle<mscclpp::ProxyChannel>& proxyChan = constDevFstRoundChans[peer];
+  DeviceHandle<mscclpp::PortChannel>& portChan = constDevFstRoundChans[peer];
   if (peerNodeId == rank / nRanksPerNode) {
     localReduceScatter(buff, scratch, rank, nRanksPerNode, 0, 0, chunkSize, chunkSize);
     return;
@@ -172,12 +172,12 @@ __device__ void reduceScatter(int* buff, int* scratch, int rank, int nRanksPerNo
   if (isComm) {
     size_t offset = (peerRank * chunkSize) * sizeof(int);
     // opposite side
-    proxyChan.putWithSignal(offset, (chunkSize / pipelineSize * sizeof(int)));
+    portChan.putWithSignal(offset, (chunkSize / pipelineSize * sizeof(int)));
   }
   localReduceScatter(buff, scratch, rank, nRanksPerNode, startChunkIndex, chunkSize / pipelineSize, chunkSize,
                      2 * chunkSize / pipelineSize);
   if (isComm) {
-    proxyChan.wait();
+    portChan.wait();
   }
   deviceSyncer.sync(gridDim.x);
   // reduce data received from peer to related rank
@@ -186,7 +186,7 @@ __device__ void reduceScatter(int* buff, int* scratch, int rank, int nRanksPerNo
   int* src = (int*)((char*)scratch + offset);
   vectorSum(dst, src, chunkSize / pipelineSize);
   if (isComm) {
-    proxyChan.flush();
+    portChan.flush();
   }
   deviceSyncer.sync(gridDim.x);
 
@@ -194,11 +194,11 @@ __device__ void reduceScatter(int* buff, int* scratch, int rank, int nRanksPerNo
   startChunkIndex = (rank / nRanksPerNode) * nRanksPerNode;
   if (isComm) {
     size_t offset = (peerRank * chunkSize + chunkSize / pipelineSize) * sizeof(int);
-    proxyChan.putWithSignal(offset, (pipelineSize - 1) * chunkSize / pipelineSize * sizeof(int));
+    portChan.putWithSignal(offset, (pipelineSize - 1) * chunkSize / pipelineSize * sizeof(int));
   }
   localReduceScatter(buff, scratch, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize);
   if (isComm) {
-    proxyChan.wait();
+    portChan.wait();
   }
   deviceSyncer.sync(gridDim.x);
   // reduce to related rank
@@ -207,7 +207,7 @@ __device__ void reduceScatter(int* buff, int* scratch, int rank, int nRanksPerNo
   src = (int*)((char*)scratch + offset);
   vectorSum(dst, src, 2 * chunkSize / pipelineSize);
   if (isComm) {
-    proxyChan.flush();
+    portChan.flush();
   }
 }
 
@@ -228,8 +228,8 @@ __device__ void localAllGather(int rank, int nRanksPerNode, uint64_t offset, uin
     int peerSendId = (remoteSendToRank < rank) ? remoteSendToRank : remoteSendToRank - 1;
     int peerRecvId = (remoteRecvFromRank < rank) ? remoteRecvFromRank : remoteRecvFromRank - 1;
 
-    DeviceHandle<mscclpp::ProxyChannel>& devSendChan = constDevSndRoundChans[peerSendId];
-    DeviceHandle<mscclpp::ProxyChannel>& devRecvChan = constDevSndRoundChans[peerRecvId];
+    DeviceHandle<mscclpp::PortChannel>& devSendChan = constDevSndRoundChans[peerSendId];
+    DeviceHandle<mscclpp::PortChannel>& devRecvChan = constDevSndRoundChans[peerRecvId];
     // wait for the data from GPU (rank-i) % nranksPerNode to arrive
     devSendChan.putWithSignal(offset, size);
     devRecvChan.wait();
@@ -252,7 +252,7 @@ __device__ void allGather(int rank, int worldSize, int nRanksPerNode, size_t nel
   int peerRank = (rank + nRanksPerNode) % worldSize;
   int peerNodeId = peerRank / nRanksPerNode;
   int peer = (peerRank < rank) ? peerRank : peerRank - 1;
-  DeviceHandle<mscclpp::ProxyChannel>& proxyChan = constDevSndRoundChans[peer];
+  DeviceHandle<mscclpp::PortChannel>& portChan = constDevSndRoundChans[peer];
 
   if (peerNodeId == rank / nRanksPerNode) {
     localAllGather(rank, nRanksPerNode, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int));
@@ -260,30 +260,30 @@ __device__ void allGather(int rank, int worldSize, int nRanksPerNode, size_t nel
   }
 
   // Step 1
-  proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int),
-                          (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int));
+  portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int),
+                         (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int));
   localAllGather(rank, nRanksPerNode, rank * nelemsPerGPU * sizeof(int), nelemsPerGPU * sizeof(int));
-  proxyChan.wait();
-  proxyChan.flush();
+  portChan.wait();
+  portChan.flush();
   // Step 2
-  proxyChan.putWithSignal((rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int),
-                          nelemsPerGPU / pipelineSize * sizeof(int));
+  portChan.putWithSignal((rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int),
+                         nelemsPerGPU / pipelineSize * sizeof(int));
   localAllGather(rank, nRanksPerNode, peerRank * nelemsPerGPU * sizeof(int),
                  (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int));
-  proxyChan.wait();
-  proxyChan.flush();
+  portChan.wait();
+  portChan.flush();
   // Step 3
   localAllGather(rank, nRanksPerNode,
                  (peerRank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int),
                  nelemsPerGPU / pipelineSize * sizeof(int));
 }
 
-__device__ void localReduceScatterSm(int* buff, int rank, int nRanksPerNode, int startChunkIndex, size_t offsetInChunk,
-                                     size_t chunkSize, size_t nelems, int nBlocks) {
+__device__ void localReduceScatterMem(int* buff, int rank, int nRanksPerNode, int startChunkIndex, size_t offsetInChunk,
+                                      size_t chunkSize, size_t nelems, int nBlocks) {
   if (nRanksPerNode == 1) return;
   if ((int)blockIdx.x >= nBlocks) return;
   const int nPeer = nRanksPerNode - 1;
-  DeviceHandle<mscclpp::SmChannel>* smChans = constSmOutOfPlaceGetChans;
+  DeviceHandle<mscclpp::MemoryChannel>* memChans = constMemOutOfPlaceGetChans;
 
   const size_t localRankIndexInNode = rank % nRanksPerNode;
   const size_t indexOffset = ((localRankIndexInNode + startChunkIndex) * chunkSize + offsetInChunk);
@@ -292,10 +292,10 @@ __device__ void localReduceScatterSm(int* buff, int rank, int nRanksPerNode, int
   int4* buff4 = (int4*)buff;
 
   for (int peerIdx = threadIdx.x + blockIdx.x * blockDim.x; peerIdx < nPeer; peerIdx += blockDim.x * nBlocks) {
-    smChans[peerIdx].signal();
+    memChans[peerIdx].signal();
   }
   for (int peerIdx = threadIdx.x + blockIdx.x * blockDim.x; peerIdx < nPeer; peerIdx += blockDim.x * nBlocks) {
-    smChans[peerIdx].wait();
+    memChans[peerIdx].wait();
   }
   reduceScatterDeviceSyncer.sync(nBlocks);
 
@@ -304,7 +304,7 @@ __device__ void localReduceScatterSm(int* buff, int rank, int nRanksPerNode, int
     int4 sum = make_int4(0, 0, 0, 0);
 
     for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
-      int4 val = smChans[peerIdx].read<int4>(indexOffset4 + idx);
+      int4 val = memChans[peerIdx].read<int4>(indexOffset4 + idx);
       sum.w += val.w;
       sum.x += val.x;
       sum.y += val.y;
@@ -320,19 +320,19 @@ __device__ void localReduceScatterSm(int* buff, int rank, int nRanksPerNode, int
   for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nLastInts; idx += blockDim.x * nBlocks) {
     int sum = 0;
     for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
-      int val = smChans[peerIdx].read<int>(indexOffset + nInt4 * 4 + idx);
+      int val = memChans[peerIdx].read<int>(indexOffset + nInt4 * 4 + idx);
       sum += val;
     }
     buff[indexOffset + nInt4 * 4 + idx] += sum;
   }
 }
 
-__device__ void localReduceScatterSm2(int* buff, int rank, int nRanksPerNode, size_t chunkSize, size_t nelems,
-                                      int nBlocks) {
+__device__ void localReduceScatterMem2(int* buff, int rank, int nRanksPerNode, size_t chunkSize, size_t nelems,
+                                       int nBlocks) {
   if (nRanksPerNode == 1) return;
   if ((int)blockIdx.x >= nBlocks) return;
   const int nPeer = nRanksPerNode - 1;
-  DeviceHandle<mscclpp::SmChannel>* smChans = constSmOutOfPlaceGetChans;
+  DeviceHandle<mscclpp::MemoryChannel>* memChans = constMemOutOfPlaceGetChans;
 
   const size_t localRankIndexInNode = rank % nRanksPerNode;
   const size_t indexOffset = localRankIndexInNode * chunkSize;
@@ -342,11 +342,11 @@ __device__ void localReduceScatterSm2(int* buff, int rank, int nRanksPerNode, si
 
   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < nPeer) {
-    smChans[tid].signal();
+    memChans[tid].signal();
   }
   const int waitStart = nBlocks * blockDim.x - nPeer;
   if (tid >= waitStart && tid < (int)(nBlocks * blockDim.x)) {
-    smChans[tid - waitStart].wait();
+    memChans[tid - waitStart].wait();
   }
   reduceScatterDeviceSyncer.sync(nBlocks);
 
@@ -355,7 +355,7 @@ __device__ void localReduceScatterSm2(int* buff, int rank, int nRanksPerNode, si
     int4 val;
     int peerIdx = (index + localRankIndexInNode) % nPeer;
     for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nInt4; idx += blockDim.x * nBlocks) {
-      val = smChans[peerIdx].read<int4>(indexOffset4 + idx);
+      val = memChans[peerIdx].read<int4>(indexOffset4 + idx);
       buff4[indexOffset4 + idx].w += val.w;
       buff4[indexOffset4 + idx].x += val.x;
       buff4[indexOffset4 + idx].y += val.y;
@@ -366,18 +366,18 @@ __device__ void localReduceScatterSm2(int* buff, int rank, int nRanksPerNode, si
   const size_t nLastInts = nelems % 4;
   for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
     for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nLastInts; idx += blockDim.x * nBlocks) {
-      int val = smChans[(localRankIndexInNode + peerIdx) % nPeer].read<int>(indexOffset + nInt4 * 4 + idx);
+      int val = memChans[(localRankIndexInNode + peerIdx) % nPeer].read<int>(indexOffset + nInt4 * 4 + idx);
       buff[indexOffset + nInt4 * 4 + idx] += val;
     }
   }
 }
 
-__device__ void localReduceScatterSm3(int* buff, int rank, int nRanksPerNode, size_t chunkSize, size_t nelems,
-                                      int nBlocks) {
+__device__ void localReduceScatterMem3(int* buff, int rank, int nRanksPerNode, size_t chunkSize, size_t nelems,
+                                       int nBlocks) {
   if (nRanksPerNode == 1) return;
   if ((int)blockIdx.x >= nBlocks) return;
   const int nPeer = nRanksPerNode - 1;
-  DeviceHandle<mscclpp::SmChannel>* smChans = constSmOutOfPlaceGetChans;
+  DeviceHandle<mscclpp::MemoryChannel>* memChans = constMemOutOfPlaceGetChans;
 
   const size_t localRankIndexInNode = rank % nRanksPerNode;
   const size_t indexOffset = localRankIndexInNode * chunkSize;
@@ -387,11 +387,11 @@ __device__ void localReduceScatterSm3(int* buff, int rank, int nRanksPerNode, si
 
   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < nPeer) {
-    smChans[tid].signal();
+    memChans[tid].signal();
   }
   const int waitStart = nBlocks * blockDim.x - nPeer;
   if (tid >= waitStart && tid < (int)(nBlocks * blockDim.x)) {
-    smChans[tid - waitStart].wait();
+    memChans[tid - waitStart].wait();
   }
   reduceScatterDeviceSyncer.sync(nBlocks);
 
@@ -405,7 +405,7 @@ __device__ void localReduceScatterSm3(int* buff, int rank, int nRanksPerNode, si
       int peerIdx = (index + localRankIndexInNode) % nPeer;
       for (size_t idx = base + threadIdx.x + blockIdx.x * blockDim.x; idx < base + unitNInt4;
            idx += blockDim.x * nBlocks) {
-        val = smChans[peerIdx].read<int4>(indexOffset4 + idx);
+        val = memChans[peerIdx].read<int4>(indexOffset4 + idx);
         buff4[indexOffset4 + idx].w += val.w;
         buff4[indexOffset4 + idx].x += val.x;
         buff4[indexOffset4 + idx].y += val.y;
@@ -417,7 +417,7 @@ __device__ void localReduceScatterSm3(int* buff, int rank, int nRanksPerNode, si
     int4 val;
     int peerIdx = (index + localRankIndexInNode) % nPeer;
     for (size_t idx = base + threadIdx.x + blockIdx.x * blockDim.x; idx < nInt4; idx += blockDim.x * nBlocks) {
-      val = smChans[peerIdx].read<int4>(indexOffset4 + idx);
+      val = memChans[peerIdx].read<int4>(indexOffset4 + idx);
       buff4[indexOffset4 + idx].w += val.w;
       buff4[indexOffset4 + idx].x += val.x;
       buff4[indexOffset4 + idx].y += val.y;
@@ -428,14 +428,14 @@ __device__ void localReduceScatterSm3(int* buff, int rank, int nRanksPerNode, si
   const size_t nLastInts = nelems % 4;
   for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) {
     for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nLastInts; idx += blockDim.x * nBlocks) {
-      int val = smChans[(localRankIndexInNode + peerIdx) % nPeer].read<int>(indexOffset + nInt4 * 4 + idx);
+      int val = memChans[(localRankIndexInNode + peerIdx) % nPeer].read<int>(indexOffset + nInt4 * 4 + idx);
       buff[indexOffset + nInt4 * 4 + idx] += val;
     }
   }
 }
 
-__device__ void reduceScatterSm(int* buff, int* scratch, int rank, int nRanksPerNode, int worldSize,
-                                size_t nelems  // must be divisible by 3
+__device__ void reduceScatterMem(int* buff, int* scratch, int rank, int nRanksPerNode, int worldSize,
+                                 size_t nelems  // must be divisible by 3
 ) {
   // this reduce-scatter algorithm works as follows:
   // Step 1: each node does a local reduce-scatter on peer node data chunks with 1/pipeline portion of chunk data. For
@@ -457,28 +457,28 @@ __device__ void reduceScatterSm(int* buff, int* scratch, int rank, int nRanksPer
   int isComm = (threadIdx.x == 0) && ((int)blockIdx.x == nBlocksForReduceScatter);
   int peer = (peerRank < rank) ? peerRank : peerRank - 1;
   int nBlocksRemain = gridDim.x - nBlocksForReduceScatter;
-  DeviceHandle<mscclpp::ProxyChannel>& proxyChan = constDevFstRoundChans[peer];
+  DeviceHandle<mscclpp::PortChannel>& portChan = constDevFstRoundChans[peer];
   if (peerNodeId == rank / nRanksPerNode) {
-    localReduceScatterSm(buff, rank, nRanksPerNode, 0, 0, chunkSize, chunkSize, gridDim.x);
+    localReduceScatterMem(buff, rank, nRanksPerNode, 0, 0, chunkSize, chunkSize, gridDim.x);
     return;
   }
 
   // step 1: local reduce
   int startChunkIndex = peerNodeId * nRanksPerNode;
-  localReduceScatterSm(buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize / pipelineSize,
-                       nBlocksForReduceScatter);
+  localReduceScatterMem(buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize / pipelineSize,
+                        nBlocksForReduceScatter);
   deviceSyncer.sync(gridDim.x);
 
   // step 2: local reduce and exchange data with neighbor
   if (isComm) {
     size_t offset = (peerRank * chunkSize) * sizeof(int);
     // opposite side
-    proxyChan.putWithSignal(offset, (chunkSize / pipelineSize * sizeof(int)));
+    portChan.putWithSignal(offset, (chunkSize / pipelineSize * sizeof(int)));
   }
-  localReduceScatterSm(buff, rank, nRanksPerNode, startChunkIndex, chunkSize / pipelineSize, chunkSize,
-                       2 * chunkSize / pipelineSize, nBlocksForReduceScatter);
+  localReduceScatterMem(buff, rank, nRanksPerNode, startChunkIndex, chunkSize / pipelineSize, chunkSize,
+                        2 * chunkSize / pipelineSize, nBlocksForReduceScatter);
   if (isComm) {
-    proxyChan.wait();
+    portChan.wait();
   }
   if ((int)blockIdx.x >= nBlocksForReduceScatter) {
     ibDeviceSyncer.sync(nBlocksRemain);
@@ -489,7 +489,7 @@ __device__ void reduceScatterSm(int* buff, int* scratch, int rank, int nRanksPer
     vectorSum(dst, src, chunkSize / pipelineSize, blockIdx.x - nBlocksForReduceScatter, nBlocksRemain);
   }
   if (isComm) {
-    proxyChan.flush();
+    portChan.flush();
   }
   deviceSyncer.sync(gridDim.x);
 
@@ -497,11 +497,11 @@ __device__ void reduceScatterSm(int* buff, int* scratch, int rank, int nRanksPer
   startChunkIndex = (rank / nRanksPerNode) * nRanksPerNode;
   if (isComm) {
     size_t offset = (peerRank * chunkSize + chunkSize / pipelineSize) * sizeof(int);
-    proxyChan.putWithSignal(offset, (pipelineSize - 1) * chunkSize / pipelineSize * sizeof(int));
+    portChan.putWithSignal(offset, (pipelineSize - 1) * chunkSize / pipelineSize * sizeof(int));
   }
-  localReduceScatterSm(buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize, nBlocksForReduceScatter);
+  localReduceScatterMem(buff, rank, nRanksPerNode, startChunkIndex, 0, chunkSize, chunkSize, nBlocksForReduceScatter);
   if (isComm) {
-    proxyChan.wait();
+    portChan.wait();
   }
   deviceSyncer.sync(gridDim.x);
   // reduce to related rank, can not overlap since localReduceScatter also calculate the sum
@@ -510,13 +510,13 @@ __device__ void reduceScatterSm(int* buff, int* scratch, int rank, int nRanksPer
   int* src = (int*)((char*)scratch + offset);
   vectorSum(dst, src, 2 * chunkSize / pipelineSize);
   if (isComm) {
-    proxyChan.flush();
+    portChan.flush();
   }
 }
 
 // This kernel is the most performant when the number of blocks is a multiple of (nRanksPerNode - 1).
-__device__ void localAllGatherSm(int rank, int nRanksPerNode, int startRankChunkIndex, uint64_t offsetInRankChunk,
-                                 uint64_t rankChunkSize, uint64_t size, size_t nBlocks) {
+__device__ void localAllGatherMem(int rank, int nRanksPerNode, int startRankChunkIndex, uint64_t offsetInRankChunk,
+                                  uint64_t rankChunkSize, uint64_t size, size_t nBlocks) {
   if (nRanksPerNode == 1) return;
   if (blockIdx.x >= nBlocks) return;
   const size_t nPeer = nRanksPerNode - 1;
@@ -552,15 +552,15 @@ __device__ void localAllGatherSm(int rank, int nRanksPerNode, int startRankChunk
     sizeForThisBlock += lastChunkSize;
   }
   if (threadIdx.x == 0 && peerLocalBlockIdx == 0) {
-    constSmInPlaceChans[peerIdx].signal();
-    constSmInPlaceChans[peerIdx].wait();
+    constMemInPlaceChans[peerIdx].signal();
+    constMemInPlaceChans[peerIdx].wait();
   }
   allGatherDeviceSyncer.sync(nBlocks);
   size_t offset = rankChunkSize * (startRankChunkIndex + remoteRankLocalIndex) + offsetInRankChunk;
-  constSmInPlaceChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x);
+  constMemInPlaceChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x);
 }
 
-__device__ void localRingAllGatherSm(int rank, int nRanksPerNode, uint64_t size, size_t nBlocks) {
+__device__ void localRingAllGatherMem(int rank, int nRanksPerNode, uint64_t size, size_t nBlocks) {
   if (nRanksPerNode == 1) return;
   if (blockIdx.x >= nBlocks) return;
 
@@ -568,22 +568,22 @@ __device__ void localRingAllGatherSm(int rank, int nRanksPerNode, uint64_t size,
   const int nPeer = nRanksPerNode - 1;
 
   if (tid < nPeer) {
-    constSmInPlaceChans[tid].signal();
+    constMemInPlaceChans[tid].signal();
   }
   int waitStart = nBlocks * blockDim.x - nPeer;
   if (tid >= waitStart && tid < (int)(nBlocks * blockDim.x)) {
-    constSmInPlaceChans[tid - waitStart].wait();
+    constMemInPlaceChans[tid - waitStart].wait();
   }
   allGatherDeviceSyncer.sync(nBlocks);
   for (int i = 0; i < nPeer; ++i) {
     int peerIdx = (i + rank) % nPeer;
     const int remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1);
     size_t offset = size * remoteRankLocalIndex;
-    constSmInPlaceChans[peerIdx].get(offset, size, tid, blockDim.x * nBlocks);
+    constMemInPlaceChans[peerIdx].get(offset, size, tid, blockDim.x * nBlocks);
   }
 }
 
-__device__ void localRingAllGatherSm2(size_t rank, size_t nRanksPerNode, size_t size, size_t nBlocks) {
+__device__ void localRingAllGatherMem2(size_t rank, size_t nRanksPerNode, size_t size, size_t nBlocks) {
   if (nRanksPerNode == 1) return;
   if (blockIdx.x >= nBlocks) return;
 
@@ -591,11 +591,11 @@ __device__ void localRingAllGatherSm2(size_t rank, size_t nRanksPerNode, size_t
   const size_t nPeer = nRanksPerNode - 1;
 
   if (tid < nPeer) {
-    constSmInPlaceChans[tid].signal();
+    constMemInPlaceChans[tid].signal();
   }
   size_t waitStart = nBlocks * blockDim.x - nPeer;
   if (tid >= waitStart && tid < nBlocks * blockDim.x) {
-    constSmInPlaceChans[tid - waitStart].wait();
+    constMemInPlaceChans[tid - waitStart].wait();
   }
   allGatherDeviceSyncer.sync(nBlocks);
   const size_t unitSize = 16 * blockDim.x * nBlocks;
@@ -605,19 +605,19 @@ __device__ void localRingAllGatherSm2(size_t rank, size_t nRanksPerNode, size_t
       size_t peerIdx = (i + rank) % nPeer;
       const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1);
       size_t offset = size * remoteRankLocalIndex + base;
-      constSmInPlaceChans[peerIdx].get(offset, unitSize, tid, blockDim.x * nBlocks);
+      constMemInPlaceChans[peerIdx].get(offset, unitSize, tid, blockDim.x * nBlocks);
     }
   }
   for (size_t i = 0; i < nPeer; ++i) {
     size_t peerIdx = (i + rank) % nPeer;
     const size_t remoteRankLocalIndex = (peerIdx < rank ? peerIdx : peerIdx + 1);
     size_t offset = size * remoteRankLocalIndex + base;
-    constSmInPlaceChans[peerIdx].get(offset, size - base, tid, blockDim.x * nBlocks);
+    constMemInPlaceChans[peerIdx].get(offset, size - base, tid, blockDim.x * nBlocks);
   }
 }
 
 // This is an allgather4 equivalent
-__device__ void allGatherSm(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
+__device__ void allGatherMem(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
   // this allgather is a pipelined and hierarchical one and only works for two nodes
   // it is implemented as follows:
   // Step 1: each node does a local allgather and concurrently,
@@ -632,14 +632,14 @@ __device__ void allGatherSm(int rank, int worldSize, int nRanksPerNode, size_t n
   int peerRank = (rank + nRanksPerNode) % worldSize;
   int peerNodeId = peerRank / nRanksPerNode;
   int peer = (peerRank < rank) ? peerRank : peerRank - 1;
-  DeviceHandle<mscclpp::ProxyChannel>& proxyChan = constDevSndRoundChans[peer];
+  DeviceHandle<mscclpp::PortChannel>& portChan = constDevSndRoundChans[peer];
   const size_t nBlocksForLocalAllGather = gridDim.x / (nRanksPerNode - 1) * (nRanksPerNode - 1);
   const size_t rankChunkSize = nelemsPerGPU * sizeof(int);
   const int startRankIndexInLocalNode = (rank / nRanksPerNode) * nRanksPerNode;
   const int startRankIndexInPeerNode = (peerRank / nRanksPerNode) * nRanksPerNode;
 
   if (peerNodeId == rank / nRanksPerNode) {
-    localAllGatherSm(rank, nRanksPerNode, 0, 0, rankChunkSize, rankChunkSize, gridDim.x);
+    localAllGatherMem(rank, nRanksPerNode, 0, 0, rankChunkSize, rankChunkSize, gridDim.x);
     return;
   }
 
@@ -650,29 +650,29 @@ __device__ void allGatherSm(int rank, int worldSize, int nRanksPerNode, size_t n
 
   // Step 1
   if (threadIdx.x == 0 && blockIdx.x == 0) {
-    proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), step1Bytes);
+    portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int), step1Bytes);
   }
-  localAllGatherSm(rank, nRanksPerNode, startRankIndexInLocalNode, 0, rankChunkSize, rankChunkSize,
-                   nBlocksForLocalAllGather);
+  localAllGatherMem(rank, nRanksPerNode, startRankIndexInLocalNode, 0, rankChunkSize, rankChunkSize,
+                    nBlocksForLocalAllGather);
   if (threadIdx.x == 0 && blockIdx.x == 0) {
-    proxyChan.wait();
-    proxyChan.flush();
+    portChan.wait();
+    portChan.flush();
   }
   deviceSyncer.sync(gridDim.x);
   // Step 2
   if (threadIdx.x == 0 && blockIdx.x == 0) {
-    proxyChan.putWithSignal(rank * nelemsPerGPU * sizeof(int) + step1Bytes, step2Bytes);
+    portChan.putWithSignal(rank * nelemsPerGPU * sizeof(int) + step1Bytes, step2Bytes);
   }
-  localAllGatherSm(rank, nRanksPerNode, startRankIndexInPeerNode, 0, rankChunkSize, step1Bytes,
-                   nBlocksForLocalAllGather);
+  localAllGatherMem(rank, nRanksPerNode, startRankIndexInPeerNode, 0, rankChunkSize, step1Bytes,
+                    nBlocksForLocalAllGather);
   if (threadIdx.x == 0 && blockIdx.x == 0) {
-    proxyChan.wait();
-    proxyChan.flush();
+    portChan.wait();
+    portChan.flush();
   }
   deviceSyncer.sync(gridDim.x);
   // Step 3
-  localAllGatherSm(rank, nRanksPerNode, startRankIndexInPeerNode, step1Bytes, rankChunkSize, step2Bytes,
-                   nBlocksForLocalAllGather);
+  localAllGatherMem(rank, nRanksPerNode, startRankIndexInPeerNode, step1Bytes, rankChunkSize, step2Bytes,
+                    nBlocksForLocalAllGather);
 }
 
 __global__ void __launch_bounds__(1024)
@@ -682,7 +682,7 @@ __global__ void __launch_bounds__(1024)
   int remoteRank = (peerId < rank) ? peerId : peerId + 1;
 
   // 1st communication phase: send data to the scratch buffer of the peer associated with this block
-  DeviceHandle<mscclpp::ProxyChannel>& devFstRoundChan = constDevFstRoundChans[peerId];
+  DeviceHandle<mscclpp::PortChannel>& devFstRoundChan = constDevFstRoundChans[peerId];
   Chunk toPeerChunk = getChunk(nelems, worldSize, remoteRank);
   // Now we need to figure out the offset of this chunk in the scratch buffer of the destination.
   // The destination will have allocated a scratch buffer of size numPeers() * toPeerChunk.size and
@@ -700,7 +700,7 @@ __global__ void __launch_bounds__(1024)
   deviceSyncer.sync(gridDim.x);
 
   // Local reduction: every block reduces a slice of each chunk in the scratch buffer into the user buffer
-  DeviceHandle<mscclpp::ProxyChannel>& devSndRoundChan = constDevSndRoundChans[peerId];
+  DeviceHandle<mscclpp::PortChannel>& devSndRoundChan = constDevSndRoundChans[peerId];
   Chunk rankChunk = getChunk(nelems, worldSize, rank);
   int* chunk = buff + rankChunk.offset;
   int numPeers = gridDim.x / BLOCKS_PER_PEER;
@@ -734,10 +734,10 @@ __global__ void __launch_bounds__(1024) allreduce1(int* buff, int* scratch, int
   int peerSendId = (remoteSendRank < rank) ? remoteSendRank : remoteSendRank - 1;
   int peerRecvId = (remoteRecvRank < rank) ? remoteRecvRank : remoteRecvRank - 1;
 
-  DeviceHandle<mscclpp::ProxyChannel>& devFstSendChan = constDevFstRoundChans[peerSendId];
-  DeviceHandle<mscclpp::ProxyChannel>& devFstRecvChan = constDevFstRoundChans[peerRecvId];
-  DeviceHandle<mscclpp::ProxyChannel>& devSndSendChan = constDevSndRoundChans[peerSendId];
-  DeviceHandle<mscclpp::ProxyChannel>& devSndRecvChan = constDevSndRoundChans[peerRecvId];
+  DeviceHandle<mscclpp::PortChannel>& devFstSendChan = constDevFstRoundChans[peerSendId];
+  DeviceHandle<mscclpp::PortChannel>& devFstRecvChan = constDevFstRoundChans[peerRecvId];
+  DeviceHandle<mscclpp::PortChannel>& devSndSendChan = constDevSndRoundChans[peerSendId];
+  DeviceHandle<mscclpp::PortChannel>& devSndRecvChan = constDevSndRoundChans[peerRecvId];
 
   // Step 1
   size_t chunkIndex = (rank + worldSize - 1) % worldSize;
@@ -846,12 +846,12 @@ __global__ void __launch_bounds__(1024)
   size_t pktBytes = nPkts * sizeof(mscclpp::LLPacket);
 
   // Channel to a local peer
-  int smChanIdx = blockIdx.x / BLOCKS_PER_PEER;
-  DeviceHandle<mscclpp::SmChannel> smChan = constSmOutOfPlaceChans[smChanIdx];
+  int memChanIdx = blockIdx.x / BLOCKS_PER_PEER;
+  DeviceHandle<mscclpp::MemoryChannel> memChan = constMemOutOfPlaceChans[memChanIdx];
 
   // Channel to a remote peer that has the same local rank as me
   int localRank = rank % nRanksPerNode;
-  DeviceHandle<mscclpp::ProxyChannel> proxyChan = constDevFstRoundChans[localRank];
+  DeviceHandle<mscclpp::PortChannel> portChan = constDevFstRoundChans[localRank];
 
   // Flag for packets. Initially 1
   uint32_t flag = (uint32_t)globalFlag;
@@ -876,11 +876,11 @@ __global__ void __launch_bounds__(1024)
     size_t srcOffset =
         ((blockIdx.x % BLOCKS_PER_PEER) * nelems * sizeof(int) / BLOCKS_PER_PEER);  // offset for this block
     // Offset of the peer's scratch buffer (scratch) to write on
-    size_t dstOffset = (scratchOffset) +                                                   // double buffering
-                       ((smChanIdx < localRank ? localRank - 1 : localRank) * pktBytes) +  // offset for this rank
+    size_t dstOffset = (scratchOffset) +                                                    // double buffering
+                       ((memChanIdx < localRank ? localRank - 1 : localRank) * pktBytes) +  // offset for this rank
                        (srcOffset * 2);  // offset for this block: twice of srcOffset because 2 elems per packet
     // Write data to the peer's scratch
-    smChan.putPackets(dstOffset, srcOffset, nelems / BLOCKS_PER_PEER * sizeof(int), threadIdx.x, blockDim.x, flag);
+    memChan.putPackets(dstOffset, srcOffset, nelems / BLOCKS_PER_PEER * sizeof(int), threadIdx.x, blockDim.x, flag);
     // Read data from my scratch, reduce data with my buff, and write the result to my putPktBuf or to result
     const bool isSingleNode = (worldSize == nRanksPerNode);
     for (size_t idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPkts; idx += blockDim.x * gridDim.x) {
@@ -924,9 +924,9 @@ __global__ void __launch_bounds__(1024)
 
     // Write my putPktBuf to the remote peer's getPktBuf
     if (threadIdx.x == 0 && blockIdx.x == 0) {
-      proxyChan.put(pktBufOffset, pktBytes);
+      portChan.put(pktBufOffset, pktBytes);
       if ((flag & 63) == 0) {
-        proxyChan.flush();
+        portChan.flush();
       }
     }
 
@@ -954,21 +954,21 @@ __global__ void __launch_bounds__(1024)
 
 __global__ void __launch_bounds__(1024)
     allreduce4(int* buff, int* scratch, int rank, int nRanksPerNode, int worldSize, size_t nelems) {
-  reduceScatterSm(buff, scratch, rank, nRanksPerNode, worldSize, nelems);
+  reduceScatterMem(buff, scratch, rank, nRanksPerNode, worldSize, nelems);
   deviceSyncer.sync(gridDim.x);
-  allGatherSm(rank, worldSize, nRanksPerNode, nelems / worldSize);
+  allGatherMem(rank, worldSize, nRanksPerNode, nelems / worldSize);
 }
 
 __global__ void __launch_bounds__(1024)
     allreduce5(int* buff, int rank, int nRanksPerNode, int worldSize, size_t nelems) {
 #if defined(__HIP_PLATFORM_AMD__)
-  localReduceScatterSm3(buff, rank, nRanksPerNode, nelems / worldSize, nelems / worldSize, gridDim.x);
+  localReduceScatterMem3(buff, rank, nRanksPerNode, nelems / worldSize, nelems / worldSize, gridDim.x);
   deviceSyncer.sync(gridDim.x);
-  localRingAllGatherSm2(rank, nRanksPerNode, nelems / worldSize * sizeof(int), gridDim.x);
+  localRingAllGatherMem2(rank, nRanksPerNode, nelems / worldSize * sizeof(int), gridDim.x);
 #else
-  localReduceScatterSm2(buff, rank, nRanksPerNode, nelems / worldSize, nelems / worldSize, gridDim.x);
+  localReduceScatterMem2(buff, rank, nRanksPerNode, nelems / worldSize, nelems / worldSize, gridDim.x);
   deviceSyncer.sync(gridDim.x);
-  localRingAllGatherSm(rank, nRanksPerNode, nelems / worldSize * sizeof(int), gridDim.x);
+  localRingAllGatherMem(rank, nRanksPerNode, nelems / worldSize * sizeof(int), gridDim.x);
 #endif
 }
 
@@ -998,8 +998,8 @@ __global__ void __launch_bounds__(1024)
   uint2* dst = (uint2*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int));
 
   // step 1: write to scratch buffer
-  constSmOutOfPlaceChans[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid,
-                                             blockDim.x * nBlocksPerPeer, flag);
+  constMemOutOfPlaceChans[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid,
+                                              blockDim.x * nBlocksPerPeer, flag);
   // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer
   for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) {
     uint2 data = make_uint2(0, 0);
@@ -1021,7 +1021,7 @@ __global__ void __launch_bounds__(1024)
     packet.flag2 = flag;
     size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket) + (idx + rank * nPktsPerRank);
     for (int index = 0; index < nPeers; index++) {
-      constSmOutOfPlaceChans[index].write(offset, packet);
+      constMemOutOfPlaceChans[index].write(offset, packet);
     }
   }
   // step 3: get data result from scratch buffer
@@ -1064,8 +1064,8 @@ __global__ void __launch_bounds__(1024)
   uint32_t* dst = (uint32_t*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int));
 
   // step 1: write to scratch buffer
-  constSmOutOfPlaceChans[peerIdx].putPackets<mscclpp::LL8Packet>(scratchOffset, srcOffset, nelemsPerRank * sizeof(int),
-                                                                 tid, blockDim.x * nBlocksPerPeer, flag);
+  constMemOutOfPlaceChans[peerIdx].putPackets<mscclpp::LL8Packet>(scratchOffset, srcOffset, nelemsPerRank * sizeof(int),
+                                                                  tid, blockDim.x * nBlocksPerPeer, flag);
   // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer
   for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) {
     uint32_t data = 0;
@@ -1083,7 +1083,7 @@ __global__ void __launch_bounds__(1024)
     packet.flag = flag;
     size_t offset = scratchResultOffset / sizeof(mscclpp::LL8Packet) + (idx + rank * nPktsPerRank);
     for (int index = 0; index < nPeers; index++) {
-      constSmOutOfPlaceChans[index].write(offset, packet);
+      constMemOutOfPlaceChans[index].write(offset, packet);
     }
   }
   // step 3: get data result from scratch buffer
@@ -1255,9 +1255,9 @@ class AllReduceTestEngine : public BaseTestEngine {
   std::shared_ptr<mscclpp::LLPacket> putPacketBuff_;
   std::shared_ptr<mscclpp::LLPacket> getPacketBuff_;
   std::shared_ptr<int[]> expectedBuff_;
-  std::vector<mscclpp::SmChannel> smOutOfPlaceChannels_;
-  std::vector<mscclpp::SmChannel> smInPlaceChannels_;
-  std::vector<mscclpp::SmChannel> smOutOfPlaceGetChannels_;
+  std::vector<mscclpp::MemoryChannel> memoryOutOfPlaceChannels_;
+  std::vector<mscclpp::MemoryChannel> memoryInPlaceChannels_;
+  std::vector<mscclpp::MemoryChannel> memoryOutOfPlaceGetChannels_;
 };
 
 AllReduceTestEngine::AllReduceTestEngine(const TestArgs& args) : BaseTestEngine(args, "allreduce") {
@@ -1304,94 +1304,98 @@ void AllReduceTestEngine::allocateBuffer() {
 }
 
 void AllReduceTestEngine::setupConnections() {
-  auto getChannelDeviceHandle = [](const std::vector<mscclpp::SmChannel>& in,
-                                   std::vector<DeviceHandle<mscclpp::SmChannel>>& out) {
-    return std::transform(in.begin(), in.end(), out.begin(),
-                          [](const mscclpp::SmChannel& smChannel) { return mscclpp::deviceHandle(smChannel); });
+  auto getChannelDeviceHandle = [](const std::vector<mscclpp::MemoryChannel>& in,
+                                   std::vector<DeviceHandle<mscclpp::MemoryChannel>>& out) {
+    return std::transform(in.begin(), in.end(), out.begin(), [](const mscclpp::MemoryChannel& memoryChannel) {
+      return mscclpp::deviceHandle(memoryChannel);
+    });
   };
   if (isUsePacket()) {
-    std::vector<DeviceHandle<mscclpp::ProxyChannel>> proxyChannels;
+    std::vector<DeviceHandle<mscclpp::PortChannel>> portChannels;
 
     const size_t nPacket = (args_.maxBytes + sizeof(uint64_t) - 1) / sizeof(uint64_t);
     if (args_.kernelNum == 6 || args_.kernelNum == 7) {
       const size_t scratchPacketBuffBytes = nPacket * 2 * 2 * sizeof(mscclpp::LLPacket);
-      setupMeshConnections(smOutOfPlaceChannels_, inputBuff_.get(), args_.maxBytes, scratchPacketBuff_.get(),
+      setupMeshConnections(memoryOutOfPlaceChannels_, inputBuff_.get(), args_.maxBytes, scratchPacketBuff_.get(),
                            scratchPacketBuffBytes);
-      std::vector<DeviceHandle<mscclpp::SmChannel>> smChannelDeviceHandles(smOutOfPlaceChannels_.size());
-      getChannelDeviceHandle(smOutOfPlaceChannels_, smChannelDeviceHandles);
-      CUDATHROW(cudaMemcpyToSymbol(constSmOutOfPlaceChans, smChannelDeviceHandles.data(),
-                                   sizeof(DeviceHandle<mscclpp::SmChannel>) * smChannelDeviceHandles.size()));
+      std::vector<DeviceHandle<mscclpp::MemoryChannel>> memoryChannelDeviceHandles(memoryOutOfPlaceChannels_.size());
+      getChannelDeviceHandle(memoryOutOfPlaceChannels_, memoryChannelDeviceHandles);
+      CUDATHROW(cudaMemcpyToSymbol(constMemOutOfPlaceChans, memoryChannelDeviceHandles.data(),
+                                   sizeof(DeviceHandle<mscclpp::MemoryChannel>) * memoryChannelDeviceHandles.size()));
     }
     if (args_.kernelNum == 2) {
       const size_t scratchPacketBuffBytes =
           nPacket * std::max(args_.nRanksPerNode - 1, 1) * 2 * sizeof(mscclpp::LLPacket);
       const size_t packetBuffBytes = nPacket * 2 * sizeof(mscclpp::LLPacket);
-      setupMeshConnections(smOutOfPlaceChannels_, proxyChannels, inputBuff_.get(), args_.maxBytes, putPacketBuff_.get(),
-                           packetBuffBytes, getPacketBuff_.get(), packetBuffBytes, scratchPacketBuff_.get(),
-                           scratchPacketBuffBytes);
+      setupMeshConnections(memoryOutOfPlaceChannels_, portChannels, inputBuff_.get(), args_.maxBytes,
+                           putPacketBuff_.get(), packetBuffBytes, getPacketBuff_.get(), packetBuffBytes,
+                           scratchPacketBuff_.get(), scratchPacketBuffBytes);
 
-      if (smOutOfPlaceChannels_.size() > sizeof(constSmOutOfPlaceChans) / sizeof(DeviceHandle<mscclpp::SmChannel>)) {
+      if (memoryOutOfPlaceChannels_.size() >
+          sizeof(constMemOutOfPlaceChans) / sizeof(DeviceHandle<mscclpp::MemoryChannel>)) {
         std::runtime_error("unexpected error");
       }
-      if (proxyChannels.size() > sizeof(constDevFstRoundChans) / sizeof(DeviceHandle<mscclpp::ProxyChannel>)) {
+      if (portChannels.size() > sizeof(constDevFstRoundChans) / sizeof(DeviceHandle<mscclpp::PortChannel>)) {
         std::runtime_error("unexpected error");
       }
 
-      std::vector<DeviceHandle<mscclpp::SmChannel>> smChannelDeviceHandles(smOutOfPlaceChannels_.size());
-      getChannelDeviceHandle(smOutOfPlaceChannels_, smChannelDeviceHandles);
-      CUDATHROW(cudaMemcpyToSymbol(constSmOutOfPlaceChans, smChannelDeviceHandles.data(),
-                                   sizeof(DeviceHandle<mscclpp::SmChannel>) * smChannelDeviceHandles.size()));
-      CUDATHROW(cudaMemcpyToSymbol(constDevFstRoundChans, proxyChannels.data(),
-                                   sizeof(DeviceHandle<mscclpp::ProxyChannel>) * proxyChannels.size()));
+      std::vector<DeviceHandle<mscclpp::MemoryChannel>> memoryChannelDeviceHandles(memoryOutOfPlaceChannels_.size());
+      getChannelDeviceHandle(memoryOutOfPlaceChannels_, memoryChannelDeviceHandles);
+      CUDATHROW(cudaMemcpyToSymbol(constMemOutOfPlaceChans, memoryChannelDeviceHandles.data(),
+                                   sizeof(DeviceHandle<mscclpp::MemoryChannel>) * memoryChannelDeviceHandles.size()));
+      CUDATHROW(cudaMemcpyToSymbol(constDevFstRoundChans, portChannels.data(),
+                                   sizeof(DeviceHandle<mscclpp::PortChannel>) * portChannels.size()));
     }
   } else {
-    std::vector<DeviceHandle<mscclpp::ProxyChannel>> fstRoundChannels;
-    std::vector<DeviceHandle<mscclpp::ProxyChannel>> sndRoundChannels;
+    std::vector<DeviceHandle<mscclpp::PortChannel>> fstRoundChannels;
+    std::vector<DeviceHandle<mscclpp::PortChannel>> sndRoundChannels;
 
     // Send data from local inputBuff to remote scratchBuff (out-of-place)
     setupMeshConnections(fstRoundChannels, inputBuff_.get(), args_.maxBytes, scratchBuff_.get(), args_.maxBytes);
-    if (fstRoundChannels.size() > sizeof(constDevFstRoundChans) / sizeof(DeviceHandle<mscclpp::ProxyChannel>)) {
+    if (fstRoundChannels.size() > sizeof(constDevFstRoundChans) / sizeof(DeviceHandle<mscclpp::PortChannel>)) {
       std::runtime_error("unexpected error");
     }
     CUDATHROW(cudaMemcpyToSymbol(constDevFstRoundChans, fstRoundChannels.data(),
-                                 sizeof(DeviceHandle<mscclpp::ProxyChannel>) * fstRoundChannels.size()));
+                                 sizeof(DeviceHandle<mscclpp::PortChannel>) * fstRoundChannels.size()));
 
     // Send data from local inputBuff to remote inputBuff (in-place)
     setupMeshConnections(sndRoundChannels, inputBuff_.get(), args_.maxBytes);
-    if (sndRoundChannels.size() > sizeof(constDevSndRoundChans) / sizeof(DeviceHandle<mscclpp::ProxyChannel>)) {
+    if (sndRoundChannels.size() > sizeof(constDevSndRoundChans) / sizeof(DeviceHandle<mscclpp::PortChannel>)) {
       std::runtime_error("unexpected error");
     }
     CUDATHROW(cudaMemcpyToSymbol(constDevSndRoundChans, sndRoundChannels.data(),
-                                 sizeof(DeviceHandle<mscclpp::ProxyChannel>) * sndRoundChannels.size()));
+                                 sizeof(DeviceHandle<mscclpp::PortChannel>) * sndRoundChannels.size()));
 
-    setupMeshConnections(smOutOfPlaceChannels_, inputBuff_.get(), args_.maxBytes, scratchBuff_.get(), args_.maxBytes);
-    if (smOutOfPlaceChannels_.size() > sizeof(constSmOutOfPlaceChans) / sizeof(DeviceHandle<mscclpp::SmChannel>)) {
+    setupMeshConnections(memoryOutOfPlaceChannels_, inputBuff_.get(), args_.maxBytes, scratchBuff_.get(),
+                         args_.maxBytes);
+    if (memoryOutOfPlaceChannels_.size() >
+        sizeof(constMemOutOfPlaceChans) / sizeof(DeviceHandle<mscclpp::MemoryChannel>)) {
       std::runtime_error("unexpected error");
     }
-    std::vector<DeviceHandle<mscclpp::SmChannel>> smChannelDeviceHandles(smOutOfPlaceChannels_.size());
-    getChannelDeviceHandle(smOutOfPlaceChannels_, smChannelDeviceHandles);
-    CUDATHROW(cudaMemcpyToSymbol(constSmOutOfPlaceChans, smChannelDeviceHandles.data(),
-                                 sizeof(DeviceHandle<mscclpp::SmChannel>) * smChannelDeviceHandles.size()));
+    std::vector<DeviceHandle<mscclpp::MemoryChannel>> memoryChannelDeviceHandles(memoryOutOfPlaceChannels_.size());
+    getChannelDeviceHandle(memoryOutOfPlaceChannels_, memoryChannelDeviceHandles);
+    CUDATHROW(cudaMemcpyToSymbol(constMemOutOfPlaceChans, memoryChannelDeviceHandles.data(),
+                                 sizeof(DeviceHandle<mscclpp::MemoryChannel>) * memoryChannelDeviceHandles.size()));
 
-    setupMeshConnections(smInPlaceChannels_, inputBuff_.get(), args_.maxBytes);
-    if (smInPlaceChannels_.size() > sizeof(constSmInPlaceChans) / sizeof(DeviceHandle<mscclpp::SmChannel>)) {
+    setupMeshConnections(memoryInPlaceChannels_, inputBuff_.get(), args_.maxBytes);
+    if (memoryInPlaceChannels_.size() > sizeof(constMemInPlaceChans) / sizeof(DeviceHandle<mscclpp::MemoryChannel>)) {
       std::runtime_error("unexpected error");
     }
-    smChannelDeviceHandles.resize(smInPlaceChannels_.size());
-    getChannelDeviceHandle(smInPlaceChannels_, smChannelDeviceHandles);
-    CUDATHROW(cudaMemcpyToSymbol(constSmInPlaceChans, smChannelDeviceHandles.data(),
-                                 sizeof(DeviceHandle<mscclpp::SmChannel>) * smChannelDeviceHandles.size()));
-
-    setupMeshConnections(smOutOfPlaceGetChannels_, inputBuff_.get(), args_.maxBytes, scratchBuff_.get(), args_.maxBytes,
-                         ChannelSemantic::GET);
-    if (smOutOfPlaceGetChannels_.size() >
-        sizeof(constSmOutOfPlaceGetChans) / sizeof(DeviceHandle<mscclpp::SmChannel>)) {
+    memoryChannelDeviceHandles.resize(memoryInPlaceChannels_.size());
+    getChannelDeviceHandle(memoryInPlaceChannels_, memoryChannelDeviceHandles);
+    CUDATHROW(cudaMemcpyToSymbol(constMemInPlaceChans, memoryChannelDeviceHandles.data(),
+                                 sizeof(DeviceHandle<mscclpp::MemoryChannel>) * memoryChannelDeviceHandles.size()));
+
+    setupMeshConnections(memoryOutOfPlaceGetChannels_, inputBuff_.get(), args_.maxBytes, scratchBuff_.get(),
+                         args_.maxBytes, ChannelSemantic::GET);
+    if (memoryOutOfPlaceGetChannels_.size() >
+        sizeof(constMemOutOfPlaceGetChans) / sizeof(DeviceHandle<mscclpp::MemoryChannel>)) {
       std::runtime_error("unexpected error");
     }
-    smChannelDeviceHandles.resize(smOutOfPlaceGetChannels_.size());
-    getChannelDeviceHandle(smOutOfPlaceGetChannels_, smChannelDeviceHandles);
-    CUDATHROW(cudaMemcpyToSymbol(constSmOutOfPlaceGetChans, smChannelDeviceHandles.data(),
-                                 sizeof(DeviceHandle<mscclpp::SmChannel>) * smChannelDeviceHandles.size()));
+    memoryChannelDeviceHandles.resize(memoryOutOfPlaceGetChannels_.size());
+    getChannelDeviceHandle(memoryOutOfPlaceGetChannels_, memoryChannelDeviceHandles);
+    CUDATHROW(cudaMemcpyToSymbol(constMemOutOfPlaceGetChans, memoryChannelDeviceHandles.data(),
+                                 sizeof(DeviceHandle<mscclpp::MemoryChannel>) * memoryChannelDeviceHandles.size()));
   }
 }
 
diff --git a/test/mscclpp-test/alltoall_test.cu b/test/mscclpp-test/alltoall_test.cu
index 6d39e9f5f..57e45e450 100644
--- a/test/mscclpp-test/alltoall_test.cu
+++ b/test/mscclpp-test/alltoall_test.cu
@@ -9,7 +9,7 @@
 
 template <class T>
 using DeviceHandle = mscclpp::DeviceHandle<T>;
-__constant__ DeviceHandle<mscclpp::ProxyChannel> constProxyChans[16];
+__constant__ DeviceHandle<mscclpp::PortChannel> constPortChans[16];
 __device__ mscclpp::DeviceSyncer deviceSyncer;
 void* localRecvBuff;
 void* localSendBuff;
@@ -17,14 +17,14 @@ void* localSendBuff;
 __device__ void localAlltoall(int rank, int nRanksPerNode, size_t nElements) {
   int remoteRank = ((int)blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1;
   for (int i = 1; i < nRanksPerNode; i++) {
-    DeviceHandle<mscclpp::ProxyChannel> proxyChan = constProxyChans[blockIdx.x];
+    DeviceHandle<mscclpp::PortChannel> portChan = constPortChans[blockIdx.x];
     if (threadIdx.x == 0 && remoteRank % nRanksPerNode == (rank + i) % nRanksPerNode) {
-      proxyChan.putWithSignalAndFlush(rank * nElements * sizeof(int), remoteRank * nElements * sizeof(int),
-                                      nElements * sizeof(int));
+      portChan.putWithSignalAndFlush(rank * nElements * sizeof(int), remoteRank * nElements * sizeof(int),
+                                     nElements * sizeof(int));
     }
     // wait for the data from GPU (rank-i) % nranksPerNode to arrive
     if (threadIdx.x == 0 && remoteRank % nRanksPerNode == (rank - i + nRanksPerNode) % nRanksPerNode) {
-      proxyChan.wait();
+      portChan.wait();
     }
     deviceSyncer.sync(nRanksPerNode - 1);
   }
@@ -32,16 +32,16 @@ __device__ void localAlltoall(int rank, int nRanksPerNode, size_t nElements) {
 
 __global__ void __launch_bounds__(1024) alltoall0(int rank, size_t nElements) {
   int remoteRank = ((int)blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1;
-  DeviceHandle<mscclpp::ProxyChannel> proxyChan = constProxyChans[blockIdx.x];
+  DeviceHandle<mscclpp::PortChannel> portChan = constPortChans[blockIdx.x];
   if (threadIdx.x == 0) {
-    proxyChan.putWithSignal(rank * nElements * sizeof(int), remoteRank * nElements * sizeof(int),
-                            nElements * sizeof(int));
+    portChan.putWithSignal(rank * nElements * sizeof(int), remoteRank * nElements * sizeof(int),
+                           nElements * sizeof(int));
   }
 
   deviceSyncer.sync(gridDim.x);
   if (threadIdx.x == 0) {
-    proxyChan.flush();
-    proxyChan.wait();
+    portChan.flush();
+    portChan.wait();
   }
 }
 
@@ -149,14 +149,14 @@ void AllToAllTestEngine::allocateBuffer() {
 }
 
 void AllToAllTestEngine::setupConnections() {
-  std::vector<DeviceHandle<mscclpp::ProxyChannel>> proxyChannels;
-  setupMeshConnections(proxyChannels, sendBuff_.get(), args_.maxBytes, recvBuff_.get(), args_.maxBytes);
+  std::vector<DeviceHandle<mscclpp::PortChannel>> portChannels;
+  setupMeshConnections(portChannels, sendBuff_.get(), args_.maxBytes, recvBuff_.get(), args_.maxBytes);
 
-  if (proxyChannels.size() > sizeof(constProxyChans) / sizeof(DeviceHandle<mscclpp::ProxyChannel>)) {
+  if (portChannels.size() > sizeof(constPortChans) / sizeof(DeviceHandle<mscclpp::PortChannel>)) {
     std::runtime_error("unexpected error");
   }
-  CUDATHROW(cudaMemcpyToSymbol(constProxyChans, proxyChannels.data(),
-                               sizeof(DeviceHandle<mscclpp::ProxyChannel>) * proxyChannels.size()));
+  CUDATHROW(cudaMemcpyToSymbol(constPortChans, portChannels.data(),
+                               sizeof(DeviceHandle<mscclpp::PortChannel>) * portChannels.size()));
 }
 
 std::vector<void*> AllToAllTestEngine::getSendBuff() { return {sendBuff_.get()}; }
diff --git a/test/mscclpp-test/common.cc b/test/mscclpp-test/common.cc
index c92312806..0bbf26012 100644
--- a/test/mscclpp-test/common.cc
+++ b/test/mscclpp-test/common.cc
@@ -397,7 +397,7 @@ void BaseTestEngine::setupMeshConnectionsInternal(
 
 // Create mesh connections between all ranks. If recvBuff is nullptr, assume in-place.
 // TODO(saemal): retrun the actual vector instead of void
-void BaseTestEngine::setupMeshConnections(std::vector<DeviceHandle<mscclpp::ProxyChannel>>& proxyChannels,
+void BaseTestEngine::setupMeshConnections(std::vector<DeviceHandle<mscclpp::PortChannel>>& portChannels,
                                           void* inputBuff, size_t inputBuffBytes, void* outputBuff,
                                           size_t outputBuffBytes, SetupChannelFunc setupChannel) {
   mscclpp::TransportFlags allTransports = mscclpp::Transport::CudaIpc;
@@ -419,16 +419,16 @@ void BaseTestEngine::setupMeshConnections(std::vector<DeviceHandle<mscclpp::Prox
   } else {
     auto service = std::dynamic_pointer_cast<mscclpp::ProxyService>(chanService_);
     for (size_t i = 0; i < connections.size(); ++i) {
-      proxyChannels.push_back(mscclpp::deviceHandle(
-          service->proxyChannel(service->buildAndAddSemaphore(*comm_, connections[i]),
-                                service->addMemory(remoteRegMemories[i].get()), service->addMemory(inputBufRegMem))));
+      portChannels.push_back(mscclpp::deviceHandle(
+          service->portChannel(service->buildAndAddSemaphore(*comm_, connections[i]),
+                               service->addMemory(remoteRegMemories[i].get()), service->addMemory(inputBufRegMem))));
     }
   }
 
   comm_->setup();
 }
 
-void BaseTestEngine::setupMeshConnections(std::vector<mscclpp::SmChannel>& smChannels, void* inputBuff,
+void BaseTestEngine::setupMeshConnections(std::vector<mscclpp::MemoryChannel>& memoryChannels, void* inputBuff,
                                           size_t inputBuffBytes, void* outputBuff, size_t outputBuffBytes,
                                           ChannelSemantic semantic, size_t nChannelPerConnection) {
   mscclpp::TransportFlags allTransports = mscclpp::Transport::CudaIpc;
@@ -446,11 +446,12 @@ void BaseTestEngine::setupMeshConnections(std::vector<mscclpp::SmChannel>& smCha
       (outputBuff && semantic == ChannelSemantic::PUT) ? outputBufRegMem : inputBufRegMem;
   setupMeshConnectionsInternal(connections, localRegMemory, remoteRegMemories);
 
-  std::unordered_map<size_t, std::vector<std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>>> smSemaphores;
+  std::unordered_map<size_t, std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>>> memorySemaphores;
   for (size_t cid = 0; cid < connections.size(); ++cid) {
     if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) {
       for (size_t i = 0; i < nChannelPerConnection; ++i) {
-        smSemaphores[cid].emplace_back(std::make_shared<mscclpp::SmDevice2DeviceSemaphore>(*comm_, connections[cid]));
+        memorySemaphores[cid].emplace_back(
+            std::make_shared<mscclpp::MemoryDevice2DeviceSemaphore>(*comm_, connections[cid]));
       }
     }
   }
@@ -459,16 +460,16 @@ void BaseTestEngine::setupMeshConnections(std::vector<mscclpp::SmChannel>& smCha
   for (size_t i = 0; i < nChannelPerConnection; ++i) {
     for (size_t cid = 0; cid < connections.size(); ++cid) {
       if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) {
-        smChannels.emplace_back(smSemaphores[cid][i], remoteRegMemories[cid].get(),
-                                (outputBuff && semantic == ChannelSemantic::GET) ? outputBuff : inputBufRegMem.data(),
-                                outputBuff);
+        memoryChannels.emplace_back(
+            memorySemaphores[cid][i], remoteRegMemories[cid].get(),
+            (outputBuff && semantic == ChannelSemantic::GET) ? outputBuff : inputBufRegMem.data(), outputBuff);
       }
     }
   }
 }
 
-void BaseTestEngine::setupMeshConnections(std::vector<mscclpp::SmChannel>& smChannels,
-                                          std::vector<DeviceHandle<mscclpp::ProxyChannel>>& proxyChannels,
+void BaseTestEngine::setupMeshConnections(std::vector<mscclpp::MemoryChannel>& memoryChannels,
+                                          std::vector<DeviceHandle<mscclpp::PortChannel>>& portChannels,
                                           void* inputBuff, size_t inputBuffBytes, void* putPacketBuff,
                                           size_t putPacketBuffBytes, void* getPacketBuff, size_t getPacketBuffBytes,
                                           void* outputBuff, size_t outputBuffBytes) {
@@ -500,13 +501,13 @@ void BaseTestEngine::setupMeshConnections(std::vector<mscclpp::SmChannel>& smCha
     setupMeshConnectionsInternal(connections, outputBufRegMem, remoteRegMemoriesOutput, false);
   }
 
-  std::unordered_map<size_t, std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>> smSemaphores;
+  std::unordered_map<size_t, std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores;
   std::unordered_map<size_t, mscclpp::SemaphoreId> connIdToSemId;
   auto service = std::dynamic_pointer_cast<mscclpp::ProxyService>(chanService_);
 
   for (size_t cid = 0; cid < connections.size(); ++cid) {
     if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) {
-      smSemaphores.emplace(cid, std::make_shared<mscclpp::SmDevice2DeviceSemaphore>(*comm_, connections[cid]));
+      memorySemaphores.emplace(cid, std::make_shared<mscclpp::MemoryDevice2DeviceSemaphore>(*comm_, connections[cid]));
     } else {
       connIdToSemId[cid] = service->buildAndAddSemaphore(*comm_, connections[cid]);
     }
@@ -515,16 +516,16 @@ void BaseTestEngine::setupMeshConnections(std::vector<mscclpp::SmChannel>& smCha
 
   for (size_t cid = 0; cid < connections.size(); ++cid) {
     if (connections[cid]->transport() == mscclpp::Transport::CudaIpc) {
-      smChannels.emplace_back(smSemaphores[cid],
-                              (outputBuff) ? remoteRegMemoriesOutput[cid].get() : remoteRegMemories[cid].get(),
-                              inputBufRegMem.data(), (outputBuff) ? outputBufRegMem.data() : nullptr);
+      memoryChannels.emplace_back(memorySemaphores[cid],
+                                  (outputBuff) ? remoteRegMemoriesOutput[cid].get() : remoteRegMemories[cid].get(),
+                                  inputBufRegMem.data(), (outputBuff) ? outputBufRegMem.data() : nullptr);
     } else {
       if (putPacketBuff == nullptr || getPacketBuff == nullptr) {
         throw std::runtime_error("IB transport requires putPacketBuff and getPacketBuff");
       }
-      proxyChannels.emplace_back(mscclpp::deviceHandle(
-          service->proxyChannel(connIdToSemId[cid], service->addMemory(remoteRegMemories[cid].get()),
-                                service->addMemory(putPacketBufRegMem))));
+      portChannels.emplace_back(mscclpp::deviceHandle(
+          service->portChannel(connIdToSemId[cid], service->addMemory(remoteRegMemories[cid].get()),
+                               service->addMemory(putPacketBufRegMem))));
     }
   }
 }
diff --git a/test/mscclpp-test/common.hpp b/test/mscclpp-test/common.hpp
index d7408cc29..0267713da 100644
--- a/test/mscclpp-test/common.hpp
+++ b/test/mscclpp-test/common.hpp
@@ -5,8 +5,8 @@
 #define MSCCLPP_TESTS_COMMON_H_
 
 #include <mscclpp/core.hpp>
-#include <mscclpp/proxy_channel.hpp>
-#include <mscclpp/sm_channel.hpp>
+#include <mscclpp/memory_channel.hpp>
+#include <mscclpp/port_channel.hpp>
 #include <vector>
 
 #define CUDATHROW(cmd)                                                                                                \
@@ -113,14 +113,14 @@ class BaseTestEngine {
                                               const mscclpp::RegisteredMemory&)>;
   template <class T>
   using DeviceHandle = mscclpp::DeviceHandle<T>;
-  void setupMeshConnections(std::vector<DeviceHandle<mscclpp::ProxyChannel>>& proxyChannels, void* inputBuff,
+  void setupMeshConnections(std::vector<DeviceHandle<mscclpp::PortChannel>>& portChannels, void* inputBuff,
                             size_t inputBuffBytes, void* outputBuff = nullptr, size_t outputBuffBytes = 0,
                             SetupChannelFunc setupChannel = nullptr);
-  void setupMeshConnections(std::vector<mscclpp::SmChannel>& smChannels, void* inputBuff, size_t inputBuffBytes,
+  void setupMeshConnections(std::vector<mscclpp::MemoryChannel>& memoryChannels, void* inputBuff, size_t inputBuffBytes,
                             void* outputBuff = nullptr, size_t outputBuffBytes = 0,
                             ChannelSemantic semantic = ChannelSemantic::PUT, size_t nChannelPerConnection = 1);
-  void setupMeshConnections(std::vector<mscclpp::SmChannel>& smChannels,
-                            std::vector<DeviceHandle<mscclpp::ProxyChannel>>& proxyChannels, void* inputBuff,
+  void setupMeshConnections(std::vector<mscclpp::MemoryChannel>& memoryChannels,
+                            std::vector<DeviceHandle<mscclpp::PortChannel>>& portChannels, void* inputBuff,
                             size_t inputBuffBytes, void* putPacketBuff = nullptr, size_t putPacketBuffBytes = 0,
                             void* getPacketBuff = nullptr, size_t getPacketBuffBytes = 0, void* outputBuff = nullptr,
                             size_t outputBuffBytes = 0);
diff --git a/test/mscclpp-test/sendrecv_test.cu b/test/mscclpp-test/sendrecv_test.cu
index 0bd13e02c..99d7bd2f9 100644
--- a/test/mscclpp-test/sendrecv_test.cu
+++ b/test/mscclpp-test/sendrecv_test.cu
@@ -7,8 +7,8 @@
 #include <iostream>
 #include <mscclpp/concurrency_device.hpp>
 #include <mscclpp/gpu_utils.hpp>
+#include <mscclpp/memory_channel.hpp>
 #include <mscclpp/semaphore.hpp>
-#include <mscclpp/sm_channel.hpp>
 #include <string>
 #include <vector>
 
@@ -24,7 +24,7 @@ constexpr size_t MAX_BLOCKS_NUM = 32;
 
 template <class T>
 using DeviceHandle = mscclpp::DeviceHandle<T>;
-__constant__ DeviceHandle<mscclpp::SmChannel> constSmChans[2];
+__constant__ DeviceHandle<mscclpp::MemoryChannel> constMemChans[2];
 
 inline int getBlockNum(size_t count) {
   return std::min((count + THRES_BYTES_PER_BLOCK - 1) / THRES_BYTES_PER_BLOCK, MAX_BLOCKS_NUM);
@@ -41,8 +41,8 @@ __global__ void __launch_bounds__(1024) kernel(size_t dataSize, size_t dataPerBl
   size_t blockDataSize = min(dataSize - startIndex, dataPerBlock);
   int globalIndex = blockIdx.x * blockDim.x + threadIdx.x;
 
-  DeviceHandle<mscclpp::SmChannel> sendConn = constSmChans[0];
-  DeviceHandle<mscclpp::SmChannel> recvConn = constSmChans[1];
+  DeviceHandle<mscclpp::MemoryChannel> sendConn = constMemChans[0];
+  DeviceHandle<mscclpp::MemoryChannel> recvConn = constMemChans[1];
 
   sendConn.put(startIndex, startIndex, blockDataSize, threadIdx.x, blockDim.x);
   deviceSyncer.sync(gridDim.x);
@@ -131,7 +131,7 @@ class SendRecvTestEngine : public BaseTestEngine {
 
   std::vector<std::shared_ptr<int>> devicePtrs_;
   std::shared_ptr<int[]> expectedBuff_;
-  std::vector<mscclpp::SmChannel> smChannels_;
+  std::vector<mscclpp::MemoryChannel> memoryChannels_;
 };
 
 SendRecvTestEngine::SendRecvTestEngine(const TestArgs& args) : BaseTestEngine(args, "sendrecv") { inPlace_ = false; }
@@ -153,7 +153,7 @@ void SendRecvTestEngine::setupConnections() {
   std::array<int, 2> ranks = {sendToRank, recvFromRank};
   auto service = std::dynamic_pointer_cast<mscclpp::ProxyService>(chanService_);
 
-  std::vector<std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>> smSemaphores;
+  std::vector<std::shared_ptr<mscclpp::MemoryDevice2DeviceSemaphore>> memorySemaphores;
 
   auto sendConnFuture =
       comm_->connectOnSetup(sendToRank, 0, getTransport(args_.rank, sendToRank, args_.nRanksPerNode, ibDevice));
@@ -161,12 +161,12 @@ void SendRecvTestEngine::setupConnections() {
     auto recvConnFuture =
         comm_->connectOnSetup(recvFromRank, 0, getTransport(args_.rank, recvFromRank, args_.nRanksPerNode, ibDevice));
     comm_->setup();
-    smSemaphores.push_back(std::make_shared<mscclpp::SmDevice2DeviceSemaphore>(*comm_, sendConnFuture.get()));
-    smSemaphores.push_back(std::make_shared<mscclpp::SmDevice2DeviceSemaphore>(*comm_, recvConnFuture.get()));
+    memorySemaphores.push_back(std::make_shared<mscclpp::MemoryDevice2DeviceSemaphore>(*comm_, sendConnFuture.get()));
+    memorySemaphores.push_back(std::make_shared<mscclpp::MemoryDevice2DeviceSemaphore>(*comm_, recvConnFuture.get()));
   } else {
     comm_->setup();
-    smSemaphores.push_back(std::make_shared<mscclpp::SmDevice2DeviceSemaphore>(*comm_, sendConnFuture.get()));
-    smSemaphores.push_back(smSemaphores[0]);  // reuse the send channel if worldSize is 2
+    memorySemaphores.push_back(std::make_shared<mscclpp::MemoryDevice2DeviceSemaphore>(*comm_, sendConnFuture.get()));
+    memorySemaphores.push_back(memorySemaphores[0]);  // reuse the send channel if worldSize is 2
   }
   comm_->setup();
 
@@ -183,15 +183,15 @@ void SendRecvTestEngine::setupConnections() {
 
   // swap to make sure devicePtrs_[0] in local rank write to devicePtrs_[1] in remote rank
   std::swap(futureRemoteMemory[0], futureRemoteMemory[1]);
-  std::vector<DeviceHandle<mscclpp::SmChannel>> smChannelHandles(2);
+  std::vector<DeviceHandle<mscclpp::MemoryChannel>> memoryChannelHandles(2);
   for (int i : {0, 1}) {
     // We assume ranks in the same node
-    smChannels_.emplace_back(smSemaphores[i], futureRemoteMemory[i].get(), (void*)localMemories[i].data());
+    memoryChannels_.emplace_back(memorySemaphores[i], futureRemoteMemory[i].get(), (void*)localMemories[i].data());
   }
-  std::transform(smChannels_.begin(), smChannels_.end(), smChannelHandles.begin(),
-                 [](const mscclpp::SmChannel& smChannel) { return smChannel.deviceHandle(); });
-  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(constSmChans, smChannelHandles.data(),
-                                       sizeof(DeviceHandle<mscclpp::SmChannel>) * smChannelHandles.size()));
+  std::transform(memoryChannels_.begin(), memoryChannels_.end(), memoryChannelHandles.begin(),
+                 [](const mscclpp::MemoryChannel& memoryChannel) { return memoryChannel.deviceHandle(); });
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(constMemChans, memoryChannelHandles.data(),
+                                       sizeof(DeviceHandle<mscclpp::MemoryChannel>) * memoryChannelHandles.size()));
 }
 
 std::vector<void*> SendRecvTestEngine::getSendBuff() { return {devicePtrs_[0].get()}; }