DeepLink-org · JimyMa · Oct 1, 2025 · Oct 5, 2025 · Oct 5, 2025 · Oct 5, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -28,7 +28,7 @@ repos:
     rev: v2.1.0
     hooks:
       - id: codespell
-        args: ["--skip=docs/imgs/*,csrc/json.hpp,csrc/python/pybind_json/pybind_json.hpp"]
+        args: ["--skip=docs/imgs/*,csrc/jring.h,csrc/json.hpp,csrc/python/pybind_json/pybind_json.hpp"]
 
   # - repo: https://github.com/myint/docformatter
   #   rev: v1.4

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -11,8 +11,9 @@ slime_option(BUILD_NVSHMEM             "Build NVSHMEM"                    OFF)
 slime_option(BUILD_ASCEND_DIRECT       "Build Ascend direct transport"    OFF)
 
 # Slime options for ops
-slime_option(BUILD_INTRA_OPS           "Build intra LL collective ops"    OFF)
-slime_option(BUILD_INTER_OPS           "Build inter LL collective ops"    OFF)
+slime_option(BUILD_IBVERBS_OPS         "Build ibverbs collective ops"     OFF)
+slime_option(BUILD_INTRA_OPS           "Build intra collective ops"       OFF)
+slime_option(BUILD_INTER_OPS           "Build inter collective ops"       OFF)
 
 # Slime options for custom python wrapper
 slime_option(BUILD_PYTHON              "Build python wrapper"             OFF)
@@ -25,6 +26,11 @@ slime_option(BUILD_TORCH_PLUGIN        "Build torch plugin"               OFF)
 slime_option(BUILD_BENCH               "Build transfer engine benchmark"  OFF)
 slime_option(BUILD_TEST                "Build test"                       OFF)
 
+if(BUILD_IBVERBS_OPS AND NOT BUILD_RDMA)
+    message(STATUS "BUILD_IBVERBS_OPS requires BUILD_RDMA, enabling RDMA...")
+    set(BUILD_RDMA ON CACHE BOOL "Build RDMA" FORCE)
+endif()
+
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
@@ -47,7 +53,7 @@ else()
     set(DLSLIME_INSTALL_PATH "lib")
 endif()
 
-if (BUILD_TORCH_PLUGIN OR BUILD_INTRA_OPS OR BUILD_INTER_OPS)
+if (BUILD_TORCH_PLUGIN OR BUILD_IBVERBS_OPS OR BUILD_INTRA_OPS OR BUILD_INTER_OPS)
     include(${CMAKE_CURRENT_LIST_DIR}/cmake/torch.cmake)
 endif()
 

diff --git a/README.md b/README.md
@@ -124,7 +124,7 @@ torchrun --nnodes 2 --master-addr 10.130.8.143 --node-rank 1 --nproc-per-node 8
 ```
 
 > \[!Note\]
-> The intra- and inter- examples example above enables CUDA Graph by default. --eager-mode falls back to eager mode.
+> The `intra-` and `inter-` examples above enables CUDA Graph by default. `--eager-mode` falls back to eager mode.
 
 ## Install
 
@@ -157,17 +157,18 @@ mkdir -p DLSlime/build && cmake -DFLAG=<ON|OFF> ..
 
 The `FLAG` can be
 
-| Flag                     | Description                           | Platform | default |
-| :----------------------- | :------------------------------------ | :------- | ------: |
-| `BUILD_RDMA`             | Build RDMA Transfer Engine            | Hetero   |      ON |
-| `BUILD_PYTHON`           | Build Python wrapper                  | Hetero   |      ON |
-| `BUILD_NVLINK`           | Build NVLINK Transfer Engine          | GPGPU    |     OFF |
-| `BUILD_NVSHMEM`          | Build NVShmem Transfer Engine         | NVIDIA   |     OFF |
-| `BUILD_ASCEND_DIRECT`    | Build Ascend direct transport         | ASCEND   |     OFF |
-| `BUILD_TORCH_PLUGIN`     | Build DLSlime as a torch backend      | Hetero   |     OFF |
-| `USE_GLOO_BACKEND`       | Use GLOO RDMA Send/Recv torch backend | Hetero   |     OFF |
-| `BUILD_INTRA_OPS`        | Use INTRA Collective OPS              | GPGPU    |     OFF |
-| `BUILD_INTER_OPS`        | Use INTER Collective OPS (NVSHMEM)    | NVIDIA   |     OFF |
+| Flag                  | Description                           | Platform | default |
+| :-------------------- | :------------------------------------ | :------- | ------: |
+| `BUILD_RDMA`          | Build RDMA Transfer Engine            | Hetero   |      ON |
+| `BUILD_PYTHON`        | Build Python wrapper                  | Hetero   |      ON |
+| `BUILD_NVLINK`        | Build NVLINK Transfer Engine          | GPGPU    |     OFF |
+| `BUILD_NVSHMEM`       | Build NVShmem Transfer Engine         | NVIDIA   |     OFF |
+| `BUILD_ASCEND_DIRECT` | Build Ascend direct transport         | ASCEND   |     OFF |
+| `BUILD_TORCH_PLUGIN`  | Build DLSlime as a torch backend      | Hetero   |     OFF |
+| `USE_GLOO_BACKEND`    | Use GLOO RDMA Send/Recv torch backend | Hetero   |     OFF |
+| `BUILD_IBVERBS_OPS`   | Build IBVERBS Collective OPS          | Hetero   |     OFF |
+| `BUILD_INTRA_OPS`     | Build INTRA Collective OPS            | GPGPU    |     OFF |
+| `BUILD_INTER_OPS`     | Build INTER Collective OPS (NVSHMEM)  | NVIDIA   |     OFF |
 
 > \[!Note\]
 > Please enable `USE_MECA` when using DLSlime as a torch backend in Metax platform.

diff --git a/csrc/CMakeLists.txt b/csrc/CMakeLists.txt
@@ -1,8 +1,6 @@
 add_subdirectory(engine)
 
-if (BUILD_INTRA_OPS OR BUILD_INTER_OPS)
-    add_subdirectory(ops)
-endif()
+add_subdirectory(ops)
 
 if (BUILD_PYTHON)
     add_subdirectory(python)

diff --git a/csrc/engine/rdma/rdma_assignment.h b/csrc/engine/rdma/rdma_assignment.h
@@ -127,8 +127,28 @@ class RDMASchedulerAssignment {
         rdma_assignment_batch_(std::move(rdma_assignment_batch))
     {
     }
+
     ~RDMASchedulerAssignment();
 
+    int merge(std::shared_ptr<RDMASchedulerAssignment> assign) {
+        if (!assign) {
+            return 0;
+        }
+
+        int original_size = rdma_assignment_batch_.size();
+
+        rdma_assignment_batch_.reserve(original_size + assign->rdma_assignment_batch_.size());
+        rdma_assignment_batch_.insert(
+            rdma_assignment_batch_.end(),
+            assign->rdma_assignment_batch_.begin(),
+            assign->rdma_assignment_batch_.end()
+        );
+
+        assign->rdma_assignment_batch_.clear();
+
+        return rdma_assignment_batch_.size() - original_size; // 返回合并的元素数量
+    }
+
     void query();
     void wait();
 

diff --git a/csrc/engine/rdma/rdma_context.cpp b/csrc/engine/rdma/rdma_context.cpp
@@ -383,7 +383,7 @@ void RDMAContext::stop_future()
         cq_thread_.join();
     }
 }
-
+namespace {
 void split_assign_by_max_length(OpCode           opcode,
                                 AssignmentBatch& batch,
                                 AssignmentBatch& batch_split_after_max_length,
@@ -426,6 +426,7 @@ void nsplit_assign_by_step(OpCode                        opcode,
     int    step  = (bsize + nstep - 1) / nstep;
     split_assign_by_step(opcode, batch, batch_nsplit, step);
 }
+}  // namespace
 
 std::shared_ptr<RDMASchedulerAssignment>
 RDMAContext::submit(OpCode opcode, AssignmentBatch& batch, callback_fn_t callback, int qpi, int32_t imm_data)
@@ -528,8 +529,18 @@ int64_t RDMAContext::post_recv_batch(int qpi, RDMAAssignmentSharedPtr assign)
     int64_t             ret        = 0;
     size_t              batch_size = assign->batch_size();
     struct ibv_recv_wr* bad_wr     = nullptr;
-    struct ibv_recv_wr* wr         = new ibv_recv_wr[batch_size];
-    struct ibv_sge*     sge        = new ibv_sge[batch_size];
+    struct ibv_recv_wr* wr;
+    struct ibv_sge*     sge;
+    if (assign->batch_size() == 0) {
+        wr = new ibv_recv_wr{.wr_id   = (uintptr_t)(new callback_info_with_qpi_t{assign->callback_info_, qpi}),
+                             .next    = nullptr,
+                             .sg_list = nullptr,
+                             .num_sge = 0};
+    }
+    else {
+        wr  = new ibv_recv_wr[batch_size];
+        sge = new ibv_sge[batch_size];
+    }
     for (size_t i = 0; i < batch_size; ++i) {
 
         Assignment&    subassign = assign->batch_[i];