From a5b240f9b5133cad2693351ebd0ef34cda0a9f3f Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 6 Aug 2024 09:24:38 -0700 Subject: [PATCH] Add more communication kernels (#221) - Packet related kernels - Fused kernel for recv-reduce-send --------- Co-authored-by: Changho Hwang --- ark/include/ark/model.hpp | 39 +- ark/include/kernels/comm.h | 636 ++++++++---------- ark/include/kernels/reduce.h | 2 +- ark/model/model_op.cpp | 5 + ark/model/model_op_arg.cpp | 4 + ark/model/model_op_arg.hpp | 1 + ark/ops/ops_all_reduce.cpp | 106 +-- ark/ops/ops_all_reduce_test.cpp | 274 ++++++-- ark/ops/ops_communication.cpp | 535 ++++++++++++++- ark/ops/ops_communication.hpp | 71 ++ ark/ops/ops_communication_test.cpp | 164 ++++- ark/ops/ops_tensor.cpp | 16 +- ark/ops/ops_test_common.cpp | 7 +- ark/ops/ops_test_common.hpp | 4 +- ark/ops_old/ops_all_reduce.cc | 140 ---- .../tutorial/allreduce-packet/plan_gpu0.json | 569 ++++++++++++++++ .../tutorial/allreduce-packet/plan_gpu1.json | 569 ++++++++++++++++ .../tutorial/allreduce-packet/plan_gpu2.json | 569 ++++++++++++++++ .../tutorial/allreduce-packet/plan_gpu3.json | 569 ++++++++++++++++ .../tutorial/allreduce-packet/plan_gpu4.json | 569 ++++++++++++++++ .../tutorial/allreduce-packet/plan_gpu5.json | 569 ++++++++++++++++ .../tutorial/allreduce-packet/plan_gpu6.json | 569 ++++++++++++++++ .../tutorial/allreduce-packet/plan_gpu7.json | 569 ++++++++++++++++ examples/tutorial/allreduce-sm/plan_gpu0.json | 625 +++++++++++++++++ examples/tutorial/allreduce-sm/plan_gpu1.json | 625 +++++++++++++++++ examples/tutorial/allreduce-sm/plan_gpu2.json | 625 +++++++++++++++++ examples/tutorial/allreduce-sm/plan_gpu3.json | 625 +++++++++++++++++ examples/tutorial/allreduce-sm/plan_gpu4.json | 625 +++++++++++++++++ examples/tutorial/allreduce-sm/plan_gpu5.json | 625 +++++++++++++++++ examples/tutorial/allreduce-sm/plan_gpu6.json | 625 +++++++++++++++++ examples/tutorial/allreduce-sm/plan_gpu7.json | 625 +++++++++++++++++ examples/tutorial/multi_gpu_plan.py | 86 +++ python/ark/__init__.py | 41 +- python/ark/ops.py | 94 ++- python/ark/runtime.py | 9 + python/model_py.cpp | 16 +- third_party/CMakeLists.txt | 2 +- 37 files changed, 11042 insertions(+), 762 deletions(-) delete mode 100644 ark/ops_old/ops_all_reduce.cc create mode 100644 examples/tutorial/allreduce-packet/plan_gpu0.json create mode 100644 examples/tutorial/allreduce-packet/plan_gpu1.json create mode 100644 examples/tutorial/allreduce-packet/plan_gpu2.json create mode 100644 examples/tutorial/allreduce-packet/plan_gpu3.json create mode 100644 examples/tutorial/allreduce-packet/plan_gpu4.json create mode 100644 examples/tutorial/allreduce-packet/plan_gpu5.json create mode 100644 examples/tutorial/allreduce-packet/plan_gpu6.json create mode 100644 examples/tutorial/allreduce-packet/plan_gpu7.json create mode 100644 examples/tutorial/allreduce-sm/plan_gpu0.json create mode 100644 examples/tutorial/allreduce-sm/plan_gpu1.json create mode 100644 examples/tutorial/allreduce-sm/plan_gpu2.json create mode 100644 examples/tutorial/allreduce-sm/plan_gpu3.json create mode 100644 examples/tutorial/allreduce-sm/plan_gpu4.json create mode 100644 examples/tutorial/allreduce-sm/plan_gpu5.json create mode 100644 examples/tutorial/allreduce-sm/plan_gpu6.json create mode 100644 examples/tutorial/allreduce-sm/plan_gpu7.json create mode 100644 examples/tutorial/multi_gpu_plan.py diff --git a/ark/include/ark/model.hpp b/ark/include/ark/model.hpp index 9766b023e..3c4f22e22 100644 --- a/ark/include/ark/model.hpp +++ b/ark/include/ark/model.hpp @@ -67,12 +67,14 @@ class Model : public ModelGraph { /// @p strides should be greater than or equal to the padded shape. If the /// @p strides are not provided, they are set to the padded shape. If the /// padded shape is not provided, it is set to the @p shape. + /// @param rank Rank of the tensor. -1 means the rank of this model. /// @param name Name of the tensor. /// @return Pointer to a tensor object. /// Tensor tensor(const Dims &shape, const DataType &data_type, const Dims &strides = {}, const Dims &offsets = {}, - const Dims &padded_shape = {}, const std::string &name = ""); + const Dims &padded_shape = {}, int rank = -1, + const std::string &name = ""); Tensor refer(Tensor input, const Dims &shape = {}, const Dims &strides = {}, const Dims &offsets = {}, const Dims &padded_shape = {}, @@ -196,10 +198,23 @@ class Model : public ModelGraph { // operator is completed. Tensor recv(Tensor output, int remote_rank, int tag, const std::string &name = ""); - // - Tensor put_packet(Tensor input, Tensor local_tmp_buf, Tensor recv_buf, - int id, int rank, int dst_rank, size_t dst_offset, - int flag, const std::string &name = ""); + Tensor send_packet(Tensor input, int remote_rank, int tag, int flag, + Tensor output = NullTensor, + const std::string &name = ""); + Tensor recv_packet(Tensor output, int remote_rank, int tag, int flag, + Tensor scratch = NullTensor, + const std::string &name = ""); + Tensor recv_reduce_send_packet( + Tensor input, const std::vector &remote_ranks, int recv_tag, + int output_tag, unsigned int flag, Tensor output = NullTensor, + std::vector peer_outputs = {}, Tensor scratch = NullTensor, + const std::string &name = ""); + Tensor recv_reduce_send(Tensor input, const std::vector &remote_ranks, + int recv_tag, int output_tag, + Tensor output = NullTensor, + std::vector peer_outputs = {}, + Tensor scratch = NullTensor, + const std::string &name = ""); // Performs an all-reduce operator across all ranks, aggregating the input // tensors. Takes the `input` tensor, the current GPU's rank, and the // total number of ranks `rank_num`. @@ -220,7 +235,8 @@ class Model : public ModelGraph { Tensor output = NullTensor, const std::string &name = ""); // sync across multi devices - Tensor device_sync(Tensor input, int npeers, const std::string &name = ""); + Tensor device_sync(Tensor input, int rank, int rank_num, + const std::string &name = ""); // local reduce scatter Tensor local_reduce_scatter(Tensor input, int gpu_id, int ngpus_per_node, @@ -238,18 +254,7 @@ class Model : public ModelGraph { Tensor local_all_reduce(Tensor input, int gpu_id, int gpu_num, const std::string &name = ""); - Tensor local_all_reduce_packet(Tensor input, int gpu_id, int gpu_num, - const std::string &name = ""); - Tensor reduce_and_write_packet(Tensor input, Tensor scratch, Tensor output, - const std::vector &remote_peer_bufs, - int id, int rank, int npeers, - size_t elems_per_rank, size_t scratch_offset, - size_t remote_dst_offset, int flag, - const std::string &name = ""); - Tensor get_packet(Tensor input, Tensor output, size_t src_offset, - size_t dst_offset, size_t npackets, int flag, - const std::string &name = ""); }; } // namespace ark diff --git a/ark/include/kernels/comm.h b/ark/include/kernels/comm.h index e298cab4e..76fdfe27b 100644 --- a/ark/include/kernels/comm.h +++ b/ark/include/kernels/comm.h @@ -4,6 +4,7 @@ #ifndef ARK_KERNELS_COMM_H_ #define ARK_KERNELS_COMM_H_ +#include #include #include @@ -11,6 +12,7 @@ #include "common/broadcast.h" #include "common/fp16.h" #include "common/unit_op.h" +#include "reduce.h" extern __constant__ mscclpp::SimpleProxyChannelDeviceHandle ARK_PROXY_CHANS[]; extern __constant__ mscclpp::SimpleProxyChannelDeviceHandle @@ -20,6 +22,161 @@ extern __constant__ mscclpp::SmChannelDeviceHandle ARK_SM_CHANS[]; namespace ark { namespace comm { +template +struct PacketIntrinsic { + using InputType = InDataType; + using OutputType = OutDataType; + using Payload = typename PacketType::Payload; + + // Each thread deal with one packet at a time + static constexpr int NelemPerThread = 1; + static_assert( + !WritePacket || std::is_same::value, + "InputType must be the same as Payload when WritePacket is true"); + static_assert( + !ReadPacket || std::is_same::value, + "OutputType must be the same as Payload when ReadPacket is true"); + + static DEVICE void compute(OutputType *out, const InputType *in) { + if constexpr (WritePacket) { + InputType stage; + ark::load(&stage, in); + out->write(stage, Flag); + } + if constexpr (ReadPacket) { + OutDataType result = in->read(Flag, -1); + ark::store(out, &result); + } + } +}; + +template +struct PacketReduce { + using UnitOp = UnitOp; + using DataType = typename CompType::DataType; + static const int NelemPerThread = CompType::NelemPerThread; + + static_assert(NelemPerThread > 0, "NelemPerThread must be positive"); + static_assert(UnitOutDims::W % NelemPerThread == 0, + "UnitOutDims::W must be divisible by NelemPerThread"); + + static DEVICE void run(DataType *out, DataType *in, PacketType *scratch, + void *args, int uop_idx) { + int un = UnitOp::uop_idx_n(uop_idx); + int uc = UnitOp::uop_idx_c(uop_idx); + int uh = UnitOp::uop_idx_h(uop_idx); + int uw = UnitOp::uop_idx_w(uop_idx); + + for (int tid = UnitOp::thread_id();; tid += UnitOp::NumThreads) { + int tid_w = (tid * NelemPerThread) % UnitOutDims::W; + int tid_h = + ((tid * NelemPerThread) / UnitOutDims::W) % UnitOutDims::H; + int tid_c = + ((tid * NelemPerThread) / UnitOutDims::HW) % UnitOutDims::C; + int tid_n = (tid * NelemPerThread) / UnitOutDims::CHW; + + if (tid_n >= UnitOutDims::N) { + break; + } + + int idx_n = tid_n + un * UnitOutDims::N; + int idx_c = tid_c + uc * UnitOutDims::C; + int idx_h = tid_h + uh * UnitOutDims::H; + int idx_w = tid_w + uw * UnitOutDims::W; + + CompType::compute(out, in, scratch, args, idx_n, idx_c, idx_h, + idx_w); + } + } +}; + +template +struct PacketReduceCompType { + using DataType = _DataType; + using Payload = typename PacketType::Payload; + static const int NelemPerThread = sizeof(Payload) / sizeof(DataType); + + static DEVICE void compute(DataType *out, DataType *in, PacketType *scratch, + void *args, int idx_n, int idx_c, int idx_h, + int idx_w) { + int idx = idx_n * InShape::CHW + idx_c * InShape::HW + + idx_h * InShape::W + idx_w; + int idx_out = idx_n * OutDims::CHW + idx_c * OutDims::HW + + idx_h * OutDims::W + idx_w; + int idx_in = idx_n * InDims::CHW + idx_c * InDims::HW + + idx_h * InDims::W + idx_w; + uint32_t *output_offset = reinterpret_cast(args); + + DataType reduced[NelemPerThread]; + ark::load(reduced, in + idx_in); +#pragma unroll + for (int i = 0; i < NPeers; ++i) { + PacketType *pkg = + scratch + (idx + i * NElemsPerRank) / NelemPerThread; + Payload payload = pkg->read(Flag, -1); + ReduceType::template reduce( + reduced, reduced, reinterpret_cast(&payload)); + } + ark::store(out + idx_out, reduced); +#pragma unroll + for (int i = 0; i < NPeers; ++i) { + int remote_rank = i < Rank ? i : i + 1; + Payload *payload = reinterpret_cast(reduced); + char *output = + reinterpret_cast(ARK_SM_CHANS[remote_rank].dst_) + + output_offset[i]; + PacketType *pkg = + reinterpret_cast(output) + idx / NelemPerThread; + pkg->write(*payload, Flag); + } + } +}; + +template +struct ReduceCompType { + using DataType = _DataType; + static const int NelemPerThread = _NelemPerThread; + + static DEVICE void compute(DataType *out, DataType *in, DataType *scratch, + void *args, int idx_n, int idx_c, int idx_h, + int idx_w) { + int idx = idx_n * InShape::CHW + idx_c * InShape::HW + + idx_h * InShape::W + idx_w; + int idx_out = idx_n * OutDims::CHW + idx_c * OutDims::HW + + idx_h * OutDims::W + idx_w; + int idx_in = idx_n * InDims::CHW + idx_c * InDims::HW + + idx_h * InDims::W + idx_w; + uint32_t *output_offset = reinterpret_cast(args); + + DataType reduced[NelemPerThread]; + ark::load(reduced, + in + idx_in); +#pragma unroll + for (int i = 0; i < NPeers; ++i) { + DataType *data = scratch + (idx + i * NElemsPerRank); + ReduceType::template reduce(reduced, reduced, data); + } + ark::store(out + idx_out, + reduced); +#pragma unroll + for (int i = 0; i < NPeers; ++i) { + int remote_rank = i < Rank ? i : i + 1; + char *output = + reinterpret_cast(ARK_SM_CHANS[remote_rank].dst_) + + output_offset[i]; + DataType *remote_out = reinterpret_cast(output) + idx; + ark::store(remote_out, + reduced); + } + } +}; + enum class ChannelType { Proxy, SecondaryProxy, @@ -100,7 +257,7 @@ DEVICE void read(int ChanId, size_t remote_offset, size_t local_offset, DataType *local_data = reinterpret_cast(local); DataType *remote_data = reinterpret_cast(remote); DefaultBroadcast1::run(local_data, remote_data, uop_idx); } @@ -115,384 +272,43 @@ DEVICE void write(int ChanId, size_t remote_offset, size_t local_offset, DataType *local_data = reinterpret_cast(local); DataType *remote_data = reinterpret_cast(remote); DefaultBroadcast1::run(remote_data, local_data, uop_idx); } -template -union BytesPack {}; - -template <> -union BytesPack<16> { - uint16_t u16[8]; - uint32_t u32[4]; - uint64_t u64[2]; - ulonglong2 u128; -}; - -template <> -union BytesPack<8> { - uint16_t u16[4]; - uint32_t u32[2]; - uint64_t u64; -}; - -DEVICE void store(ulonglong2 *p, const BytesPack<16> &v) { -#if defined(ARK_TARGET_CUDA_ARCH) - asm volatile("st.volatile.global.v2.b64 [%0], {%1,%2};" - : - : "l"(p), "l"(v.u64[0]), "l"(v.u64[1]) - : "memory"); -#else // !defined(ARK_TARGET_CUDA_ARCH) - atomicStoreRelaxed(reinterpret_cast(&(p->x)), v.u64[0]); - atomicStoreRelaxed(reinterpret_cast(&(p->y)), v.u64[1]); -#endif // !defined(ARK_TARGET_CUDA_ARCH) -} - -DEVICE void store(uint64_t *p, const BytesPack<8> &v) { - atomicStoreRelaxed(p, v.u64); -} - -DEVICE void add_half8(BytesPack<16> &dst, BytesPack<16> &src) { - __half2 *pd = reinterpret_cast<__half2 *>(dst.u32); - __half2 *ps = reinterpret_cast<__half2 *>(src.u32); -#pragma unroll - for (int i = 0; i < 4; ++i) { - union { - __half2 h2; - uint32_t u32; - } d, s; - d.h2 = pd[i]; - s.h2 = ps[i]; - pd[i] = __hadd2(d.h2, s.h2); - } -} - -DEVICE void add_half4(BytesPack<8> &dst, BytesPack<8> &src) { - __half *pd = reinterpret_cast<__half *>(dst.u16); - __half *ps = reinterpret_cast<__half *>(src.u16); -#pragma unroll - for (int i = 0; i < 2; ++i) { - __half2 d, s; - d.x = pd[i * 2]; - d.y = pd[i * 2 + 1]; - s.x = ps[i * 2]; - s.y = ps[i * 2 + 1]; - d = __hadd2(d, s); - pd[i * 2] = d.x; - pd[i * 2 + 1] = d.y; - } -} - -template -DEVICE void device_sync(int, int) { - using UnitOp = UnitOp, ark::Vec<>, ark::Vec<>, 1, 0>; - if (UnitOp::thread_id() != 0) { - return; - } - for (int i = 0; i < NRanks - 1; ++i) { - ARK_SM_CHANS[i].signal(); - } - for (int i = 0; i < NRanks - 1; ++i) { - ARK_SM_CHANS[i].wait(-1); - } -} - -// Do reduce scatter in a single node -template -DEVICE void ring_read_and_reduce(size_t src_offset_0, size_t src_offset_1, - size_t src_offset_2, size_t src_offset_3, - size_t src_offset_4, size_t src_offset_5, - size_t src_offset_6, ark::fp16 *src, - int uop_idx, int) { - // treat channel dst as src since we read from it, and reduce to local - // memory - using UnitOp = UnitOp; - constexpr int total_tiles = - math::div_up::value; - constexpr int total_threads = total_tiles * UnitOp::NumThreads; - constexpr size_t nInt4 = Length / sizeof(int4); - const int tid = uop_idx * UnitOp::NumThreads + UnitOp::thread_id(); - BytesPack<16> *dst = - reinterpret_cast *>((uint8_t *)src + Offset); - size_t peer_offsets[] = {src_offset_0, src_offset_1, src_offset_2, - src_offset_3, src_offset_4, src_offset_5, - src_offset_6}; - for (int i = 0; i < NPeers; ++i) { - int chan_idx = (Rank + i) % NPeers; - const size_t index_offset4 = - (peer_offsets[chan_idx] + Offset) / sizeof(int4); - union { - BytesPack<16> data; - int4 val; - } ret; - for (int idx = tid; idx < nInt4; idx += total_threads) { - BytesPack<16> tmp = dst[idx]; - ret.val = ARK_SM_CHANS[chan_idx].read(index_offset4 + idx); - add_half8(tmp, ret.data); - store((ulonglong2 *)&dst[idx], tmp); - } - } -} - -// Do reduce scatter in a single node with AMD -template -DEVICE void parallel_read_and_reduce(size_t src_offset_0, size_t src_offset_1, - size_t src_offset_2, size_t src_offset_3, - size_t src_offset_4, size_t src_offset_5, - size_t src_offset_6, ark::fp16 *src, - int uop_idx, int) { - // treat channel dst as src since we read from it, and reduce to local - // memory - using UnitOp = UnitOp; - constexpr int total_tiles = - math::div_up::value; - constexpr int total_threads = total_tiles * UnitOp::NumThreads; - constexpr size_t nInt4 = Length / sizeof(int4); - const int tid = uop_idx * UnitOp::NumThreads + UnitOp::thread_id(); - BytesPack<16> *dst = - reinterpret_cast *>((uint8_t *)src + Offset); - size_t peer_offsets[] = {src_offset_0, src_offset_1, src_offset_2, - src_offset_3, src_offset_4, src_offset_5, - src_offset_6}; - for (int idx = tid; idx < nInt4; idx += total_threads) { - BytesPack<16> tmp = dst[idx]; - for (int i = 0; i < NPeers; ++i) { - int chan_idx = (Rank + i) % NPeers; - const size_t index_offset4 = - (peer_offsets[chan_idx] + Offset) / sizeof(int4); - union { - BytesPack<16> data; - int4 val; - } ret; - ret.val = ARK_SM_CHANS[chan_idx].read(index_offset4 + idx); - add_half8(tmp, ret.data); - } - store((ulonglong2 *)&dst[idx], tmp); - } -} - -template -DEVICE void read_and_reduce(size_t src_offset_0, size_t src_offset_1, - size_t src_offset_2, size_t src_offset_3, - size_t src_offset_4, size_t src_offset_5, - size_t src_offset_6, ark::fp16 *src, int uop_idx, - int) { - // TODO: support length not multiple of 16 - static_assert(Length % sizeof(int4) == 0, "Length must be multiple of 16"); -#if defined(ARK_TARGET_CUDA_ARCH) - return ring_read_and_reduce( - src_offset_0, src_offset_1, src_offset_2, src_offset_3, src_offset_4, - src_offset_5, src_offset_6, src, uop_idx, 0); -#else // !defined(ARK_TARGET_CUDA_ARCH) - return parallel_read_and_reduce( - src_offset_0, src_offset_1, src_offset_2, src_offset_3, src_offset_4, - src_offset_5, src_offset_6, src, uop_idx, 0); -#endif // !defined(ARK_TARGET_CUDA_ARCH) -} - -template -DEVICE void ring_gather_from_peers( - size_t ori_offset, size_t target_offset_0, size_t target_offset_1, - size_t target_offset_2, size_t target_offset_3, size_t target_offset_4, - size_t target_offset_5, size_t target_offset_6, ark::fp16 *, int uop_idx, - int) { - using UnitOp = UnitOp; - constexpr size_t shape_width = Shape::W * sizeof(ark::fp16); - constexpr size_t output_width = UnitOutDims::W * sizeof(ark::fp16); - constexpr size_t stride = Dims::W * sizeof(ark::fp16); - const int tid = UnitOp::thread_id(); - const int tile_hid = UnitOp::uop_idx_h(uop_idx); - const int tile_wid = UnitOp::uop_idx_w(uop_idx); - const size_t offset_in_width = - tile_wid * UnitOutDims::W * sizeof(ark::fp16); - size_t bytes_per_width = UnitOutDims::W * sizeof(ark::fp16); - if (offset_in_width + output_width > shape_width) { - bytes_per_width = shape_width - offset_in_width; - } - size_t peer_offsets[] = {target_offset_0, target_offset_1, target_offset_2, - target_offset_3, target_offset_4, target_offset_5, - target_offset_6}; -#pragma unroll - for (int i = 0; i < NPeers; ++i) { - int chan_idx = (Rank + i) % NPeers; - int remote_rank = chan_idx < Rank ? chan_idx : chan_idx + 1; - for (int j = tile_hid * UnitOutDims::H; - j < tile_hid * UnitOutDims::H + UnitOutDims::H; ++j) { - size_t offset = - shape_width * remote_rank + j * stride + offset_in_width; - ARK_SM_CHANS[chan_idx].get(peer_offsets[chan_idx] + offset, - ori_offset + offset, bytes_per_width, - tid, UnitOp::NumThreads); - } - } -} - -template -DEVICE void parallel_gather_from_peers( - size_t ori_offset, size_t target_offset_0, size_t target_offset_1, - size_t target_offset_2, size_t target_offset_3, size_t target_offset_4, - size_t target_offset_5, size_t target_offset_6, ark::fp16 *, int uop_idx, - int) { - using UnitOp = UnitOp; - constexpr size_t shape_width = Shape::W * sizeof(ark::fp16); - constexpr size_t output_width = UnitOutDims::W * sizeof(ark::fp16); - constexpr size_t stride = Dims::W * sizeof(ark::fp16); - const int tid = UnitOp::thread_id(); - const int tile_hid = UnitOp::uop_idx_h(uop_idx); - const int tile_wid = UnitOp::uop_idx_w(uop_idx); - const size_t offset_in_width = - tile_wid * UnitOutDims::W * sizeof(ark::fp16); - size_t bytes_per_width = UnitOutDims::W * sizeof(ark::fp16); - if (offset_in_width + output_width > shape_width) { - bytes_per_width = shape_width - offset_in_width; - } - size_t peer_offsets[] = {target_offset_0, target_offset_1, target_offset_2, - target_offset_3, target_offset_4, target_offset_5, - target_offset_6}; - const size_t unit_size = bytes_per_width >= (16 * UnitOp::NumThreads) - ? 16 * UnitOp::NumThreads - : bytes_per_width; - for (int i = tile_hid * UnitOutDims::H; - i < tile_hid * UnitOutDims::H + UnitOutDims::H; ++i) { - int base = 0; - for (; base < bytes_per_width; base += unit_size) { -#pragma unroll - for (int j = 0; j < NPeers; ++j) { - int chan_idx = (Rank + j) % NPeers; - int remote_rank = chan_idx < Rank ? chan_idx : chan_idx + 1; - size_t offset = shape_width * remote_rank + i * stride + - offset_in_width + base; - ARK_SM_CHANS[chan_idx].get(peer_offsets[chan_idx] + offset, - ori_offset + offset, unit_size, tid, - UnitOp::NumThreads); - } - } - if (base < bytes_per_width) { -#pragma unroll - for (int j = 0; j < NPeers; ++j) { - int chan_idx = (Rank + j) % NPeers; - int remote_rank = chan_idx < Rank ? chan_idx : chan_idx + 1; - size_t offset = shape_width * remote_rank + i * stride + - offset_in_width + base; - ARK_SM_CHANS[chan_idx].get( - peer_offsets[chan_idx] + offset, ori_offset + offset, - bytes_per_width - base, tid, UnitOp::NumThreads); - } - } - } -} - -template -DEVICE void gather_from_peers(size_t ori_offset, size_t target_offset_0, - size_t target_offset_1, size_t target_offset_2, - size_t target_offset_3, size_t target_offset_4, - size_t target_offset_5, size_t target_offset_6, - ark::fp16 *, int uop_idx, int) { -#if defined(ARK_TARGET_CUDA_ARCH) - return ring_gather_from_peers( - ori_offset, target_offset_0, target_offset_1, target_offset_2, - target_offset_3, target_offset_4, target_offset_5, target_offset_6, - nullptr, uop_idx, 0); -#else // !defined(ARK_TARGET_CUDA_ARCH) - return parallel_gather_from_peers( - ori_offset, target_offset_0, target_offset_1, target_offset_2, - target_offset_3, target_offset_4, target_offset_5, target_offset_6, - nullptr, uop_idx, 0); -#endif // !defined(ARK_TARGET_CUDA_ARCH) -} - -template -DEVICE void put_packet(size_t dst_offset, size_t src_offset, int uop_idx, int) { - using UnitOp = UnitOp; - constexpr int total_tiles = - math::div_up::value; - constexpr int total_threads = total_tiles * UnitOp::NumThreads; - constexpr int chan_idx = DstRank < Rank ? DstRank : DstRank - 1; - const int tid = uop_idx * UnitOp::NumThreads + UnitOp::thread_id(); - ARK_SM_CHANS[chan_idx].putPackets(dst_offset + DstOffset, src_offset, - Length, tid, total_threads, Flag); -} - -template -DEVICE void reduce_and_write_packet(ark::fp16 *dst, ark::fp16 *src, - void *scratch, size_t peer_offset_0, - size_t peer_offset_1, size_t peer_offset_2, - size_t peer_offset_3, size_t peer_offset_4, - size_t peer_offset_5, size_t peer_offset_6, - int uop_idx, int) { - // All channels have the same src_, so we can use any channel to get dst - using UnitOp = UnitOp; - constexpr int total_tiles = - math::div_up::value; - constexpr int total_threads = total_tiles * UnitOp::NumThreads; - constexpr int npackets_per_rank = - NElemsPerRank * sizeof(ark::fp16) / (sizeof(mscclpp::LLPacket) / 2); - uint8_t *scratch_base = (uint8_t *)scratch + ScratchOffset; - const int tid = uop_idx * UnitOp::NumThreads + UnitOp::thread_id(); - size_t peer_offsets[] = {peer_offset_0, peer_offset_1, peer_offset_2, - peer_offset_3, peer_offset_4, peer_offset_5, - peer_offset_6}; - for (int idx = tid; idx < npackets_per_rank; idx += total_threads) { - BytesPack<8> data; - data.u64 = *((uint64_t *)src + idx); - for (int index = 0; index < NPeers; index++) { - const int remote_rank = index < Rank ? index : index + 1; - mscclpp::LLPacket *pkt = (mscclpp::LLPacket *)(scratch_base) + - remote_rank * npackets_per_rank; - uint2 val = pkt[idx].read(Flag); - BytesPack<8> packet; - packet.u64 = *reinterpret_cast(&val); - add_half4(data, packet); - } - store((uint64_t *)dst + idx, data); - for (int index = 0; index < NPeers; index++) { - mscclpp::LLPacket *dst_pkt = - (mscclpp::LLPacket *)((char *)ARK_SM_CHANS[index].dst_ + - peer_offsets[index] + RemoteDstOffset); - dst_pkt[idx + Rank * npackets_per_rank].write(data.u32[0], - data.u32[1], Flag); - } - } +template +DEVICE void writePacket(int chan_id, size_t remote_offset, size_t local_offset, + int uop_idx, [[maybe_unused]] int smem_per_warp) { + using Payload = typename PacketType::Payload; + const mscclpp::SmChannelDeviceHandle &chan = ARK_SM_CHANS[chan_id]; + char *local = reinterpret_cast(chan.src_) + local_offset; + char *remote = reinterpret_cast(chan.dst_) + remote_offset; + Payload *local_data = reinterpret_cast(local); + PacketType *remote_data = reinterpret_cast(remote); + Broadcast1>::run(remote_data, local_data, uop_idx); } -template -DEVICE void get_from_packet(void *dst, void *src, int uop_idx, int) { - using UnitOp = UnitOp; - constexpr int total_tiles = - math::div_up::value; - constexpr int total_threads = total_tiles * UnitOp::NumThreads; - const int tid = uop_idx * UnitOp::NumThreads + UnitOp::thread_id(); - mscclpp::LLPacket *dst_pkt = (mscclpp::LLPacket *)((char *)src + SrcOffset); - BytesPack<8> packet; - uint64_t *dst_pkt_base = (uint64_t *)((char *)dst + DstOffset); - for (int idx = tid; idx < NPacket; idx += total_threads) { - uint2 data = dst_pkt[idx].read(Flag); - packet.u64 = *reinterpret_cast(&data); - store(dst_pkt_base + idx, packet); - } +template +DEVICE void readPacket(int chan_id, size_t output_offset, size_t scratch_offset, + int uop_idx, [[maybe_unused]] int smem_per_warp) { + using Payload = typename PacketType::Payload; + char *base_addr = reinterpret_cast(ARK_SM_CHANS[chan_id].src_); + char *scratch = base_addr + scratch_offset; + char *output = base_addr + output_offset; + PacketType *scratch_data = reinterpret_cast(scratch); + Payload *output_data = reinterpret_cast(output); + Broadcast1>::run(output_data, scratch_data, uop_idx); } - } // namespace comm template +template DEVICE void wait(int, int) { + if constexpr (!Wait) { + return; + } using UnitOp = UnitOp, ark::Vec<>, ark::Vec<>, 1, 0>; if (UnitOp::thread_id() == 0) { comm::wait(RemoteRank); } } +template +DEVICE void write_packet(size_t dst_offset, size_t src_offset, int uop_idx, + int) { + comm::writePacket(RemoteRank, dst_offset, + src_offset, uop_idx, 0); +} + +template +DEVICE void read_packet(size_t dst_offset, size_t src_offset, int uop_idx, + int) { + comm::readPacket(RemoteRank, dst_offset, + src_offset, uop_idx, 0); +} + +// TODO: add reduce type in future +template +DEVICE void read_reduce_and_write( + DataType *dst, DataType *src, void *scratch_base, uint32_t peer_offset_0, + uint32_t peer_offset_1, uint32_t peer_offset_2, uint32_t peer_offset_3, + uint32_t peer_offset_4, uint32_t peer_offset_5, uint32_t peer_offset_6, + int uop_idx, int) { + constexpr unsigned int nelems_per_rank = InShape::NCHW; + uint32_t peer_offsets[] = {peer_offset_0, peer_offset_1, peer_offset_2, + peer_offset_3, peer_offset_4, peer_offset_5, + peer_offset_6}; + if constexpr (std::is_same_v) { + DataType *scratch = reinterpret_cast(scratch_base); + constexpr int NelemPerThread = + DefaultNelemPerThread::value; + comm::PacketReduce< + OutDims, OutShape, UnitOutDims, NumWarps, SmemBytes, PacketType, + comm::ReduceCompType>::run(dst, src, scratch, + peer_offsets, uop_idx); + } + else { + PacketType *scratch = reinterpret_cast(scratch_base); + comm::PacketReduce< + OutDims, OutShape, UnitOutDims, NumWarps, SmemBytes, PacketType, + comm::PacketReduceCompType< + InDims, InShape, OutDims, PacketType, ReduceTypeSum, DataType, + Rank, NPeers, nelems_per_rank, Flag>>::run(dst, src, scratch, + peer_offsets, + uop_idx); + } +} + +template +DEVICE void device_sync(int, int) { + using UnitOp = UnitOp, ark::Vec<>, ark::Vec<>, 1, 0>; + int tid = UnitOp::thread_id(); + if (tid < NPeers) { + int remote_rank = tid < Rank ? tid : tid + 1; + comm::signal(remote_rank); + comm::wait(remote_rank); + } +} + } // namespace ark #endif // ARK_KERNELS_COMM_H_ diff --git a/ark/include/kernels/reduce.h b/ark/include/kernels/reduce.h index 2dd79d2c3..9ebe6555c 100644 --- a/ark/include/kernels/reduce.h +++ b/ark/include/kernels/reduce.h @@ -291,7 +291,7 @@ struct EwiseReduceCompType(reduced); #pragma unroll - for (int i = 0; i < InShape::W; ++i) { + for (int i = 0; i < InShape::W; i += NelemPerThread) { ReduceType::template reduce(reduced, reduced, &in[idx_in + i]); } diff --git a/ark/model/model_op.cpp b/ark/model/model_op.cpp index 173d1a92f..5db8576e8 100644 --- a/ark/model/model_op.cpp +++ b/ark/model/model_op.cpp @@ -79,6 +79,11 @@ const ModelOpType ModelOpT::from_name(const std::string &type_name) { MODEL_OP_TYPE_REGISTER(Sub); MODEL_OP_TYPE_REGISTER(Tensor); MODEL_OP_TYPE_REGISTER(Transpose); + MODEL_OP_TYPE_REGISTER(SendPacket); + MODEL_OP_TYPE_REGISTER(RecvPacket); + MODEL_OP_TYPE_REGISTER(RecvReduceSendPacket); + MODEL_OP_TYPE_REGISTER(RecvReduceSend); + MODEL_OP_TYPE_REGISTER(DeviceSync); } auto it = instances.find(type_name); if (it == instances.end()) { diff --git a/ark/model/model_op_arg.cpp b/ark/model/model_op_arg.cpp index 4a8a7eea4..57b9ac7aa 100644 --- a/ark/model/model_op_arg.cpp +++ b/ark/model/model_op_arg.cpp @@ -21,6 +21,8 @@ Json ModelOpArg::serialize() const { j[type_name] = this->value().vector(); } else if (type_name == "INT") { j[type_name] = this->value(); + } else if (type_name == "UINT32") { + j[type_name] = this->value(); } else if (type_name == "INT64") { j[type_name] = this->value(); } else if (type_name == "UINT64") { @@ -48,6 +50,8 @@ ModelOpArg ModelOpArg::deserialize(const Json &serialized) { return ModelOpArg(Dims(value.get>())); } else if (type_name == "INT") { return ModelOpArg(value.get()); + } else if (type_name == "UINT32") { + return ModelOpArg(value.get()); } else if (type_name == "INT64") { return ModelOpArg(value.get()); } else if (type_name == "UINT64") { diff --git a/ark/model/model_op_arg.hpp b/ark/model/model_op_arg.hpp index c5e86bfe6..12ff7397c 100644 --- a/ark/model/model_op_arg.hpp +++ b/ark/model/model_op_arg.hpp @@ -55,6 +55,7 @@ class ModelOpArg : public ModelNamedT { }; REGISTER_MODEL_OP_ARG_TYPE(INT, int) +REGISTER_MODEL_OP_ARG_TYPE(UINT32, uint32_t) REGISTER_MODEL_OP_ARG_TYPE(INT64, int64_t) REGISTER_MODEL_OP_ARG_TYPE(UINT64, uint64_t) REGISTER_MODEL_OP_ARG_TYPE(BOOL, bool) diff --git a/ark/ops/ops_all_reduce.cpp b/ark/ops/ops_all_reduce.cpp index 82868374b..08f06257a 100644 --- a/ark/ops/ops_all_reduce.cpp +++ b/ark/ops/ops_all_reduce.cpp @@ -5,14 +5,17 @@ namespace ark { -Tensor Model::all_reduce(Tensor input, int gpu_id, int gpu_num, - [[maybe_unused]] Tensor output, const std::string &) { +Tensor Model::all_reduce(Tensor input, int gpu_id, int gpu_num, Tensor output, + const std::string &) { std::vector tags(gpu_num); for (int i = 0; i < gpu_num; i++) { tags[i] = this->unique_tag(); } + if (output.is_null()) { + output = this->copy(input); + } Tensor prev_recv = NullTensor; - Tensor cumulate = input; + Tensor cumulate = output; for (int i = 1; i < gpu_num; i++) { int gpu_dst = (gpu_id + i) % gpu_num; int gpu_src = (gpu_id + gpu_num - i) % gpu_num; @@ -24,106 +27,13 @@ Tensor Model::all_reduce(Tensor input, int gpu_id, int gpu_num, } send_data = this->send(send_data, gpu_dst, tags[gpu_id]); Tensor send_done_tensor = this->send_done(send_data); - Tensor recv_buf = this->tensor(input.shape(), input.data_type()); + Tensor recv_buf = this->tensor(output.shape(), output.data_type()); Tensor recv = this->identity(recv_buf, {send_done_tensor}); recv = this->recv(recv_buf, gpu_src, tags[gpu_src]); prev_recv = recv; - cumulate = this->add(cumulate, recv); + cumulate = this->add(cumulate, recv, cumulate); } return cumulate; } -// Tensor *Model::local_all_reduce(Tensor *input, int gpu_id, int gpu_num, -// const std::string &) { -// assert(input != nullptr); -// if (!input->is_sequential()) { -// LOG(WARN, -// "all_reduce may not work correctly if the input tensor is " -// "not contiguous"); -// } -// ark::Dims ori_shape = input->shape; -// Tensor *input_reshaped = this->reshape(input, {input->shape.size()}); -// Tensor *out = this->local_reduce_scatter(input_reshaped, gpu_id, -// gpu_num); Tensor *res = this->local_all_gather(out, gpu_id, gpu_num); -// return this->reshape(res, ori_shape); -// } - -// Tensor *Model::local_all_reduce_packet(Tensor *input, int gpu_id, int -// gpu_num, -// const std::string &) { -// assert(input != nullptr); -// // We only support out-of-place all_reduce -// if (input->ndims() > 1) { -// ERR(InvalidUsageError, "supports only 1D input"); -// } -// if (!input->is_sequential()) { -// LOG(WARN, -// "all_reduce may not work correctly if the input tensor is " -// "not contiguous"); -// } -// Tensor *out = this->tensor(input->shape, input->type); -// // only half of the packets are used to store data -// const int num_packets = input->shape_bytes() / (MSCCLPP_PACKET_SIZE / 2); -// const int scratch_nelems = num_packets * -// 2 /*oringinal data & reduced result*/ * -// 2 /*double buffer*/; -// Dims scratch_shape = { -// static_cast(scratch_nelems * MSCCLPP_PACKET_SIZE)}; -// Tensor *scratch = this->tensor(scratch_shape, UINT8); -// int npeer = gpu_num - 1; -// std::vector outputs; -// std::vector remote_scratches; -// size_t nelems_per_rank = -// input->shape_bytes() / input->type_bytes() / gpu_num; -// size_t npackets_per_rank = num_packets / gpu_num; -// int flag = this->impl->reduce_packet_flag; -// size_t scratch_base_offset = -// (flag & 1) ? 0 : num_packets * MSCCLPP_PACKET_SIZE; -// size_t scratch_result_offset = (flag & 1) -// ? 2 * num_packets * -// MSCCLPP_PACKET_SIZE : 3 * num_packets -// * MSCCLPP_PACKET_SIZE; -// int id = this->impl->next_eid; -// std::vector sharded_inputs = -// this->sharding(input, 0, nelems_per_rank); -// std::vector sharded_outputs = -// this->sharding(out, 0, nelems_per_rank); -// for (int i = 0; i < npeer; ++i) { -// int remote_rank = i < gpu_id ? i : i + 1; -// Tensor *remote_scratch = this->tensor(scratch_shape, UINT8); -// remote_scratches.push_back(remote_scratch); -// Tensor *out = -// this->put_packet(sharded_inputs[remote_rank], scratch, -// remote_scratch, id, gpu_id, remote_rank, -// scratch_base_offset + npackets_per_rank * gpu_id -// * -// MSCCLPP_PACKET_SIZE, -// flag); -// outputs.push_back(out); -// } -// Tensor *input_sharded = this->identity(sharded_inputs[gpu_id], outputs); -// // This op should reduce from the scratch buffer and write to the remote. -// Tensor *out_stage2 = this->reduce_and_write_packet( -// input_sharded, scratch, sharded_outputs[gpu_id], remote_scratches, -// id, gpu_id, npeer, nelems_per_rank, scratch_base_offset, -// scratch_result_offset, flag); -// // Get the result from the scratch buffer. -// Tensor *scratch_stage3 = this->identity(scratch, {out_stage2}); -// outputs.clear(); -// for (int i = 0; i < npeer; ++i) { -// int remote_rank = i < gpu_id ? i : i + 1; -// size_t dst_offset = nelems_per_rank * remote_rank * -// input->type_bytes(); size_t src_offset = scratch_result_offset + -// npackets_per_rank * -// remote_rank * -// MSCCLPP_PACKET_SIZE; -// Tensor *res = this->get_packet(scratch_stage3, out, src_offset, -// dst_offset, npackets_per_rank, flag); -// outputs.push_back(res); -// } -// this->impl->next_eid += 1; -// this->impl->reduce_packet_flag += 1; -// return this->identity(out, outputs); -// } - } // namespace ark diff --git a/ark/ops/ops_all_reduce_test.cpp b/ark/ops/ops_all_reduce_test.cpp index c73426f24..a37d211f7 100644 --- a/ark/ops/ops_all_reduce_test.cpp +++ b/ark/ops/ops_all_reduce_test.cpp @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. +#include "model/model_buffer.hpp" #include "model/model_node.hpp" #include "model/model_op.hpp" #include "ops_test_common.hpp" @@ -46,54 +47,203 @@ void test_all_reduce_internal(ark::DimType nelem) { ark::unittest::wait_all_processes(); } -// void test_local_all_reduce_8gpus_internel(size_t nelem, int iter) { -// constexpr int num_gpus = 8; -// for (int gpu_id = 0; gpu_id < num_gpus; ++gpu_id) { -// ark::unittest::spawn_process([gpu_id, nelem, num_gpus, iter]() { -// // Each GPU's data is equal to its GPU ID + 1. -// ark::Model m(gpu_id, num_gpus); -// ark::Tensor data = m.tensor({nelem}, ark::FP16); -// std::vector data_buf(nelem); -// for (size_t i = 0; i < nelem; ++i) { -// data_buf[i] = ark::half_t(gpu_id + 1); -// } -// ark::Tensor *output = m.local_all_reduce(data, gpu_id, num_gpus); -// auto result = -// ark::op_test("all_reduce", m, {data}, {output}, -// baseline_all_reduce, -// {data_buf.data()}, true, gpu_id, num_gpus, 16); -// UNITTEST_LOG(result); -// UNITTEST_EQ(result.max_diff[0], 0.0f); -// return ark::unittest::SUCCESS; -// }); -// } -// ark::unittest::wait_all_processes(); -// } - -// void test_local_all_reduce_packet_8gpus_internel(size_t nelem, int iter) { -// constexpr int num_gpus = 8; -// for (int gpu_id = 0; gpu_id < num_gpus; ++gpu_id) { -// ark::unittest::spawn_process([gpu_id, nelem, num_gpus, iter]() { -// // Each GPU's data is equal to its GPU ID + 1. -// ark::Model m{gpu_id}; -// ark::Tensor *data = m.tensor(ark::Dims(nelem), ark::FP16); -// std::vector data_buf(nelem); -// for (size_t i = 0; i < nelem; ++i) { -// data_buf[i] = ark::half_t(gpu_id + 1); -// } -// ark::Tensor *output = -// m.local_all_reduce_packet(data, gpu_id, num_gpus); -// auto result = -// ark::op_test("all_reduce_packet", m, {data}, {output}, -// baseline_all_reduce, -// {data_buf.data()}, false, gpu_id, num_gpus, 16); -// UNITTEST_LOG(result); -// UNITTEST_EQ(result.max_diff[0], 0.0f); -// return ark::unittest::SUCCESS; -// }); -// } -// ark::unittest::wait_all_processes(); -// } +ark::Tensor all_reduce_packet(ark::Model &m, ark::Tensor input, int rank, + int rank_num, int flag, ark::Tensor output) { + int tag_send_reduce = m.unique_tag(); + int tag_output = m.unique_tag(); + if (output.is_null()) { + output = m.tensor(input.shape(), input.data_type(), input.strides(), + input.offsets(), input.padded_shape()); + } + std::vector remote_ranks; + for (int i = 0; i < rank_num; i++) { + if (i != rank) { + remote_ranks.push_back(i); + } + } + // need to make sure input is contiguous, and we flatten the input tensor + ark::Tensor reshaped_input = m.reshape(input, {input.shape().nelems()}); + ark::Tensor reshaped_output = m.reshape(output, {output.shape().nelems()}); + int nelems_per_rank = reshaped_input.shape().nelems() / rank_num; + uint32_t nbytes_per_rank = + nelems_per_rank * reshaped_input.data_type().bytes(); + std::vector sharded_inputs = + m.sharding(reshaped_input, 0, nelems_per_rank); + std::vector sharded_outputs = + m.sharding(reshaped_output, 0, nelems_per_rank); + int npeer = rank_num - 1; + size_t scratch_off = flag % 2 == 0 ? 0 : nbytes_per_rank * npeer * 2; + ark::Dims scratch_strides = {nbytes_per_rank * 2 * npeer * 2}; + for (int i = 0; i < rank_num; i++) { + if (i != rank) { + int off_index = i < rank ? rank - 1 : rank; + ark::Tensor scratch_tensor = m.tensor( + nbytes_per_rank * 2, ark::UINT8, scratch_strides, + ark::Dims(scratch_off + nbytes_per_rank * off_index * 2), + ark::Dims(nbytes_per_rank * 2), i); + m.send_packet(sharded_inputs[i], i, tag_send_reduce, flag, + scratch_tensor); + } + } + std::vector deps; + ark::Tensor scratch = + m.tensor(nbytes_per_rank * 2 * npeer, ark::UINT8, scratch_strides, + scratch_off, nbytes_per_rank * 2 * npeer); + std::vector outputs; + size_t out_off = flag % 2 == 0 ? 0 : nbytes_per_rank * 2; + ark::Dims out_shape = {nbytes_per_rank * 2}; + ark::Dims out_strides = {nbytes_per_rank * 2 * 2}; // packet + double buffer + for (int i = 0; i < rank_num; i++) { + if (i != rank) { + outputs.push_back(m.tensor(out_shape, ark::UINT8, out_strides, + out_off, out_shape, i)); + } + } + deps.push_back(m.recv_reduce_send_packet( + sharded_inputs[rank], remote_ranks, tag_send_reduce, tag_output, flag, + sharded_outputs[rank], outputs, scratch)); + for (int i = 0; i < rank_num; i++) { + if (i != rank) { + ark::Tensor scratch_tensor = + m.tensor(out_shape, ark::UINT8, out_strides, ark::Dims(out_off), + out_shape); + deps.push_back(m.recv_packet(sharded_outputs[i], i, tag_output, + flag, scratch_tensor)); + } + } + return m.identity(output, deps); +} + +template +void test_all_reduce_packet_internal(ark::DimType nelem) { + for (int gpu_id = 0; gpu_id < NumGpus; ++gpu_id) { + ark::unittest::spawn_process([gpu_id, nelem]() { + // Each GPU's data is equal to its GPU ID + 1. + ark::Model m(gpu_id, NumGpus); + ark::Tensor ones = m.tensor({nelem}, ark::FP16); + ark::Tensor data = m.mul(ones, float(gpu_id + 1)); + ark::Tensor output = all_reduce_packet(m, data, gpu_id, NumGpus, 1, data); + + std::vector ones_vec(ones.shape().nelems(), + ark::half_t(1.0f)); + auto result = + ark::op_test("all_reduce_packet", m, {ones}, {output}, + baseline_all_reduce, + {ones_vec.data()}, false, gpu_id, NumGpus); + UNITTEST_LOG(result); + UNITTEST_EQ(result.max_diff[0], 0.0f); + return ark::unittest::SUCCESS; + }); + } + ark::unittest::wait_all_processes(); +} + +ark::Tensor all_reduce_sm(ark::Model &m, ark::Tensor input, int rank, + int rank_num, ark::Tensor output) { + int send_tag = m.unique_tag(); + int recv_tag = m.unique_tag(); + if (output.is_null()) { + output = m.tensor(input.shape(), input.data_type(), input.strides(), + input.offsets(), input.padded_shape()); + } + std::vector remote_ranks; + for (int i = 0; i < rank_num; i++) { + if (i != rank) { + remote_ranks.push_back(i); + } + } + ark::Tensor reshaped_input = m.reshape(input, {input.shape().nelems()}); + ark::Tensor reshaped_output = m.reshape(output, {output.shape().nelems()}); + int nelems_per_rank = reshaped_input.shape().nelems() / rank_num; + int npeer = rank_num - 1; + ark::Tensor scratch_tensor = + m.tensor(nelems_per_rank * npeer, reshaped_input.data_type()); + std::vector sharded_inputs = + m.sharding(reshaped_input, 0, nelems_per_rank); + std::vector sharded_scratch = + m.sharding(scratch_tensor, 0, nelems_per_rank); + std::vector shared_outputs = + m.sharding(reshaped_output, 0, nelems_per_rank); + for (int i = 0; i < rank_num; i++) { + if (i != rank) { + int remote_off = i < rank ? rank - 1 : rank; + ark::Tensor scratch = + m.tensor(nelems_per_rank, reshaped_input.data_type(), + {nelems_per_rank * npeer}, + ark::Dims(nelems_per_rank * remote_off), + ark::Dims(nelems_per_rank), i); + m.send(sharded_inputs[i], i, send_tag, scratch); + } + } + m.device_sync(reshaped_input, rank, rank_num); + m.recv_reduce_send(sharded_inputs[rank], remote_ranks, send_tag, recv_tag, + sharded_inputs[rank]); + for (int i = 0; i < rank_num; i++) { + if (i != rank) { + int peer_id = i < rank ? i : i - 1; + m.recv(sharded_inputs[peer_id], i, recv_tag); + } + } + ark::Tensor res = m.device_sync(input, rank, rank_num); + return res; +} + + +template +void test_all_reduce_sm_internal(ark::DimType nelem) { + auto config_rule = [nelem](const std::string op_str, const std::string) { + const int tile_y = 64 /*nthreads per wrap*/ * 8 /*nelems per thread*/ * + 8 /*num wraps*/; + const int num_tasks = nelem / tile_y / NumGpus; + auto op = nlohmann::json::parse(op_str); + nlohmann::json config; + if (op.at("Type") == "Send") { + config["ChannelType"] = "Sm"; + config["Signal"] = false; + config["Tile"] = {1, tile_y}; + config["NumTasks"] = num_tasks; + config["NumWarps"] = 8; + config["SramBytes"] = 0; + } else if (op.at("Type") == "DeviceSync") { + config["ChannelType"] = "Sm"; + config["NumTasks"] = 1; + config["NumWarps"] = 1; + config["SramBytes"] = 0; + } else if (op.at("Type") == "Recv") { + config["ChannelType"] = "Sm"; + config["NumTasks"] = 1; + config["NumWarps"] = 1; + config["SramBytes"] = 0; + config["Wait"] = false; + } else if (op.at("Type") == "RecvReduceSend") { + config["NumTasks"] = num_tasks; + config["NumWarps"] = 8; + config["SramBytes"] = 0; + config["Tile"] = {1, tile_y}; + } + return config.dump(); + }; + for (int gpu_id = 0; gpu_id < NumGpus; ++gpu_id) { + ark::unittest::spawn_process([gpu_id, nelem, config_rule]() { + // Each GPU's data is equal to its GPU ID + 1. + ark::Model m(gpu_id, NumGpus); + ark::Tensor ones = m.tensor({nelem}, ark::FP16); + ark::Tensor data = m.mul(ones, float(gpu_id + 1)); + ark::Tensor output = all_reduce_sm(m, data, gpu_id, NumGpus, data); + + std::vector ones_vec(ones.shape().nelems(), + ark::half_t(1.0f)); + auto result = ark::op_test( + "all_reduce_sm", m, {ones}, {output}, + baseline_all_reduce, {ones_vec.data()}, + false, gpu_id, NumGpus, config_rule); + UNITTEST_LOG(result); + UNITTEST_EQ(result.max_diff[0], 0.0f); + return ark::unittest::SUCCESS; + }); + } + ark::unittest::wait_all_processes(); +} ark::unittest::State test_all_reduce_4gpus() { test_all_reduce_internal<4>(64); @@ -107,8 +257,36 @@ ark::unittest::State test_all_reduce_8gpus() { return ark::unittest::SUCCESS; } +ark::unittest::State test_all_reduce_packet_4gpus() { + test_all_reduce_packet_internal<4>(2048); + test_all_reduce_packet_internal<4>(8192); + return ark::unittest::SUCCESS; +} + +ark::unittest::State test_all_reduce_packet_8gpus() { + test_all_reduce_packet_internal<8>(2048); + test_all_reduce_packet_internal<8>(8192); + return ark::unittest::SUCCESS; +} + +ark::unittest::State test_all_reduce_sm_4gpus() { + test_all_reduce_sm_internal<4>(2048 * 1024); + test_all_reduce_sm_internal<4>(8192 * 1024); + return ark::unittest::SUCCESS; +} + +ark::unittest::State test_all_reduce_sm_8gpus() { + test_all_reduce_sm_internal<8>(2048 * 1024); + test_all_reduce_sm_internal<8>(8192 * 1024); + return ark::unittest::SUCCESS; +} + int main() { UNITTEST(test_all_reduce_4gpus); UNITTEST(test_all_reduce_8gpus); + UNITTEST(test_all_reduce_packet_4gpus); + UNITTEST(test_all_reduce_packet_8gpus); + UNITTEST(test_all_reduce_sm_4gpus); + UNITTEST(test_all_reduce_sm_8gpus); return 0; } diff --git a/ark/ops/ops_communication.cpp b/ark/ops/ops_communication.cpp index e335f869e..baf7aafa2 100644 --- a/ark/ops/ops_communication.cpp +++ b/ark/ops/ops_communication.cpp @@ -5,6 +5,14 @@ #include "ops_common.hpp" +namespace { +static const std::map packet_payload_size_map = { + {"mscclpp::LL8Packet", 4}, + {"mscclpp::LL16Packet", 8}, +}; +static const int MAX_NUM_PEERS = 7; +} // namespace + namespace ark { ModelOpSend::ModelOpSend(ModelTensorRef input, int remote_rank, int tag, @@ -12,7 +20,6 @@ ModelOpSend::ModelOpSend(ModelTensorRef input, int remote_rank, int tag, : ModelOp("Send") { check_null(input); if (output) { - // TODO: verify output shape and strides if (output->buffer()->rank() != remote_rank) { ERR(ModelError, "invalid buffer rank: ", output->buffer()->rank(), ", expected: ", remote_rank); @@ -39,6 +46,8 @@ std::string ModelOpSend::impl_name(const Json &config) const { auto &input = read_tensors_[0]; auto &output = write_tensors_[0]; int remote_rank = output->buffer()->rank(); + bool signal = config["Signal"]; + int num_warps = config["NumWarps"]; std::string channel_type = config["ChannelType"]; if (channel_type != "Proxy" && channel_type != "SecondaryProxy" && channel_type != "Sm") { @@ -53,12 +62,12 @@ std::string ModelOpSend::impl_name(const Json &config) const { } return function_name_string( "put", - {"comm::ChannelType::" + channel_type, std::to_string(true), + {"comm::ChannelType::" + channel_type, std::to_string(signal), std::to_string(remote_rank), vec_string(input->strides().dims4()), vec_string(input->shape().dims4()), vec_string(output->strides().dims4()), vec_string(output->shape().dims4()), vec_string(unit_out_dims), - std::to_string(1), std::to_string(0), + std::to_string(num_warps), std::to_string(0), output->data_type()->type_str()}); } @@ -130,8 +139,9 @@ ModelOpRecv::ModelOpRecv(ModelTensorRef output, int remote_rank, int tag) std::string ModelOpRecv::impl_name(const Json &config) const { check_fields_config(config, - {"ChannelType", "NumTasks", "NumWarps", "SramBytes"}); + {"ChannelType", "NumTasks", "NumWarps", "SramBytes", "Wait"}); std::string channel_type = config["ChannelType"]; + bool wait = config["Wait"]; if (channel_type != "Proxy" && channel_type != "SecondaryProxy" && channel_type != "Sm") { ERR(ModelError, "invalid channel type: ", channel_type); @@ -140,8 +150,9 @@ std::string ModelOpRecv::impl_name(const Json &config) const { int remote_rank = input->buffer()->rank(); int max_spin_cnt = -1; return function_name_string( - "wait", {"comm::ChannelType::" + channel_type, - std::to_string(remote_rank), std::to_string(max_spin_cnt)}); + "wait", + {"comm::ChannelType::" + channel_type, std::to_string(remote_rank), + std::to_string(max_spin_cnt), std::to_string(wait)}); } std::vector ModelOpRecv::impl_args([ @@ -150,6 +161,427 @@ std::vector ModelOpRecv::impl_args([ } Json ModelOpRecv::default_config([[maybe_unused]] const ArchRef arch) const { + return {{"ChannelType", "Proxy"}, + {"NumTasks", 1}, + {"NumWarps", 1}, + {"SramBytes", 0}, + {"Wait", true}}; +} + +ModelOpSendPacket::ModelOpSendPacket(ModelTensorRef input, int remote_rank, + int tag, uint32_t flag, + ModelTensorRef output) + : ModelOp("SendPacket") { + check_null(input); + if (output) { + // TODO: verify output shape and strides + if (output->buffer()->rank() != remote_rank) { + ERR(ModelError, "invalid buffer rank: ", output->buffer()->rank(), + ", expected: ", remote_rank); + } + } else { + // For packet output, expand the last dimension to 2x + Dims output_shape(input->shape_bytes() * 2); + output = std::make_shared( + UINT8.ref(), std::make_shared(remote_rank), + output_shape); + } + output->buffer()->tag_recv(-1, tag); + ModelTensorRef result = std::make_shared(*output); + + read_tensors_ = {input}; + write_tensors_ = {output}; + result_tensors_ = {result}; + args_ = {{"Flag", ModelOpArg(flag)}}; + verify(); +} + +std::string ModelOpSendPacket::impl_name(const Json &config) const { + check_fields_config( + config, {"NumTasks", "NumWarps", "Tile", "SramBytes", "PacketType"}); + auto &input = read_tensors_[0]; + auto &output = write_tensors_[0]; + uint32_t flag = args_.at("Flag").value(); + int remote_rank = output->buffer()->rank(); + int num_warps = config.at("NumWarps"); + auto &tile_shape = config.at("Tile"); + std::string packet_type = config.at("PacketType"); + Dims unit_out_dims = {1, 1, tile_shape[0], tile_shape[1]}; + const size_t packet_payload_size = packet_payload_size_map.at(packet_type); + const size_t scale_factor = + packet_payload_size / input->data_type()->bytes(); + if (scale_factor == 0) { + ERR(ModelError, + "unsupported data type: ", input->data_type()->type_str()); + } + Dims in_dims[] = {input->strides().dims4(), input->shape().dims4()}; + for (auto &dim : in_dims) { + dim[3] /= scale_factor; + } + Dims out_dims[] = {output->strides().dims4(), output->shape().dims4(), + unit_out_dims}; + for (auto &dim : out_dims) { + dim[3] = dim[3] / packet_payload_size / 2; + } + return function_name_string( + "write_packet", {std::to_string(remote_rank), vec_string(in_dims[0]), + vec_string(in_dims[1]), vec_string(out_dims[0]), + vec_string(out_dims[1]), vec_string(out_dims[2]), + std::to_string(num_warps), std::to_string(0), + packet_type, std::to_string(flag)}); +} + +std::vector ModelOpSendPacket::impl_args([ + [maybe_unused]] const Json &config) const { + return {ModelOffset(write_tensors_[0]), ModelOffset(read_tensors_[0])}; +} + +Json ModelOpSendPacket::default_config([ + [maybe_unused]] const ArchRef arch) const { + Json config; + if (arch->belongs_to(ARCH_ROCM)) { + config["PacketType"] = "mscclpp::LL8Packet"; + } else { + config["PacketType"] = "mscclpp::LL16Packet"; + } + config["NumWarps"] = 1; + config["SramBytes"] = 0; + const auto &shape = result_tensors_[0]->shape().dims4(); + size_t tile_x = 1; + size_t tile_y = 512; + config["Tile"] = {tile_x, tile_y}; + size_t num_tasks = shape[0] * shape[1]; + num_tasks *= (shape[2] + tile_x - 1) / tile_x; + num_tasks *= (shape[3] + tile_y - 1) / tile_y; + config["NumTasks"] = num_tasks; + return config; +} + +ModelOpRecvPacket::ModelOpRecvPacket(ModelTensorRef output, int remote_rank, + int tag, uint32_t flag, + ModelTensorRef scratch) + : ModelOp("RecvPacket") { + check_null(output); + int local_rank = output->buffer()->rank(); + ModelTensorRef result = std::make_shared(*output); + if (scratch) { + if (scratch->buffer()->rank() != local_rank) { + ERR(ModelError, "invalid buffer rank: ", scratch->buffer()->rank(), + ", expected: ", local_rank); + } + } else { + // For packet output, expand the last dimension to 2x + Dims scratch_shape(output->shape_bytes() * 2); + scratch = std::make_shared( + UINT8.ref(), std::make_shared(local_rank), + scratch_shape); + } + ModelTensorRef input = std::make_shared( + output->data_type(), std::make_shared(remote_rank), + output->shape()); + scratch->buffer()->tag_recv(remote_rank, tag); + + read_tensors_ = {input, scratch}; + write_tensors_ = {output}; + result_tensors_ = {result}; + args_ = {{"Flag", ModelOpArg(flag)}}; + verify(); +} + +std::string ModelOpRecvPacket::impl_name(const Json &config) const { + check_fields_config( + config, {"NumTasks", "NumWarps", "Tile", "SramBytes", "PacketType"}); + auto &input = read_tensors_[1]; + auto &peer_tensor = read_tensors_[0]; + auto &output = write_tensors_[0]; + uint32_t flag = args_.at("Flag").value(); + int num_warps = config.at("NumWarps"); + auto &tile_shape = config.at("Tile"); + std::string packet_type = config.at("PacketType"); + int remote_rank = peer_tensor->buffer()->rank(); + Dims unit_out_dims = {1, 1, tile_shape[0], tile_shape[1]}; + const size_t packet_payload_size = packet_payload_size_map.at(packet_type); + const size_t scale_factor = + packet_payload_size / output->data_type()->bytes(); + if (scale_factor == 0) { + ERR(ModelError, + "unsupported data type: ", input->data_type()->type_str()); + } + Dims in_dims[] = {input->strides().dims4(), input->shape().dims4()}; + for (auto &dim : in_dims) { + dim[3] = dim[3] / packet_payload_size / 2; + } + Dims out_dims[] = {output->strides().dims4(), output->shape().dims4(), + unit_out_dims}; + for (auto &dim : out_dims) { + dim[3] = dim[3] / scale_factor; + } + return function_name_string( + "read_packet", {std::to_string(remote_rank), vec_string(in_dims[0]), + vec_string(in_dims[1]), vec_string(out_dims[0]), + vec_string(out_dims[1]), vec_string(out_dims[2]), + std::to_string(num_warps), std::to_string(0), + packet_type, std::to_string(flag)}); +} + +std::vector ModelOpRecvPacket::impl_args([ + [maybe_unused]] const Json &config) const { + return {ModelOffset(write_tensors_[0]), ModelOffset(read_tensors_[1])}; +} + +Json ModelOpRecvPacket::default_config([ + [maybe_unused]] const ArchRef arch) const { + Json config; + if (arch->belongs_to(ARCH_ROCM)) { + config["PacketType"] = "mscclpp::LL8Packet"; + } else { + config["PacketType"] = "mscclpp::LL16Packet"; + } + config["NumWarps"] = 1; + config["SramBytes"] = 0; + const auto &shape = result_tensors_[0]->shape().dims4(); + size_t tile_x = 1; + size_t tile_y = 128; + config["Tile"] = {tile_x, tile_y}; + size_t num_tasks = shape[0] * shape[1]; + num_tasks *= (shape[2] + tile_x - 1) / tile_x; + num_tasks *= (shape[3] + tile_y - 1) / tile_y; + config["NumTasks"] = num_tasks; + return config; +} + +ModelOpRecvReduceSendPacket::ModelOpRecvReduceSendPacket( + ModelTensorRef input, ModelTensorRef output, int rank, + const std::vector &remote_ranks, int recv_tag, int output_tag, + uint32_t flag, std::vector &peer_output_refs, + ModelTensorRef scratch) + : ModelOp("RecvReduceSendPacket") { + check_null(input); + uint32_t n_remote_ranks = remote_ranks.size(); + // Need to check the scratch buffers are contiguous + if (scratch) { + if (scratch->buffer()->rank() != rank && + scratch->buffer()->rank() != -1) { + ERR(ModelError, "invalid buffer rank: ", scratch->buffer()->rank(), + ", expected: ", rank); + } + } else { + Dims scratch_shape(input->shape_bytes() * 2 * n_remote_ranks); + scratch = std::make_shared( + UINT8.ref(), std::make_shared(rank), scratch_shape); + } + if (!output) { + output = std::make_shared( + input->data_type(), std::make_shared(rank), + input->shape(), input->strides(), input->offsets(), + input->padded_shape()); + } + for (uint32_t i = 0; i < n_remote_ranks; ++i) { + scratch->buffer()->tag_recv(remote_ranks[i], recv_tag); + peer_output_refs[i]->buffer()->tag_recv(-1, output_tag); + } + ModelTensorRef result = std::make_shared(*output); + read_tensors_ = {input, scratch}; + write_tensors_ = {output}; + write_tensors_.insert(write_tensors_.end(), peer_output_refs.begin(), + peer_output_refs.end()); + result_tensors_ = {result}; + args_ = { + {"Flag", ModelOpArg(flag)}, + {"NPeers", ModelOpArg(n_remote_ranks)}, + {"Rank", ModelOpArg(rank)}, + }; + verify(); +} + +std::string ModelOpRecvReduceSendPacket::impl_name(const Json &config) const { + check_fields_config( + config, {"NumTasks", "NumWarps", "Tile", "SramBytes", "PacketType"}); + auto &input = read_tensors_[0]; + auto &output = write_tensors_[0]; + uint32_t flag = args_.at("Flag").value(); + uint32_t n_peers = args_.at("NPeers").value(); + int rank = args_.at("Rank").value(); + int num_warps = config.at("NumWarps"); + auto &tile_shape = config.at("Tile"); + std::string packet_type = config.at("PacketType"); + Dims unit_out_dims = {1, 1, tile_shape[0], tile_shape[1]}; + Dims in_dims[] = {input->strides().dims4(), input->shape().dims4()}; + Dims out_dims[] = {output->strides().dims4(), output->shape().dims4(), + unit_out_dims}; + return function_name_string( + "read_reduce_and_write", + {vec_string(in_dims[0]), vec_string(in_dims[1]), + vec_string(out_dims[0]), vec_string(out_dims[1]), + vec_string(out_dims[2]), std::to_string(num_warps), std::to_string(0), + std::to_string(n_peers), std::to_string(rank), packet_type, + input->data_type()->type_str(), std::to_string(flag)}); +} + +std::vector ModelOpRecvReduceSendPacket::impl_args([ + [maybe_unused]] const Json &config) const { + std::vector args = {write_tensors_[0], read_tensors_[0], + read_tensors_[1]}; + for (size_t i = 1; i < write_tensors_.size(); ++i) { + args.push_back(ModelOffset(write_tensors_[i])); + } + for (int i = write_tensors_.size() - 1; i < MAX_NUM_PEERS; ++i) { + args.push_back(0L); + } + return args; +} + +Json ModelOpRecvReduceSendPacket::default_config([ + [maybe_unused]] const ArchRef arch) const { + Json config; + if (arch->belongs_to(ARCH_ROCM)) { + config["PacketType"] = "mscclpp::LL8Packet"; + } else { + config["PacketType"] = "mscclpp::LL16Packet"; + } + config["NumWarps"] = 1; + config["SramBytes"] = 0; + const auto &shape = result_tensors_[0]->shape().dims4(); + size_t tile_x = 1; + size_t tile_y = 128; + config["Tile"] = {tile_x, tile_y}; + size_t num_tasks = shape[0] * shape[1]; + num_tasks *= (shape[2] + tile_x - 1) / tile_x; + num_tasks *= (shape[3] + tile_y - 1) / tile_y; + config["NumTasks"] = num_tasks; + return config; +} + +ModelOpRecvReduceSend::ModelOpRecvReduceSend(ModelTensorRef input, + ModelTensorRef output, int rank, + const std::vector &remote_ranks, + int recv_tag, int output_tag, + std::vector &peer_output_refs, + ModelTensorRef scratch) + : ModelOp("RecvReduceSend") { + check_null(input); + uint32_t n_remote_ranks = remote_ranks.size(); + // Need to check the scratch buffers are contiguous + if (scratch) { + if (scratch->buffer()->rank() != rank && + scratch->buffer()->rank() != -1) { + ERR(ModelError, "invalid buffer rank: ", scratch->buffer()->rank(), + ", expected: ", rank); + } + } else { + Dims scratch_shape(input->shape_bytes() * n_remote_ranks / + input->data_type()->bytes()); + scratch = std::make_shared( + input->data_type(), std::make_shared(rank), + scratch_shape); + } + if (!output) { + output = std::make_shared( + input->data_type(), std::make_shared(rank), + input->shape(), input->strides(), input->offsets(), + input->padded_shape()); + } + for (uint32_t i = 0; i < n_remote_ranks; ++i) { + scratch->buffer()->tag_recv(remote_ranks[i], recv_tag); + peer_output_refs[i]->buffer()->tag_recv(-1, output_tag); + } + ModelTensorRef result = std::make_shared(*output); + read_tensors_ = {input, scratch}; + write_tensors_ = {output}; + write_tensors_.insert(write_tensors_.end(), peer_output_refs.begin(), + peer_output_refs.end()); + result_tensors_ = {result}; + args_ = { + {"NPeers", ModelOpArg(n_remote_ranks)}, + {"Rank", ModelOpArg(rank)}, + }; + verify(); +} + +std::string ModelOpRecvReduceSend::impl_name(const Json &config) const { + check_fields_config(config, {"NumTasks", "NumWarps", "Tile", "SramBytes"}); + auto &input = read_tensors_[0]; + auto &output = write_tensors_[0]; + uint32_t n_peers = args_.at("NPeers").value(); + int rank = args_.at("Rank").value(); + int num_warps = config.at("NumWarps"); + auto &tile_shape = config.at("Tile"); + Dims unit_out_dims = {1, 1, tile_shape[0], tile_shape[1]}; + Dims in_dims[] = {input->strides().dims4(), input->shape().dims4()}; + Dims out_dims[] = {output->strides().dims4(), output->shape().dims4(), + unit_out_dims}; + return function_name_string( + "read_reduce_and_write", + {vec_string(in_dims[0]), vec_string(in_dims[1]), + vec_string(out_dims[0]), vec_string(out_dims[1]), + vec_string(out_dims[2]), std::to_string(num_warps), std::to_string(0), + std::to_string(n_peers), std::to_string(rank), + input->data_type()->type_str(), input->data_type()->type_str()}); +} + +std::vector ModelOpRecvReduceSend::impl_args([ + [maybe_unused]] const Json &config) const { + std::vector args = {write_tensors_[0], read_tensors_[0], + read_tensors_[1]}; + for (size_t i = 1; i < write_tensors_.size(); ++i) { + args.push_back(ModelOffset(write_tensors_[i])); + } + for (int i = write_tensors_.size() - 1; i < MAX_NUM_PEERS; ++i) { + args.push_back(0L); + } + return args; +} + +Json ModelOpRecvReduceSend::default_config([ + [maybe_unused]] const ArchRef arch) const { + Json config; + config["NumWarps"] = 1; + config["SramBytes"] = 0; + const auto &shape = result_tensors_[0]->shape().dims4(); + size_t tile_x = 1; + size_t tile_y = 128; + config["Tile"] = {tile_x, tile_y}; + size_t num_tasks = shape[0] * shape[1]; + num_tasks *= (shape[2] + tile_x - 1) / tile_x; + num_tasks *= (shape[3] + tile_y - 1) / tile_y; + config["NumTasks"] = num_tasks; + return config; +} + +ModelOpDeviceSync::ModelOpDeviceSync(ModelTensorRef input, int rank, + int rank_num, ModelTensorRef output) + : ModelOp("DeviceSync") { + check_null(input); + check_null(output); + ModelTensorRef result = std::make_shared(*output); + read_tensors_ = {input}; + write_tensors_ = {output}; + result_tensors_ = {result}; + args_ = {{"Rank", rank}, {"PeerNum", rank_num - 1}}; + verify(); +} + +std::string ModelOpDeviceSync::impl_name(const Json &config) const { + check_fields_config(config, + {"ChannelType", "NumTasks", "NumWarps", "SramBytes"}); + std::string channel_type = config["ChannelType"]; + if (channel_type != "Proxy" && channel_type != "SecondaryProxy" && + channel_type != "Sm") { + ERR(ModelError, "invalid channel type: ", channel_type); + } + int rank = args_.at("Rank").value(); + int peer_num = args_.at("PeerNum").value(); + return function_name_string( + "device_sync", {"comm::ChannelType::" + channel_type, + std::to_string(peer_num), std::to_string(rank)}); +} + +std::vector ModelOpDeviceSync::impl_args([ + [maybe_unused]] const Json &config) const { + return {}; +} + +Json ModelOpDeviceSync::default_config([[maybe_unused]] const ArchRef arch) const { return {{"ChannelType", "Proxy"}, {"NumTasks", 1}, {"NumWarps", 1}, @@ -177,4 +609,95 @@ Tensor Model::recv(Tensor output, int remote_rank, int tag, ->result_tensors()[0]; } +Tensor Model::send_packet(Tensor input, int remote_rank, int tag, int flag, + Tensor output, const std::string &name) { + tags_.insert(tag); + return impl_ + ->create_op(name, input.ref(), remote_rank, tag, + flag, output.ref()) + ->result_tensors()[0]; +} + +Tensor Model::recv_packet(Tensor output, int remote_rank, int tag, int flag, + Tensor scratch, const std::string &name) { + tags_.insert(tag); + return impl_ + ->create_op(name, output.ref(), remote_rank, tag, + flag, scratch.ref()) + ->result_tensors()[0]; +} + +Tensor Model::recv_reduce_send_packet(Tensor input, + const std::vector &remote_ranks, + int recv_tag, int output_tag, + unsigned int flag, Tensor output, + std::vector peer_outputs, + Tensor scratch, const std::string &name) { + tags_.insert(recv_tag); + tags_.insert(output_tag); + std::vector result_tensors; + std::vector scratch_refs; + std::vector outputs_refs; + int local_rank = this->rank(); + if (peer_outputs.empty()) { + size_t shape_bytes = input.ref()->shape_bytes(); + Dims output_shape(shape_bytes * 2); // For packet + std::transform(remote_ranks.begin(), remote_ranks.end(), + std::back_inserter(peer_outputs), [&](int remote_rank) { + return std::make_shared( + UINT8.ref(), + std::make_shared(remote_rank), + output_shape); + }); + } + std::transform(peer_outputs.begin(), peer_outputs.end(), + std::back_inserter(outputs_refs), + [](const Tensor &t) { return t.ref(); }); + return impl_ + ->create_op( + name, input.ref(), output.ref(), local_rank, remote_ranks, recv_tag, + output_tag, flag, outputs_refs, scratch.ref()) + ->result_tensors()[0]; +} + +Tensor Model::recv_reduce_send(Tensor input, + const std::vector &remote_ranks, + int recv_tag, int output_tag, Tensor output, + std::vector peer_outputs, Tensor scratch, + const std::string &name) { + tags_.insert(recv_tag); + tags_.insert(output_tag); + std::vector result_tensors; + std::vector scratch_refs; + std::vector outputs_refs; + int local_rank = this->rank(); + if (peer_outputs.empty()) { + std::transform(remote_ranks.begin(), remote_ranks.end(), + std::back_inserter(peer_outputs), [&](int remote_rank) { + return std::make_shared( + input.ref()->data_type(), + std::make_shared(remote_rank), + input.shape(), input.strides(), input.offsets(), + input.padded_shape()); + }); + } + std::transform(peer_outputs.begin(), peer_outputs.end(), + std::back_inserter(outputs_refs), + [](const Tensor &t) { return t.ref(); }); + return impl_ + ->create_op( + name, input.ref(), output.ref(), local_rank, remote_ranks, recv_tag, + output_tag, outputs_refs, scratch.ref()) + ->result_tensors()[0]; +} + +Tensor Model::device_sync(Tensor input, int rank, int rank_num, + const std::string &name) { + Tensor output = this->identity(input); + return impl_ + ->create_op(name, input.ref(), rank, rank_num, + output.ref()) + ->result_tensors()[0]; +} + } // namespace ark diff --git a/ark/ops/ops_communication.hpp b/ark/ops/ops_communication.hpp index 83109f90f..23f3b84af 100644 --- a/ark/ops/ops_communication.hpp +++ b/ark/ops/ops_communication.hpp @@ -45,6 +45,77 @@ class ModelOpRecv : public ModelOp { Json default_config(const ArchRef arch = ARCH_ANY) const override; }; +class ModelOpSendPacket : public ModelOp { + public: + ModelOpSendPacket() = default; + ModelOpSendPacket(ModelTensorRef input, int remote_rank, int tag, + uint32_t flag, ModelTensorRef output); + + std::string impl_name(const Json &config) const override; + + std::vector impl_args(const Json &config) const override; + + Json default_config(const ArchRef arch = ARCH_ANY) const override; +}; + +class ModelOpRecvPacket : public ModelOp { + public: + ModelOpRecvPacket() = default; + ModelOpRecvPacket(ModelTensorRef output, int remote_rank, int tag, + uint32_t flag, ModelTensorRef scratch); + + std::string impl_name(const Json &config) const override; + + std::vector impl_args(const Json &config) const override; + + Json default_config(const ArchRef arch = ARCH_ANY) const override; +}; + +class ModelOpRecvReduceSendPacket : public ModelOp { + public: + ModelOpRecvReduceSendPacket() = default; + ModelOpRecvReduceSendPacket(ModelTensorRef input, ModelTensorRef output, + int rank, const std::vector &remote_rank, + int recv_tag, int output_tag, uint32_t flag, + std::vector &peer_output_refs, + ModelTensorRef scratch); + + std::string impl_name(const Json &config) const override; + + std::vector impl_args(const Json &config) const override; + + Json default_config(const ArchRef arch = ARCH_ANY) const override; +}; + +class ModelOpRecvReduceSend : public ModelOp { + public: + ModelOpRecvReduceSend() = default; + ModelOpRecvReduceSend(ModelTensorRef input, ModelTensorRef output, int rank, + const std::vector &remote_rank, int recv_tag, + int output_tag, + std::vector &peer_output_refs, + ModelTensorRef scratch); + + std::string impl_name(const Json &config) const override; + + std::vector impl_args(const Json &config) const override; + + Json default_config(const ArchRef arch = ARCH_ANY) const override; +}; + + +class ModelOpDeviceSync : public ModelOp { + public: + ModelOpDeviceSync() = default; + ModelOpDeviceSync(ModelTensorRef input, int rank, int rank_num, + ModelTensorRef output); + + std::string impl_name(const Json &config) const override; + + std::vector impl_args(const Json &config) const override; + + Json default_config(const ArchRef arch = ARCH_ANY) const override; +}; } // namespace ark #endif // ARK_OPS_COMMUNICATION_HPP_ diff --git a/ark/ops/ops_communication_test.cpp b/ark/ops/ops_communication_test.cpp index f2a66f088..dec310331 100644 --- a/ark/ops/ops_communication_test.cpp +++ b/ark/ops/ops_communication_test.cpp @@ -7,6 +7,7 @@ #include "ark/executor.hpp" #include "ark/planner.hpp" #include "half.h" +#include "model/model_buffer.hpp" #include "ops_test_common.hpp" ark::unittest::State test_communication_send_recv_unidir() { @@ -16,8 +17,8 @@ ark::unittest::State test_communication_send_recv_unidir() { ark::Model model(gpu_id, 2); ark::Tensor tns = model.tensor({1024}, ark::FP16); if (gpu_id == 0) { - tns = model.send(tns, 1, 0); - model.send_done(tns); + ark::Tensor out_tns = model.send(tns, 1, 0); + model.send_done(out_tns); } if (gpu_id == 1) { tns = model.recv(tns, 0, 0); @@ -210,6 +211,7 @@ ark::unittest::State test_communication_send_recv_bidir_sm() { config["NumTasks"] = 1; config["NumWarps"] = 1; config["SramBytes"] = 0; + config["Wait"] = true; } return config.dump(); }; @@ -307,10 +309,168 @@ ark::unittest::State test_communication_send_recv_bidir_sm() { return ark::unittest::SUCCESS; } +ark::unittest::State test_communication_send_packet() { + // send from gpu 0 to gpu 1 + for (int gpu_id = 0; gpu_id < 2; ++gpu_id) { + ark::unittest::spawn_process([gpu_id]() { + ark::Model model(gpu_id, 2); + ark::Tensor tns_data = model.tensor({1024}, ark::FP16); + if (gpu_id == 0) { + model.send_packet(tns_data, 1, 0, 1); + } + if (gpu_id == 1) { + tns_data = model.recv_packet(tns_data, 0, 0, 1); + } + + ark::DefaultExecutor exe(model, gpu_id); + exe.compile(); + + if (gpu_id == 0) { + std::vector data(1024); + std::iota(data.begin(), data.end(), 1.0f); + exe.tensor_write(tns_data, data); + } + + exe.barrier(); + exe.launch(); + exe.run(1); + exe.stop(); + exe.barrier(); + + if (gpu_id == 1) { + std::vector data(1024); + exe.tensor_read(tns_data, data); + for (int i = 0; i < 1024; ++i) { + UNITTEST_EQ(data[i], ark::half_t(i + 1)); + } + } + return ark::unittest::SUCCESS; + }); + } + + ark::unittest::wait_all_processes(); + return ark::unittest::SUCCESS; +} + +ark::unittest::State test_communication_send_recv_reduce_packet() { + for (int gpu_id = 0; gpu_id < 2; ++gpu_id) { + ark::unittest::spawn_process([gpu_id]() { + ark::Model model(gpu_id, 2); + ark::Tensor tns_data = model.tensor({1024}, ark::FP16); + std::vector shard_tensors = model.sharding(tns_data, 0, 512); + + int peer_gpu_id = (gpu_id + 1) % 2; + model.send_packet(shard_tensors[peer_gpu_id], peer_gpu_id, 0, 1); + model.recv_reduce_send_packet(shard_tensors[gpu_id], {peer_gpu_id}, + 0, 1, 1, shard_tensors[gpu_id]); + model.recv_packet(shard_tensors[peer_gpu_id], peer_gpu_id, 1, 1); + + ark::DefaultExecutor exe(model, gpu_id); + exe.compile(); + + std::vector data(1024); + std::iota(data.begin(), data.end(), 1.0f); + exe.tensor_write(tns_data, data); + + exe.barrier(); + exe.launch(); + exe.run(1); + exe.stop(); + exe.barrier(); + + exe.tensor_read(tns_data, data); + for (int i = 0; i < 1024; ++i) { + UNITTEST_EQ(data[i], ark::half_t((i + 1) * 2)); + } + return ark::unittest::SUCCESS; + }); + } + + ark::unittest::wait_all_processes(); + return ark::unittest::SUCCESS; +} + +ark::unittest::State test_communication_send_recv_reduce() { + auto config_rule = [](const std::string op_str, const std::string) { + auto op = nlohmann::json::parse(op_str); + nlohmann::json config; + if (op.at("Type") == "Send") { + config["ChannelType"] = "Sm"; + config["Signal"] = false; + config["Tile"] = {1, 256}; + config["NumTasks"] = 4; + config["NumWarps"] = 4; + config["SramBytes"] = 0; + } + else if (op.at("Type") == "DeviceSync") { + config["ChannelType"] = "Sm"; + config["NumTasks"] = 1; + config["NumWarps"] = 1; + config["SramBytes"] = 0; + } else if (op.at("Type") == "Recv") { + config["ChannelType"] = "Sm"; + config["NumTasks"] = 1; + config["NumWarps"] = 1; + config["SramBytes"] = 0; + config["Wait"] = false; + } + return config.dump(); + }; + for (int gpu_id = 0; gpu_id < 2; ++gpu_id) { + ark::unittest::spawn_process([gpu_id, config_rule]() { + ark::Model model(gpu_id, 2); + ark::Tensor tns_data = model.tensor({1024}, ark::FP16); + std::vector shard_tensors = + model.sharding(tns_data, 0, 512); + + int peer_gpu_id = (gpu_id + 1) % 2; + ark::Tensor remote_scratch = + model.tensor({512}, ark::FP16, {}, {}, {}, peer_gpu_id); + ark::Tensor out = model.send(shard_tensors[peer_gpu_id], + peer_gpu_id, 0, remote_scratch); + out = model.device_sync(out, gpu_id, 2); + ark::Tensor reduced = model.identity(shard_tensors[gpu_id], {out}); + reduced = + model.recv_reduce_send(reduced, {peer_gpu_id}, 0, 1, reduced); + model.recv(shard_tensors[peer_gpu_id], peer_gpu_id, 1); + model.device_sync(reduced, gpu_id, 2); + + ark::Planner planner(model, gpu_id); + planner.install_config_rule(config_rule); + ark::Executor exe(gpu_id, 2, gpu_id, "Executor", planner.plan()); + exe.compile(); + + std::vector data(1024); + std::iota(data.begin(), data.end(), 1.0f); + exe.tensor_write(tns_data, data); + + exe.barrier(); + exe.launch(); + exe.run(1); + exe.stop(); + exe.barrier(); + + exe.tensor_read(tns_data, data); + if (gpu_id == 1) { + for (int i = 0; i < 1024; ++i) { + UNITTEST_EQ(data[i], ark::half_t((i + 1) * 2)); + } + } + return ark::unittest::SUCCESS; + }); + } + + ark::unittest::wait_all_processes(); + return ark::unittest::SUCCESS; +} + int main() { ark::init(); UNITTEST(test_communication_send_recv_unidir); UNITTEST(test_communication_send_recv_bidir); UNITTEST(test_communication_send_recv_bidir_sm); + UNITTEST(test_communication_send_packet); + UNITTEST(test_communication_send_recv_reduce_packet); + UNITTEST(test_communication_send_recv_reduce); return ark::unittest::SUCCESS; } diff --git a/ark/ops/ops_tensor.cpp b/ark/ops/ops_tensor.cpp index 0279ab311..8831e0fcf 100644 --- a/ark/ops/ops_tensor.cpp +++ b/ark/ops/ops_tensor.cpp @@ -3,6 +3,7 @@ #include "ops_tensor.hpp" +#include "logging.hpp" #include "ops_common.hpp" namespace ark { @@ -25,10 +26,19 @@ ModelOpTensor::ModelOpTensor(ModelBufferRef buffer, const Dims &shape, Tensor Model::tensor(const Dims &shape, const DataType &data_type, const Dims &strides, const Dims &offsets, - const Dims &padded_shape, const std::string &name) { + const Dims &padded_shape, int rank, + const std::string &name) { + if (rank != -1) { + if (rank == this->rank()) { + rank = -1; + } else if (rank < 0 || rank >= this->world_size()) { + ERR(ModelError, "Invalid rank %d", rank); + } + } return impl_ - ->create_op(name, nullptr, shape, data_type.ref(), - strides, offsets, padded_shape) + ->create_op(name, std::make_shared(rank), + shape, data_type.ref(), strides, offsets, + padded_shape) ->result_tensors()[0]; } diff --git a/ark/ops/ops_test_common.cpp b/ark/ops/ops_test_common.cpp index 3f22076e0..0e8f215ae 100644 --- a/ark/ops/ops_test_common.cpp +++ b/ark/ops/ops_test_common.cpp @@ -36,8 +36,11 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model, const std::vector &outputs, OpsTestBaseline baseline, const std::vector &inputs_data, - bool print_on_error, int rank, int world_size) { - DefaultExecutor exe(model); + bool print_on_error, int rank, int world_size, + Planner::ConfigRule config_rule) { + Planner planner(model, rank); + planner.install_config_rule(config_rule); + Executor exe(rank, world_size, rank, "Executor", planner.plan()); exe.compile(); std::vector>> inputs_data_storages; diff --git a/ark/ops/ops_test_common.hpp b/ark/ops/ops_test_common.hpp index 01e97dbb1..e5b6c4f8e 100644 --- a/ark/ops/ops_test_common.hpp +++ b/ark/ops/ops_test_common.hpp @@ -10,6 +10,7 @@ #include "ark/model.hpp" #include "ark/model_ref.hpp" +#include "ark/planner.hpp" #include "ark/random.hpp" #include "bfloat16.h" #include "half.h" @@ -171,7 +172,8 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model, OpsTestBaseline baseline, const std::vector &inputs_data = {}, bool print_on_error = false, int rank = 0, - int world_size = 1); + int world_size = 1, + Planner::ConfigRule config_rule = nullptr); OpsTestGpuMem to_gpu(void *host_ptr, size_t size); diff --git a/ark/ops_old/ops_all_reduce.cc b/ark/ops_old/ops_all_reduce.cc deleted file mode 100644 index 64db18c75..000000000 --- a/ark/ops_old/ops_all_reduce.cc +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -#include -#include - -#include "logging.hpp" -#include "math_utils.h" -#include "model.h" -#include "ops_common.h" -constexpr int MSCCLPP_PACKET_SIZE = sizeof(mscclpp::LLPacket); - -namespace ark { - -Tensor *Model::all_reduce(Tensor *input, int gpu_id, int gpu_num, - Tensor *output, const std::string &) { - assert(input != nullptr); - if (output != nullptr) { - ERR(ModelError, "all_reduce output is not supported"); - } - if (!input->is_sequential()) { - LOG(WARN, - "all_reduce may not work correctly if the input tensor is " - "not contiguous"); - } - int base = this->impl->next_eid; - Tensor *prev_recv = nullptr; - Tensor *cumulate = input; - for (int i = 1; i < gpu_num; i++) { - int gpu_dst = (gpu_id + i) % gpu_num; - int gpu_src = (gpu_id + gpu_num - i) % gpu_num; - Tensor *send_data; - if (prev_recv != nullptr) { - send_data = this->identity(input, {prev_recv}); - } else { - send_data = input; - } - send_data = this->send(send_data, base + gpu_id, gpu_dst); - Tensor *send_done_tensor = - this->send_done(send_data, base + gpu_id, gpu_dst); - Tensor *recv_buf = this->tensor(input->shape, input->type); - recv_buf = this->identity(recv_buf, {send_done_tensor}); - Tensor *recv = this->recv(base + gpu_src, gpu_src, 0, recv_buf); - prev_recv = recv; - cumulate = this->add(cumulate, recv); - } - this->impl->next_eid += gpu_num; - return cumulate; -} - -Tensor *Model::local_all_reduce(Tensor *input, int gpu_id, int gpu_num, - const std::string &) { - assert(input != nullptr); - if (!input->is_sequential()) { - LOG(WARN, - "all_reduce may not work correctly if the input tensor is " - "not contiguous"); - } - ark::Dims ori_shape = input->shape; - Tensor *input_reshaped = this->reshape(input, {input->shape.size()}); - Tensor *out = this->local_reduce_scatter(input_reshaped, gpu_id, gpu_num); - Tensor *res = this->local_all_gather(out, gpu_id, gpu_num); - return this->reshape(res, ori_shape); -} - -Tensor *Model::local_all_reduce_packet(Tensor *input, int gpu_id, int gpu_num, - const std::string &) { - assert(input != nullptr); - // We only support out-of-place all_reduce - if (input->ndims() > 1) { - ERR(ModelError, "supports only 1D input"); - } - if (!input->is_sequential()) { - LOG(WARN, - "all_reduce may not work correctly if the input tensor is " - "not contiguous"); - } - Tensor *out = this->tensor(input->shape, input->type); - // only half of the packets are used to store data - const int num_packets = input->shape_bytes() / (MSCCLPP_PACKET_SIZE / 2); - const int scratch_nelems = num_packets * - 2 /*oringinal data & reduced result*/ * - 2 /*double buffer*/; - Dims scratch_shape = { - static_cast(scratch_nelems * MSCCLPP_PACKET_SIZE)}; - Tensor *scratch = this->tensor(scratch_shape, UINT8); - int npeer = gpu_num - 1; - std::vector outputs; - std::vector remote_scratches; - size_t nelems_per_rank = - input->shape_bytes() / input->type_bytes() / gpu_num; - size_t npackets_per_rank = num_packets / gpu_num; - int flag = this->impl->reduce_packet_flag; - size_t scratch_base_offset = - (flag & 1) ? 0 : num_packets * MSCCLPP_PACKET_SIZE; - size_t scratch_result_offset = (flag & 1) - ? 2 * num_packets * MSCCLPP_PACKET_SIZE - : 3 * num_packets * MSCCLPP_PACKET_SIZE; - int id = this->impl->next_eid; - std::vector sharded_inputs = - this->sharding(input, 0, nelems_per_rank); - std::vector sharded_outputs = - this->sharding(out, 0, nelems_per_rank); - for (int i = 0; i < npeer; ++i) { - int remote_rank = i < gpu_id ? i : i + 1; - Tensor *remote_scratch = this->tensor(scratch_shape, UINT8); - remote_scratches.push_back(remote_scratch); - Tensor *out = - this->put_packet(sharded_inputs[remote_rank], scratch, - remote_scratch, id, gpu_id, remote_rank, - scratch_base_offset + npackets_per_rank * gpu_id * - MSCCLPP_PACKET_SIZE, - flag); - outputs.push_back(out); - } - Tensor *input_sharded = this->identity(sharded_inputs[gpu_id], outputs); - // This op should reduce from the scratch buffer and write to the remote. - Tensor *out_stage2 = this->reduce_and_write_packet( - input_sharded, scratch, sharded_outputs[gpu_id], remote_scratches, id, - gpu_id, npeer, nelems_per_rank, scratch_base_offset, - scratch_result_offset, flag); - // Get the result from the scratch buffer. - Tensor *scratch_stage3 = this->identity(scratch, {out_stage2}); - outputs.clear(); - for (int i = 0; i < npeer; ++i) { - int remote_rank = i < gpu_id ? i : i + 1; - size_t dst_offset = nelems_per_rank * remote_rank * input->type_bytes(); - size_t src_offset = scratch_result_offset + npackets_per_rank * - remote_rank * - MSCCLPP_PACKET_SIZE; - Tensor *res = this->get_packet(scratch_stage3, out, src_offset, - dst_offset, npackets_per_rank, flag); - outputs.push_back(res); - } - this->impl->next_eid += 1; - this->impl->reduce_packet_flag += 1; - return this->identity(out, outputs); -} - -} // namespace ark diff --git a/examples/tutorial/allreduce-packet/plan_gpu0.json b/examples/tutorial/allreduce-packet/plan_gpu0.json new file mode 100644 index 000000000..91f072bf3 --- /dev/null +++ b/examples/tutorial/allreduce-packet/plan_gpu0.json @@ -0,0 +1,569 @@ +{ + "Rank": 0, + "WorldSize": 8, + "NumProcessors": 8, + "NumWarpsPerProcessor": 16, + "Architecture": "ROCM_942", + "TaskInfos": [ + { + "Id": 0, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":19,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[28672],"PaddedShape":[4096],"Buffer":{"Id":1,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":20,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[28672],"PaddedShape":[4096],"Buffer":{"Id":1,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 1, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":21,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[28672],"PaddedShape":[4096],"Buffer":{"Id":2,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":22,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[28672],"PaddedShape":[4096],"Buffer":{"Id":2,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 2, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":23,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[28672],"PaddedShape":[4096],"Buffer":{"Id":3,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":24,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[28672],"PaddedShape":[4096],"Buffer":{"Id":3,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 3, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":25,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[28672],"PaddedShape":[4096],"Buffer":{"Id":4,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":26,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[28672],"PaddedShape":[4096],"Buffer":{"Id":4,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 4, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":27,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[28672],"PaddedShape":[4096],"Buffer":{"Id":5,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":28,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[28672],"PaddedShape":[4096],"Buffer":{"Id":5,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 5, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":29,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[28672],"PaddedShape":[4096],"Buffer":{"Id":6,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":30,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[28672],"PaddedShape":[4096],"Buffer":{"Id":6,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 6, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":31,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[28672],"PaddedShape":[4096],"Buffer":{"Id":7,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":32,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[28672],"PaddedShape":[4096],"Buffer":{"Id":7,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 7, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvReduceSendPacket", + "Name": "recv_reduce_send_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":3,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":33,"DataType":"UINT8","Shape":[28672],"Strides":[57344],"Offsets":[28672],"PaddedShape":[28672],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]]}} + ], + "WriteTensors": [ + {"Id":11,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":34,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":9,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":35,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":10,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":36,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":11,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":37,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":12,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":38,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":13,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":39,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":14,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":40,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":15,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":41,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1}, + "NPeers": {"UINT32":7}, + "Rank": {"INT":0} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 8, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":44,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":17,"Rank":1,"SendTags":[],"RecvTags":[]}}, + {"Id":42,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[[1,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":12,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":43,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 9, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":47,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":19,"Rank":2,"SendTags":[],"RecvTags":[]}}, + {"Id":45,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[[2,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":13,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":46,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 10, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":50,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":21,"Rank":3,"SendTags":[],"RecvTags":[]}}, + {"Id":48,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[[3,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":14,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":49,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 11, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":53,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":23,"Rank":4,"SendTags":[],"RecvTags":[]}}, + {"Id":51,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[[4,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":15,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":52,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 12, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":56,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":25,"Rank":5,"SendTags":[],"RecvTags":[]}}, + {"Id":54,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[[5,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":16,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":55,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 13, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":59,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":27,"Rank":6,"SendTags":[],"RecvTags":[]}}, + {"Id":57,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[[6,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":17,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":58,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 14, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":62,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":29,"Rank":7,"SendTags":[],"RecvTags":[]}}, + {"Id":60,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":18,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":61,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + } + ], + "ProcessorGroups": [ + { + "ProcessorRange": [0,8], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":0,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":8,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [1,2], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":1,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":9,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [2,3], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":2,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":10,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [3,4], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":3,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":11,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [4,5], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":4,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":12,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [5,6], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":5,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":13,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [6,7], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":6,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":14,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [7,8], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":7,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + } + ] +} diff --git a/examples/tutorial/allreduce-packet/plan_gpu1.json b/examples/tutorial/allreduce-packet/plan_gpu1.json new file mode 100644 index 000000000..6b5e8eac1 --- /dev/null +++ b/examples/tutorial/allreduce-packet/plan_gpu1.json @@ -0,0 +1,569 @@ +{ + "Rank": 1, + "WorldSize": 8, + "NumProcessors": 8, + "NumWarpsPerProcessor": 16, + "Architecture": "ROCM_942", + "TaskInfos": [ + { + "Id": 0, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":3,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":19,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[28672],"PaddedShape":[4096],"Buffer":{"Id":1,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":20,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[28672],"PaddedShape":[4096],"Buffer":{"Id":1,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 1, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":21,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[32768],"PaddedShape":[4096],"Buffer":{"Id":2,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":22,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[32768],"PaddedShape":[4096],"Buffer":{"Id":2,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 2, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":23,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[32768],"PaddedShape":[4096],"Buffer":{"Id":3,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":24,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[32768],"PaddedShape":[4096],"Buffer":{"Id":3,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 3, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":25,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[32768],"PaddedShape":[4096],"Buffer":{"Id":4,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":26,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[32768],"PaddedShape":[4096],"Buffer":{"Id":4,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 4, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":27,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[32768],"PaddedShape":[4096],"Buffer":{"Id":5,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":28,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[32768],"PaddedShape":[4096],"Buffer":{"Id":5,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 5, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":29,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[32768],"PaddedShape":[4096],"Buffer":{"Id":6,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":30,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[32768],"PaddedShape":[4096],"Buffer":{"Id":6,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 6, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":31,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[32768],"PaddedShape":[4096],"Buffer":{"Id":7,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":32,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[32768],"PaddedShape":[4096],"Buffer":{"Id":7,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 7, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvReduceSendPacket", + "Name": "recv_reduce_send_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":33,"DataType":"UINT8","Shape":[28672],"Strides":[57344],"Offsets":[28672],"PaddedShape":[28672],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]]}} + ], + "WriteTensors": [ + {"Id":12,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":34,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":9,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":35,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":10,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":36,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":11,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":37,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":12,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":38,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":13,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":39,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":14,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":40,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":15,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":41,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1}, + "NPeers": {"UINT32":7}, + "Rank": {"INT":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 8, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":44,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":17,"Rank":0,"SendTags":[],"RecvTags":[]}}, + {"Id":42,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[[0,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":11,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":43,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 9, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":47,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":19,"Rank":2,"SendTags":[],"RecvTags":[]}}, + {"Id":45,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[[2,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":13,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":46,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 10, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":50,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":21,"Rank":3,"SendTags":[],"RecvTags":[]}}, + {"Id":48,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[[3,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":14,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":49,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 11, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":53,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":23,"Rank":4,"SendTags":[],"RecvTags":[]}}, + {"Id":51,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[[4,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":15,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":52,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 12, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":56,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":25,"Rank":5,"SendTags":[],"RecvTags":[]}}, + {"Id":54,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[[5,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":16,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":55,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 13, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":59,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":27,"Rank":6,"SendTags":[],"RecvTags":[]}}, + {"Id":57,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[[6,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":17,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":58,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 14, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":62,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":29,"Rank":7,"SendTags":[],"RecvTags":[]}}, + {"Id":60,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":18,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":61,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + } + ], + "ProcessorGroups": [ + { + "ProcessorRange": [0,8], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":0,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":8,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [1,2], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":1,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":9,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [2,3], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":2,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":10,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [3,4], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":3,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":11,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [4,5], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":4,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":12,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [5,6], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":5,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":13,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [6,7], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":6,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":14,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [7,8], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":7,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + } + ] +} diff --git a/examples/tutorial/allreduce-packet/plan_gpu2.json b/examples/tutorial/allreduce-packet/plan_gpu2.json new file mode 100644 index 000000000..ad6d99d22 --- /dev/null +++ b/examples/tutorial/allreduce-packet/plan_gpu2.json @@ -0,0 +1,569 @@ +{ + "Rank": 2, + "WorldSize": 8, + "NumProcessors": 8, + "NumWarpsPerProcessor": 16, + "Architecture": "ROCM_942", + "TaskInfos": [ + { + "Id": 0, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":3,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":19,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[32768],"PaddedShape":[4096],"Buffer":{"Id":1,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":20,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[32768],"PaddedShape":[4096],"Buffer":{"Id":1,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 1, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":21,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[32768],"PaddedShape":[4096],"Buffer":{"Id":2,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":22,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[32768],"PaddedShape":[4096],"Buffer":{"Id":2,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 2, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":23,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[36864],"PaddedShape":[4096],"Buffer":{"Id":3,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":24,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[36864],"PaddedShape":[4096],"Buffer":{"Id":3,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 3, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":25,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[36864],"PaddedShape":[4096],"Buffer":{"Id":4,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":26,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[36864],"PaddedShape":[4096],"Buffer":{"Id":4,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 4, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":27,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[36864],"PaddedShape":[4096],"Buffer":{"Id":5,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":28,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[36864],"PaddedShape":[4096],"Buffer":{"Id":5,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 5, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":29,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[36864],"PaddedShape":[4096],"Buffer":{"Id":6,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":30,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[36864],"PaddedShape":[4096],"Buffer":{"Id":6,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 6, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":31,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[36864],"PaddedShape":[4096],"Buffer":{"Id":7,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":32,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[36864],"PaddedShape":[4096],"Buffer":{"Id":7,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 7, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvReduceSendPacket", + "Name": "recv_reduce_send_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":33,"DataType":"UINT8","Shape":[28672],"Strides":[57344],"Offsets":[28672],"PaddedShape":[28672],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]]}} + ], + "WriteTensors": [ + {"Id":13,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":34,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":9,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":35,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":10,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":36,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":11,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":37,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":12,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":38,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":13,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":39,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":14,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":40,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":15,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":41,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1}, + "NPeers": {"UINT32":7}, + "Rank": {"INT":2} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 8, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":44,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":17,"Rank":0,"SendTags":[],"RecvTags":[]}}, + {"Id":42,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[[0,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":11,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":43,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 9, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":47,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":19,"Rank":1,"SendTags":[],"RecvTags":[]}}, + {"Id":45,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[[1,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":12,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":46,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 10, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":50,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":21,"Rank":3,"SendTags":[],"RecvTags":[]}}, + {"Id":48,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[[3,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":14,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":49,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 11, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":53,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":23,"Rank":4,"SendTags":[],"RecvTags":[]}}, + {"Id":51,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[[4,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":15,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":52,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 12, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":56,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":25,"Rank":5,"SendTags":[],"RecvTags":[]}}, + {"Id":54,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[[5,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":16,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":55,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 13, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":59,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":27,"Rank":6,"SendTags":[],"RecvTags":[]}}, + {"Id":57,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[[6,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":17,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":58,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 14, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":62,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":29,"Rank":7,"SendTags":[],"RecvTags":[]}}, + {"Id":60,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":18,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":61,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + } + ], + "ProcessorGroups": [ + { + "ProcessorRange": [0,8], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":0,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":8,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [1,2], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":1,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":9,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [2,3], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":2,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":10,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [3,4], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":3,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":11,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [4,5], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":4,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":12,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [5,6], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":5,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":13,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [6,7], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":6,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":14,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [7,8], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":7,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + } + ] +} diff --git a/examples/tutorial/allreduce-packet/plan_gpu3.json b/examples/tutorial/allreduce-packet/plan_gpu3.json new file mode 100644 index 000000000..4aeb40cbd --- /dev/null +++ b/examples/tutorial/allreduce-packet/plan_gpu3.json @@ -0,0 +1,569 @@ +{ + "Rank": 3, + "WorldSize": 8, + "NumProcessors": 8, + "NumWarpsPerProcessor": 16, + "Architecture": "ROCM_942", + "TaskInfos": [ + { + "Id": 0, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":3,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":19,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[36864],"PaddedShape":[4096],"Buffer":{"Id":1,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":20,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[36864],"PaddedShape":[4096],"Buffer":{"Id":1,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 1, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":21,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[36864],"PaddedShape":[4096],"Buffer":{"Id":2,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":22,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[36864],"PaddedShape":[4096],"Buffer":{"Id":2,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 2, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":23,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[36864],"PaddedShape":[4096],"Buffer":{"Id":3,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":24,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[36864],"PaddedShape":[4096],"Buffer":{"Id":3,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 3, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":25,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[40960],"PaddedShape":[4096],"Buffer":{"Id":4,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":26,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[40960],"PaddedShape":[4096],"Buffer":{"Id":4,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 4, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":27,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[40960],"PaddedShape":[4096],"Buffer":{"Id":5,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":28,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[40960],"PaddedShape":[4096],"Buffer":{"Id":5,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 5, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":29,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[40960],"PaddedShape":[4096],"Buffer":{"Id":6,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":30,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[40960],"PaddedShape":[4096],"Buffer":{"Id":6,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 6, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":31,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[40960],"PaddedShape":[4096],"Buffer":{"Id":7,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":32,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[40960],"PaddedShape":[4096],"Buffer":{"Id":7,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 7, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvReduceSendPacket", + "Name": "recv_reduce_send_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":33,"DataType":"UINT8","Shape":[28672],"Strides":[57344],"Offsets":[28672],"PaddedShape":[28672],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]]}} + ], + "WriteTensors": [ + {"Id":14,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":34,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":9,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":35,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":10,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":36,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":11,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":37,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":12,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":38,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":13,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":39,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":14,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":40,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":15,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":41,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1}, + "NPeers": {"UINT32":7}, + "Rank": {"INT":3} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 8, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":44,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":17,"Rank":0,"SendTags":[],"RecvTags":[]}}, + {"Id":42,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[[0,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":11,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":43,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 9, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":47,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":19,"Rank":1,"SendTags":[],"RecvTags":[]}}, + {"Id":45,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[[1,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":12,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":46,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 10, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":50,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":21,"Rank":2,"SendTags":[],"RecvTags":[]}}, + {"Id":48,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[[2,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":13,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":49,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 11, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":53,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":23,"Rank":4,"SendTags":[],"RecvTags":[]}}, + {"Id":51,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[[4,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":15,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":52,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 12, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":56,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":25,"Rank":5,"SendTags":[],"RecvTags":[]}}, + {"Id":54,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[[5,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":16,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":55,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 13, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":59,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":27,"Rank":6,"SendTags":[],"RecvTags":[]}}, + {"Id":57,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[[6,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":17,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":58,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 14, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":62,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":29,"Rank":7,"SendTags":[],"RecvTags":[]}}, + {"Id":60,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":18,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":61,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + } + ], + "ProcessorGroups": [ + { + "ProcessorRange": [0,8], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":0,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":8,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [1,2], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":1,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":9,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [2,3], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":2,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":10,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [3,4], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":3,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":11,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [4,5], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":4,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":12,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [5,6], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":5,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":13,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [6,7], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":6,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":14,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [7,8], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":7,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + } + ] +} diff --git a/examples/tutorial/allreduce-packet/plan_gpu4.json b/examples/tutorial/allreduce-packet/plan_gpu4.json new file mode 100644 index 000000000..cde728c7b --- /dev/null +++ b/examples/tutorial/allreduce-packet/plan_gpu4.json @@ -0,0 +1,569 @@ +{ + "Rank": 4, + "WorldSize": 8, + "NumProcessors": 8, + "NumWarpsPerProcessor": 16, + "Architecture": "ROCM_942", + "TaskInfos": [ + { + "Id": 0, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":3,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":19,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[40960],"PaddedShape":[4096],"Buffer":{"Id":1,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":20,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[40960],"PaddedShape":[4096],"Buffer":{"Id":1,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 1, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":21,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[40960],"PaddedShape":[4096],"Buffer":{"Id":2,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":22,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[40960],"PaddedShape":[4096],"Buffer":{"Id":2,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 2, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":23,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[40960],"PaddedShape":[4096],"Buffer":{"Id":3,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":24,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[40960],"PaddedShape":[4096],"Buffer":{"Id":3,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 3, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":25,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[40960],"PaddedShape":[4096],"Buffer":{"Id":4,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":26,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[40960],"PaddedShape":[4096],"Buffer":{"Id":4,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 4, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":27,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[45056],"PaddedShape":[4096],"Buffer":{"Id":5,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":28,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[45056],"PaddedShape":[4096],"Buffer":{"Id":5,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 5, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":29,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[45056],"PaddedShape":[4096],"Buffer":{"Id":6,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":30,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[45056],"PaddedShape":[4096],"Buffer":{"Id":6,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 6, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":31,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[45056],"PaddedShape":[4096],"Buffer":{"Id":7,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":32,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[45056],"PaddedShape":[4096],"Buffer":{"Id":7,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 7, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvReduceSendPacket", + "Name": "recv_reduce_send_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":33,"DataType":"UINT8","Shape":[28672],"Strides":[57344],"Offsets":[28672],"PaddedShape":[28672],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]]}} + ], + "WriteTensors": [ + {"Id":15,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":34,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":9,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":35,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":10,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":36,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":11,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":37,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":12,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":38,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":13,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":39,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":14,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":40,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":15,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":41,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1}, + "NPeers": {"UINT32":7}, + "Rank": {"INT":4} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 8, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":44,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":17,"Rank":0,"SendTags":[],"RecvTags":[]}}, + {"Id":42,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[[0,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":11,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":43,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 9, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":47,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":19,"Rank":1,"SendTags":[],"RecvTags":[]}}, + {"Id":45,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[[1,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":12,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":46,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 10, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":50,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":21,"Rank":2,"SendTags":[],"RecvTags":[]}}, + {"Id":48,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[[2,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":13,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":49,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 11, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":53,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":23,"Rank":3,"SendTags":[],"RecvTags":[]}}, + {"Id":51,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[[3,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":14,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":52,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 12, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":56,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":25,"Rank":5,"SendTags":[],"RecvTags":[]}}, + {"Id":54,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[[5,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":16,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":55,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 13, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":59,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":27,"Rank":6,"SendTags":[],"RecvTags":[]}}, + {"Id":57,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[[6,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":17,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":58,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 14, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":62,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":29,"Rank":7,"SendTags":[],"RecvTags":[]}}, + {"Id":60,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":18,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":61,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + } + ], + "ProcessorGroups": [ + { + "ProcessorRange": [0,8], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":0,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":8,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [1,2], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":1,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":9,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [2,3], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":2,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":10,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [3,4], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":3,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":11,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [4,5], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":4,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":12,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [5,6], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":5,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":13,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [6,7], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":6,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":14,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [7,8], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":7,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + } + ] +} diff --git a/examples/tutorial/allreduce-packet/plan_gpu5.json b/examples/tutorial/allreduce-packet/plan_gpu5.json new file mode 100644 index 000000000..1bd3ba0c3 --- /dev/null +++ b/examples/tutorial/allreduce-packet/plan_gpu5.json @@ -0,0 +1,569 @@ +{ + "Rank": 5, + "WorldSize": 8, + "NumProcessors": 8, + "NumWarpsPerProcessor": 16, + "Architecture": "ROCM_942", + "TaskInfos": [ + { + "Id": 0, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":3,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":19,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[45056],"PaddedShape":[4096],"Buffer":{"Id":1,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":20,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[45056],"PaddedShape":[4096],"Buffer":{"Id":1,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 1, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":21,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[45056],"PaddedShape":[4096],"Buffer":{"Id":2,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":22,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[45056],"PaddedShape":[4096],"Buffer":{"Id":2,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 2, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":23,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[45056],"PaddedShape":[4096],"Buffer":{"Id":3,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":24,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[45056],"PaddedShape":[4096],"Buffer":{"Id":3,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 3, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":25,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[45056],"PaddedShape":[4096],"Buffer":{"Id":4,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":26,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[45056],"PaddedShape":[4096],"Buffer":{"Id":4,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 4, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":27,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[45056],"PaddedShape":[4096],"Buffer":{"Id":5,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":28,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[45056],"PaddedShape":[4096],"Buffer":{"Id":5,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 5, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":29,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[49152],"PaddedShape":[4096],"Buffer":{"Id":6,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":30,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[49152],"PaddedShape":[4096],"Buffer":{"Id":6,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 6, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":31,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[49152],"PaddedShape":[4096],"Buffer":{"Id":7,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":32,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[49152],"PaddedShape":[4096],"Buffer":{"Id":7,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 7, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvReduceSendPacket", + "Name": "recv_reduce_send_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":33,"DataType":"UINT8","Shape":[28672],"Strides":[57344],"Offsets":[28672],"PaddedShape":[28672],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]]}} + ], + "WriteTensors": [ + {"Id":16,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":34,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":9,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":35,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":10,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":36,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":11,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":37,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":12,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":38,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":13,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":39,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":14,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":40,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":15,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":41,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1}, + "NPeers": {"UINT32":7}, + "Rank": {"INT":5} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 8, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":44,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":17,"Rank":0,"SendTags":[],"RecvTags":[]}}, + {"Id":42,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[[0,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":11,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":43,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 9, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":47,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":19,"Rank":1,"SendTags":[],"RecvTags":[]}}, + {"Id":45,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[[1,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":12,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":46,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 10, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":50,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":21,"Rank":2,"SendTags":[],"RecvTags":[]}}, + {"Id":48,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[[2,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":13,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":49,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 11, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":53,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":23,"Rank":3,"SendTags":[],"RecvTags":[]}}, + {"Id":51,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[[3,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":14,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":52,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 12, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":56,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":25,"Rank":4,"SendTags":[],"RecvTags":[]}}, + {"Id":54,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[[4,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":15,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":55,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 13, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":59,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":27,"Rank":6,"SendTags":[],"RecvTags":[]}}, + {"Id":57,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[[6,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":17,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":58,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 14, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":62,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":29,"Rank":7,"SendTags":[],"RecvTags":[]}}, + {"Id":60,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":18,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":61,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + } + ], + "ProcessorGroups": [ + { + "ProcessorRange": [0,8], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":0,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":8,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [1,2], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":1,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":9,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [2,3], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":2,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":10,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [3,4], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":3,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":11,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [4,5], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":4,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":12,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [5,6], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":5,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":13,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [6,7], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":6,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":14,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [7,8], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":7,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + } + ] +} diff --git a/examples/tutorial/allreduce-packet/plan_gpu6.json b/examples/tutorial/allreduce-packet/plan_gpu6.json new file mode 100644 index 000000000..5f608d9b9 --- /dev/null +++ b/examples/tutorial/allreduce-packet/plan_gpu6.json @@ -0,0 +1,569 @@ +{ + "Rank": 6, + "WorldSize": 8, + "NumProcessors": 8, + "NumWarpsPerProcessor": 16, + "Architecture": "ROCM_942", + "TaskInfos": [ + { + "Id": 0, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":3,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":19,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[49152],"PaddedShape":[4096],"Buffer":{"Id":1,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":20,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[49152],"PaddedShape":[4096],"Buffer":{"Id":1,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 1, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":21,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[49152],"PaddedShape":[4096],"Buffer":{"Id":2,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":22,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[49152],"PaddedShape":[4096],"Buffer":{"Id":2,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 2, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":23,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[49152],"PaddedShape":[4096],"Buffer":{"Id":3,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":24,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[49152],"PaddedShape":[4096],"Buffer":{"Id":3,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 3, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":25,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[49152],"PaddedShape":[4096],"Buffer":{"Id":4,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":26,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[49152],"PaddedShape":[4096],"Buffer":{"Id":4,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 4, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":27,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[49152],"PaddedShape":[4096],"Buffer":{"Id":5,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":28,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[49152],"PaddedShape":[4096],"Buffer":{"Id":5,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 5, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":29,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[49152],"PaddedShape":[4096],"Buffer":{"Id":6,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":30,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[49152],"PaddedShape":[4096],"Buffer":{"Id":6,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 6, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":31,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[53248],"PaddedShape":[4096],"Buffer":{"Id":7,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":32,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[53248],"PaddedShape":[4096],"Buffer":{"Id":7,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 7, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvReduceSendPacket", + "Name": "recv_reduce_send_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":33,"DataType":"UINT8","Shape":[28672],"Strides":[57344],"Offsets":[28672],"PaddedShape":[28672],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]]}} + ], + "WriteTensors": [ + {"Id":17,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":34,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":9,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":35,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":10,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":36,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":11,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":37,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":12,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":38,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":13,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":39,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":14,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":40,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":15,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":41,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1}, + "NPeers": {"UINT32":7}, + "Rank": {"INT":6} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 8, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":44,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":17,"Rank":0,"SendTags":[],"RecvTags":[]}}, + {"Id":42,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[[0,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":11,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":43,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 9, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":47,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":19,"Rank":1,"SendTags":[],"RecvTags":[]}}, + {"Id":45,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[[1,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":12,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":46,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 10, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":50,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":21,"Rank":2,"SendTags":[],"RecvTags":[]}}, + {"Id":48,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[[2,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":13,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":49,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 11, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":53,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":23,"Rank":3,"SendTags":[],"RecvTags":[]}}, + {"Id":51,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[[3,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":14,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":52,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 12, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":56,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":25,"Rank":4,"SendTags":[],"RecvTags":[]}}, + {"Id":54,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[[4,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":15,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":55,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 13, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":59,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":27,"Rank":5,"SendTags":[],"RecvTags":[]}}, + {"Id":57,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[[5,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":16,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":58,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 14, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":62,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":29,"Rank":7,"SendTags":[],"RecvTags":[]}}, + {"Id":60,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":18,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":61,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + } + ], + "ProcessorGroups": [ + { + "ProcessorRange": [0,8], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":0,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":8,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [1,2], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":1,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":9,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [2,3], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":2,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":10,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [3,4], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":3,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":11,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [4,5], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":4,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":12,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [5,6], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":5,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":13,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [6,7], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":6,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":14,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [7,8], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":7,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + } + ] +} diff --git a/examples/tutorial/allreduce-packet/plan_gpu7.json b/examples/tutorial/allreduce-packet/plan_gpu7.json new file mode 100644 index 000000000..40919ee24 --- /dev/null +++ b/examples/tutorial/allreduce-packet/plan_gpu7.json @@ -0,0 +1,569 @@ +{ + "Rank": 7, + "WorldSize": 8, + "NumProcessors": 8, + "NumWarpsPerProcessor": 16, + "Architecture": "ROCM_942", + "TaskInfos": [ + { + "Id": 0, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":3,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":19,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[53248],"PaddedShape":[4096],"Buffer":{"Id":1,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":20,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[53248],"PaddedShape":[4096],"Buffer":{"Id":1,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 1, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":21,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[53248],"PaddedShape":[4096],"Buffer":{"Id":2,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":22,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[53248],"PaddedShape":[4096],"Buffer":{"Id":2,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 2, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":23,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[53248],"PaddedShape":[4096],"Buffer":{"Id":3,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":24,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[53248],"PaddedShape":[4096],"Buffer":{"Id":3,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 3, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":25,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[53248],"PaddedShape":[4096],"Buffer":{"Id":4,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":26,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[53248],"PaddedShape":[4096],"Buffer":{"Id":4,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 4, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":27,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[53248],"PaddedShape":[4096],"Buffer":{"Id":5,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":28,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[53248],"PaddedShape":[4096],"Buffer":{"Id":5,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 5, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":29,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[53248],"PaddedShape":[4096],"Buffer":{"Id":6,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":30,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[53248],"PaddedShape":[4096],"Buffer":{"Id":6,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 6, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "SendPacket", + "Name": "send_packet_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":31,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[53248],"PaddedShape":[4096],"Buffer":{"Id":7,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":32,"DataType":"UINT8","Shape":[4096],"Strides":[57344],"Offsets":[53248],"PaddedShape":[4096],"Buffer":{"Id":7,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 7, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvReduceSendPacket", + "Name": "recv_reduce_send_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":33,"DataType":"UINT8","Shape":[28672],"Strides":[57344],"Offsets":[28672],"PaddedShape":[28672],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]]}} + ], + "WriteTensors": [ + {"Id":18,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}, + {"Id":34,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":9,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":35,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":10,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":36,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":11,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":37,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":12,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":38,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":13,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":39,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":14,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":40,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":15,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":41,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[7168],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1}, + "NPeers": {"UINT32":7}, + "Rank": {"INT":7} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 8, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet", + "IsVirtual": false, + "ReadTensors": [ + {"Id":44,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":17,"Rank":0,"SendTags":[],"RecvTags":[]}}, + {"Id":42,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[[0,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":11,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":43,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 9, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":47,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":19,"Rank":1,"SendTags":[],"RecvTags":[]}}, + {"Id":45,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[[1,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":12,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":46,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[1024],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 10, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":50,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":21,"Rank":2,"SendTags":[],"RecvTags":[]}}, + {"Id":48,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[[2,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":13,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":49,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[2048],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 11, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":53,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":23,"Rank":3,"SendTags":[],"RecvTags":[]}}, + {"Id":51,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[[3,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":14,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":52,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[3072],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 12, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":56,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":25,"Rank":4,"SendTags":[],"RecvTags":[]}}, + {"Id":54,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[[4,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":15,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":55,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[4096],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 13, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":59,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":27,"Rank":5,"SendTags":[],"RecvTags":[]}}, + {"Id":57,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[[5,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":16,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":58,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[5120],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + }, + { + "Id": 14, + "NumWarps": 16, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvPacket", + "Name": "recv_packet_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":62,"DataType":"FP16","Shape":[1024],"Strides":[1024],"Offsets":[0],"PaddedShape":[1024],"Buffer":{"Id":29,"Rank":6,"SendTags":[],"RecvTags":[]}}, + {"Id":60,"DataType":"UINT8","Shape":[4096],"Strides":[8192],"Offsets":[4096],"PaddedShape":[4096],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[[6,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":17,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "ResultTensors": [ + {"Id":61,"DataType":"FP16","Shape":[1024],"Strides":[8192],"Offsets":[6144],"PaddedShape":[1024],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}} + ], + "Args": { + "Flag": {"UINT32":1} + }, + "Config": { + "PacketType": "mscclpp::LL8Packet", + "NumWarps": 16, + "SramBytes": 0, + "Tile": [1,1024], + "NumTasks": 1 + } + } + ] + } + ], + "ProcessorGroups": [ + { + "ProcessorRange": [0,8], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":0,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":8,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [1,2], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":1,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":9,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [2,3], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":2,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":10,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [3,4], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":3,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":11,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [4,5], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":4,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":12,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [5,6], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":5,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":13,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [6,7], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":6,"TaskRange":[0,1],"Granularity":1}, + {"TaskId":14,"TaskRange":[0,1],"Granularity":1} + ] + }, + { + "ProcessorRange": [7,8], + "WarpRange": [0,16], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":7,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + } + ] +} diff --git a/examples/tutorial/allreduce-sm/plan_gpu0.json b/examples/tutorial/allreduce-sm/plan_gpu0.json new file mode 100644 index 000000000..7cbda9488 --- /dev/null +++ b/examples/tutorial/allreduce-sm/plan_gpu0.json @@ -0,0 +1,625 @@ +{ + "Rank": 0, + "WorldSize": 8, + "NumProcessors": 56, + "NumWarpsPerProcessor": 8, + "Architecture": "ROCM_942", + "TaskInfos": [ + { + "Id": 0, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send", + "IsVirtual": false, + "ReadTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":27,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":2,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":28,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":2,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 1, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":29,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":3,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":30,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":3,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 2, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":31,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":4,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":32,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":4,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 3, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":33,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":5,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":34,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":5,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 4, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":35,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":6,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":36,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":6,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 5, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":37,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":7,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":38,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":7,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 6, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":11,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[7340032],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":39,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":8,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":40,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":8,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 7, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "DeviceSync", + "Name": "device_sync", + "IsVirtual": false, + "ReadTensors": [ + {"Id":1,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":41,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":42,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": { + "PeerNum": {"INT":7}, + "Rank": {"INT":0} + }, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0 + } + } + ] + }, + { + "Id": 8, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvReduceSend", + "Name": "recv_reduce_send", + "IsVirtual": false, + "ReadTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}}, + {"Id":50,"DataType":"FP16","Shape":[7340032],"Strides":[7340032],"Offsets":[0],"PaddedShape":[7340032],"Buffer":{"Id":16,"Rank":0,"SendTags":[],"RecvTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]]}} + ], + "WriteTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}}, + {"Id":43,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":9,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":44,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":10,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":45,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":11,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":46,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":12,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":47,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":13,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":48,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":14,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":49,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":15,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":51,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": { + "NPeers": {"UINT32":7}, + "Rank": {"INT":0} + }, + "Config": { + "NumWarps": 8, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 9, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv", + "IsVirtual": false, + "ReadTensors": [ + {"Id":53,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":17,"Rank":1,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":52,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 10, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":55,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":18,"Rank":2,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":54,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 11, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":57,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":19,"Rank":3,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":56,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 12, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":59,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":20,"Rank":4,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":58,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 13, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":61,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":21,"Rank":5,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":60,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 14, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":63,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":22,"Rank":6,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":62,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 15, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":65,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":23,"Rank":7,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":64,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 16, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "DeviceSync", + "Name": "device_sync_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":0,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":66,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":67,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": { + "PeerNum": {"INT":7}, + "Rank": {"INT":0} + }, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0 + } + } + ] + } + ], + "ProcessorGroups": [ + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,8], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":0,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [8,16], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":1,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [16,24], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":2,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [24,32], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":3,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [32,40], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":4,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [40,48], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":5,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [48,56], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":6,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":7,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,56], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":8,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":9,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + } + ] +} diff --git a/examples/tutorial/allreduce-sm/plan_gpu1.json b/examples/tutorial/allreduce-sm/plan_gpu1.json new file mode 100644 index 000000000..52f827212 --- /dev/null +++ b/examples/tutorial/allreduce-sm/plan_gpu1.json @@ -0,0 +1,625 @@ +{ + "Rank": 1, + "WorldSize": 8, + "NumProcessors": 56, + "NumWarpsPerProcessor": 8, + "Architecture": "ROCM_942", + "TaskInfos": [ + { + "Id": 0, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send", + "IsVirtual": false, + "ReadTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":27,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":2,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":28,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":2,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 1, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":29,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":3,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":30,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":3,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 2, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":31,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":4,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":32,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":4,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 3, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":33,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":5,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":34,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":5,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 4, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":35,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":6,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":36,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":6,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 5, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":37,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":7,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":38,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":7,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 6, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":11,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[7340032],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":39,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":8,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":40,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":8,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 7, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "DeviceSync", + "Name": "device_sync", + "IsVirtual": false, + "ReadTensors": [ + {"Id":1,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":41,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":42,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": { + "PeerNum": {"INT":7}, + "Rank": {"INT":1} + }, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0 + } + } + ] + }, + { + "Id": 8, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvReduceSend", + "Name": "recv_reduce_send", + "IsVirtual": false, + "ReadTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}}, + {"Id":50,"DataType":"FP16","Shape":[7340032],"Strides":[7340032],"Offsets":[0],"PaddedShape":[7340032],"Buffer":{"Id":16,"Rank":1,"SendTags":[],"RecvTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]]}} + ], + "WriteTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}}, + {"Id":43,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":9,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":44,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":10,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":45,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":11,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":46,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":12,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":47,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":13,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":48,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":14,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":49,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":15,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":51,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": { + "NPeers": {"UINT32":7}, + "Rank": {"INT":1} + }, + "Config": { + "NumWarps": 8, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 9, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv", + "IsVirtual": false, + "ReadTensors": [ + {"Id":53,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":17,"Rank":0,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":52,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 10, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":55,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":18,"Rank":2,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":54,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 11, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":57,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":19,"Rank":3,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":56,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 12, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":59,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":20,"Rank":4,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":58,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 13, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":61,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":21,"Rank":5,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":60,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 14, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":63,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":22,"Rank":6,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":62,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 15, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":65,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":23,"Rank":7,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":64,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 16, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "DeviceSync", + "Name": "device_sync_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":0,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":66,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":67,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": { + "PeerNum": {"INT":7}, + "Rank": {"INT":1} + }, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0 + } + } + ] + } + ], + "ProcessorGroups": [ + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,8], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":0,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [8,16], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":1,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [16,24], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":2,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [24,32], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":3,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [32,40], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":4,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [40,48], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":5,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [48,56], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":6,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":7,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,56], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":8,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":9,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + } + ] +} diff --git a/examples/tutorial/allreduce-sm/plan_gpu2.json b/examples/tutorial/allreduce-sm/plan_gpu2.json new file mode 100644 index 000000000..747e88719 --- /dev/null +++ b/examples/tutorial/allreduce-sm/plan_gpu2.json @@ -0,0 +1,625 @@ +{ + "Rank": 2, + "WorldSize": 8, + "NumProcessors": 56, + "NumWarpsPerProcessor": 8, + "Architecture": "ROCM_942", + "TaskInfos": [ + { + "Id": 0, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send", + "IsVirtual": false, + "ReadTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":27,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":2,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":28,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":2,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 1, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":29,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":3,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":30,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":3,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 2, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":31,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":4,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":32,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":4,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 3, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":33,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":5,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":34,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":5,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 4, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":35,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":6,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":36,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":6,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 5, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":37,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":7,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":38,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":7,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 6, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":11,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[7340032],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":39,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":8,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":40,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":8,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 7, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "DeviceSync", + "Name": "device_sync", + "IsVirtual": false, + "ReadTensors": [ + {"Id":1,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":41,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":42,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": { + "PeerNum": {"INT":7}, + "Rank": {"INT":2} + }, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0 + } + } + ] + }, + { + "Id": 8, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvReduceSend", + "Name": "recv_reduce_send", + "IsVirtual": false, + "ReadTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}}, + {"Id":50,"DataType":"FP16","Shape":[7340032],"Strides":[7340032],"Offsets":[0],"PaddedShape":[7340032],"Buffer":{"Id":16,"Rank":2,"SendTags":[],"RecvTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]]}} + ], + "WriteTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}}, + {"Id":43,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":9,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":44,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":10,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":45,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":11,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":46,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":12,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":47,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":13,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":48,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":14,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":49,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":15,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":51,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": { + "NPeers": {"UINT32":7}, + "Rank": {"INT":2} + }, + "Config": { + "NumWarps": 8, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 9, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv", + "IsVirtual": false, + "ReadTensors": [ + {"Id":53,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":17,"Rank":0,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":52,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 10, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":55,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":18,"Rank":1,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":54,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 11, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":57,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":19,"Rank":3,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":56,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 12, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":59,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":20,"Rank":4,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":58,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 13, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":61,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":21,"Rank":5,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":60,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 14, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":63,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":22,"Rank":6,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":62,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 15, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":65,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":23,"Rank":7,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":64,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 16, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "DeviceSync", + "Name": "device_sync_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":0,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":66,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":67,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": { + "PeerNum": {"INT":7}, + "Rank": {"INT":2} + }, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0 + } + } + ] + } + ], + "ProcessorGroups": [ + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,8], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":0,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [8,16], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":1,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [16,24], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":2,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [24,32], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":3,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [32,40], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":4,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [40,48], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":5,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [48,56], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":6,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":7,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,56], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":8,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":9,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + } + ] +} diff --git a/examples/tutorial/allreduce-sm/plan_gpu3.json b/examples/tutorial/allreduce-sm/plan_gpu3.json new file mode 100644 index 000000000..dd319c362 --- /dev/null +++ b/examples/tutorial/allreduce-sm/plan_gpu3.json @@ -0,0 +1,625 @@ +{ + "Rank": 3, + "WorldSize": 8, + "NumProcessors": 56, + "NumWarpsPerProcessor": 8, + "Architecture": "ROCM_942", + "TaskInfos": [ + { + "Id": 0, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send", + "IsVirtual": false, + "ReadTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":27,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":2,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":28,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":2,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 1, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":29,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":3,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":30,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":3,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 2, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":31,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":4,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":32,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":4,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 3, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":33,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":5,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":34,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":5,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 4, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":35,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":6,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":36,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":6,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 5, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":37,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":7,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":38,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":7,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 6, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":11,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[7340032],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":39,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":8,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":40,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":8,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 7, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "DeviceSync", + "Name": "device_sync", + "IsVirtual": false, + "ReadTensors": [ + {"Id":1,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":41,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":42,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": { + "PeerNum": {"INT":7}, + "Rank": {"INT":3} + }, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0 + } + } + ] + }, + { + "Id": 8, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvReduceSend", + "Name": "recv_reduce_send", + "IsVirtual": false, + "ReadTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}}, + {"Id":50,"DataType":"FP16","Shape":[7340032],"Strides":[7340032],"Offsets":[0],"PaddedShape":[7340032],"Buffer":{"Id":16,"Rank":3,"SendTags":[],"RecvTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]]}} + ], + "WriteTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}}, + {"Id":43,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":9,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":44,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":10,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":45,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":11,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":46,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":12,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":47,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":13,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":48,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":14,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":49,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":15,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":51,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": { + "NPeers": {"UINT32":7}, + "Rank": {"INT":3} + }, + "Config": { + "NumWarps": 8, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 9, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv", + "IsVirtual": false, + "ReadTensors": [ + {"Id":53,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":17,"Rank":0,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":52,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 10, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":55,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":18,"Rank":1,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":54,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 11, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":57,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":19,"Rank":2,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":56,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 12, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":59,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":20,"Rank":4,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":58,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 13, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":61,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":21,"Rank":5,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":60,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 14, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":63,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":22,"Rank":6,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":62,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 15, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":65,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":23,"Rank":7,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":64,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 16, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "DeviceSync", + "Name": "device_sync_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":0,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":66,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":67,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": { + "PeerNum": {"INT":7}, + "Rank": {"INT":3} + }, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0 + } + } + ] + } + ], + "ProcessorGroups": [ + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,8], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":0,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [8,16], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":1,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [16,24], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":2,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [24,32], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":3,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [32,40], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":4,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [40,48], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":5,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [48,56], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":6,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":7,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,56], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":8,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":9,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + } + ] +} diff --git a/examples/tutorial/allreduce-sm/plan_gpu4.json b/examples/tutorial/allreduce-sm/plan_gpu4.json new file mode 100644 index 000000000..7c0044303 --- /dev/null +++ b/examples/tutorial/allreduce-sm/plan_gpu4.json @@ -0,0 +1,625 @@ +{ + "Rank": 4, + "WorldSize": 8, + "NumProcessors": 56, + "NumWarpsPerProcessor": 8, + "Architecture": "ROCM_942", + "TaskInfos": [ + { + "Id": 0, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send", + "IsVirtual": false, + "ReadTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":27,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":2,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":28,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":2,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 1, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":29,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":3,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":30,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":3,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 2, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":31,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":4,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":32,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":4,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 3, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":33,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":5,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":34,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":5,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 4, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":35,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":6,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":36,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":6,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 5, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":37,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":7,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":38,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":7,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 6, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":11,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[7340032],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":39,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":8,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":40,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":8,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 7, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "DeviceSync", + "Name": "device_sync", + "IsVirtual": false, + "ReadTensors": [ + {"Id":1,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":41,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":42,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": { + "PeerNum": {"INT":7}, + "Rank": {"INT":4} + }, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0 + } + } + ] + }, + { + "Id": 8, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvReduceSend", + "Name": "recv_reduce_send", + "IsVirtual": false, + "ReadTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}}, + {"Id":50,"DataType":"FP16","Shape":[7340032],"Strides":[7340032],"Offsets":[0],"PaddedShape":[7340032],"Buffer":{"Id":16,"Rank":4,"SendTags":[],"RecvTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]]}} + ], + "WriteTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}}, + {"Id":43,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":9,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":44,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":10,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":45,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":11,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":46,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":12,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":47,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":13,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":48,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":14,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":49,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":15,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":51,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": { + "NPeers": {"UINT32":7}, + "Rank": {"INT":4} + }, + "Config": { + "NumWarps": 8, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 9, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv", + "IsVirtual": false, + "ReadTensors": [ + {"Id":53,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":17,"Rank":0,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":52,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 10, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":55,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":18,"Rank":1,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":54,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 11, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":57,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":19,"Rank":2,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":56,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 12, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":59,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":20,"Rank":3,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":58,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 13, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":61,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":21,"Rank":5,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":60,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 14, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":63,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":22,"Rank":6,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":62,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 15, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":65,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":23,"Rank":7,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":64,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 16, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "DeviceSync", + "Name": "device_sync_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":0,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":66,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":67,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[5,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[5,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": { + "PeerNum": {"INT":7}, + "Rank": {"INT":4} + }, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0 + } + } + ] + } + ], + "ProcessorGroups": [ + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,8], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":0,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [8,16], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":1,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [16,24], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":2,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [24,32], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":3,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [32,40], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":4,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [40,48], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":5,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [48,56], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":6,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":7,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,56], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":8,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":9,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + } + ] +} diff --git a/examples/tutorial/allreduce-sm/plan_gpu5.json b/examples/tutorial/allreduce-sm/plan_gpu5.json new file mode 100644 index 000000000..52dd93103 --- /dev/null +++ b/examples/tutorial/allreduce-sm/plan_gpu5.json @@ -0,0 +1,625 @@ +{ + "Rank": 5, + "WorldSize": 8, + "NumProcessors": 56, + "NumWarpsPerProcessor": 8, + "Architecture": "ROCM_942", + "TaskInfos": [ + { + "Id": 0, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send", + "IsVirtual": false, + "ReadTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":27,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":2,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":28,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":2,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 1, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":29,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":3,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":30,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":3,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 2, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":31,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":4,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":32,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":4,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 3, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":33,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":5,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":34,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":5,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 4, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":35,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":6,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":36,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":6,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 5, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":37,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":7,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":38,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":7,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 6, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":11,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[7340032],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":39,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":8,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":40,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":8,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 7, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "DeviceSync", + "Name": "device_sync", + "IsVirtual": false, + "ReadTensors": [ + {"Id":1,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":41,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":42,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": { + "PeerNum": {"INT":7}, + "Rank": {"INT":5} + }, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0 + } + } + ] + }, + { + "Id": 8, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvReduceSend", + "Name": "recv_reduce_send", + "IsVirtual": false, + "ReadTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}}, + {"Id":50,"DataType":"FP16","Shape":[7340032],"Strides":[7340032],"Offsets":[0],"PaddedShape":[7340032],"Buffer":{"Id":16,"Rank":5,"SendTags":[],"RecvTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]]}} + ], + "WriteTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}}, + {"Id":43,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":9,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":44,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":10,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":45,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":11,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":46,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":12,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":47,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":13,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":48,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":14,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":49,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":15,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":51,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": { + "NPeers": {"UINT32":7}, + "Rank": {"INT":5} + }, + "Config": { + "NumWarps": 8, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 9, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv", + "IsVirtual": false, + "ReadTensors": [ + {"Id":53,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":17,"Rank":0,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":52,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 10, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":55,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":18,"Rank":1,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":54,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 11, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":57,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":19,"Rank":2,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":56,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 12, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":59,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":20,"Rank":3,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":58,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 13, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":61,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":21,"Rank":4,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":60,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 14, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":63,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":22,"Rank":6,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":62,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 15, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":65,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":23,"Rank":7,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":64,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 16, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "DeviceSync", + "Name": "device_sync_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":0,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":66,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":67,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[6,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[6,-2147483647],[7,-2147483647]]}} + ], + "Args": { + "PeerNum": {"INT":7}, + "Rank": {"INT":5} + }, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0 + } + } + ] + } + ], + "ProcessorGroups": [ + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,8], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":0,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [8,16], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":1,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [16,24], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":2,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [24,32], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":3,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [32,40], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":4,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [40,48], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":5,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [48,56], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":6,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":7,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,56], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":8,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":9,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + } + ] +} diff --git a/examples/tutorial/allreduce-sm/plan_gpu6.json b/examples/tutorial/allreduce-sm/plan_gpu6.json new file mode 100644 index 000000000..6481ffa56 --- /dev/null +++ b/examples/tutorial/allreduce-sm/plan_gpu6.json @@ -0,0 +1,625 @@ +{ + "Rank": 6, + "WorldSize": 8, + "NumProcessors": 56, + "NumWarpsPerProcessor": 8, + "Architecture": "ROCM_942", + "TaskInfos": [ + { + "Id": 0, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send", + "IsVirtual": false, + "ReadTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":27,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":2,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":28,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":2,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 1, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":29,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":3,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":30,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":3,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 2, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":31,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":4,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":32,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":4,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 3, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":33,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":5,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":34,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":5,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 4, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":35,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":6,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":36,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":6,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 5, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":37,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":7,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":38,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":7,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 6, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":11,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[7340032],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":39,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":8,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":40,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":8,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 7, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "DeviceSync", + "Name": "device_sync", + "IsVirtual": false, + "ReadTensors": [ + {"Id":1,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":41,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":42,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "Args": { + "PeerNum": {"INT":7}, + "Rank": {"INT":6} + }, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0 + } + } + ] + }, + { + "Id": 8, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvReduceSend", + "Name": "recv_reduce_send", + "IsVirtual": false, + "ReadTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}}, + {"Id":50,"DataType":"FP16","Shape":[7340032],"Strides":[7340032],"Offsets":[0],"PaddedShape":[7340032],"Buffer":{"Id":16,"Rank":6,"SendTags":[],"RecvTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]]}} + ], + "WriteTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}}, + {"Id":43,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":9,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":44,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":10,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":45,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":11,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":46,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":12,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":47,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":13,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":48,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":14,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":49,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":15,"Rank":7,"SendTags":[],"RecvTags":[[-1,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":51,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "Args": { + "NPeers": {"UINT32":7}, + "Rank": {"INT":6} + }, + "Config": { + "NumWarps": 8, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 9, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv", + "IsVirtual": false, + "ReadTensors": [ + {"Id":53,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":17,"Rank":0,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":52,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 10, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":55,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":18,"Rank":1,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":54,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 11, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":57,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":19,"Rank":2,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":56,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 12, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":59,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":20,"Rank":3,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":58,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 13, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":61,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":21,"Rank":4,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":60,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 14, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":63,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":22,"Rank":5,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":62,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 15, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":65,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":23,"Rank":7,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":64,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 16, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "DeviceSync", + "Name": "device_sync_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":0,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":66,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":67,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[7,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[7,-2147483647]]}} + ], + "Args": { + "PeerNum": {"INT":7}, + "Rank": {"INT":6} + }, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0 + } + } + ] + } + ], + "ProcessorGroups": [ + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,8], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":0,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [8,16], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":1,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [16,24], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":2,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [24,32], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":3,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [32,40], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":4,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [40,48], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":5,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [48,56], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":6,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":7,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,56], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":8,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":9,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + } + ] +} diff --git a/examples/tutorial/allreduce-sm/plan_gpu7.json b/examples/tutorial/allreduce-sm/plan_gpu7.json new file mode 100644 index 000000000..03251e825 --- /dev/null +++ b/examples/tutorial/allreduce-sm/plan_gpu7.json @@ -0,0 +1,625 @@ +{ + "Rank": 7, + "WorldSize": 8, + "NumProcessors": 56, + "NumWarpsPerProcessor": 8, + "Architecture": "ROCM_942", + "TaskInfos": [ + { + "Id": 0, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send", + "IsVirtual": false, + "ReadTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":27,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":2,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":28,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":2,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 1, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":29,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":3,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":30,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":3,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 2, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":31,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":4,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":32,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":4,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 3, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":33,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":5,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":34,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":5,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 4, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":35,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":6,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":36,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":6,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 5, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":37,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":7,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":38,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":7,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 6, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "Send", + "Name": "send_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":39,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":8,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "ResultTensors": [ + {"Id":40,"DataType":"FP16","Shape":[1048576],"Strides":[7340032],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":8,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483648]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 256, + "NumWarps": 8, + "Signal": false, + "SramBytes": 0, + "Tile": [1,4096] + } + } + ] + }, + { + "Id": 7, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "DeviceSync", + "Name": "device_sync", + "IsVirtual": false, + "ReadTensors": [ + {"Id":1,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":41,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":42,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "Args": { + "PeerNum": {"INT":7}, + "Rank": {"INT":7} + }, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0 + } + } + ] + }, + { + "Id": 8, + "NumWarps": 8, + "SramBytes": 0, + "Ops": [ + { + "Type": "RecvReduceSend", + "Name": "recv_reduce_send", + "IsVirtual": false, + "ReadTensors": [ + {"Id":11,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[7340032],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}}, + {"Id":50,"DataType":"FP16","Shape":[7340032],"Strides":[7340032],"Offsets":[0],"PaddedShape":[7340032],"Buffer":{"Id":16,"Rank":7,"SendTags":[],"RecvTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]]}} + ], + "WriteTensors": [ + {"Id":11,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[7340032],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}}, + {"Id":43,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[7340032],"PaddedShape":[1048576],"Buffer":{"Id":9,"Rank":0,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":44,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[7340032],"PaddedShape":[1048576],"Buffer":{"Id":10,"Rank":1,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":45,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[7340032],"PaddedShape":[1048576],"Buffer":{"Id":11,"Rank":2,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":46,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[7340032],"PaddedShape":[1048576],"Buffer":{"Id":12,"Rank":3,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":47,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[7340032],"PaddedShape":[1048576],"Buffer":{"Id":13,"Rank":4,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":48,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[7340032],"PaddedShape":[1048576],"Buffer":{"Id":14,"Rank":5,"SendTags":[],"RecvTags":[[-1,-2147483647]]}}, + {"Id":49,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[7340032],"PaddedShape":[1048576],"Buffer":{"Id":15,"Rank":6,"SendTags":[],"RecvTags":[[-1,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":51,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[7340032],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "Args": { + "NPeers": {"UINT32":7}, + "Rank": {"INT":7} + }, + "Config": { + "NumWarps": 8, + "SramBytes": 0, + "Tile": [1,4096], + "NumTasks": 256 + } + } + ] + }, + { + "Id": 9, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv", + "IsVirtual": false, + "ReadTensors": [ + {"Id":53,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":17,"Rank":0,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":4,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":52,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[0],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 10, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":55,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":18,"Rank":1,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":5,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":54,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[1048576],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 11, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_2", + "IsVirtual": false, + "ReadTensors": [ + {"Id":57,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":19,"Rank":2,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":6,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":56,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[2097152],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 12, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_3", + "IsVirtual": false, + "ReadTensors": [ + {"Id":59,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":20,"Rank":3,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":7,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":58,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[3145728],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 13, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_4", + "IsVirtual": false, + "ReadTensors": [ + {"Id":61,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":21,"Rank":4,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":8,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":60,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[4194304],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 14, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_5", + "IsVirtual": false, + "ReadTensors": [ + {"Id":63,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":22,"Rank":5,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":9,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":62,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[5242880],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 15, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "Recv", + "Name": "recv_6", + "IsVirtual": false, + "ReadTensors": [ + {"Id":65,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":23,"Rank":6,"SendTags":[[-1,-2147483647]],"RecvTags":[]}} + ], + "WriteTensors": [ + {"Id":10,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":64,"DataType":"FP16","Shape":[1048576],"Strides":[8388608],"Offsets":[6291456],"PaddedShape":[1048576],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "Args": {}, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0, + "Wait": false + } + } + ] + }, + { + "Id": 16, + "NumWarps": 1, + "SramBytes": 0, + "Ops": [ + { + "Type": "DeviceSync", + "Name": "device_sync_1", + "IsVirtual": false, + "ReadTensors": [ + {"Id":0,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "WriteTensors": [ + {"Id":66,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "ResultTensors": [ + {"Id":67,"DataType":"FP16","Shape":[8388608],"Strides":[8388608],"Offsets":[0],"PaddedShape":[8388608],"Buffer":{"Id":0,"Rank":-1,"SendTags":[[0,-2147483648],[1,-2147483648],[2,-2147483648],[3,-2147483648],[4,-2147483648],[5,-2147483648],[6,-2147483648]],"RecvTags":[[0,-2147483647],[1,-2147483647],[2,-2147483647],[3,-2147483647],[4,-2147483647],[5,-2147483647],[6,-2147483647]]}} + ], + "Args": { + "PeerNum": {"INT":7}, + "Rank": {"INT":7} + }, + "Config": { + "ChannelType": "Sm", + "NumTasks": 1, + "NumWarps": 1, + "SramBytes": 0 + } + } + ] + } + ], + "ProcessorGroups": [ + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,8], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":0,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [8,16], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":1,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [16,24], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":2,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [24,32], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":3,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [32,40], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":4,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [40,48], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":5,"TaskRange":[0,256],"Granularity":1} + ] + }, + { + "ProcessorRange": [48,56], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":6,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":7,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,56], + "WarpRange": [0,8], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":8,"TaskRange":[0,256],"Granularity":1} + ] + } + ] + }, + { + "ProcessorRange": [0,56], + "ResourceGroups": [ + { + "ProcessorRange": [0,1], + "WarpRange": [0,1], + "SramRange": [0,0], + "TaskGroups": [ + {"TaskId":9,"TaskRange":[0,1],"Granularity":1} + ] + } + ] + } + ] +} diff --git a/examples/tutorial/multi_gpu_plan.py b/examples/tutorial/multi_gpu_plan.py new file mode 100644 index 000000000..7fff6e26b --- /dev/null +++ b/examples/tutorial/multi_gpu_plan.py @@ -0,0 +1,86 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import argparse +import ark +import numpy as np +import multiprocessing +import os +from pathlib import Path +import time + +world_size = 8 + +tensor_len = 8192 * 1024 +tensor_size = tensor_len * ark.fp16.element_size() + + +def allreduce_test_function(rank, np_inputs, plan_path, ground_truth): + print("rank:", rank) + ark.set_rank(rank) + ark.set_world_size(world_size) + + # Run `plan_path` file if exists + if not Path(plan_path).is_file(): + print(f"File {plan_path} does not exist. Exiting...") + return + + input = ark.tensor([tensor_len], ark.fp16) + output = ark.all_reduce(input, rank, world_size, input) + with ark.Runtime.get_runtime() as rt: + plan = ark.Plan.from_file(plan_path) + rt.launch(plan=plan, device_id=rank) + input.from_numpy(np_inputs) + rt.run() + # Copy data back to host and calculate errors + host_output = output.to_numpy() + np.testing.assert_allclose( + host_output, ground_truth, rtol=1e-2, atol=1e-2 + ) + + rt.barrier() + # Measure throughput + iter = 10000 + ts = time.time() + rt.run(iter) + elapsed_ms = (time.time() - ts) * 1e3 + print( + f"Current plan elapsed time: total {elapsed_ms:.6f} ms, {elapsed_ms/iter:.6f} ms/iter" + ) + + +def allreduce_test(plan_path: str, plan_prefix: str): + num_processes = world_size # number of processes + processes = [] + np_inputs = [] + for i in range(world_size): + np_inputs.append(np.random.uniform(0, 1, tensor_len).astype(np.float16)) + ground_truth = np.sum(np_inputs, axis=0) + + # Create a process for each GPU + for i in range(num_processes): + process = multiprocessing.Process( + target=allreduce_test_function, + args=( + i, + np_inputs[i], + os.path.join(plan_path, plan_prefix + str(i) + ".json"), + ground_truth, + ), + ) + process.start() + processes.append(process) + + # Join the processes after completion + for process in processes: + process.join() + + +if __name__ == "__main__": + ark.init() + parser = argparse.ArgumentParser() + parser.add_argument("--plan_dir", type=str, default="examples/tutorial") + parser.add_argument("--plan_prefix", type=str, default="plan_gpu") + + args = parser.parse_args() + allreduce_test(args.plan_dir, args.plan_prefix) diff --git a/python/ark/__init__.py b/python/ark/__init__.py index e8dc7e6c4..b1d0f7873 100644 --- a/python/ark/__init__.py +++ b/python/ark/__init__.py @@ -51,45 +51,6 @@ def set_world_size(world_size): uint8, byte, ) -from .ops import ( - tensor, - parameter, - reshape, - identity, - sharding, - reduce_sum, - reduce_mean, - reduce_max, - layernorm, - softmax, - transpose, - matmul, - # im2col, - exp, - sqrt, - rsqrt, - rope, - relu, - gelu, - sigmoid, - add, - sub, - mul, - div, - # send, - # send_done, - # recv, - # all_gather, - # local_all_gather, - # local_reduce_scatter, - # all_reduce, - # local_all_reduce, - # local_all_reduce_packet, - embedding, - cast, - constant, - ones, - zeros, -) +from .ops import * from .planner import * from .error import * diff --git a/python/ark/ops.py b/python/ark/ops.py index bc1c3ed13..484e248ca 100644 --- a/python/ark/ops.py +++ b/python/ark/ops.py @@ -18,6 +18,7 @@ def _tensor( strides: Iterable[int] = [], offsets: Iterable[int] = [], padded_shape: Iterable[int] = [], + rank: int = -1, name: str = "", ) -> Tensor: if not _is_list_or_tuple(shape): @@ -42,6 +43,7 @@ def _tensor( Dims(strides), Dims(offsets), Dims(padded_shape), + rank, name, ) @@ -462,6 +464,7 @@ def tensor( strides: Iterable[int] = [], offsets: Iterable[int] = [], padded_shape: Iterable[int] = [], + rank: int = -1, name: str = "", ) -> Tensor: """ @@ -470,7 +473,9 @@ def tensor( tensor = ark.tensor([1, 2, 3, 4], dtype=ark.fp32) tensor = ark.tensor([1, 2], dtype=ark.fp16) """ - return Tensor(_tensor(shape, dtype, strides, offsets, padded_shape, name)) + return Tensor( + _tensor(shape, dtype, strides, offsets, padded_shape, rank, name) + ) def transpose( @@ -577,6 +582,69 @@ def zeros( ) +def all_reduce( + input: Tensor, + rank: int, + world_size: int, + output: Tensor = NullTensor, + name: str = "all_reduce", +) -> Tensor: + """ + Perform an all-reduce operation on the input tensor. + + Args: + input (Tensor): The input tensor to be reduced. + rank (int): The rank of the current process. + world_size (int): The total number of processes. + output (Tensor, optional): The output tensor. If provided, the result + will be stored in this tensor. Defaults to NullTensor. + name (str, optional): The name of the operation. Defaults to + "all_reduce". + + Returns: + Tensor: The reduced tensor. + """ + if output is not NullTensor: + output = output._tensor + _tensor = Model.get_model().all_reduce( + input._tensor, rank, world_size, output, name + ) + return Tensor(_tensor) + + +__all__ = [ + "tensor", + "parameter", + "reshape", + "identity", + "sharding", + "reduce_sum", + "reduce_mean", + "reduce_max", + "layernorm", + "softmax", + "transpose", + "matmul", + "exp", + "sqrt", + "rsqrt", + "rope", + "relu", + "gelu", + "sigmoid", + "add", + "sub", + "mul", + "div", + "all_reduce", + "embedding", + "cast", + "constant", + "ones", + "zeros", +] + + # def im2col( # input: Tensor, # kernel_height: int, @@ -785,30 +853,6 @@ def zeros( # return Tensor(_tensor) -# def all_reduce( -# input: Tensor, -# rank: int, -# world_size: int, -# output: Tensor = NullTensor, -# name: str = "all_reduce", -# ) -> Tensor: -# """ -# Performs an all-reduce operator across all GPUs, aggregating the -# input tensors. Takes the `input` tensor, the current GPU's -# `rank`, and the total number of GPUs `world_size`. -# Usage: -# ark.init(rank, world_size) -# input_tensor = ark.tensor([tensor_len], ark.fp16) -# allreduce_result = ark.all_reduce(input_tensor, rank, world_size) -# """ -# if output is not NullTensor: -# output = output._tensor -# _tensor = Model.get_model().all_reduce( -# input._tensor, rank, world_size, output, name -# ) -# return Tensor(_tensor) - - # def local_all_reduce( # input: Tensor, # rank: int, diff --git a/python/ark/runtime.py b/python/ark/runtime.py index d29b036ca..ab844708e 100644 --- a/python/ark/runtime.py +++ b/python/ark/runtime.py @@ -119,6 +119,15 @@ def run(self, iter=1, non_blocking=False): if not non_blocking: self.wait() + def barrier(self): + """ + Barrier for all ranks. + """ + if self.state != Runtime.State.LaunchedNotRunning: + logging.error("ARK runtime is not launched") + raise RuntimeError("ARK runtime is not launched") + self.executor.barrier() + def wait(self): """ Wait for the kernel to finish. diff --git a/python/model_py.cpp b/python/model_py.cpp index 46c70a7d3..c224a3d5b 100644 --- a/python/model_py.cpp +++ b/python/model_py.cpp @@ -104,9 +104,17 @@ void register_model(py::module &m) { const std::string &>(&ark::Model::sub), py::arg("input"), py::arg("other"), py::arg("output"), py::arg("name")) - .def("tensor", &ark::Model::tensor, py::arg("shape"), - py::arg("data_type"), py::arg("strides"), py::arg("offsets"), - py::arg("padded_shape"), py::arg("name")) + .def("tensor", + py::overload_cast( + &ark::Model::tensor), + py::arg("shape"), py::arg("data_type"), py::arg("strides"), + py::arg("offsets"), py::arg("padded_shape"), py::arg("rank"), + py::arg("name")) .def("transpose", &ark::Model::transpose, py::arg("input"), - py::arg("permutation"), py::arg("output"), py::arg("name")); + py::arg("permutation"), py::arg("output"), py::arg("name")) + .def("all_reduce", &ark::Model::all_reduce, py::arg("input"), + py::arg("rank"), py::arg("world_size"), py::arg("output"), + py::arg("name")); } diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt index 75916d962..12ae74298 100644 --- a/third_party/CMakeLists.txt +++ b/third_party/CMakeLists.txt @@ -11,7 +11,7 @@ include(FetchContent) FetchContent_Declare( mscclpp GIT_REPOSITORY https://github.com/microsoft/mscclpp - GIT_TAG v0.5.1 + GIT_TAG v0.5.2 SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/mscclpp ) set(BUILD_PYTHON_BINDINGS OFF CACHE BOOL "" FORCE)