From f7fb83c79b9e24e2ca0836030e148f88c8a44829 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Mon, 25 Mar 2024 20:33:13 -0700
Subject: [PATCH 01/52] add cudaDeviceSynchronize for NCCL

---
 allgather.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/allgather.cu b/allgather.cu
index cf1a882..5953041 100644
--- a/allgather.cu
+++ b/allgather.cu
@@ -158,6 +158,7 @@ int main(int argc, char *argv[]) {
             MPI_CHECK(MPI_Wait(&request, &status));
             #elif defined(USE_NCCL)
             NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL));
+	    cudaDeviceSynchronize();
             #elif defined(USE_RCCL)
 	    // TODO: fix later
             rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
@@ -177,6 +178,7 @@ int main(int argc, char *argv[]) {
             MPI_CHECK(MPI_Wait(&request, &status));
             #elif defined(USE_NCCL)
             NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL));
+	    cudaDeviceSynchronize();
             #elif defined(USE_RCCL)
             // TODO: fix later
             rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);

From 453a397d2c609f41eac92012e9564ef6e1fa8ed6 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Sat, 30 Mar 2024 14:15:33 -0700
Subject: [PATCH 02/52] add allreduce code

---
 Makefile     |   9 ++-
 allreduce.cu | 219 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 225 insertions(+), 3 deletions(-)
 create mode 100644 allreduce.cu

diff --git a/Makefile b/Makefile
index df453b4..231e499 100644
--- a/Makefile
+++ b/Makefile
@@ -5,14 +5,17 @@
 
 CC	= cc
 INC	= -I/global/common/software/nersc9/nccl/2.19.4/include
-CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
+CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI
 LDFLAGS	= -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
 
 
-all: allgather.x
+all: allgather.x allreduce.x
 
 allgather.x: allgather.cu 
 	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu
 
+allreduce.x: allreduce.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allreduce.x allreduce.cu
+
 clean: 
-	rm -f allgather.x 
+	rm -f allgather.x allreduce.x
diff --git a/allreduce.cu b/allreduce.cu
new file mode 100644
index 0000000..062b120
--- /dev/null
+++ b/allreduce.cu
@@ -0,0 +1,219 @@
+/* \file allreduce.cu
+ * Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+ * See the top-level LICENSE file for details.
+ * 
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+#ifdef USE_CUDA
+  #include <cuda_runtime.h>
+  #include <cuda_bf16.h>
+#endif
+
+#ifdef USE_NCCL
+  #include "nccl.h"
+#elif defined(USE_RCCL)
+  #include "rccl.h"
+#endif
+
+#define NUM_WARMUP_ITERATIONS		5
+
+#define MPI_CHECK(cmd) do {                         \
+  int e = cmd;                                      \
+  if( e != MPI_SUCCESS ) {                          \
+    printf("Failed: MPI error %s:%d '%d'\n",        \
+        __FILE__,__LINE__, e);                      \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define CUDA_CHECK(cmd) do {                        \
+  cudaError_t e = cmd;                              \
+  if(e != cudaSuccess) {                            \
+    printf("CUDA error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, cudaGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define NCCL_CHECK(cmd) do {                        \
+  ncclResult_t e = cmd;                             \
+  if (e != ncclSuccess) {                           \
+    printf("NCCL error %s:%d %s\n",                 \
+        __FILE__, __LINE__, ncclGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+void initializeData(nv_bfloat16 *data, int size) {
+    for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) {
+        data[i] = __float2bfloat16((float)i);
+    }
+}
+
+void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
+	nv_bfloat16* in = (nv_bfloat16*) invec;
+	nv_bfloat16* inout = (nv_bfloat16*) inoutvec;
+	for (int i = 0; i < *len; i++)
+	    inout[i] = __hadd(in[i], inout[i]);
+}
+
+int main(int argc, char *argv[]) {
+    if (argc != 5) {
+        fprintf(stderr, "Usage: %s <num_gpus> <min_msg_size> <max_msg_size> <iterations>\n", argv[0]);
+        return EXIT_FAILURE;
+    }
+
+    int num_gpus = atoi(argv[1]);
+    int min_msg_size = atoi(argv[2]);
+    int max_msg_size = atoi(argv[3]);
+    int iterations = atoi(argv[4]);
+
+    if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
+        fprintf(stderr, "Invalid input parameters.\n");
+        return EXIT_FAILURE;
+    }
+
+    int my_rank, num_pes;
+    int num_gpus_per_node;
+    int msg_count;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
+
+    if (num_pes != num_gpus) {
+        fprintf(stderr, "Number of processes must match number of GPUs.\n");
+        MPI_Finalize();
+        return EXIT_FAILURE;
+    }
+
+    // Initialize GPU context
+    cudaGetDeviceCount(&num_gpus_per_node);
+    cudaSetDevice((my_rank % num_gpus_per_node));
+
+    int local_data_size = max_msg_size; // Size of local data
+    int global_data_size = local_data_size; // Size of global data
+
+    nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size);
+    nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size);
+
+    // Initialize local data
+    initializeData(local_data, local_data_size);
+
+    // Allocate memory on GPU
+    nv_bfloat16 *d_local_data, *d_global_data;
+    CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
+    CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
+
+    // Copy local data to GPU
+    CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
+
+    #ifdef USE_MPI
+    // create 2-byte datatype (send raw, un-interpreted bytes)
+    MPI_Datatype mpi_type_bfloat16;
+    MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16);
+    MPI_Type_commit(&mpi_type_bfloat16);
+
+    // define custom reduce operation for nv_bfloat16 types
+    MPI_Op CUSTOM_SUM;
+    MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM);
+
+    #elif USE_NCCL
+    ncclUniqueId nccl_comm_id;
+    ncclComm_t nccl_comm;
+
+    if (my_rank == 0) {
+        /* Generates an Id to be used in ncclCommInitRank. */
+        ncclGetUniqueId(&nccl_comm_id);
+    }
+
+    /* distribute nccl_comm_id to all ranks */
+    MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
+                        0, MPI_COMM_WORLD));
+
+    /* Create a new NCCL communicator */
+    NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
+
+    #elif defined(USE_RCCL)
+    // TODO: fix later
+    rcclComm_t rccl_comm;
+    rcclCommInitRank(&comm, num_gpus, 0, rccl_root);
+    #endif
+
+    // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
+    double total_time, start_time;
+    MPI_Request request;
+    MPI_Status status;
+
+    // Print benchmark results
+    if (my_rank == 0) {
+        printf("Number of GPUs: %d\n", num_gpus);
+        printf("Message size range: %d - %d\n", min_msg_size, max_msg_size);
+        printf("Number of iterations: %d\n", iterations);
+    }
+    fflush(NULL);
+
+    for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
+	msg_count = msg_size / sizeof(nv_bfloat16);
+	// warmup iterations
+	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
+            #ifdef USE_MPI
+            MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16,
+                CUSTOM_SUM, MPI_COMM_WORLD, &request));
+
+            MPI_CHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL)
+            NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL));
+	    cudaDeviceSynchronize();
+            #elif defined(USE_RCCL)
+	    // TODO: fix later
+            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
+            #endif
+        }
+
+	if(msg_size >= 8388608)
+	    iterations = 20;
+
+        MPI_Barrier(MPI_COMM_WORLD);
+        start_time = MPI_Wtime();
+	for (int i = 0; i < iterations; ++i) {
+            #ifdef USE_MPI
+            MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16,
+                CUSTOM_SUM, MPI_COMM_WORLD, &request));
+
+            MPI_CHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL)
+            NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL));
+	    cudaDeviceSynchronize();
+            #elif defined(USE_RCCL)
+            // TODO: fix later
+            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
+            #endif
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+        total_time = MPI_Wtime() - start_time;
+	if (my_rank == 0)
+	    printf("%d %.6f seconds\n", msg_size, (total_time / iterations));
+    }
+
+    // Cleanup
+    free(local_data);
+    free(global_data);
+    CUDA_CHECK(cudaFree(d_local_data));
+    CUDA_CHECK(cudaFree(d_global_data));
+
+    #ifdef USE_NCCL
+    ncclCommDestroy(nccl_comm);
+    #elif defined(USE_RCCL)
+    rcclCommDestroy(rccl_comm);
+    #endif
+
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}
+

From f516fa01dc6a1fe40f255216da98f0b235a1ede4 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Sat, 30 Mar 2024 16:40:26 -0700
Subject: [PATCH 03/52] add reduce scatter code

---
 Makefile          |   7 +-
 reduce_scatter.cu | 226 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 231 insertions(+), 2 deletions(-)
 create mode 100644 reduce_scatter.cu

diff --git a/Makefile b/Makefile
index 231e499..973364d 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA
 LDFLAGS	= -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
 
 
-all: allgather.x allreduce.x
+all: allgather.x allreduce.x reduce_scatter.x
 
 allgather.x: allgather.cu 
 	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu
@@ -17,5 +17,8 @@ allgather.x: allgather.cu
 allreduce.x: allreduce.cu
 	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allreduce.x allreduce.cu
 
+reduce_scatter.x: reduce_scatter.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce_scatter.x reduce_scatter.cu
+
 clean: 
-	rm -f allgather.x allreduce.x
+	rm -f allgather.x allreduce.x reduce_scatter.x
diff --git a/reduce_scatter.cu b/reduce_scatter.cu
new file mode 100644
index 0000000..9ed9e53
--- /dev/null
+++ b/reduce_scatter.cu
@@ -0,0 +1,226 @@
+/* \file reduce_scatter.cu
+ * Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+ * See the top-level LICENSE file for details.
+ * 
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+#ifdef USE_CUDA
+  #include <cuda_runtime.h>
+  #include <cuda_bf16.h>
+#endif
+
+#ifdef USE_NCCL
+  #include "nccl.h"
+#elif defined(USE_RCCL)
+  #include "rccl.h"
+#endif
+
+#define NUM_WARMUP_ITERATIONS		5
+
+#define MPI_CHECK(cmd) do {                         \
+  int e = cmd;                                      \
+  if( e != MPI_SUCCESS ) {                          \
+    printf("Failed: MPI error %s:%d '%d'\n",        \
+        __FILE__,__LINE__, e);                      \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define CUDA_CHECK(cmd) do {                        \
+  cudaError_t e = cmd;                              \
+  if(e != cudaSuccess) {                            \
+    printf("CUDA error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, cudaGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define NCCL_CHECK(cmd) do {                        \
+  ncclResult_t e = cmd;                             \
+  if (e != ncclSuccess) {                           \
+    printf("NCCL error %s:%d %s\n",                 \
+        __FILE__, __LINE__, ncclGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+void initializeData(nv_bfloat16 *data, int size) {
+    for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) {
+        data[i] = __float2bfloat16((float)i);
+    }
+}
+
+void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
+	nv_bfloat16* in = (nv_bfloat16*) invec;
+	nv_bfloat16* inout = (nv_bfloat16*) inoutvec;
+	for (int i = 0; i < *len; i++)
+	    inout[i] = __hadd(in[i], inout[i]);
+}
+
+int main(int argc, char *argv[]) {
+    if (argc != 5) {
+        fprintf(stderr, "Usage: %s <num_gpus> <min_msg_size> <max_msg_size> <iterations>\n", argv[0]);
+        return EXIT_FAILURE;
+    }
+
+    int num_gpus = atoi(argv[1]);
+    int min_msg_size = atoi(argv[2]);
+    int max_msg_size = atoi(argv[3]);
+    int iterations = atoi(argv[4]);
+
+    if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
+        fprintf(stderr, "Invalid input parameters.\n");
+        return EXIT_FAILURE;
+    }
+
+    int my_rank, num_pes;
+    int num_gpus_per_node;
+    int msg_count;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
+
+    if (num_pes != num_gpus) {
+        fprintf(stderr, "Number of processes must match number of GPUs.\n");
+        MPI_Finalize();
+        return EXIT_FAILURE;
+    }
+
+    // Initialize GPU context
+    cudaGetDeviceCount(&num_gpus_per_node);
+    cudaSetDevice((my_rank % num_gpus_per_node));
+
+    int local_data_size = max_msg_size; // Size of local data
+    int global_data_size = local_data_size; // Size of global data
+
+    nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size);
+    nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size);
+
+    // Initialize local data
+    initializeData(local_data, local_data_size);
+
+    // Allocate memory on GPU
+    nv_bfloat16 *d_local_data, *d_global_data;
+    CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
+    CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
+
+    // Copy local data to GPU
+    CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
+
+    #ifdef USE_MPI
+    // create 2-byte datatype (send raw, un-interpreted bytes)
+    MPI_Datatype mpi_type_bfloat16;
+    MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16);
+    MPI_Type_commit(&mpi_type_bfloat16);
+
+    // define custom reduce operation for nv_bfloat16 types
+    MPI_Op CUSTOM_SUM;
+    MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM);
+
+    #elif USE_NCCL
+    ncclUniqueId nccl_comm_id;
+    ncclComm_t nccl_comm;
+
+    if (my_rank == 0) {
+        /* Generates an Id to be used in ncclCommInitRank. */
+        ncclGetUniqueId(&nccl_comm_id);
+    }
+
+    /* distribute nccl_comm_id to all ranks */
+    MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
+                        0, MPI_COMM_WORLD));
+
+    /* Create a new NCCL communicator */
+    NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
+
+    #elif defined(USE_RCCL)
+    // TODO: fix later
+    rcclComm_t rccl_comm;
+    rcclCommInitRank(&comm, num_gpus, 0, rccl_root);
+    #endif
+
+    // init recvcounts to send an equal portion of data from the reduce operation
+    int num_elements = local_data_size / sizeof(nv_bfloat16);
+    int portion = num_elements / num_pes;
+    int *recvcounts = (int*) malloc(sizeof(int) * num_pes);
+    for (int i = 0; i < num_pes; i++) 
+        recvcounts[i] = portion;
+
+    // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
+    double total_time, start_time;
+    MPI_Request request;
+    MPI_Status status;
+
+    // Print benchmark results
+    if (my_rank == 0) {
+        printf("Number of GPUs: %d\n", num_gpus);
+        printf("Message size range: %d - %d\n", min_msg_size, max_msg_size);
+        printf("Number of iterations: %d\n", iterations);
+    }
+    fflush(NULL);
+
+    for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
+	msg_count = msg_size / sizeof(nv_bfloat16);
+	// warmup iterations
+	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
+            #ifdef USE_MPI
+            MPI_CHECK(MPI_Ireduce_scatter(d_local_data, d_global_data, recvcounts, mpi_type_bfloat16,
+                CUSTOM_SUM, MPI_COMM_WORLD, &request));
+
+            MPI_CHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL)
+            NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL));
+	    cudaDeviceSynchronize();
+            #elif defined(USE_RCCL)
+	    // TODO: fix later
+            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
+            #endif
+        }
+
+	if(msg_size >= 8388608)
+	    iterations = 20;
+
+        MPI_Barrier(MPI_COMM_WORLD);
+        start_time = MPI_Wtime();
+	for (int i = 0; i < iterations; ++i) {
+            #ifdef USE_MPI
+            MPI_CHECK(MPI_Ireduce_scatter(d_local_data, d_global_data, recvcounts, mpi_type_bfloat16,
+                CUSTOM_SUM, MPI_COMM_WORLD, &request));
+
+            MPI_CHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL)
+            NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL));
+	    cudaDeviceSynchronize();
+            #elif defined(USE_RCCL)
+            // TODO: fix later
+            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
+            #endif
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+        total_time = MPI_Wtime() - start_time;
+	if (my_rank == 0)
+	    printf("%d %.6f seconds\n", msg_size, (total_time / iterations));
+    }
+
+    // Cleanup
+    free(local_data);
+    free(global_data);
+    CUDA_CHECK(cudaFree(d_local_data));
+    CUDA_CHECK(cudaFree(d_global_data));
+
+    #ifdef USE_NCCL
+    ncclCommDestroy(nccl_comm);
+    #elif defined(USE_RCCL)
+    rcclCommDestroy(rccl_comm);
+    #endif
+
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}
+

From 23e9f5cea25b1c430ea0cfb3a0b5977a4ed27ff0 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Sat, 30 Mar 2024 16:43:04 -0700
Subject: [PATCH 04/52] remove duplicate commit

---
 allreduce.cu | 219 ---------------------------------------------------
 1 file changed, 219 deletions(-)
 delete mode 100644 allreduce.cu

diff --git a/allreduce.cu b/allreduce.cu
deleted file mode 100644
index 062b120..0000000
--- a/allreduce.cu
+++ /dev/null
@@ -1,219 +0,0 @@
-/* \file allreduce.cu
- * Copyright 2024 Parallel Software and Systems Group, University of Maryland.
- * See the top-level LICENSE file for details.
- * 
- * SPDX-License-Identifier: MIT
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-
-#ifdef USE_CUDA
-  #include <cuda_runtime.h>
-  #include <cuda_bf16.h>
-#endif
-
-#ifdef USE_NCCL
-  #include "nccl.h"
-#elif defined(USE_RCCL)
-  #include "rccl.h"
-#endif
-
-#define NUM_WARMUP_ITERATIONS		5
-
-#define MPI_CHECK(cmd) do {                         \
-  int e = cmd;                                      \
-  if( e != MPI_SUCCESS ) {                          \
-    printf("Failed: MPI error %s:%d '%d'\n",        \
-        __FILE__,__LINE__, e);                      \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-#define CUDA_CHECK(cmd) do {                        \
-  cudaError_t e = cmd;                              \
-  if(e != cudaSuccess) {                            \
-    printf("CUDA error  %s:%d: %s\n",               \
-        __FILE__, __LINE__, cudaGetErrorString(e)); \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-#define NCCL_CHECK(cmd) do {                        \
-  ncclResult_t e = cmd;                             \
-  if (e != ncclSuccess) {                           \
-    printf("NCCL error %s:%d %s\n",                 \
-        __FILE__, __LINE__, ncclGetErrorString(e)); \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-void initializeData(nv_bfloat16 *data, int size) {
-    for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) {
-        data[i] = __float2bfloat16((float)i);
-    }
-}
-
-void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
-	nv_bfloat16* in = (nv_bfloat16*) invec;
-	nv_bfloat16* inout = (nv_bfloat16*) inoutvec;
-	for (int i = 0; i < *len; i++)
-	    inout[i] = __hadd(in[i], inout[i]);
-}
-
-int main(int argc, char *argv[]) {
-    if (argc != 5) {
-        fprintf(stderr, "Usage: %s <num_gpus> <min_msg_size> <max_msg_size> <iterations>\n", argv[0]);
-        return EXIT_FAILURE;
-    }
-
-    int num_gpus = atoi(argv[1]);
-    int min_msg_size = atoi(argv[2]);
-    int max_msg_size = atoi(argv[3]);
-    int iterations = atoi(argv[4]);
-
-    if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
-        fprintf(stderr, "Invalid input parameters.\n");
-        return EXIT_FAILURE;
-    }
-
-    int my_rank, num_pes;
-    int num_gpus_per_node;
-    int msg_count;
-
-    MPI_Init(&argc, &argv);
-    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
-
-    if (num_pes != num_gpus) {
-        fprintf(stderr, "Number of processes must match number of GPUs.\n");
-        MPI_Finalize();
-        return EXIT_FAILURE;
-    }
-
-    // Initialize GPU context
-    cudaGetDeviceCount(&num_gpus_per_node);
-    cudaSetDevice((my_rank % num_gpus_per_node));
-
-    int local_data_size = max_msg_size; // Size of local data
-    int global_data_size = local_data_size; // Size of global data
-
-    nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size);
-    nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size);
-
-    // Initialize local data
-    initializeData(local_data, local_data_size);
-
-    // Allocate memory on GPU
-    nv_bfloat16 *d_local_data, *d_global_data;
-    CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
-    CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
-
-    // Copy local data to GPU
-    CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
-
-    #ifdef USE_MPI
-    // create 2-byte datatype (send raw, un-interpreted bytes)
-    MPI_Datatype mpi_type_bfloat16;
-    MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16);
-    MPI_Type_commit(&mpi_type_bfloat16);
-
-    // define custom reduce operation for nv_bfloat16 types
-    MPI_Op CUSTOM_SUM;
-    MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM);
-
-    #elif USE_NCCL
-    ncclUniqueId nccl_comm_id;
-    ncclComm_t nccl_comm;
-
-    if (my_rank == 0) {
-        /* Generates an Id to be used in ncclCommInitRank. */
-        ncclGetUniqueId(&nccl_comm_id);
-    }
-
-    /* distribute nccl_comm_id to all ranks */
-    MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
-                        0, MPI_COMM_WORLD));
-
-    /* Create a new NCCL communicator */
-    NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
-
-    #elif defined(USE_RCCL)
-    // TODO: fix later
-    rcclComm_t rccl_comm;
-    rcclCommInitRank(&comm, num_gpus, 0, rccl_root);
-    #endif
-
-    // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
-    double total_time, start_time;
-    MPI_Request request;
-    MPI_Status status;
-
-    // Print benchmark results
-    if (my_rank == 0) {
-        printf("Number of GPUs: %d\n", num_gpus);
-        printf("Message size range: %d - %d\n", min_msg_size, max_msg_size);
-        printf("Number of iterations: %d\n", iterations);
-    }
-    fflush(NULL);
-
-    for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
-	msg_count = msg_size / sizeof(nv_bfloat16);
-	// warmup iterations
-	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
-            #ifdef USE_MPI
-            MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16,
-                CUSTOM_SUM, MPI_COMM_WORLD, &request));
-
-            MPI_CHECK(MPI_Wait(&request, &status));
-            #elif defined(USE_NCCL)
-            NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL));
-	    cudaDeviceSynchronize();
-            #elif defined(USE_RCCL)
-	    // TODO: fix later
-            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
-            #endif
-        }
-
-	if(msg_size >= 8388608)
-	    iterations = 20;
-
-        MPI_Barrier(MPI_COMM_WORLD);
-        start_time = MPI_Wtime();
-	for (int i = 0; i < iterations; ++i) {
-            #ifdef USE_MPI
-            MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16,
-                CUSTOM_SUM, MPI_COMM_WORLD, &request));
-
-            MPI_CHECK(MPI_Wait(&request, &status));
-            #elif defined(USE_NCCL)
-            NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL));
-	    cudaDeviceSynchronize();
-            #elif defined(USE_RCCL)
-            // TODO: fix later
-            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
-            #endif
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-        total_time = MPI_Wtime() - start_time;
-	if (my_rank == 0)
-	    printf("%d %.6f seconds\n", msg_size, (total_time / iterations));
-    }
-
-    // Cleanup
-    free(local_data);
-    free(global_data);
-    CUDA_CHECK(cudaFree(d_local_data));
-    CUDA_CHECK(cudaFree(d_global_data));
-
-    #ifdef USE_NCCL
-    ncclCommDestroy(nccl_comm);
-    #elif defined(USE_RCCL)
-    rcclCommDestroy(rccl_comm);
-    #endif
-
-    MPI_Finalize();
-    return EXIT_SUCCESS;
-}
-

From 7ff3fb503e4549ca170ea9ee08d763a90eb55584 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Sat, 30 Mar 2024 16:47:59 -0700
Subject: [PATCH 05/52] fix Makefile

---
 Makefile | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index 973364d..7c01696 100644
--- a/Makefile
+++ b/Makefile
@@ -9,16 +9,13 @@ CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA
 LDFLAGS	= -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
 
 
-all: allgather.x allreduce.x reduce_scatter.x
+all: allgather.x reduce_scatter.x
 
 allgather.x: allgather.cu 
 	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu
 
-allreduce.x: allreduce.cu
-	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allreduce.x allreduce.cu
-
 reduce_scatter.x: reduce_scatter.cu
 	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce_scatter.x reduce_scatter.cu
 
 clean: 
-	rm -f allgather.x allreduce.x reduce_scatter.x
+	rm -f allgather.x reduce_scatter.x

From 982ccaf7c6d265bf9ed21962432f8d462621dfbe Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Sun, 31 Mar 2024 23:43:59 -0400
Subject: [PATCH 06/52] add code for ROCm and RCCL

---
 Makefile     |  16 +++++---
 allgather.cu | 101 ++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 78 insertions(+), 39 deletions(-)

diff --git a/Makefile b/Makefile
index df453b4..9f6d40a 100644
--- a/Makefile
+++ b/Makefile
@@ -4,15 +4,21 @@
 # SPDX-License-Identifier: MIT
 
 CC	= cc
-INC	= -I/global/common/software/nersc9/nccl/2.19.4/include
-CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
-LDFLAGS	= -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
 
+# perlmutter flags
+# INC	= -I/global/common/software/nersc9/nccl/2.19.4/include
+# CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
+# LDFLAGS	= -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
+
+# frontier flags
+INC = -L${ROCM_PATH}/lib -lamdhip64
+CFLAGS = -std=c++11 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL
+LDFLAGS = --rocm-path=${ROCM_PATH} -lrccl
 
 all: allgather.x
 
-allgather.x: allgather.cu 
+allgather.x: allgather.cu
 	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu
 
 clean: 
-	rm -f allgather.x 
+	rm -f allgather.x
diff --git a/allgather.cu b/allgather.cu
index 5953041..b6fd992 100644
--- a/allgather.cu
+++ b/allgather.cu
@@ -8,16 +8,21 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
-
+#include <iostream>
 #ifdef USE_CUDA
-  #include <cuda_runtime.h>
   #include <cuda_bf16.h>
+  #define bfloat16 nv_bfloat16
+#elif USE_ROCM
+  #include <hip/hip_bfloat16.h>
+  #include <hip/hip_runtime.h>
+  #include <hip/hip_runtime_api.h>
+  #define bfloat16 hip_bfloat16
 #endif
 
 #ifdef USE_NCCL
   #include "nccl.h"
-#elif defined(USE_RCCL)
-  #include "rccl.h"
+#elif USE_RCCL
+  #include <rccl/rccl.h> 
 #endif
 
 #define NUM_WARMUP_ITERATIONS		5
@@ -40,6 +45,16 @@
   }                                                 \
 } while(0)
 
+#define HIP_CHECK(cmd) do {                        \
+  hipError_t e = cmd;                              \
+  if(e != hipSuccess) {                            \
+    printf("HIP error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, hipGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+// NCCL_CHECK is used to validate RCCL functions as well
 #define NCCL_CHECK(cmd) do {                        \
   ncclResult_t e = cmd;                             \
   if (e != ncclSuccess) {                           \
@@ -49,9 +64,14 @@
   }                                                 \
 } while(0)
 
-void initializeData(nv_bfloat16 *data, int size) {
-    for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) {
+void initializeData(bfloat16 *data, int size) {
+    for (int i = 0; i < (size / sizeof(bfloat16)); ++i) {
+        #ifdef USE_CUDA
         data[i] = __float2bfloat16((float)i);
+        #elif USE_ROCM
+        // ROCm doesn't have a float2bfloat16 method
+        data[i] = (bfloat16) ((float) i);
+        #endif
     }
 }
 
@@ -86,33 +106,44 @@ int main(int argc, char *argv[]) {
     }
 
     // Initialize GPU context
+    #if USE_CUDA
     cudaGetDeviceCount(&num_gpus_per_node);
     cudaSetDevice((my_rank % num_gpus_per_node));
+    #elif USE_ROCM
+    hipGetDeviceCount(&num_gpus_per_node);
+    hipSetDevice((my_rank % num_gpus_per_node));
+    #endif
 
     int local_data_size = max_msg_size; // Size of local data
     int global_data_size = local_data_size * num_gpus; // Size of global data
 
-    nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size);
-    nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size);
+    bfloat16 *local_data = (bfloat16*)malloc(local_data_size);
+    bfloat16 *global_data = (bfloat16*)malloc(global_data_size);
 
     // Initialize local data
     initializeData(local_data, local_data_size);
 
     // Allocate memory on GPU
-    nv_bfloat16 *d_local_data, *d_global_data;
+    bfloat16 *d_local_data, *d_global_data;
+    #ifdef USE_CUDA
     CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
     CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
-
     // Copy local data to GPU
     CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
 
+    #elif USE_ROCM
+    HIP_CHECK(hipMalloc(&d_local_data, local_data_size));
+    HIP_CHECK(hipMalloc(&d_global_data, global_data_size));
+    HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice));
+    #endif
+
     #ifdef USE_MPI
     // create 2-byte datatype (send raw, un-interpreted bytes)
     MPI_Datatype mpi_type_bfloat16;
     MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16);
     MPI_Type_commit(&mpi_type_bfloat16);
 
-    #elif USE_NCCL
+    #elif defined(USE_NCCL) || defined(USE_RCCL)
     ncclUniqueId nccl_comm_id;
     ncclComm_t nccl_comm;
 
@@ -125,13 +156,8 @@ int main(int argc, char *argv[]) {
     MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
                         0, MPI_COMM_WORLD));
 
-    /* Create a new NCCL communicator */
+    /* Create a new NCCL/RCCL communicator */
     NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
-
-    #elif defined(USE_RCCL)
-    // TODO: fix later
-    rcclComm_t rccl_comm;
-    rcclCommInitRank(&comm, num_gpus, 0, rccl_root);
     #endif
 
     // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
@@ -148,7 +174,7 @@ int main(int argc, char *argv[]) {
     fflush(NULL);
 
     for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
-	msg_count = msg_size / sizeof(nv_bfloat16);
+	msg_count = msg_size / sizeof(bfloat16);
 	// warmup iterations
 	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
             #ifdef USE_MPI
@@ -156,12 +182,14 @@ int main(int argc, char *argv[]) {
 		d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request));
                 
             MPI_CHECK(MPI_Wait(&request, &status));
-            #elif defined(USE_NCCL)
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
             NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL));
-	    cudaDeviceSynchronize();
-            #elif defined(USE_RCCL)
-	    // TODO: fix later
-            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
+            #endif
+        
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
             #endif
         }
 
@@ -172,16 +200,18 @@ int main(int argc, char *argv[]) {
         start_time = MPI_Wtime();
 	for (int i = 0; i < iterations; ++i) {
             #ifdef USE_MPI
-            MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16,
-                d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request));
-
+	    MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16,
+		d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request));
+                
             MPI_CHECK(MPI_Wait(&request, &status));
-            #elif defined(USE_NCCL)
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
             NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL));
-	    cudaDeviceSynchronize();
-            #elif defined(USE_RCCL)
-            // TODO: fix later
-            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
+            #endif
+        
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
             #endif
         }
         MPI_Barrier(MPI_COMM_WORLD);
@@ -193,13 +223,16 @@ int main(int argc, char *argv[]) {
     // Cleanup
     free(local_data);
     free(global_data);
+    #ifdef USE_CUDA
     CUDA_CHECK(cudaFree(d_local_data));
     CUDA_CHECK(cudaFree(d_global_data));
+    #elif USE_ROCM
+    HIP_CHECK(hipFree(d_local_data));
+    HIP_CHECK(hipFree(d_global_data));
+    #endif
 
-    #ifdef USE_NCCL
+    #ifdef defined(USE_NCCL) || defined(USE_RCCL)
     ncclCommDestroy(nccl_comm);
-    #elif defined(USE_RCCL)
-    rcclCommDestroy(rccl_comm);
     #endif
 
     MPI_Finalize();

From f70e65c096433273840f836e067450d1d1a760af Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Sun, 31 Mar 2024 23:56:46 -0400
Subject: [PATCH 07/52] add flags for ROCm and RCCL

---
 Makefile | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index df453b4..9f6d40a 100644
--- a/Makefile
+++ b/Makefile
@@ -4,15 +4,21 @@
 # SPDX-License-Identifier: MIT
 
 CC	= cc
-INC	= -I/global/common/software/nersc9/nccl/2.19.4/include
-CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
-LDFLAGS	= -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
 
+# perlmutter flags
+# INC	= -I/global/common/software/nersc9/nccl/2.19.4/include
+# CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
+# LDFLAGS	= -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
+
+# frontier flags
+INC = -L${ROCM_PATH}/lib -lamdhip64
+CFLAGS = -std=c++11 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL
+LDFLAGS = --rocm-path=${ROCM_PATH} -lrccl
 
 all: allgather.x
 
-allgather.x: allgather.cu 
+allgather.x: allgather.cu
 	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu
 
 clean: 
-	rm -f allgather.x 
+	rm -f allgather.x

From 8ab25d10e8dce0cce796837d5813b47a8425ebd7 Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Sun, 31 Mar 2024 23:59:33 -0400
Subject: [PATCH 08/52] revert Makefile to original

---
 Makefile | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/Makefile b/Makefile
index 9f6d40a..11f5145 100644
--- a/Makefile
+++ b/Makefile
@@ -4,20 +4,14 @@
 # SPDX-License-Identifier: MIT
 
 CC	= cc
+INC	= -I/global/common/software/nersc9/nccl/2.19.4/include
+CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
+LDFLAGS	= -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
 
-# perlmutter flags
-# INC	= -I/global/common/software/nersc9/nccl/2.19.4/include
-# CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
-# LDFLAGS	= -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
-
-# frontier flags
-INC = -L${ROCM_PATH}/lib -lamdhip64
-CFLAGS = -std=c++11 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL
-LDFLAGS = --rocm-path=${ROCM_PATH} -lrccl
 
 all: allgather.x
 
-allgather.x: allgather.cu
+allgather.x: allgather.cu 
 	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu
 
 clean: 

From ef65ccde73efcf113fa3799462638dc3c8fcbf5e Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Mon, 1 Apr 2024 00:12:38 -0400
Subject: [PATCH 09/52] remove unneeded import

---
 allgather.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/allgather.cu b/allgather.cu
index b6fd992..698e425 100644
--- a/allgather.cu
+++ b/allgather.cu
@@ -8,7 +8,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
-#include <iostream>
 #ifdef USE_CUDA
   #include <cuda_bf16.h>
   #define bfloat16 nv_bfloat16

From 795a6d3e4f5323b3fa8e706ae4f159b58c0dbfca Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Mon, 1 Apr 2024 00:28:31 -0400
Subject: [PATCH 10/52] add ROCm and RCCL code for all-reduce

---
 allreduce.cu | 95 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 63 insertions(+), 32 deletions(-)

diff --git a/allreduce.cu b/allreduce.cu
index 062b120..4394249 100644
--- a/allreduce.cu
+++ b/allreduce.cu
@@ -8,16 +8,20 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
-
 #ifdef USE_CUDA
-  #include <cuda_runtime.h>
   #include <cuda_bf16.h>
+  #define bfloat16 nv_bfloat16
+#elif USE_ROCM
+  #include <hip/hip_bfloat16.h>
+  #include <hip/hip_runtime.h>
+  #include <hip/hip_runtime_api.h>
+  #define bfloat16 hip_bfloat16
 #endif
 
 #ifdef USE_NCCL
   #include "nccl.h"
-#elif defined(USE_RCCL)
-  #include "rccl.h"
+#elif USE_RCCL
+  #include <rccl/rccl.h> 
 #endif
 
 #define NUM_WARMUP_ITERATIONS		5
@@ -40,6 +44,16 @@
   }                                                 \
 } while(0)
 
+#define HIP_CHECK(cmd) do {                        \
+  hipError_t e = cmd;                              \
+  if(e != hipSuccess) {                            \
+    printf("HIP error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, hipGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+// NCCL_CHECK is used to validate RCCL functions as well
 #define NCCL_CHECK(cmd) do {                        \
   ncclResult_t e = cmd;                             \
   if (e != ncclSuccess) {                           \
@@ -49,9 +63,14 @@
   }                                                 \
 } while(0)
 
-void initializeData(nv_bfloat16 *data, int size) {
-    for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) {
+void initializeData(bfloat16 *data, int size) {
+    for (int i = 0; i < (size / sizeof(bfloat16)); ++i) {
+        #ifdef USE_CUDA
         data[i] = __float2bfloat16((float)i);
+        #elif USE_ROCM
+        // ROCm doesn't have a float2bfloat16 method
+        data[i] = (bfloat16) ((float) i);
+        #endif
     }
 }
 
@@ -93,26 +112,36 @@ int main(int argc, char *argv[]) {
     }
 
     // Initialize GPU context
+    #if USE_CUDA
     cudaGetDeviceCount(&num_gpus_per_node);
     cudaSetDevice((my_rank % num_gpus_per_node));
+    #elif USE_ROCM
+    hipGetDeviceCount(&num_gpus_per_node);
+    hipSetDevice((my_rank % num_gpus_per_node));
+    #endif
 
     int local_data_size = max_msg_size; // Size of local data
     int global_data_size = local_data_size; // Size of global data
 
-    nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size);
-    nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size);
+    bfloat16 *local_data = (bfloat16*)malloc(local_data_size);
+    bfloat16 *global_data = (bfloat16*)malloc(global_data_size);
 
     // Initialize local data
     initializeData(local_data, local_data_size);
 
-    // Allocate memory on GPU
-    nv_bfloat16 *d_local_data, *d_global_data;
+    bfloat16 *d_local_data, *d_global_data;
+    #ifdef USE_CUDA
     CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
     CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
-
     // Copy local data to GPU
     CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
 
+    #elif USE_ROCM
+    HIP_CHECK(hipMalloc(&d_local_data, local_data_size));
+    HIP_CHECK(hipMalloc(&d_global_data, global_data_size));
+    HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice));
+    #endif
+
     #ifdef USE_MPI
     // create 2-byte datatype (send raw, un-interpreted bytes)
     MPI_Datatype mpi_type_bfloat16;
@@ -123,7 +152,7 @@ int main(int argc, char *argv[]) {
     MPI_Op CUSTOM_SUM;
     MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM);
 
-    #elif USE_NCCL
+    #elif defined(USE_NCCL) || defined(USE_RCCL)
     ncclUniqueId nccl_comm_id;
     ncclComm_t nccl_comm;
 
@@ -136,13 +165,8 @@ int main(int argc, char *argv[]) {
     MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
                         0, MPI_COMM_WORLD));
 
-    /* Create a new NCCL communicator */
+    /* Create a new NCCL/RCCL communicator */
     NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
-
-    #elif defined(USE_RCCL)
-    // TODO: fix later
-    rcclComm_t rccl_comm;
-    rcclCommInitRank(&comm, num_gpus, 0, rccl_root);
     #endif
 
     // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
@@ -159,7 +183,7 @@ int main(int argc, char *argv[]) {
     fflush(NULL);
 
     for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
-	msg_count = msg_size / sizeof(nv_bfloat16);
+	msg_count = msg_size / sizeof(bfloat16);
 	// warmup iterations
 	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
             #ifdef USE_MPI
@@ -167,12 +191,14 @@ int main(int argc, char *argv[]) {
                 CUSTOM_SUM, MPI_COMM_WORLD, &request));
 
             MPI_CHECK(MPI_Wait(&request, &status));
-            #elif defined(USE_NCCL)
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
             NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL));
-	    cudaDeviceSynchronize();
-            #elif defined(USE_RCCL)
-	    // TODO: fix later
-            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
+            #endif
+            
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
             #endif
         }
 
@@ -187,12 +213,14 @@ int main(int argc, char *argv[]) {
                 CUSTOM_SUM, MPI_COMM_WORLD, &request));
 
             MPI_CHECK(MPI_Wait(&request, &status));
-            #elif defined(USE_NCCL)
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
             NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL));
-	    cudaDeviceSynchronize();
-            #elif defined(USE_RCCL)
-            // TODO: fix later
-            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
+            #endif
+            
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
             #endif
         }
         MPI_Barrier(MPI_COMM_WORLD);
@@ -204,13 +232,16 @@ int main(int argc, char *argv[]) {
     // Cleanup
     free(local_data);
     free(global_data);
+    #ifdef USE_CUDA
     CUDA_CHECK(cudaFree(d_local_data));
     CUDA_CHECK(cudaFree(d_global_data));
+    #elif USE_ROCM
+    HIP_CHECK(hipFree(d_local_data));
+    HIP_CHECK(hipFree(d_global_data));
+    #endif
 
-    #ifdef USE_NCCL
+    #ifdef defined(USE_NCCL) || defined(USE_RCCL)
     ncclCommDestroy(nccl_comm);
-    #elif defined(USE_RCCL)
-    rcclCommDestroy(rccl_comm);
     #endif
 
     MPI_Finalize();

From b9e882437578691448c9748aa67caae643db422a Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Mon, 1 Apr 2024 00:38:59 -0400
Subject: [PATCH 11/52] add ROCm and RCCL code for reduce-scatter

---
 reduce_scatter.cu | 98 +++++++++++++++++++++++++++++++----------------
 1 file changed, 64 insertions(+), 34 deletions(-)

diff --git a/reduce_scatter.cu b/reduce_scatter.cu
index 9ed9e53..8f851d4 100644
--- a/reduce_scatter.cu
+++ b/reduce_scatter.cu
@@ -8,16 +8,20 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
-
 #ifdef USE_CUDA
-  #include <cuda_runtime.h>
   #include <cuda_bf16.h>
+  #define bfloat16 nv_bfloat16
+#elif USE_ROCM
+  #include <hip/hip_bfloat16.h>
+  #include <hip/hip_runtime.h>
+  #include <hip/hip_runtime_api.h>
+  #define bfloat16 hip_bfloat16
 #endif
 
 #ifdef USE_NCCL
   #include "nccl.h"
-#elif defined(USE_RCCL)
-  #include "rccl.h"
+#elif USE_RCCL
+  #include <rccl/rccl.h> 
 #endif
 
 #define NUM_WARMUP_ITERATIONS		5
@@ -40,6 +44,16 @@
   }                                                 \
 } while(0)
 
+#define HIP_CHECK(cmd) do {                        \
+  hipError_t e = cmd;                              \
+  if(e != hipSuccess) {                            \
+    printf("HIP error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, hipGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+// NCCL_CHECK is used to validate RCCL functions as well
 #define NCCL_CHECK(cmd) do {                        \
   ncclResult_t e = cmd;                             \
   if (e != ncclSuccess) {                           \
@@ -49,9 +63,14 @@
   }                                                 \
 } while(0)
 
-void initializeData(nv_bfloat16 *data, int size) {
-    for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) {
+void initializeData(bfloat16 *data, int size) {
+    for (int i = 0; i < (size / sizeof(bfloat16)); ++i) {
+        #ifdef USE_CUDA
         data[i] = __float2bfloat16((float)i);
+        #elif USE_ROCM
+        // ROCm doesn't have a float2bfloat16 method
+        data[i] = (bfloat16) ((float) i);
+        #endif
     }
 }
 
@@ -93,26 +112,36 @@ int main(int argc, char *argv[]) {
     }
 
     // Initialize GPU context
+    #if USE_CUDA
     cudaGetDeviceCount(&num_gpus_per_node);
     cudaSetDevice((my_rank % num_gpus_per_node));
+    #elif USE_ROCM
+    hipGetDeviceCount(&num_gpus_per_node);
+    hipSetDevice((my_rank % num_gpus_per_node));
+    #endif
 
     int local_data_size = max_msg_size; // Size of local data
     int global_data_size = local_data_size; // Size of global data
 
-    nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size);
-    nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size);
+    bfloat16 *local_data = (bfloat16*)malloc(local_data_size);
+    bfloat16 *global_data = (bfloat16*)malloc(global_data_size);
 
     // Initialize local data
     initializeData(local_data, local_data_size);
 
-    // Allocate memory on GPU
-    nv_bfloat16 *d_local_data, *d_global_data;
+    bfloat16 *d_local_data, *d_global_data;
+    #ifdef USE_CUDA
     CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
     CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
-
     // Copy local data to GPU
     CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
 
+    #elif USE_ROCM
+    HIP_CHECK(hipMalloc(&d_local_data, local_data_size));
+    HIP_CHECK(hipMalloc(&d_global_data, global_data_size));
+    HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice));
+    #endif
+
     #ifdef USE_MPI
     // create 2-byte datatype (send raw, un-interpreted bytes)
     MPI_Datatype mpi_type_bfloat16;
@@ -123,7 +152,7 @@ int main(int argc, char *argv[]) {
     MPI_Op CUSTOM_SUM;
     MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM);
 
-    #elif USE_NCCL
+    #elif defined(USE_NCCL) || defined(USE_RCCL)
     ncclUniqueId nccl_comm_id;
     ncclComm_t nccl_comm;
 
@@ -136,17 +165,12 @@ int main(int argc, char *argv[]) {
     MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
                         0, MPI_COMM_WORLD));
 
-    /* Create a new NCCL communicator */
+    /* Create a new NCCL/RCCL communicator */
     NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
-
-    #elif defined(USE_RCCL)
-    // TODO: fix later
-    rcclComm_t rccl_comm;
-    rcclCommInitRank(&comm, num_gpus, 0, rccl_root);
     #endif
 
     // init recvcounts to send an equal portion of data from the reduce operation
-    int num_elements = local_data_size / sizeof(nv_bfloat16);
+    int num_elements = local_data_size / sizeof(bfloat16);
     int portion = num_elements / num_pes;
     int *recvcounts = (int*) malloc(sizeof(int) * num_pes);
     for (int i = 0; i < num_pes; i++) 
@@ -166,7 +190,7 @@ int main(int argc, char *argv[]) {
     fflush(NULL);
 
     for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
-	msg_count = msg_size / sizeof(nv_bfloat16);
+	msg_count = msg_size / sizeof(bfloat16);
 	// warmup iterations
 	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
             #ifdef USE_MPI
@@ -174,12 +198,14 @@ int main(int argc, char *argv[]) {
                 CUSTOM_SUM, MPI_COMM_WORLD, &request));
 
             MPI_CHECK(MPI_Wait(&request, &status));
-            #elif defined(USE_NCCL)
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
             NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL));
-	    cudaDeviceSynchronize();
-            #elif defined(USE_RCCL)
-	    // TODO: fix later
-            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
+            #endif
+            
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
             #endif
         }
 
@@ -194,12 +220,14 @@ int main(int argc, char *argv[]) {
                 CUSTOM_SUM, MPI_COMM_WORLD, &request));
 
             MPI_CHECK(MPI_Wait(&request, &status));
-            #elif defined(USE_NCCL)
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
             NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL));
-	    cudaDeviceSynchronize();
-            #elif defined(USE_RCCL)
-            // TODO: fix later
-            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
+            #endif
+            
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
             #endif
         }
         MPI_Barrier(MPI_COMM_WORLD);
@@ -211,16 +239,18 @@ int main(int argc, char *argv[]) {
     // Cleanup
     free(local_data);
     free(global_data);
+    #ifdef USE_CUDA
     CUDA_CHECK(cudaFree(d_local_data));
     CUDA_CHECK(cudaFree(d_global_data));
+    #elif USE_ROCM
+    HIP_CHECK(hipFree(d_local_data));
+    HIP_CHECK(hipFree(d_global_data));
+    #endif
 
-    #ifdef USE_NCCL
+    #ifdef defined(USE_NCCL) || defined(USE_RCCL)
     ncclCommDestroy(nccl_comm);
-    #elif defined(USE_RCCL)
-    rcclCommDestroy(rccl_comm);
     #endif
 
     MPI_Finalize();
     return EXIT_SUCCESS;
 }
-

From e077503742f70d62e1c26043eb755f25f9b61358 Mon Sep 17 00:00:00 2001
From: Aditya Tomar <59426357+RoastSea8@users.noreply.github.com>
Date: Sun, 31 Mar 2024 21:48:27 -0700
Subject: [PATCH 12/52] Update and rename README to README.md

---
 README => README.md | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)
 rename README => README.md (52%)

diff --git a/README b/README.md
similarity index 52%
rename from README
rename to README.md
index eba2046..396231b 100644
--- a/README
+++ b/README.md
@@ -1,9 +1,13 @@
 Before compiling do these:
 
+### Perlmutter
+```sh
 module load PrgEnv-cray cudatoolkit craype-accel-nvidia80
 export CRAY_ACCEL_TARGET=nvidia80
-
-When running do these:
-
-module load cudatoolkit
 export MPICH_GPU_SUPPORT_ENABLED=1
+```
+### Frontier
+```sh
+module load PrgEnv-cray amd-mixed craype-accel-amd-gfx90a
+export MPICH_GPU_SUPPORT_ENABLED=1
+```

From 686be82807a62cb66f3ba91fe055c9644b2d4442 Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Mon, 1 Apr 2024 00:57:19 -0400
Subject: [PATCH 13/52] revert Makefile to original

---
 Makefile | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index 7c01696..11f5145 100644
--- a/Makefile
+++ b/Makefile
@@ -5,17 +5,14 @@
 
 CC	= cc
 INC	= -I/global/common/software/nersc9/nccl/2.19.4/include
-CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI
+CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
 LDFLAGS	= -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
 
 
-all: allgather.x reduce_scatter.x
+all: allgather.x
 
 allgather.x: allgather.cu 
 	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu
 
-reduce_scatter.x: reduce_scatter.cu
-	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce_scatter.x reduce_scatter.cu
-
 clean: 
-	rm -f allgather.x reduce_scatter.x
+	rm -f allgather.x

From ec4fdedfe0e26a4344a778e0b9ec7fd0ed8985ab Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Mon, 1 Apr 2024 00:58:42 -0400
Subject: [PATCH 14/52] revert Makefile to original

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 11f5145..df453b4 100644
--- a/Makefile
+++ b/Makefile
@@ -15,4 +15,4 @@ allgather.x: allgather.cu
 	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu
 
 clean: 
-	rm -f allgather.x
+	rm -f allgather.x 

From 4a87dfc7a3084f57f1727865ab4889b3441fcdfd Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Mon, 1 Apr 2024 00:59:47 -0400
Subject: [PATCH 15/52] revert Makefile to original

---
 Makefile | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index 231e499..df453b4 100644
--- a/Makefile
+++ b/Makefile
@@ -5,17 +5,14 @@
 
 CC	= cc
 INC	= -I/global/common/software/nersc9/nccl/2.19.4/include
-CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI
+CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
 LDFLAGS	= -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
 
 
-all: allgather.x allreduce.x
+all: allgather.x
 
 allgather.x: allgather.cu 
 	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu
 
-allreduce.x: allreduce.cu
-	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allreduce.x allreduce.cu
-
 clean: 
-	rm -f allgather.x allreduce.x
+	rm -f allgather.x 

From b6083d1cea284f619120d27e1a9c62018cf1464a Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Mon, 1 Apr 2024 01:00:41 -0400
Subject: [PATCH 16/52] revert Makefile to original

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 11f5145..df453b4 100644
--- a/Makefile
+++ b/Makefile
@@ -15,4 +15,4 @@ allgather.x: allgather.cu
 	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu
 
 clean: 
-	rm -f allgather.x
+	rm -f allgather.x 

From 79b2fb96a335ff798f63160ab36d530753665114 Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Mon, 1 Apr 2024 01:34:35 -0400
Subject: [PATCH 17/52] update custom bf16 sum function

---
 allreduce.cu | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/allreduce.cu b/allreduce.cu
index 4394249..7bf1031 100644
--- a/allreduce.cu
+++ b/allreduce.cu
@@ -75,10 +75,14 @@ void initializeData(bfloat16 *data, int size) {
 }
 
 void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
-	nv_bfloat16* in = (nv_bfloat16*) invec;
-	nv_bfloat16* inout = (nv_bfloat16*) inoutvec;
+	bfloat16* in = (bfloat16*) invec;
+	bfloat16* inout = (bfloat16*) inoutvec;
 	for (int i = 0; i < *len; i++)
+        #ifdef USE_CUDA
 	    inout[i] = __hadd(in[i], inout[i]);
+        #elif USE_ROCM
+        inout[i] = in[i] + inout[i];
+        #endif
 }
 
 int main(int argc, char *argv[]) {

From 60e6911eb66fed80565a0fba1f59f932163fd194 Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Mon, 1 Apr 2024 01:40:18 -0400
Subject: [PATCH 18/52] update custom bf16 sum function

---
 reduce_scatter.cu | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/reduce_scatter.cu b/reduce_scatter.cu
index 8f851d4..5db2b60 100644
--- a/reduce_scatter.cu
+++ b/reduce_scatter.cu
@@ -75,10 +75,15 @@ void initializeData(bfloat16 *data, int size) {
 }
 
 void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
-	nv_bfloat16* in = (nv_bfloat16*) invec;
-	nv_bfloat16* inout = (nv_bfloat16*) inoutvec;
-	for (int i = 0; i < *len; i++)
+    bfloat16* in = (bfloat16*) invec;
+	bfloat16* inout = (bfloat16*) inoutvec;
+	for (int i = 0; i < *len; i++) {
+        #ifdef USE_CUDA
 	    inout[i] = __hadd(in[i], inout[i]);
+        #elif USE_ROCM
+        inout[i] = in[i] + inout[i];
+        #endif
+    }
 }
 
 int main(int argc, char *argv[]) {

From ef6fb0d6d22e9e0dac426da032cd678df12875e3 Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Mon, 1 Apr 2024 01:41:28 -0400
Subject: [PATCH 19/52] fix custom bf16 sum function

---
 allreduce.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/allreduce.cu b/allreduce.cu
index 7bf1031..9f017db 100644
--- a/allreduce.cu
+++ b/allreduce.cu
@@ -77,12 +77,13 @@ void initializeData(bfloat16 *data, int size) {
 void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
 	bfloat16* in = (bfloat16*) invec;
 	bfloat16* inout = (bfloat16*) inoutvec;
-	for (int i = 0; i < *len; i++)
+	for (int i = 0; i < *len; i++) {
         #ifdef USE_CUDA
 	    inout[i] = __hadd(in[i], inout[i]);
         #elif USE_ROCM
         inout[i] = in[i] + inout[i];
         #endif
+    }
 }
 
 int main(int argc, char *argv[]) {

From 8052ca765fc87da0cb8f9de17bcc8d252b4dad56 Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Mon, 1 Apr 2024 01:45:14 -0400
Subject: [PATCH 20/52] fix indents

---
 allreduce.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/allreduce.cu b/allreduce.cu
index 9f017db..2ffac86 100644
--- a/allreduce.cu
+++ b/allreduce.cu
@@ -75,11 +75,11 @@ void initializeData(bfloat16 *data, int size) {
 }
 
 void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
-	bfloat16* in = (bfloat16*) invec;
-	bfloat16* inout = (bfloat16*) inoutvec;
-	for (int i = 0; i < *len; i++) {
+    bfloat16* in = (bfloat16*) invec;
+    bfloat16* inout = (bfloat16*) inoutvec;
+    for (int i = 0; i < *len; i++) {
         #ifdef USE_CUDA
-	    inout[i] = __hadd(in[i], inout[i]);
+        inout[i] = __hadd(in[i], inout[i]);
         #elif USE_ROCM
         inout[i] = in[i] + inout[i];
         #endif

From a67570e98862f1e62ac850d295cffa9a3fc79206 Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Mon, 1 Apr 2024 01:46:47 -0400
Subject: [PATCH 21/52] fix indents

---
 reduce_scatter.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/reduce_scatter.cu b/reduce_scatter.cu
index 5db2b60..b667c01 100644
--- a/reduce_scatter.cu
+++ b/reduce_scatter.cu
@@ -76,10 +76,10 @@ void initializeData(bfloat16 *data, int size) {
 
 void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
     bfloat16* in = (bfloat16*) invec;
-	bfloat16* inout = (bfloat16*) inoutvec;
-	for (int i = 0; i < *len; i++) {
+    bfloat16* inout = (bfloat16*) inoutvec;
+    for (int i = 0; i < *len; i++) {
         #ifdef USE_CUDA
-	    inout[i] = __hadd(in[i], inout[i]);
+        inout[i] = __hadd(in[i], inout[i]);
         #elif USE_ROCM
         inout[i] = in[i] + inout[i];
         #endif

From fdb324ad8fb90fb6c9ec78c7551c8324e2655fa7 Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Tue, 9 Apr 2024 06:05:06 -0700
Subject: [PATCH 22/52] update Makefile

---
 Makefile | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 9f6d40a..52c0235 100644
--- a/Makefile
+++ b/Makefile
@@ -3,17 +3,17 @@
 # 
 # SPDX-License-Identifier: MIT
 
-CC	= cc
+CC = cc
 
 # perlmutter flags
-# INC	= -I/global/common/software/nersc9/nccl/2.19.4/include
-# CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
-# LDFLAGS	= -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
+# INC = -I/global/common/software/nersc9/nccl/2.19.4/include
+# CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
+# LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
 
 # frontier flags
-INC = -L${ROCM_PATH}/lib -lamdhip64
-CFLAGS = -std=c++11 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL
-LDFLAGS = --rocm-path=${ROCM_PATH} -lrccl
+INC = -I${ROCM_PATH}/include
+CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL
+LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl
 
 all: allgather.x
 

From 405f09084247c87e318804074fdc2e0a0ac0f296 Mon Sep 17 00:00:00 2001
From: Aditya Tomar <59426357+RoastSea8@users.noreply.github.com>
Date: Thu, 11 Apr 2024 15:05:03 -0700
Subject: [PATCH 23/52] Create allreduce.cu

---
 allreduce.cu | 254 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 254 insertions(+)
 create mode 100644 allreduce.cu

diff --git a/allreduce.cu b/allreduce.cu
new file mode 100644
index 0000000..51b6248
--- /dev/null
+++ b/allreduce.cu
@@ -0,0 +1,254 @@
+/* \file allreduce.cu
+ * Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+ * See the top-level LICENSE file for details.
+ * 
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+#ifdef USE_CUDA
+  #include <cuda_bf16.h>
+  #define bfloat16 nv_bfloat16
+#elif USE_ROCM
+  #include <hip/hip_bfloat16.h>
+  #include <hip/hip_runtime.h>
+  #include <hip/hip_runtime_api.h>
+  #define bfloat16 hip_bfloat16
+#endif
+
+#ifdef USE_NCCL
+  #include "nccl.h"
+#elif USE_RCCL
+  #include <rccl/rccl.h> 
+#endif
+
+#define NUM_WARMUP_ITERATIONS		5
+
+#define MPI_CHECK(cmd) do {                         \
+  int e = cmd;                                      \
+  if( e != MPI_SUCCESS ) {                          \
+    printf("Failed: MPI error %s:%d '%d'\n",        \
+        __FILE__,__LINE__, e);                      \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define CUDA_CHECK(cmd) do {                        \
+  cudaError_t e = cmd;                              \
+  if(e != cudaSuccess) {                            \
+    printf("CUDA error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, cudaGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define HIP_CHECK(cmd) do {                        \
+  hipError_t e = cmd;                              \
+  if(e != hipSuccess) {                            \
+    printf("HIP error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, hipGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+// NCCL_CHECK is used to validate RCCL functions as well
+#define NCCL_CHECK(cmd) do {                        \
+  ncclResult_t e = cmd;                             \
+  if (e != ncclSuccess) {                           \
+    printf("NCCL error %s:%d %s\n",                 \
+        __FILE__, __LINE__, ncclGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+void initializeData(bfloat16 *data, int size) {
+    for (int i = 0; i < (size / sizeof(bfloat16)); ++i) {
+        #ifdef USE_CUDA
+        data[i] = __float2bfloat16((float)i);
+        #elif USE_ROCM
+        // ROCm doesn't have a float2bfloat16 method
+        data[i] = (bfloat16) ((float) i);
+        #endif
+    }
+}
+
+void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
+    bfloat16* in = (bfloat16*) invec;
+    bfloat16* inout = (bfloat16*) inoutvec;
+    for (int i = 0; i < *len; i++) {
+        #ifdef USE_CUDA
+        inout[i] = __hadd(in[i], inout[i]);
+        #elif USE_ROCM
+        inout[i] = in[i] + inout[i];
+        #endif
+    }
+}
+
+int main(int argc, char *argv[]) {
+    if (argc != 5) {
+        fprintf(stderr, "Usage: %s <num_gpus> <min_msg_size> <max_msg_size> <iterations>\n", argv[0]);
+        return EXIT_FAILURE;
+    }
+
+    int num_gpus = atoi(argv[1]);
+    int min_msg_size = atoi(argv[2]);
+    int max_msg_size = atoi(argv[3]);
+    int iterations = atoi(argv[4]);
+
+    if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
+        fprintf(stderr, "Invalid input parameters.\n");
+        return EXIT_FAILURE;
+    }
+
+    int my_rank, num_pes;
+    int num_gpus_per_node;
+    int msg_count;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
+
+    if (num_pes != num_gpus) {
+        fprintf(stderr, "Number of processes must match number of GPUs.\n");
+        MPI_Finalize();
+        return EXIT_FAILURE;
+    }
+
+    // Initialize GPU context
+    #if USE_CUDA
+    cudaGetDeviceCount(&num_gpus_per_node);
+    cudaSetDevice((my_rank % num_gpus_per_node));
+    #elif USE_ROCM
+    hipGetDeviceCount(&num_gpus_per_node);
+    hipSetDevice((my_rank % num_gpus_per_node));
+    #endif
+
+    int local_data_size = max_msg_size; // Size of local data
+    int global_data_size = local_data_size; // Size of global data
+
+    bfloat16 *local_data = (bfloat16*)malloc(local_data_size);
+    bfloat16 *global_data = (bfloat16*)malloc(global_data_size);
+
+    // Initialize local data
+    initializeData(local_data, local_data_size);
+
+    bfloat16 *d_local_data, *d_global_data;
+    #ifdef USE_CUDA
+    CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
+    CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
+    // Copy local data to GPU
+    CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
+
+    #elif USE_ROCM
+    HIP_CHECK(hipMalloc(&d_local_data, local_data_size));
+    HIP_CHECK(hipMalloc(&d_global_data, global_data_size));
+    HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice));
+    #endif
+
+    #ifdef USE_MPI
+    // create 2-byte datatype (send raw, un-interpreted bytes)
+    MPI_Datatype mpi_type_bfloat16;
+    MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16);
+    MPI_Type_commit(&mpi_type_bfloat16);
+
+    // define custom reduce operation for nv_bfloat16 types
+    MPI_Op CUSTOM_SUM;
+    MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM);
+
+    #elif defined(USE_NCCL) || defined(USE_RCCL)
+    ncclUniqueId nccl_comm_id;
+    ncclComm_t nccl_comm;
+
+    if (my_rank == 0) {
+        /* Generates an Id to be used in ncclCommInitRank. */
+        ncclGetUniqueId(&nccl_comm_id);
+    }
+
+    /* distribute nccl_comm_id to all ranks */
+    MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
+                        0, MPI_COMM_WORLD));
+
+    /* Create a new NCCL/RCCL communicator */
+    NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
+    #endif
+
+    // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
+    double total_time, start_time;
+    MPI_Request request;
+    MPI_Status status;
+
+    // Print benchmark results
+    if (my_rank == 0) {
+        printf("Number of GPUs: %d\n", num_gpus);
+        printf("Message size range: %d - %d\n", min_msg_size, max_msg_size);
+        printf("Number of iterations: %d\n", iterations);
+    }
+    fflush(NULL);
+
+    for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
+	msg_count = msg_size / sizeof(bfloat16);
+	// warmup iterations
+	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
+            #ifdef USE_MPI
+            MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16,
+                CUSTOM_SUM, MPI_COMM_WORLD, &request));
+
+            MPI_CHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
+            NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL));
+            #endif
+            
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
+            #endif
+        }
+
+	if(msg_size >= 8388608)
+	    iterations = 20;
+
+        MPI_Barrier(MPI_COMM_WORLD);
+        start_time = MPI_Wtime();
+	for (int i = 0; i < iterations; ++i) {
+            #ifdef USE_MPI
+            MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16,
+                CUSTOM_SUM, MPI_COMM_WORLD, &request));
+
+            MPI_CHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
+            NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL));
+            #endif
+            
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
+            #endif
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+        total_time = MPI_Wtime() - start_time;
+	if (my_rank == 0)
+	    printf("%d %.6f seconds\n", msg_size, (total_time / iterations));
+    }
+
+    // Cleanup
+    free(local_data);
+    free(global_data);
+    #ifdef USE_CUDA
+    CUDA_CHECK(cudaFree(d_local_data));
+    CUDA_CHECK(cudaFree(d_global_data));
+    #elif USE_ROCM
+    HIP_CHECK(hipFree(d_local_data));
+    HIP_CHECK(hipFree(d_global_data));
+    #endif
+
+    #ifdef defined(USE_NCCL) || defined(USE_RCCL)
+    ncclCommDestroy(nccl_comm);
+    #endif
+
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}

From 63bb696da18e45195885cd465cb54af3930b77b9 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Sat, 13 Apr 2024 13:47:23 -0700
Subject: [PATCH 24/52] change to int64_t for global/local data size

---
 reduce_scatter.cu | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/reduce_scatter.cu b/reduce_scatter.cu
index b667c01..820cf4f 100644
--- a/reduce_scatter.cu
+++ b/reduce_scatter.cu
@@ -8,6 +8,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
+#include <stdint.h>
+
 #ifdef USE_CUDA
   #include <cuda_bf16.h>
   #define bfloat16 nv_bfloat16
@@ -27,9 +29,9 @@
 #define NUM_WARMUP_ITERATIONS		5
 
 #define MPI_CHECK(cmd) do {                         \
-  int e = cmd;                                      \
+  int64_t e = cmd;                                      \
   if( e != MPI_SUCCESS ) {                          \
-    printf("Failed: MPI error %s:%d '%d'\n",        \
+    printf("Failed: MPI error %s:%d '%ld'\n",        \
         __FILE__,__LINE__, e);                      \
     exit(EXIT_FAILURE);                             \
   }                                                 \
@@ -63,8 +65,8 @@
   }                                                 \
 } while(0)
 
-void initializeData(bfloat16 *data, int size) {
-    for (int i = 0; i < (size / sizeof(bfloat16)); ++i) {
+void initializeData(bfloat16 *data, int64_t size) {
+    for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) {
         #ifdef USE_CUDA
         data[i] = __float2bfloat16((float)i);
         #elif USE_ROCM
@@ -74,10 +76,10 @@ void initializeData(bfloat16 *data, int size) {
     }
 }
 
-void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
+void custom_bf16_sum(void *invec, void *inoutvec, int64_t *len, MPI_Datatype *datatype) {
     bfloat16* in = (bfloat16*) invec;
     bfloat16* inout = (bfloat16*) inoutvec;
-    for (int i = 0; i < *len; i++) {
+    for (int64_t i = 0; i < *len; i++) {
         #ifdef USE_CUDA
         inout[i] = __hadd(in[i], inout[i]);
         #elif USE_ROCM
@@ -93,8 +95,8 @@ int main(int argc, char *argv[]) {
     }
 
     int num_gpus = atoi(argv[1]);
-    int min_msg_size = atoi(argv[2]);
-    int max_msg_size = atoi(argv[3]);
+    int64_t min_msg_size = atoi(argv[2]);
+    int64_t max_msg_size = atoi(argv[3]);
     int iterations = atoi(argv[4]);
 
     if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
@@ -125,8 +127,13 @@ int main(int argc, char *argv[]) {
     hipSetDevice((my_rank % num_gpus_per_node));
     #endif
 
-    int local_data_size = max_msg_size; // Size of local data
-    int global_data_size = local_data_size; // Size of global data
+    int64_t local_data_size = max_msg_size; // Size of local data
+    int64_t global_data_size = local_data_size; // Size of global data
+
+    if (my_rank == 0) {
+        fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024);
+        fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024);
+    }
 
     bfloat16 *local_data = (bfloat16*)malloc(local_data_size);
     bfloat16 *global_data = (bfloat16*)malloc(global_data_size);
@@ -189,12 +196,12 @@ int main(int argc, char *argv[]) {
     // Print benchmark results
     if (my_rank == 0) {
         printf("Number of GPUs: %d\n", num_gpus);
-        printf("Message size range: %d - %d\n", min_msg_size, max_msg_size);
+        printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size);
         printf("Number of iterations: %d\n", iterations);
     }
     fflush(NULL);
 
-    for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
+    for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
 	msg_count = msg_size / sizeof(bfloat16);
 	// warmup iterations
 	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
@@ -238,7 +245,7 @@ int main(int argc, char *argv[]) {
         MPI_Barrier(MPI_COMM_WORLD);
         total_time = MPI_Wtime() - start_time;
 	if (my_rank == 0)
-	    printf("%d %.6f seconds\n", msg_size, (total_time / iterations));
+	    printf("%ld %.6f seconds\n", msg_size, (total_time / iterations));
     }
 
     // Cleanup

From 3082c980f49b0ab7a88f2af24139a069c54adb95 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Sat, 13 Apr 2024 13:57:03 -0700
Subject: [PATCH 25/52] change to int64_t for global/local data size

---
 allgather.cu | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/allgather.cu b/allgather.cu
index 698e425..8ae7481 100644
--- a/allgather.cu
+++ b/allgather.cu
@@ -8,6 +8,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
+#include <stdint.h>
+
 #ifdef USE_CUDA
   #include <cuda_bf16.h>
   #define bfloat16 nv_bfloat16
@@ -27,9 +29,9 @@
 #define NUM_WARMUP_ITERATIONS		5
 
 #define MPI_CHECK(cmd) do {                         \
-  int e = cmd;                                      \
+  int64_t e = cmd;                                      \
   if( e != MPI_SUCCESS ) {                          \
-    printf("Failed: MPI error %s:%d '%d'\n",        \
+    printf("Failed: MPI error %s:%d '%ld'\n",        \
         __FILE__,__LINE__, e);                      \
     exit(EXIT_FAILURE);                             \
   }                                                 \
@@ -63,8 +65,8 @@
   }                                                 \
 } while(0)
 
-void initializeData(bfloat16 *data, int size) {
-    for (int i = 0; i < (size / sizeof(bfloat16)); ++i) {
+void initializeData(bfloat16 *data, int64_t size) {
+    for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) {
         #ifdef USE_CUDA
         data[i] = __float2bfloat16((float)i);
         #elif USE_ROCM
@@ -81,8 +83,8 @@ int main(int argc, char *argv[]) {
     }
 
     int num_gpus = atoi(argv[1]);
-    int min_msg_size = atoi(argv[2]);
-    int max_msg_size = atoi(argv[3]);
+    int64_t min_msg_size = atoi(argv[2]);
+    int64_t max_msg_size = atoi(argv[3]);
     int iterations = atoi(argv[4]);
 
     if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
@@ -113,8 +115,13 @@ int main(int argc, char *argv[]) {
     hipSetDevice((my_rank % num_gpus_per_node));
     #endif
 
-    int local_data_size = max_msg_size; // Size of local data
-    int global_data_size = local_data_size * num_gpus; // Size of global data
+    int64_t local_data_size = max_msg_size; // Size of local data
+    int64_t global_data_size = local_data_size * num_gpus; // Size of global data
+
+    if (my_rank == 0) {
+        fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024);
+        fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024);
+    }
 
     bfloat16 *local_data = (bfloat16*)malloc(local_data_size);
     bfloat16 *global_data = (bfloat16*)malloc(global_data_size);
@@ -167,12 +174,12 @@ int main(int argc, char *argv[]) {
     // Print benchmark results
     if (my_rank == 0) {
         printf("Number of GPUs: %d\n", num_gpus);
-        printf("Message size range: %d - %d\n", min_msg_size, max_msg_size);
+        printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size);
         printf("Number of iterations: %d\n", iterations);
     }
     fflush(NULL);
 
-    for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
+    for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
 	msg_count = msg_size / sizeof(bfloat16);
 	// warmup iterations
 	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
@@ -216,7 +223,7 @@ int main(int argc, char *argv[]) {
         MPI_Barrier(MPI_COMM_WORLD);
         total_time = MPI_Wtime() - start_time;
 	if (my_rank == 0)
-	    printf("%d %.6f seconds\n", msg_size, (total_time / iterations));
+	    printf("%ld %.6f seconds\n", msg_size, (total_time / iterations));
     }
 
     // Cleanup

From 3c91d01234babf777f467428306eb78665368639 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Sat, 13 Apr 2024 14:02:49 -0700
Subject: [PATCH 26/52] change to int64_t for global/local data size

---
 allreduce.cu      | 33 ++++++++++++++++++++-------------
 reduce_scatter.cu | 33 ++++++++++++++++++++-------------
 2 files changed, 40 insertions(+), 26 deletions(-)

diff --git a/allreduce.cu b/allreduce.cu
index 51b6248..63e1635 100644
--- a/allreduce.cu
+++ b/allreduce.cu
@@ -8,6 +8,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
+#include <stdint.h>
+
 #ifdef USE_CUDA
   #include <cuda_bf16.h>
   #define bfloat16 nv_bfloat16
@@ -27,9 +29,9 @@
 #define NUM_WARMUP_ITERATIONS		5
 
 #define MPI_CHECK(cmd) do {                         \
-  int e = cmd;                                      \
+  int64_t e = cmd;                                      \
   if( e != MPI_SUCCESS ) {                          \
-    printf("Failed: MPI error %s:%d '%d'\n",        \
+    printf("Failed: MPI error %s:%d '%ld'\n",        \
         __FILE__,__LINE__, e);                      \
     exit(EXIT_FAILURE);                             \
   }                                                 \
@@ -63,8 +65,8 @@
   }                                                 \
 } while(0)
 
-void initializeData(bfloat16 *data, int size) {
-    for (int i = 0; i < (size / sizeof(bfloat16)); ++i) {
+void initializeData(bfloat16 *data, int64_t size) {
+    for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) {
         #ifdef USE_CUDA
         data[i] = __float2bfloat16((float)i);
         #elif USE_ROCM
@@ -74,10 +76,10 @@ void initializeData(bfloat16 *data, int size) {
     }
 }
 
-void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
+void custom_bf16_sum(void *invec, void *inoutvec, int64_t *len, MPI_Datatype *datatype) {
     bfloat16* in = (bfloat16*) invec;
     bfloat16* inout = (bfloat16*) inoutvec;
-    for (int i = 0; i < *len; i++) {
+    for (int64_t i = 0; i < *len; i++) {
         #ifdef USE_CUDA
         inout[i] = __hadd(in[i], inout[i]);
         #elif USE_ROCM
@@ -93,8 +95,8 @@ int main(int argc, char *argv[]) {
     }
 
     int num_gpus = atoi(argv[1]);
-    int min_msg_size = atoi(argv[2]);
-    int max_msg_size = atoi(argv[3]);
+    int64_t min_msg_size = atoi(argv[2]);
+    int64_t max_msg_size = atoi(argv[3]);
     int iterations = atoi(argv[4]);
 
     if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
@@ -125,8 +127,13 @@ int main(int argc, char *argv[]) {
     hipSetDevice((my_rank % num_gpus_per_node));
     #endif
 
-    int local_data_size = max_msg_size; // Size of local data
-    int global_data_size = local_data_size; // Size of global data
+    int64_t local_data_size = max_msg_size; // Size of local data
+    int64_t global_data_size = local_data_size; // Size of global data 
+    
+    if (my_rank == 0) {
+        fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024);
+        fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024);
+    }
 
     bfloat16 *local_data = (bfloat16*)malloc(local_data_size);
     bfloat16 *global_data = (bfloat16*)malloc(global_data_size);
@@ -182,12 +189,12 @@ int main(int argc, char *argv[]) {
     // Print benchmark results
     if (my_rank == 0) {
         printf("Number of GPUs: %d\n", num_gpus);
-        printf("Message size range: %d - %d\n", min_msg_size, max_msg_size);
+        printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size);
         printf("Number of iterations: %d\n", iterations);
     }
     fflush(NULL);
 
-    for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
+    for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
 	msg_count = msg_size / sizeof(bfloat16);
 	// warmup iterations
 	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
@@ -231,7 +238,7 @@ int main(int argc, char *argv[]) {
         MPI_Barrier(MPI_COMM_WORLD);
         total_time = MPI_Wtime() - start_time;
 	if (my_rank == 0)
-	    printf("%d %.6f seconds\n", msg_size, (total_time / iterations));
+	    printf("%ld %.6f seconds\n", msg_size, (total_time / iterations));
     }
 
     // Cleanup
diff --git a/reduce_scatter.cu b/reduce_scatter.cu
index b667c01..820cf4f 100644
--- a/reduce_scatter.cu
+++ b/reduce_scatter.cu
@@ -8,6 +8,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
+#include <stdint.h>
+
 #ifdef USE_CUDA
   #include <cuda_bf16.h>
   #define bfloat16 nv_bfloat16
@@ -27,9 +29,9 @@
 #define NUM_WARMUP_ITERATIONS		5
 
 #define MPI_CHECK(cmd) do {                         \
-  int e = cmd;                                      \
+  int64_t e = cmd;                                      \
   if( e != MPI_SUCCESS ) {                          \
-    printf("Failed: MPI error %s:%d '%d'\n",        \
+    printf("Failed: MPI error %s:%d '%ld'\n",        \
         __FILE__,__LINE__, e);                      \
     exit(EXIT_FAILURE);                             \
   }                                                 \
@@ -63,8 +65,8 @@
   }                                                 \
 } while(0)
 
-void initializeData(bfloat16 *data, int size) {
-    for (int i = 0; i < (size / sizeof(bfloat16)); ++i) {
+void initializeData(bfloat16 *data, int64_t size) {
+    for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) {
         #ifdef USE_CUDA
         data[i] = __float2bfloat16((float)i);
         #elif USE_ROCM
@@ -74,10 +76,10 @@ void initializeData(bfloat16 *data, int size) {
     }
 }
 
-void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
+void custom_bf16_sum(void *invec, void *inoutvec, int64_t *len, MPI_Datatype *datatype) {
     bfloat16* in = (bfloat16*) invec;
     bfloat16* inout = (bfloat16*) inoutvec;
-    for (int i = 0; i < *len; i++) {
+    for (int64_t i = 0; i < *len; i++) {
         #ifdef USE_CUDA
         inout[i] = __hadd(in[i], inout[i]);
         #elif USE_ROCM
@@ -93,8 +95,8 @@ int main(int argc, char *argv[]) {
     }
 
     int num_gpus = atoi(argv[1]);
-    int min_msg_size = atoi(argv[2]);
-    int max_msg_size = atoi(argv[3]);
+    int64_t min_msg_size = atoi(argv[2]);
+    int64_t max_msg_size = atoi(argv[3]);
     int iterations = atoi(argv[4]);
 
     if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
@@ -125,8 +127,13 @@ int main(int argc, char *argv[]) {
     hipSetDevice((my_rank % num_gpus_per_node));
     #endif
 
-    int local_data_size = max_msg_size; // Size of local data
-    int global_data_size = local_data_size; // Size of global data
+    int64_t local_data_size = max_msg_size; // Size of local data
+    int64_t global_data_size = local_data_size; // Size of global data
+
+    if (my_rank == 0) {
+        fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024);
+        fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024);
+    }
 
     bfloat16 *local_data = (bfloat16*)malloc(local_data_size);
     bfloat16 *global_data = (bfloat16*)malloc(global_data_size);
@@ -189,12 +196,12 @@ int main(int argc, char *argv[]) {
     // Print benchmark results
     if (my_rank == 0) {
         printf("Number of GPUs: %d\n", num_gpus);
-        printf("Message size range: %d - %d\n", min_msg_size, max_msg_size);
+        printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size);
         printf("Number of iterations: %d\n", iterations);
     }
     fflush(NULL);
 
-    for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
+    for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
 	msg_count = msg_size / sizeof(bfloat16);
 	// warmup iterations
 	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
@@ -238,7 +245,7 @@ int main(int argc, char *argv[]) {
         MPI_Barrier(MPI_COMM_WORLD);
         total_time = MPI_Wtime() - start_time;
 	if (my_rank == 0)
-	    printf("%d %.6f seconds\n", msg_size, (total_time / iterations));
+	    printf("%ld %.6f seconds\n", msg_size, (total_time / iterations));
     }
 
     // Cleanup

From 0a33166c8ed7059a422d3bdda3c4804604a9f849 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Sat, 13 Apr 2024 14:33:25 -0700
Subject: [PATCH 27/52] revert type change for custom sum

---
 reduce_scatter.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/reduce_scatter.cu b/reduce_scatter.cu
index 820cf4f..f824072 100644
--- a/reduce_scatter.cu
+++ b/reduce_scatter.cu
@@ -76,10 +76,10 @@ void initializeData(bfloat16 *data, int64_t size) {
     }
 }
 
-void custom_bf16_sum(void *invec, void *inoutvec, int64_t *len, MPI_Datatype *datatype) {
+void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
     bfloat16* in = (bfloat16*) invec;
     bfloat16* inout = (bfloat16*) inoutvec;
-    for (int64_t i = 0; i < *len; i++) {
+    for (int i = 0; i < *len; i++) {
         #ifdef USE_CUDA
         inout[i] = __hadd(in[i], inout[i]);
         #elif USE_ROCM

From 8be09db3867b60b275bd587c16aaba1a4eb5b40c Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Sat, 13 Apr 2024 16:41:03 -0700
Subject: [PATCH 28/52] setup benchmarks rig and add results so far

---
 allreduce.cu                         |  4 +--
 mpi/Makefile                         | 30 ++++++++++++++++++++++
 mpi/all-gather/128_gpu_run.sh        | 37 ++++++++++++++++++++++++++++
 mpi/all-gather/16_gpu_run.sh         | 37 ++++++++++++++++++++++++++++
 mpi/all-gather/32_gpu_run.sh         | 37 ++++++++++++++++++++++++++++
 mpi/all-gather/64_gpu_run.sh         | 37 ++++++++++++++++++++++++++++
 mpi/all-gather/8_gpu_run.sh          | 37 ++++++++++++++++++++++++++++
 mpi/all-gather/benchmarks/16_gpu.txt | 12 +++++++++
 mpi/all-gather/benchmarks/8_gpu.txt  | 13 ++++++++++
 mpi/all-reduce/128_gpu_run.sh        | 37 ++++++++++++++++++++++++++++
 mpi/all-reduce/16_gpu_run.sh         | 37 ++++++++++++++++++++++++++++
 mpi/all-reduce/32_gpu_run.sh         | 37 ++++++++++++++++++++++++++++
 mpi/all-reduce/64_gpu_run.sh         | 37 ++++++++++++++++++++++++++++
 mpi/reduce-scatter/128_gpu_run.sh    | 37 ++++++++++++++++++++++++++++
 mpi/reduce-scatter/16_gpu_run.sh     | 37 ++++++++++++++++++++++++++++
 mpi/reduce-scatter/32_gpu_run.sh     | 37 ++++++++++++++++++++++++++++
 mpi/reduce-scatter/64_gpu_run.sh     | 37 ++++++++++++++++++++++++++++
 mpi/reduce-scatter/8_gpu_run.sh      | 37 ++++++++++++++++++++++++++++
 nccl/Makefile                        | 30 ++++++++++++++++++++++
 nccl/all-gather/128_gpu_run.sh       | 37 ++++++++++++++++++++++++++++
 nccl/all-gather/16_gpu_run.sh        | 37 ++++++++++++++++++++++++++++
 nccl/all-gather/32_gpu_run.sh        | 37 ++++++++++++++++++++++++++++
 nccl/all-gather/64_gpu_run.sh        | 37 ++++++++++++++++++++++++++++
 nccl/all-gather/8_gpu_run.sh         | 37 ++++++++++++++++++++++++++++
 nccl/all-reduce/128_gpu_run.sh       | 37 ++++++++++++++++++++++++++++
 nccl/all-reduce/16_gpu_run.sh        | 37 ++++++++++++++++++++++++++++
 nccl/all-reduce/32_gpu_run.sh        | 37 ++++++++++++++++++++++++++++
 nccl/all-reduce/64_gpu_run.sh        | 37 ++++++++++++++++++++++++++++
 nccl/all-reduce/8_gpu_run.sh         | 37 ++++++++++++++++++++++++++++
 nccl/reduce-scatter/128_gpu_run.sh   | 37 ++++++++++++++++++++++++++++
 nccl/reduce-scatter/16_gpu_run.sh    | 37 ++++++++++++++++++++++++++++
 nccl/reduce-scatter/32_gpu_run.sh    | 37 ++++++++++++++++++++++++++++
 nccl/reduce-scatter/64_gpu_run.sh    | 37 ++++++++++++++++++++++++++++
 nccl/reduce-scatter/8_gpu_run.sh     | 37 ++++++++++++++++++++++++++++
 Makefile => rccl/Makefile            | 14 ++++++++---
 reduce_scatter.cu                    |  4 +--
 36 files changed, 1172 insertions(+), 8 deletions(-)
 create mode 100644 mpi/Makefile
 create mode 100644 mpi/all-gather/128_gpu_run.sh
 create mode 100644 mpi/all-gather/16_gpu_run.sh
 create mode 100644 mpi/all-gather/32_gpu_run.sh
 create mode 100644 mpi/all-gather/64_gpu_run.sh
 create mode 100644 mpi/all-gather/8_gpu_run.sh
 create mode 100644 mpi/all-gather/benchmarks/16_gpu.txt
 create mode 100644 mpi/all-gather/benchmarks/8_gpu.txt
 create mode 100644 mpi/all-reduce/128_gpu_run.sh
 create mode 100644 mpi/all-reduce/16_gpu_run.sh
 create mode 100644 mpi/all-reduce/32_gpu_run.sh
 create mode 100644 mpi/all-reduce/64_gpu_run.sh
 create mode 100644 mpi/reduce-scatter/128_gpu_run.sh
 create mode 100644 mpi/reduce-scatter/16_gpu_run.sh
 create mode 100644 mpi/reduce-scatter/32_gpu_run.sh
 create mode 100644 mpi/reduce-scatter/64_gpu_run.sh
 create mode 100644 mpi/reduce-scatter/8_gpu_run.sh
 create mode 100644 nccl/Makefile
 create mode 100644 nccl/all-gather/128_gpu_run.sh
 create mode 100644 nccl/all-gather/16_gpu_run.sh
 create mode 100644 nccl/all-gather/32_gpu_run.sh
 create mode 100644 nccl/all-gather/64_gpu_run.sh
 create mode 100644 nccl/all-gather/8_gpu_run.sh
 create mode 100644 nccl/all-reduce/128_gpu_run.sh
 create mode 100644 nccl/all-reduce/16_gpu_run.sh
 create mode 100644 nccl/all-reduce/32_gpu_run.sh
 create mode 100644 nccl/all-reduce/64_gpu_run.sh
 create mode 100644 nccl/all-reduce/8_gpu_run.sh
 create mode 100644 nccl/reduce-scatter/128_gpu_run.sh
 create mode 100644 nccl/reduce-scatter/16_gpu_run.sh
 create mode 100644 nccl/reduce-scatter/32_gpu_run.sh
 create mode 100644 nccl/reduce-scatter/64_gpu_run.sh
 create mode 100644 nccl/reduce-scatter/8_gpu_run.sh
 rename Makefile => rccl/Makefile (57%)

diff --git a/allreduce.cu b/allreduce.cu
index 63e1635..ddbfb97 100644
--- a/allreduce.cu
+++ b/allreduce.cu
@@ -76,10 +76,10 @@ void initializeData(bfloat16 *data, int64_t size) {
     }
 }
 
-void custom_bf16_sum(void *invec, void *inoutvec, int64_t *len, MPI_Datatype *datatype) {
+void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
     bfloat16* in = (bfloat16*) invec;
     bfloat16* inout = (bfloat16*) inoutvec;
-    for (int64_t i = 0; i < *len; i++) {
+    for (int i = 0; i < *len; i++) {
         #ifdef USE_CUDA
         inout[i] = __hadd(in[i], inout[i]);
         #elif USE_ROCM
diff --git a/mpi/Makefile b/mpi/Makefile
new file mode 100644
index 0000000..782a6bf
--- /dev/null
+++ b/mpi/Makefile
@@ -0,0 +1,30 @@
+# Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+# See the top-level LICENSE file for details.
+# 
+# SPDX-License-Identifier: MIT
+
+CC = cc
+
+# perlmutter flags
+INC = -I/global/common/software/nersc9/nccl/2.19.4/include
+CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI
+LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
+
+# frontier flags
+# INC = -I${ROCM_PATH}/include
+# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL
+# LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl
+
+all: allgather.x allreduce.x reduce_scatter.x
+
+allgather.x: ../allgather.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu
+
+allreduce.x: ../allreduce.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu
+
+reduce_scatter.x: ../reduce_scatter.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu
+
+clean: 
+	rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x
diff --git a/mpi/all-gather/128_gpu_run.sh b/mpi/all-gather/128_gpu_run.sh
new file mode 100644
index 0000000..3af373c
--- /dev/null
+++ b/mpi/all-gather/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 16))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/16_gpu_run.sh b/mpi/all-gather/16_gpu_run.sh
new file mode 100644
index 0000000..25d7b92
--- /dev/null
+++ b/mpi/all-gather/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 128))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/32_gpu_run.sh b/mpi/all-gather/32_gpu_run.sh
new file mode 100644
index 0000000..3a03ef0
--- /dev/null
+++ b/mpi/all-gather/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 64))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/64_gpu_run.sh b/mpi/all-gather/64_gpu_run.sh
new file mode 100644
index 0000000..37ba334
--- /dev/null
+++ b/mpi/all-gather/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 32))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/8_gpu_run.sh b/mpi/all-gather/8_gpu_run.sh
new file mode 100644
index 0000000..aa3e3a8
--- /dev/null
+++ b/mpi/all-gather/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 256))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/benchmarks/16_gpu.txt b/mpi/all-gather/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..b69654b
--- /dev/null
+++ b/mpi/all-gather/benchmarks/16_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 128
+Global data size: 2048
+Number of GPUs: 16
+Message size range: 2097152 - 134217728
+Number of iterations: 10
+2097152 0.002391 seconds
+4194304 0.003558 seconds
+8388608 0.007162 seconds
+16777216 0.014929 seconds
+33554432 0.030427 seconds
+67108864 0.062092 seconds
+134217728 0.151508 seconds
diff --git a/mpi/all-gather/benchmarks/8_gpu.txt b/mpi/all-gather/benchmarks/8_gpu.txt
new file mode 100644
index 0000000..de3a837
--- /dev/null
+++ b/mpi/all-gather/benchmarks/8_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 256
+Global data size: 2048
+Number of GPUs: 8
+Message size range: 2097152 - 268435456
+Number of iterations: 10
+2097152 0.000838 seconds
+4194304 0.001719 seconds
+8388608 0.003172 seconds
+16777216 0.006797 seconds
+33554432 0.013860 seconds
+67108864 0.027938 seconds
+134217728 0.055353 seconds
+268435456 0.104310 seconds
diff --git a/mpi/all-reduce/128_gpu_run.sh b/mpi/all-reduce/128_gpu_run.sh
new file mode 100644
index 0000000..6a5ccff
--- /dev/null
+++ b/mpi/all-reduce/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/16_gpu_run.sh b/mpi/all-reduce/16_gpu_run.sh
new file mode 100644
index 0000000..4158fe0
--- /dev/null
+++ b/mpi/all-reduce/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/32_gpu_run.sh b/mpi/all-reduce/32_gpu_run.sh
new file mode 100644
index 0000000..8990167
--- /dev/null
+++ b/mpi/all-reduce/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/64_gpu_run.sh b/mpi/all-reduce/64_gpu_run.sh
new file mode 100644
index 0000000..314f852
--- /dev/null
+++ b/mpi/all-reduce/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/128_gpu_run.sh b/mpi/reduce-scatter/128_gpu_run.sh
new file mode 100644
index 0000000..e0a9db1
--- /dev/null
+++ b/mpi/reduce-scatter/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/16_gpu_run.sh b/mpi/reduce-scatter/16_gpu_run.sh
new file mode 100644
index 0000000..be576de
--- /dev/null
+++ b/mpi/reduce-scatter/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/32_gpu_run.sh b/mpi/reduce-scatter/32_gpu_run.sh
new file mode 100644
index 0000000..04a7f0a
--- /dev/null
+++ b/mpi/reduce-scatter/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/64_gpu_run.sh b/mpi/reduce-scatter/64_gpu_run.sh
new file mode 100644
index 0000000..48c7645
--- /dev/null
+++ b/mpi/reduce-scatter/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/8_gpu_run.sh b/mpi/reduce-scatter/8_gpu_run.sh
new file mode 100644
index 0000000..5f8f10e
--- /dev/null
+++ b/mpi/reduce-scatter/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/Makefile b/nccl/Makefile
new file mode 100644
index 0000000..5652112
--- /dev/null
+++ b/nccl/Makefile
@@ -0,0 +1,30 @@
+# Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+# See the top-level LICENSE file for details.
+# 
+# SPDX-License-Identifier: MIT
+
+CC = cc
+
+# perlmutter flags
+INC = -I/global/common/software/nersc9/nccl/2.19.4/include
+CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
+LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
+
+# frontier flags
+# INC = -I${ROCM_PATH}/include
+# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL
+# LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl
+
+all: allgather.x allreduce.x reduce_scatter.x
+
+allgather.x: ../allgather.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu
+
+allreduce.x: ../allreduce.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu
+
+reduce_scatter.x: ../reduce_scatter.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu
+
+clean: 
+	rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x
diff --git a/nccl/all-gather/128_gpu_run.sh b/nccl/all-gather/128_gpu_run.sh
new file mode 100644
index 0000000..e9fc3ae
--- /dev/null
+++ b/nccl/all-gather/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 32))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-gather/16_gpu_run.sh b/nccl/all-gather/16_gpu_run.sh
new file mode 100644
index 0000000..a94a523
--- /dev/null
+++ b/nccl/all-gather/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 256))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-gather/32_gpu_run.sh b/nccl/all-gather/32_gpu_run.sh
new file mode 100644
index 0000000..f1ecd9f
--- /dev/null
+++ b/nccl/all-gather/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 64))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-gather/64_gpu_run.sh b/nccl/all-gather/64_gpu_run.sh
new file mode 100644
index 0000000..357da9e
--- /dev/null
+++ b/nccl/all-gather/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 32))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-gather/8_gpu_run.sh b/nccl/all-gather/8_gpu_run.sh
new file mode 100644
index 0000000..4bd249d
--- /dev/null
+++ b/nccl/all-gather/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 256))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-reduce/128_gpu_run.sh b/nccl/all-reduce/128_gpu_run.sh
new file mode 100644
index 0000000..0e1358b
--- /dev/null
+++ b/nccl/all-reduce/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 4096))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-reduce/16_gpu_run.sh b/nccl/all-reduce/16_gpu_run.sh
new file mode 100644
index 0000000..6553e02
--- /dev/null
+++ b/nccl/all-reduce/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 4096))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-reduce/32_gpu_run.sh b/nccl/all-reduce/32_gpu_run.sh
new file mode 100644
index 0000000..b672e7c
--- /dev/null
+++ b/nccl/all-reduce/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-reduce/64_gpu_run.sh b/nccl/all-reduce/64_gpu_run.sh
new file mode 100644
index 0000000..fc0416c
--- /dev/null
+++ b/nccl/all-reduce/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-reduce/8_gpu_run.sh b/nccl/all-reduce/8_gpu_run.sh
new file mode 100644
index 0000000..d9c0ef6
--- /dev/null
+++ b/nccl/all-reduce/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/reduce-scatter/128_gpu_run.sh b/nccl/reduce-scatter/128_gpu_run.sh
new file mode 100644
index 0000000..fa2199a
--- /dev/null
+++ b/nccl/reduce-scatter/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 4096))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/reduce-scatter/16_gpu_run.sh b/nccl/reduce-scatter/16_gpu_run.sh
new file mode 100644
index 0000000..2edffa6
--- /dev/null
+++ b/nccl/reduce-scatter/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 4096))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/reduce-scatter/32_gpu_run.sh b/nccl/reduce-scatter/32_gpu_run.sh
new file mode 100644
index 0000000..3d297ff
--- /dev/null
+++ b/nccl/reduce-scatter/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/reduce-scatter/64_gpu_run.sh b/nccl/reduce-scatter/64_gpu_run.sh
new file mode 100644
index 0000000..6bbf97a
--- /dev/null
+++ b/nccl/reduce-scatter/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/reduce-scatter/8_gpu_run.sh b/nccl/reduce-scatter/8_gpu_run.sh
new file mode 100644
index 0000000..21c0dc4
--- /dev/null
+++ b/nccl/reduce-scatter/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/Makefile b/rccl/Makefile
similarity index 57%
rename from Makefile
rename to rccl/Makefile
index 52c0235..590dee7 100644
--- a/Makefile
+++ b/rccl/Makefile
@@ -15,10 +15,16 @@ INC = -I${ROCM_PATH}/include
 CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL
 LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl
 
-all: allgather.x
+all: allgather.x allreduce.x reduce_scatter.x
 
-allgather.x: allgather.cu
-	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu
+allgather.x: ../allgather.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu
+
+allreduce.x: ../allreduce.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu
+
+reduce_scatter.x: ../reduce_scatter.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu
 
 clean: 
-	rm -f allgather.x
+	rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x
diff --git a/reduce_scatter.cu b/reduce_scatter.cu
index 820cf4f..f824072 100644
--- a/reduce_scatter.cu
+++ b/reduce_scatter.cu
@@ -76,10 +76,10 @@ void initializeData(bfloat16 *data, int64_t size) {
     }
 }
 
-void custom_bf16_sum(void *invec, void *inoutvec, int64_t *len, MPI_Datatype *datatype) {
+void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
     bfloat16* in = (bfloat16*) invec;
     bfloat16* inout = (bfloat16*) inoutvec;
-    for (int64_t i = 0; i < *len; i++) {
+    for (int i = 0; i < *len; i++) {
         #ifdef USE_CUDA
         inout[i] = __hadd(in[i], inout[i]);
         #elif USE_ROCM

From c7bb21719e15613e832c4cb57bad340a54eef9e6 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Sat, 13 Apr 2024 16:57:43 -0700
Subject: [PATCH 29/52] add results so far

---
 nccl/all-gather/benchmarks/16_gpu.txt | 13 +++++++++++++
 nccl/all-gather/benchmarks/8_gpu.txt  | 13 +++++++++++++
 2 files changed, 26 insertions(+)
 create mode 100644 nccl/all-gather/benchmarks/16_gpu.txt
 create mode 100644 nccl/all-gather/benchmarks/8_gpu.txt

diff --git a/nccl/all-gather/benchmarks/16_gpu.txt b/nccl/all-gather/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..73e83d9
--- /dev/null
+++ b/nccl/all-gather/benchmarks/16_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 256
+Global data size: 4096
+Number of GPUs: 16
+Message size range: 2097152 - 268435456
+Number of iterations: 10
+2097152 0.000532 seconds
+4194304 0.000982 seconds
+8388608 0.001976 seconds
+16777216 0.003447 seconds
+33554432 0.006826 seconds
+67108864 0.013190 seconds
+134217728 0.026196 seconds
+268435456 0.052567 seconds
diff --git a/nccl/all-gather/benchmarks/8_gpu.txt b/nccl/all-gather/benchmarks/8_gpu.txt
new file mode 100644
index 0000000..1c654f3
--- /dev/null
+++ b/nccl/all-gather/benchmarks/8_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 256
+Global data size: 2048
+Number of GPUs: 8
+Message size range: 2097152 - 268435456
+Number of iterations: 10
+2097152 0.000286 seconds
+4194304 0.000523 seconds
+8388608 0.000954 seconds
+16777216 0.001696 seconds
+33554432 0.003150 seconds
+67108864 0.006500 seconds
+134217728 0.012278 seconds
+268435456 0.024449 seconds

From cb99cadc5f0674137b3225f923061337c2ab8002 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Sat, 13 Apr 2024 17:13:57 -0700
Subject: [PATCH 30/52] add results so far

---
 mpi/all-gather/benchmarks/128_gpu.txt  | 12 ++++++++++++
 mpi/all-gather/benchmarks/32_gpu.txt   | 14 ++++++++++++++
 mpi/all-gather/benchmarks/64_gpu.txt   | 13 +++++++++++++
 nccl/all-gather/benchmarks/128_gpu.txt | 13 +++++++++++++
 nccl/all-gather/benchmarks/32_gpu.txt  | 14 ++++++++++++++
 nccl/all-gather/benchmarks/64_gpu.txt  | 13 +++++++++++++
 6 files changed, 79 insertions(+)
 create mode 100644 mpi/all-gather/benchmarks/128_gpu.txt
 create mode 100644 mpi/all-gather/benchmarks/32_gpu.txt
 create mode 100644 mpi/all-gather/benchmarks/64_gpu.txt
 create mode 100644 nccl/all-gather/benchmarks/128_gpu.txt
 create mode 100644 nccl/all-gather/benchmarks/32_gpu.txt
 create mode 100644 nccl/all-gather/benchmarks/64_gpu.txt

diff --git a/mpi/all-gather/benchmarks/128_gpu.txt b/mpi/all-gather/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..3787302
--- /dev/null
+++ b/mpi/all-gather/benchmarks/128_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 16
+Global data size: 2048
+Number of GPUs: 128
+Message size range: 262144 - 16777216
+Number of iterations: 10
+262144 0.003218 seconds
+524288 0.005101 seconds
+1048576 0.008701 seconds
+2097152 0.015526 seconds
+4194304 0.030239 seconds
+8388608 0.060280 seconds
+16777216 0.189415 seconds
diff --git a/mpi/all-gather/benchmarks/32_gpu.txt b/mpi/all-gather/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..0e15475
--- /dev/null
+++ b/mpi/all-gather/benchmarks/32_gpu.txt
@@ -0,0 +1,14 @@
+Local data size: 64
+Global data size: 2048
+Number of GPUs: 32
+Message size range: 262144 - 67108864
+Number of iterations: 10
+262144 0.000730 seconds
+524288 0.001367 seconds
+1048576 0.002650 seconds
+2097152 0.003740 seconds
+4194304 0.007503 seconds
+8388608 0.014208 seconds
+16777216 0.029923 seconds
+33554432 0.061970 seconds
+67108864 0.168545 seconds
diff --git a/mpi/all-gather/benchmarks/64_gpu.txt b/mpi/all-gather/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..ed700b9
--- /dev/null
+++ b/mpi/all-gather/benchmarks/64_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 32
+Global data size: 2048
+Number of GPUs: 64
+Message size range: 262144 - 33554432
+Number of iterations: 10
+262144 0.001561 seconds
+524288 0.002915 seconds
+1048576 0.004163 seconds
+2097152 0.007885 seconds
+4194304 0.014989 seconds
+8388608 0.029413 seconds
+16777216 0.063034 seconds
+33554432 0.183096 seconds
diff --git a/nccl/all-gather/benchmarks/128_gpu.txt b/nccl/all-gather/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..c84792c
--- /dev/null
+++ b/nccl/all-gather/benchmarks/128_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 32
+Global data size: 4096
+Number of GPUs: 128
+Message size range: 262144 - 33554432
+Number of iterations: 10
+262144 0.002247 seconds
+524288 0.002277 seconds
+1048576 0.002775 seconds
+2097152 0.004497 seconds
+4194304 0.007477 seconds
+8388608 0.015057 seconds
+16777216 0.028550 seconds
+33554432 0.056270 seconds
diff --git a/nccl/all-gather/benchmarks/32_gpu.txt b/nccl/all-gather/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..72f0d07
--- /dev/null
+++ b/nccl/all-gather/benchmarks/32_gpu.txt
@@ -0,0 +1,14 @@
+Local data size: 64
+Global data size: 2048
+Number of GPUs: 32
+Message size range: 262144 - 67108864
+Number of iterations: 10
+262144 0.000622 seconds
+524288 0.000577 seconds
+1048576 0.000780 seconds
+2097152 0.001190 seconds
+4194304 0.002041 seconds
+8388608 0.003571 seconds
+16777216 0.006995 seconds
+33554432 0.013830 seconds
+67108864 0.027698 seconds
diff --git a/nccl/all-gather/benchmarks/64_gpu.txt b/nccl/all-gather/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..db7919c
--- /dev/null
+++ b/nccl/all-gather/benchmarks/64_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 32
+Global data size: 2048
+Number of GPUs: 64
+Message size range: 262144 - 33554432
+Number of iterations: 10
+262144 0.001077 seconds
+524288 0.001154 seconds
+1048576 0.001399 seconds
+2097152 0.002078 seconds
+4194304 0.003777 seconds
+8388608 0.007711 seconds
+16777216 0.014418 seconds
+33554432 0.028471 seconds

From d2d2bbc6b5b7a079e8af857852904d19196b5230 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Sat, 13 Apr 2024 17:56:42 -0700
Subject: [PATCH 31/52] change atoi to strtoll

---
 allreduce.cu      | 4 ++--
 reduce_scatter.cu | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/allreduce.cu b/allreduce.cu
index ddbfb97..7fdf2b9 100644
--- a/allreduce.cu
+++ b/allreduce.cu
@@ -95,8 +95,8 @@ int main(int argc, char *argv[]) {
     }
 
     int num_gpus = atoi(argv[1]);
-    int64_t min_msg_size = atoi(argv[2]);
-    int64_t max_msg_size = atoi(argv[3]);
+    int64_t min_msg_size = strtoll(argv[2], NULL, 10);
+    int64_t max_msg_size = strtoll(argv[3], NULL, 10);
     int iterations = atoi(argv[4]);
 
     if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
diff --git a/reduce_scatter.cu b/reduce_scatter.cu
index f824072..1853aed 100644
--- a/reduce_scatter.cu
+++ b/reduce_scatter.cu
@@ -95,8 +95,8 @@ int main(int argc, char *argv[]) {
     }
 
     int num_gpus = atoi(argv[1]);
-    int64_t min_msg_size = atoi(argv[2]);
-    int64_t max_msg_size = atoi(argv[3]);
+    int64_t min_msg_size = strtoll(argv[2], NULL, 10);
+    int64_t max_msg_size = strtoll(argv[3], NULL, 10);
     int iterations = atoi(argv[4]);
 
     if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {

From 74cfdd8c32a705f2a74d82a9ae5ec230060aa317 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Sat, 13 Apr 2024 17:59:18 -0700
Subject: [PATCH 32/52] change atoi to strtoll

---
 reduce_scatter.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/reduce_scatter.cu b/reduce_scatter.cu
index f824072..1853aed 100644
--- a/reduce_scatter.cu
+++ b/reduce_scatter.cu
@@ -95,8 +95,8 @@ int main(int argc, char *argv[]) {
     }
 
     int num_gpus = atoi(argv[1]);
-    int64_t min_msg_size = atoi(argv[2]);
-    int64_t max_msg_size = atoi(argv[3]);
+    int64_t min_msg_size = strtoll(argv[2], NULL, 10);
+    int64_t max_msg_size = strtoll(argv[3], NULL, 10);
     int iterations = atoi(argv[4]);
 
     if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {

From ccb73ae591a813dd10cacc35be6cb6b44bd3ef60 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Sun, 14 Apr 2024 13:04:35 -0700
Subject: [PATCH 33/52] add all perlmutter code and benchmark data

---
 Makefile                                      |  18 --
 README                                        |   9 -
 README.md                                     |  15 +
 allgather.cu                                  | 123 ++++++---
 allreduce.cu                                  | 261 ++++++++++++++++++
 mpi/Makefile                                  |  30 ++
 mpi/all-gather/perlmutter/128_gpu_run.sh      |  37 +++
 mpi/all-gather/perlmutter/16_gpu_run.sh       |  37 +++
 mpi/all-gather/perlmutter/32_gpu_run.sh       |  37 +++
 mpi/all-gather/perlmutter/64_gpu_run.sh       |  37 +++
 mpi/all-gather/perlmutter/8_gpu_run.sh        |  37 +++
 .../perlmutter/benchmarks/128_gpu.txt         |  12 +
 .../perlmutter/benchmarks/16_gpu.txt          |  12 +
 .../perlmutter/benchmarks/32_gpu.txt          |  14 +
 .../perlmutter/benchmarks/64_gpu.txt          |  13 +
 .../perlmutter/benchmarks/8_gpu.txt           |  13 +
 mpi/all-reduce/perlmutter/128_gpu_run.sh      |  37 +++
 mpi/all-reduce/perlmutter/16_gpu_run.sh       |  37 +++
 mpi/all-reduce/perlmutter/32_gpu_run.sh       |  37 +++
 mpi/all-reduce/perlmutter/64_gpu_run.sh       |  37 +++
 mpi/all-reduce/perlmutter/8_gpu_run.sh        |  37 +++
 .../perlmutter/benchmarks/128_gpu.txt         |  11 +
 .../perlmutter/benchmarks/16_gpu.txt          |  11 +
 .../perlmutter/benchmarks/32_gpu.txt          |  13 +
 .../perlmutter/benchmarks/64_gpu.txt          |  12 +
 .../perlmutter/benchmarks/8_gpu.txt           |  12 +
 mpi/reduce-scatter/perlmutter/128_gpu_run.sh  |  37 +++
 mpi/reduce-scatter/perlmutter/16_gpu_run.sh   |  37 +++
 mpi/reduce-scatter/perlmutter/32_gpu_run.sh   |  37 +++
 mpi/reduce-scatter/perlmutter/64_gpu_run.sh   |  37 +++
 mpi/reduce-scatter/perlmutter/8_gpu_run.sh    |  37 +++
 .../perlmutter/benchmarks/128_gpu.txt         |  12 +
 .../perlmutter/benchmarks/16_gpu.txt          |  12 +
 .../perlmutter/benchmarks/32_gpu.txt          |  14 +
 .../perlmutter/benchmarks/64_gpu.txt          |  13 +
 .../perlmutter/benchmarks/8_gpu.txt           |  13 +
 nccl/Makefile                                 |  30 ++
 nccl/all-gather/128_gpu_run.sh                |  37 +++
 nccl/all-gather/16_gpu_run.sh                 |  37 +++
 nccl/all-gather/32_gpu_run.sh                 |  37 +++
 nccl/all-gather/64_gpu_run.sh                 |  37 +++
 nccl/all-gather/8_gpu_run.sh                  |  37 +++
 nccl/all-gather/benchmarks/128_gpu.txt        |  13 +
 nccl/all-gather/benchmarks/16_gpu.txt         |  13 +
 nccl/all-gather/benchmarks/32_gpu.txt         |  14 +
 nccl/all-gather/benchmarks/64_gpu.txt         |  13 +
 nccl/all-gather/benchmarks/8_gpu.txt          |  13 +
 nccl/all-reduce/128_gpu_run.sh                |  37 +++
 nccl/all-reduce/16_gpu_run.sh                 |  37 +++
 nccl/all-reduce/32_gpu_run.sh                 |  37 +++
 nccl/all-reduce/64_gpu_run.sh                 |  37 +++
 nccl/all-reduce/8_gpu_run.sh                  |  37 +++
 nccl/all-reduce/benchmarks/128_gpu.txt        |  12 +
 nccl/all-reduce/benchmarks/16_gpu.txt         |  12 +
 nccl/all-reduce/benchmarks/32_gpu.txt         |  14 +
 nccl/all-reduce/benchmarks/64_gpu.txt         |  13 +
 nccl/all-reduce/benchmarks/8_gpu.txt          |  13 +
 nccl/reduce-scatter/128_gpu_run.sh            |  37 +++
 nccl/reduce-scatter/16_gpu_run.sh             |  37 +++
 nccl/reduce-scatter/32_gpu_run.sh             |  37 +++
 nccl/reduce-scatter/64_gpu_run.sh             |  37 +++
 nccl/reduce-scatter/8_gpu_run.sh              |  37 +++
 nccl/reduce-scatter/benchmarks/128_gpu.txt    |  12 +
 nccl/reduce-scatter/benchmarks/16_gpu.txt     |  12 +
 nccl/reduce-scatter/benchmarks/32_gpu.txt     |  14 +
 nccl/reduce-scatter/benchmarks/64_gpu.txt     |  13 +
 nccl/reduce-scatter/benchmarks/8_gpu.txt      |  13 +
 67 files changed, 1908 insertions(+), 69 deletions(-)
 delete mode 100644 Makefile
 delete mode 100644 README
 create mode 100644 README.md
 create mode 100644 allreduce.cu
 create mode 100644 mpi/Makefile
 create mode 100644 mpi/all-gather/perlmutter/128_gpu_run.sh
 create mode 100644 mpi/all-gather/perlmutter/16_gpu_run.sh
 create mode 100644 mpi/all-gather/perlmutter/32_gpu_run.sh
 create mode 100644 mpi/all-gather/perlmutter/64_gpu_run.sh
 create mode 100644 mpi/all-gather/perlmutter/8_gpu_run.sh
 create mode 100644 mpi/all-gather/perlmutter/benchmarks/128_gpu.txt
 create mode 100644 mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
 create mode 100644 mpi/all-gather/perlmutter/benchmarks/32_gpu.txt
 create mode 100644 mpi/all-gather/perlmutter/benchmarks/64_gpu.txt
 create mode 100644 mpi/all-gather/perlmutter/benchmarks/8_gpu.txt
 create mode 100644 mpi/all-reduce/perlmutter/128_gpu_run.sh
 create mode 100644 mpi/all-reduce/perlmutter/16_gpu_run.sh
 create mode 100644 mpi/all-reduce/perlmutter/32_gpu_run.sh
 create mode 100644 mpi/all-reduce/perlmutter/64_gpu_run.sh
 create mode 100644 mpi/all-reduce/perlmutter/8_gpu_run.sh
 create mode 100644 mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt
 create mode 100644 mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt
 create mode 100644 mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt
 create mode 100644 mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt
 create mode 100644 mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt
 create mode 100644 mpi/reduce-scatter/perlmutter/128_gpu_run.sh
 create mode 100644 mpi/reduce-scatter/perlmutter/16_gpu_run.sh
 create mode 100644 mpi/reduce-scatter/perlmutter/32_gpu_run.sh
 create mode 100644 mpi/reduce-scatter/perlmutter/64_gpu_run.sh
 create mode 100644 mpi/reduce-scatter/perlmutter/8_gpu_run.sh
 create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt
 create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt
 create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt
 create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt
 create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt
 create mode 100644 nccl/Makefile
 create mode 100644 nccl/all-gather/128_gpu_run.sh
 create mode 100644 nccl/all-gather/16_gpu_run.sh
 create mode 100644 nccl/all-gather/32_gpu_run.sh
 create mode 100644 nccl/all-gather/64_gpu_run.sh
 create mode 100644 nccl/all-gather/8_gpu_run.sh
 create mode 100644 nccl/all-gather/benchmarks/128_gpu.txt
 create mode 100644 nccl/all-gather/benchmarks/16_gpu.txt
 create mode 100644 nccl/all-gather/benchmarks/32_gpu.txt
 create mode 100644 nccl/all-gather/benchmarks/64_gpu.txt
 create mode 100644 nccl/all-gather/benchmarks/8_gpu.txt
 create mode 100644 nccl/all-reduce/128_gpu_run.sh
 create mode 100644 nccl/all-reduce/16_gpu_run.sh
 create mode 100644 nccl/all-reduce/32_gpu_run.sh
 create mode 100644 nccl/all-reduce/64_gpu_run.sh
 create mode 100644 nccl/all-reduce/8_gpu_run.sh
 create mode 100644 nccl/all-reduce/benchmarks/128_gpu.txt
 create mode 100644 nccl/all-reduce/benchmarks/16_gpu.txt
 create mode 100644 nccl/all-reduce/benchmarks/32_gpu.txt
 create mode 100644 nccl/all-reduce/benchmarks/64_gpu.txt
 create mode 100644 nccl/all-reduce/benchmarks/8_gpu.txt
 create mode 100644 nccl/reduce-scatter/128_gpu_run.sh
 create mode 100644 nccl/reduce-scatter/16_gpu_run.sh
 create mode 100644 nccl/reduce-scatter/32_gpu_run.sh
 create mode 100644 nccl/reduce-scatter/64_gpu_run.sh
 create mode 100644 nccl/reduce-scatter/8_gpu_run.sh
 create mode 100644 nccl/reduce-scatter/benchmarks/128_gpu.txt
 create mode 100644 nccl/reduce-scatter/benchmarks/16_gpu.txt
 create mode 100644 nccl/reduce-scatter/benchmarks/32_gpu.txt
 create mode 100644 nccl/reduce-scatter/benchmarks/64_gpu.txt
 create mode 100644 nccl/reduce-scatter/benchmarks/8_gpu.txt

diff --git a/Makefile b/Makefile
deleted file mode 100644
index df453b4..0000000
--- a/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright 2024 Parallel Software and Systems Group, University of Maryland.
-# See the top-level LICENSE file for details.
-# 
-# SPDX-License-Identifier: MIT
-
-CC	= cc
-INC	= -I/global/common/software/nersc9/nccl/2.19.4/include
-CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
-LDFLAGS	= -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
-
-
-all: allgather.x
-
-allgather.x: allgather.cu 
-	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu
-
-clean: 
-	rm -f allgather.x 
diff --git a/README b/README
deleted file mode 100644
index eba2046..0000000
--- a/README
+++ /dev/null
@@ -1,9 +0,0 @@
-Before compiling do these:
-
-module load PrgEnv-cray cudatoolkit craype-accel-nvidia80
-export CRAY_ACCEL_TARGET=nvidia80
-
-When running do these:
-
-module load cudatoolkit
-export MPICH_GPU_SUPPORT_ENABLED=1
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..a1fdcdb
--- /dev/null
+++ b/README.md
@@ -0,0 +1,15 @@
+Before compiling do these:
+
+### Perlmutter
+```sh
+module load PrgEnv-cray cudatoolkit craype-accel-nvidia80 nccl/2.19.4
+export CRAY_ACCEL_TARGET=nvidia80
+export MPICH_GPU_SUPPORT_ENABLED=1
+```
+### Frontier
+```sh
+module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05
+export MPICH_GPU_SUPPORT_ENABLED=1
+export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
+```
+
diff --git a/allgather.cu b/allgather.cu
index 5953041..8ae7481 100644
--- a/allgather.cu
+++ b/allgather.cu
@@ -8,24 +8,30 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
+#include <stdint.h>
 
 #ifdef USE_CUDA
-  #include <cuda_runtime.h>
   #include <cuda_bf16.h>
+  #define bfloat16 nv_bfloat16
+#elif USE_ROCM
+  #include <hip/hip_bfloat16.h>
+  #include <hip/hip_runtime.h>
+  #include <hip/hip_runtime_api.h>
+  #define bfloat16 hip_bfloat16
 #endif
 
 #ifdef USE_NCCL
   #include "nccl.h"
-#elif defined(USE_RCCL)
-  #include "rccl.h"
+#elif USE_RCCL
+  #include <rccl/rccl.h> 
 #endif
 
 #define NUM_WARMUP_ITERATIONS		5
 
 #define MPI_CHECK(cmd) do {                         \
-  int e = cmd;                                      \
+  int64_t e = cmd;                                      \
   if( e != MPI_SUCCESS ) {                          \
-    printf("Failed: MPI error %s:%d '%d'\n",        \
+    printf("Failed: MPI error %s:%d '%ld'\n",        \
         __FILE__,__LINE__, e);                      \
     exit(EXIT_FAILURE);                             \
   }                                                 \
@@ -40,6 +46,16 @@
   }                                                 \
 } while(0)
 
+#define HIP_CHECK(cmd) do {                        \
+  hipError_t e = cmd;                              \
+  if(e != hipSuccess) {                            \
+    printf("HIP error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, hipGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+// NCCL_CHECK is used to validate RCCL functions as well
 #define NCCL_CHECK(cmd) do {                        \
   ncclResult_t e = cmd;                             \
   if (e != ncclSuccess) {                           \
@@ -49,9 +65,14 @@
   }                                                 \
 } while(0)
 
-void initializeData(nv_bfloat16 *data, int size) {
-    for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) {
+void initializeData(bfloat16 *data, int64_t size) {
+    for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) {
+        #ifdef USE_CUDA
         data[i] = __float2bfloat16((float)i);
+        #elif USE_ROCM
+        // ROCm doesn't have a float2bfloat16 method
+        data[i] = (bfloat16) ((float) i);
+        #endif
     }
 }
 
@@ -62,8 +83,8 @@ int main(int argc, char *argv[]) {
     }
 
     int num_gpus = atoi(argv[1]);
-    int min_msg_size = atoi(argv[2]);
-    int max_msg_size = atoi(argv[3]);
+    int64_t min_msg_size = atoi(argv[2]);
+    int64_t max_msg_size = atoi(argv[3]);
     int iterations = atoi(argv[4]);
 
     if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
@@ -86,33 +107,49 @@ int main(int argc, char *argv[]) {
     }
 
     // Initialize GPU context
+    #if USE_CUDA
     cudaGetDeviceCount(&num_gpus_per_node);
     cudaSetDevice((my_rank % num_gpus_per_node));
+    #elif USE_ROCM
+    hipGetDeviceCount(&num_gpus_per_node);
+    hipSetDevice((my_rank % num_gpus_per_node));
+    #endif
 
-    int local_data_size = max_msg_size; // Size of local data
-    int global_data_size = local_data_size * num_gpus; // Size of global data
+    int64_t local_data_size = max_msg_size; // Size of local data
+    int64_t global_data_size = local_data_size * num_gpus; // Size of global data
 
-    nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size);
-    nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size);
+    if (my_rank == 0) {
+        fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024);
+        fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024);
+    }
+
+    bfloat16 *local_data = (bfloat16*)malloc(local_data_size);
+    bfloat16 *global_data = (bfloat16*)malloc(global_data_size);
 
     // Initialize local data
     initializeData(local_data, local_data_size);
 
     // Allocate memory on GPU
-    nv_bfloat16 *d_local_data, *d_global_data;
+    bfloat16 *d_local_data, *d_global_data;
+    #ifdef USE_CUDA
     CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
     CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
-
     // Copy local data to GPU
     CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
 
+    #elif USE_ROCM
+    HIP_CHECK(hipMalloc(&d_local_data, local_data_size));
+    HIP_CHECK(hipMalloc(&d_global_data, global_data_size));
+    HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice));
+    #endif
+
     #ifdef USE_MPI
     // create 2-byte datatype (send raw, un-interpreted bytes)
     MPI_Datatype mpi_type_bfloat16;
     MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16);
     MPI_Type_commit(&mpi_type_bfloat16);
 
-    #elif USE_NCCL
+    #elif defined(USE_NCCL) || defined(USE_RCCL)
     ncclUniqueId nccl_comm_id;
     ncclComm_t nccl_comm;
 
@@ -125,13 +162,8 @@ int main(int argc, char *argv[]) {
     MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
                         0, MPI_COMM_WORLD));
 
-    /* Create a new NCCL communicator */
+    /* Create a new NCCL/RCCL communicator */
     NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
-
-    #elif defined(USE_RCCL)
-    // TODO: fix later
-    rcclComm_t rccl_comm;
-    rcclCommInitRank(&comm, num_gpus, 0, rccl_root);
     #endif
 
     // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
@@ -142,13 +174,13 @@ int main(int argc, char *argv[]) {
     // Print benchmark results
     if (my_rank == 0) {
         printf("Number of GPUs: %d\n", num_gpus);
-        printf("Message size range: %d - %d\n", min_msg_size, max_msg_size);
+        printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size);
         printf("Number of iterations: %d\n", iterations);
     }
     fflush(NULL);
 
-    for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
-	msg_count = msg_size / sizeof(nv_bfloat16);
+    for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
+	msg_count = msg_size / sizeof(bfloat16);
 	// warmup iterations
 	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
             #ifdef USE_MPI
@@ -156,12 +188,14 @@ int main(int argc, char *argv[]) {
 		d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request));
                 
             MPI_CHECK(MPI_Wait(&request, &status));
-            #elif defined(USE_NCCL)
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
             NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL));
-	    cudaDeviceSynchronize();
-            #elif defined(USE_RCCL)
-	    // TODO: fix later
-            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
+            #endif
+        
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
             #endif
         }
 
@@ -172,34 +206,39 @@ int main(int argc, char *argv[]) {
         start_time = MPI_Wtime();
 	for (int i = 0; i < iterations; ++i) {
             #ifdef USE_MPI
-            MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16,
-                d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request));
-
+	    MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16,
+		d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request));
+                
             MPI_CHECK(MPI_Wait(&request, &status));
-            #elif defined(USE_NCCL)
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
             NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL));
-	    cudaDeviceSynchronize();
-            #elif defined(USE_RCCL)
-            // TODO: fix later
-            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
+            #endif
+        
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
             #endif
         }
         MPI_Barrier(MPI_COMM_WORLD);
         total_time = MPI_Wtime() - start_time;
 	if (my_rank == 0)
-	    printf("%d %.6f seconds\n", msg_size, (total_time / iterations));
+	    printf("%ld %.6f seconds\n", msg_size, (total_time / iterations));
     }
 
     // Cleanup
     free(local_data);
     free(global_data);
+    #ifdef USE_CUDA
     CUDA_CHECK(cudaFree(d_local_data));
     CUDA_CHECK(cudaFree(d_global_data));
+    #elif USE_ROCM
+    HIP_CHECK(hipFree(d_local_data));
+    HIP_CHECK(hipFree(d_global_data));
+    #endif
 
-    #ifdef USE_NCCL
+    #ifdef defined(USE_NCCL) || defined(USE_RCCL)
     ncclCommDestroy(nccl_comm);
-    #elif defined(USE_RCCL)
-    rcclCommDestroy(rccl_comm);
     #endif
 
     MPI_Finalize();
diff --git a/allreduce.cu b/allreduce.cu
new file mode 100644
index 0000000..7fdf2b9
--- /dev/null
+++ b/allreduce.cu
@@ -0,0 +1,261 @@
+/* \file allreduce.cu
+ * Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+ * See the top-level LICENSE file for details.
+ * 
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include <stdint.h>
+
+#ifdef USE_CUDA
+  #include <cuda_bf16.h>
+  #define bfloat16 nv_bfloat16
+#elif USE_ROCM
+  #include <hip/hip_bfloat16.h>
+  #include <hip/hip_runtime.h>
+  #include <hip/hip_runtime_api.h>
+  #define bfloat16 hip_bfloat16
+#endif
+
+#ifdef USE_NCCL
+  #include "nccl.h"
+#elif USE_RCCL
+  #include <rccl/rccl.h> 
+#endif
+
+#define NUM_WARMUP_ITERATIONS		5
+
+#define MPI_CHECK(cmd) do {                         \
+  int64_t e = cmd;                                      \
+  if( e != MPI_SUCCESS ) {                          \
+    printf("Failed: MPI error %s:%d '%ld'\n",        \
+        __FILE__,__LINE__, e);                      \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define CUDA_CHECK(cmd) do {                        \
+  cudaError_t e = cmd;                              \
+  if(e != cudaSuccess) {                            \
+    printf("CUDA error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, cudaGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define HIP_CHECK(cmd) do {                        \
+  hipError_t e = cmd;                              \
+  if(e != hipSuccess) {                            \
+    printf("HIP error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, hipGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+// NCCL_CHECK is used to validate RCCL functions as well
+#define NCCL_CHECK(cmd) do {                        \
+  ncclResult_t e = cmd;                             \
+  if (e != ncclSuccess) {                           \
+    printf("NCCL error %s:%d %s\n",                 \
+        __FILE__, __LINE__, ncclGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+void initializeData(bfloat16 *data, int64_t size) {
+    for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) {
+        #ifdef USE_CUDA
+        data[i] = __float2bfloat16((float)i);
+        #elif USE_ROCM
+        // ROCm doesn't have a float2bfloat16 method
+        data[i] = (bfloat16) ((float) i);
+        #endif
+    }
+}
+
+void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
+    bfloat16* in = (bfloat16*) invec;
+    bfloat16* inout = (bfloat16*) inoutvec;
+    for (int i = 0; i < *len; i++) {
+        #ifdef USE_CUDA
+        inout[i] = __hadd(in[i], inout[i]);
+        #elif USE_ROCM
+        inout[i] = in[i] + inout[i];
+        #endif
+    }
+}
+
+int main(int argc, char *argv[]) {
+    if (argc != 5) {
+        fprintf(stderr, "Usage: %s <num_gpus> <min_msg_size> <max_msg_size> <iterations>\n", argv[0]);
+        return EXIT_FAILURE;
+    }
+
+    int num_gpus = atoi(argv[1]);
+    int64_t min_msg_size = strtoll(argv[2], NULL, 10);
+    int64_t max_msg_size = strtoll(argv[3], NULL, 10);
+    int iterations = atoi(argv[4]);
+
+    if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
+        fprintf(stderr, "Invalid input parameters.\n");
+        return EXIT_FAILURE;
+    }
+
+    int my_rank, num_pes;
+    int num_gpus_per_node;
+    int msg_count;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
+
+    if (num_pes != num_gpus) {
+        fprintf(stderr, "Number of processes must match number of GPUs.\n");
+        MPI_Finalize();
+        return EXIT_FAILURE;
+    }
+
+    // Initialize GPU context
+    #if USE_CUDA
+    cudaGetDeviceCount(&num_gpus_per_node);
+    cudaSetDevice((my_rank % num_gpus_per_node));
+    #elif USE_ROCM
+    hipGetDeviceCount(&num_gpus_per_node);
+    hipSetDevice((my_rank % num_gpus_per_node));
+    #endif
+
+    int64_t local_data_size = max_msg_size; // Size of local data
+    int64_t global_data_size = local_data_size; // Size of global data 
+    
+    if (my_rank == 0) {
+        fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024);
+        fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024);
+    }
+
+    bfloat16 *local_data = (bfloat16*)malloc(local_data_size);
+    bfloat16 *global_data = (bfloat16*)malloc(global_data_size);
+
+    // Initialize local data
+    initializeData(local_data, local_data_size);
+
+    bfloat16 *d_local_data, *d_global_data;
+    #ifdef USE_CUDA
+    CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
+    CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
+    // Copy local data to GPU
+    CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
+
+    #elif USE_ROCM
+    HIP_CHECK(hipMalloc(&d_local_data, local_data_size));
+    HIP_CHECK(hipMalloc(&d_global_data, global_data_size));
+    HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice));
+    #endif
+
+    #ifdef USE_MPI
+    // create 2-byte datatype (send raw, un-interpreted bytes)
+    MPI_Datatype mpi_type_bfloat16;
+    MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16);
+    MPI_Type_commit(&mpi_type_bfloat16);
+
+    // define custom reduce operation for nv_bfloat16 types
+    MPI_Op CUSTOM_SUM;
+    MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM);
+
+    #elif defined(USE_NCCL) || defined(USE_RCCL)
+    ncclUniqueId nccl_comm_id;
+    ncclComm_t nccl_comm;
+
+    if (my_rank == 0) {
+        /* Generates an Id to be used in ncclCommInitRank. */
+        ncclGetUniqueId(&nccl_comm_id);
+    }
+
+    /* distribute nccl_comm_id to all ranks */
+    MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
+                        0, MPI_COMM_WORLD));
+
+    /* Create a new NCCL/RCCL communicator */
+    NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
+    #endif
+
+    // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
+    double total_time, start_time;
+    MPI_Request request;
+    MPI_Status status;
+
+    // Print benchmark results
+    if (my_rank == 0) {
+        printf("Number of GPUs: %d\n", num_gpus);
+        printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size);
+        printf("Number of iterations: %d\n", iterations);
+    }
+    fflush(NULL);
+
+    for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
+	msg_count = msg_size / sizeof(bfloat16);
+	// warmup iterations
+	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
+            #ifdef USE_MPI
+            MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16,
+                CUSTOM_SUM, MPI_COMM_WORLD, &request));
+
+            MPI_CHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
+            NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL));
+            #endif
+            
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
+            #endif
+        }
+
+	if(msg_size >= 8388608)
+	    iterations = 20;
+
+        MPI_Barrier(MPI_COMM_WORLD);
+        start_time = MPI_Wtime();
+	for (int i = 0; i < iterations; ++i) {
+            #ifdef USE_MPI
+            MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16,
+                CUSTOM_SUM, MPI_COMM_WORLD, &request));
+
+            MPI_CHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
+            NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL));
+            #endif
+            
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
+            #endif
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+        total_time = MPI_Wtime() - start_time;
+	if (my_rank == 0)
+	    printf("%ld %.6f seconds\n", msg_size, (total_time / iterations));
+    }
+
+    // Cleanup
+    free(local_data);
+    free(global_data);
+    #ifdef USE_CUDA
+    CUDA_CHECK(cudaFree(d_local_data));
+    CUDA_CHECK(cudaFree(d_global_data));
+    #elif USE_ROCM
+    HIP_CHECK(hipFree(d_local_data));
+    HIP_CHECK(hipFree(d_global_data));
+    #endif
+
+    #ifdef defined(USE_NCCL) || defined(USE_RCCL)
+    ncclCommDestroy(nccl_comm);
+    #endif
+
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}
diff --git a/mpi/Makefile b/mpi/Makefile
new file mode 100644
index 0000000..782a6bf
--- /dev/null
+++ b/mpi/Makefile
@@ -0,0 +1,30 @@
+# Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+# See the top-level LICENSE file for details.
+# 
+# SPDX-License-Identifier: MIT
+
+CC = cc
+
+# perlmutter flags
+INC = -I/global/common/software/nersc9/nccl/2.19.4/include
+CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI
+LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
+
+# frontier flags
+# INC = -I${ROCM_PATH}/include
+# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL
+# LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl
+
+all: allgather.x allreduce.x reduce_scatter.x
+
+allgather.x: ../allgather.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu
+
+allreduce.x: ../allreduce.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu
+
+reduce_scatter.x: ../reduce_scatter.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu
+
+clean: 
+	rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x
diff --git a/mpi/all-gather/perlmutter/128_gpu_run.sh b/mpi/all-gather/perlmutter/128_gpu_run.sh
new file mode 100644
index 0000000..710a399
--- /dev/null
+++ b/mpi/all-gather/perlmutter/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 16))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/perlmutter/16_gpu_run.sh b/mpi/all-gather/perlmutter/16_gpu_run.sh
new file mode 100644
index 0000000..d4d984e
--- /dev/null
+++ b/mpi/all-gather/perlmutter/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 128))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/perlmutter/32_gpu_run.sh b/mpi/all-gather/perlmutter/32_gpu_run.sh
new file mode 100644
index 0000000..d2f1b0d
--- /dev/null
+++ b/mpi/all-gather/perlmutter/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 64))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/perlmutter/64_gpu_run.sh b/mpi/all-gather/perlmutter/64_gpu_run.sh
new file mode 100644
index 0000000..515d667
--- /dev/null
+++ b/mpi/all-gather/perlmutter/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 32))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/perlmutter/8_gpu_run.sh b/mpi/all-gather/perlmutter/8_gpu_run.sh
new file mode 100644
index 0000000..210ea3d
--- /dev/null
+++ b/mpi/all-gather/perlmutter/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 256))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..3787302
--- /dev/null
+++ b/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 16
+Global data size: 2048
+Number of GPUs: 128
+Message size range: 262144 - 16777216
+Number of iterations: 10
+262144 0.003218 seconds
+524288 0.005101 seconds
+1048576 0.008701 seconds
+2097152 0.015526 seconds
+4194304 0.030239 seconds
+8388608 0.060280 seconds
+16777216 0.189415 seconds
diff --git a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..b69654b
--- /dev/null
+++ b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 128
+Global data size: 2048
+Number of GPUs: 16
+Message size range: 2097152 - 134217728
+Number of iterations: 10
+2097152 0.002391 seconds
+4194304 0.003558 seconds
+8388608 0.007162 seconds
+16777216 0.014929 seconds
+33554432 0.030427 seconds
+67108864 0.062092 seconds
+134217728 0.151508 seconds
diff --git a/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..0e15475
--- /dev/null
+++ b/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt
@@ -0,0 +1,14 @@
+Local data size: 64
+Global data size: 2048
+Number of GPUs: 32
+Message size range: 262144 - 67108864
+Number of iterations: 10
+262144 0.000730 seconds
+524288 0.001367 seconds
+1048576 0.002650 seconds
+2097152 0.003740 seconds
+4194304 0.007503 seconds
+8388608 0.014208 seconds
+16777216 0.029923 seconds
+33554432 0.061970 seconds
+67108864 0.168545 seconds
diff --git a/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..ed700b9
--- /dev/null
+++ b/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 32
+Global data size: 2048
+Number of GPUs: 64
+Message size range: 262144 - 33554432
+Number of iterations: 10
+262144 0.001561 seconds
+524288 0.002915 seconds
+1048576 0.004163 seconds
+2097152 0.007885 seconds
+4194304 0.014989 seconds
+8388608 0.029413 seconds
+16777216 0.063034 seconds
+33554432 0.183096 seconds
diff --git a/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt
new file mode 100644
index 0000000..de3a837
--- /dev/null
+++ b/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 256
+Global data size: 2048
+Number of GPUs: 8
+Message size range: 2097152 - 268435456
+Number of iterations: 10
+2097152 0.000838 seconds
+4194304 0.001719 seconds
+8388608 0.003172 seconds
+16777216 0.006797 seconds
+33554432 0.013860 seconds
+67108864 0.027938 seconds
+134217728 0.055353 seconds
+268435456 0.104310 seconds
diff --git a/mpi/all-reduce/perlmutter/128_gpu_run.sh b/mpi/all-reduce/perlmutter/128_gpu_run.sh
new file mode 100644
index 0000000..33729eb
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 20:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/perlmutter/16_gpu_run.sh b/mpi/all-reduce/perlmutter/16_gpu_run.sh
new file mode 100644
index 0000000..dc30279
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 15:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/perlmutter/32_gpu_run.sh b/mpi/all-reduce/perlmutter/32_gpu_run.sh
new file mode 100644
index 0000000..be73564
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 20:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/perlmutter/64_gpu_run.sh b/mpi/all-reduce/perlmutter/64_gpu_run.sh
new file mode 100644
index 0000000..cf714da
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 20:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/perlmutter/8_gpu_run.sh b/mpi/all-reduce/perlmutter/8_gpu_run.sh
new file mode 100644
index 0000000..49ff135
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 15:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..4e3e17d
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt
@@ -0,0 +1,11 @@
+Local data size: 1024
+Global data size: 1024
+Number of GPUs: 128
+Message size range: 33554432 - 1073741824
+Number of iterations: 10
+33554432 0.264543 seconds
+67108864 0.527909 seconds
+134217728 1.092095 seconds
+268435456 3.194094 seconds
+536870912 6.415718 seconds
+1073741824 12.819154 seconds
diff --git a/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..b377ec2
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt
@@ -0,0 +1,11 @@
+Local data size: 1024
+Global data size: 1024
+Number of GPUs: 16
+Message size range: 33554432 - 1073741824
+Number of iterations: 10
+33554432 0.142677 seconds
+67108864 0.324897 seconds
+134217728 0.673650 seconds
+268435456 2.140369 seconds
+536870912 4.318430 seconds
+1073741824 8.632880 seconds
diff --git a/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..cda53bf
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 1024
+Global data size: 1024
+Number of GPUs: 32
+Message size range: 8388608 - 1073741824
+Number of iterations: 10
+8388608 0.049975 seconds
+16777216 0.092395 seconds
+33554432 0.181888 seconds
+67108864 0.368241 seconds
+134217728 0.774021 seconds
+268435456 2.362729 seconds
+536870912 4.760279 seconds
+1073741824 9.524390 seconds
diff --git a/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..341fc93
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 1024
+Global data size: 1024
+Number of GPUs: 64
+Message size range: 16777216 - 1073741824
+Number of iterations: 10
+16777216 0.111867 seconds
+33554432 0.230462 seconds
+67108864 0.465838 seconds
+134217728 0.970915 seconds
+268435456 2.875694 seconds
+536870912 5.771569 seconds
+1073741824 11.522959 seconds
diff --git a/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt
new file mode 100644
index 0000000..05fd1e8
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 1024
+Global data size: 1024
+Number of GPUs: 8
+Message size range: 16777216 - 1073741824
+Number of iterations: 10
+16777216 0.058292 seconds
+33554432 0.107128 seconds
+67108864 0.211506 seconds
+134217728 0.491929 seconds
+268435456 1.508757 seconds
+536870912 3.052047 seconds
+1073741824 6.103450 seconds
diff --git a/mpi/reduce-scatter/perlmutter/128_gpu_run.sh b/mpi/reduce-scatter/perlmutter/128_gpu_run.sh
new file mode 100644
index 0000000..469aeaf
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/perlmutter/16_gpu_run.sh b/mpi/reduce-scatter/perlmutter/16_gpu_run.sh
new file mode 100644
index 0000000..e66b9f4
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/perlmutter/32_gpu_run.sh b/mpi/reduce-scatter/perlmutter/32_gpu_run.sh
new file mode 100644
index 0000000..07d6020
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 30:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/perlmutter/64_gpu_run.sh b/mpi/reduce-scatter/perlmutter/64_gpu_run.sh
new file mode 100644
index 0000000..e51945a
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/perlmutter/8_gpu_run.sh b/mpi/reduce-scatter/perlmutter/8_gpu_run.sh
new file mode 100644
index 0000000..1b51537
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 30:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..d696072
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 128
+Message size range: 33554432 - 2147483648
+Number of iterations: 10
+33554432 3.352414 seconds
+67108864 3.323000 seconds
+134217728 3.331817 seconds
+268435456 3.327162 seconds
+536870912 3.345694 seconds
+1073741824 3.326455 seconds
+2147483648 3.321790 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..b71477d
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 16
+Message size range: 33554432 - 2147483648
+Number of iterations: 10
+33554432 3.368300 seconds
+67108864 3.361940 seconds
+134217728 3.367816 seconds
+268435456 3.360722 seconds
+536870912 3.363088 seconds
+1073741824 3.392373 seconds
+2147483648 3.375325 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..38e09b1
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt
@@ -0,0 +1,14 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 32
+Message size range: 8388608 - 2147483648
+Number of iterations: 10
+8388608 3.368554 seconds
+16777216 3.367485 seconds
+33554432 3.376475 seconds
+67108864 3.381592 seconds
+134217728 3.384111 seconds
+268435456 3.375780 seconds
+536870912 3.371542 seconds
+1073741824 3.379895 seconds
+2147483648 3.381470 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..d982100
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 64
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 2.220629 seconds
+33554432 2.201147 seconds
+67108864 2.196879 seconds
+134217728 2.199449 seconds
+268435456 2.194973 seconds
+536870912 2.196809 seconds
+1073741824 2.196212 seconds
+2147483648 2.201029 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt
new file mode 100644
index 0000000..d2bdd9a
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 8
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 3.558431 seconds
+33554432 3.553477 seconds
+67108864 3.562137 seconds
+134217728 3.556267 seconds
+268435456 3.551567 seconds
+536870912 3.599067 seconds
+1073741824 3.608635 seconds
+2147483648 3.624090 seconds
diff --git a/nccl/Makefile b/nccl/Makefile
new file mode 100644
index 0000000..5652112
--- /dev/null
+++ b/nccl/Makefile
@@ -0,0 +1,30 @@
+# Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+# See the top-level LICENSE file for details.
+# 
+# SPDX-License-Identifier: MIT
+
+CC = cc
+
+# perlmutter flags
+INC = -I/global/common/software/nersc9/nccl/2.19.4/include
+CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
+LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
+
+# frontier flags
+# INC = -I${ROCM_PATH}/include
+# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL
+# LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl
+
+all: allgather.x allreduce.x reduce_scatter.x
+
+allgather.x: ../allgather.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu
+
+allreduce.x: ../allreduce.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu
+
+reduce_scatter.x: ../reduce_scatter.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu
+
+clean: 
+	rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x
diff --git a/nccl/all-gather/128_gpu_run.sh b/nccl/all-gather/128_gpu_run.sh
new file mode 100644
index 0000000..e9fc3ae
--- /dev/null
+++ b/nccl/all-gather/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 32))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-gather/16_gpu_run.sh b/nccl/all-gather/16_gpu_run.sh
new file mode 100644
index 0000000..a94a523
--- /dev/null
+++ b/nccl/all-gather/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 256))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-gather/32_gpu_run.sh b/nccl/all-gather/32_gpu_run.sh
new file mode 100644
index 0000000..f1ecd9f
--- /dev/null
+++ b/nccl/all-gather/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 64))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-gather/64_gpu_run.sh b/nccl/all-gather/64_gpu_run.sh
new file mode 100644
index 0000000..357da9e
--- /dev/null
+++ b/nccl/all-gather/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 32))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-gather/8_gpu_run.sh b/nccl/all-gather/8_gpu_run.sh
new file mode 100644
index 0000000..4bd249d
--- /dev/null
+++ b/nccl/all-gather/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 256))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-gather/benchmarks/128_gpu.txt b/nccl/all-gather/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..c84792c
--- /dev/null
+++ b/nccl/all-gather/benchmarks/128_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 32
+Global data size: 4096
+Number of GPUs: 128
+Message size range: 262144 - 33554432
+Number of iterations: 10
+262144 0.002247 seconds
+524288 0.002277 seconds
+1048576 0.002775 seconds
+2097152 0.004497 seconds
+4194304 0.007477 seconds
+8388608 0.015057 seconds
+16777216 0.028550 seconds
+33554432 0.056270 seconds
diff --git a/nccl/all-gather/benchmarks/16_gpu.txt b/nccl/all-gather/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..73e83d9
--- /dev/null
+++ b/nccl/all-gather/benchmarks/16_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 256
+Global data size: 4096
+Number of GPUs: 16
+Message size range: 2097152 - 268435456
+Number of iterations: 10
+2097152 0.000532 seconds
+4194304 0.000982 seconds
+8388608 0.001976 seconds
+16777216 0.003447 seconds
+33554432 0.006826 seconds
+67108864 0.013190 seconds
+134217728 0.026196 seconds
+268435456 0.052567 seconds
diff --git a/nccl/all-gather/benchmarks/32_gpu.txt b/nccl/all-gather/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..72f0d07
--- /dev/null
+++ b/nccl/all-gather/benchmarks/32_gpu.txt
@@ -0,0 +1,14 @@
+Local data size: 64
+Global data size: 2048
+Number of GPUs: 32
+Message size range: 262144 - 67108864
+Number of iterations: 10
+262144 0.000622 seconds
+524288 0.000577 seconds
+1048576 0.000780 seconds
+2097152 0.001190 seconds
+4194304 0.002041 seconds
+8388608 0.003571 seconds
+16777216 0.006995 seconds
+33554432 0.013830 seconds
+67108864 0.027698 seconds
diff --git a/nccl/all-gather/benchmarks/64_gpu.txt b/nccl/all-gather/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..db7919c
--- /dev/null
+++ b/nccl/all-gather/benchmarks/64_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 32
+Global data size: 2048
+Number of GPUs: 64
+Message size range: 262144 - 33554432
+Number of iterations: 10
+262144 0.001077 seconds
+524288 0.001154 seconds
+1048576 0.001399 seconds
+2097152 0.002078 seconds
+4194304 0.003777 seconds
+8388608 0.007711 seconds
+16777216 0.014418 seconds
+33554432 0.028471 seconds
diff --git a/nccl/all-gather/benchmarks/8_gpu.txt b/nccl/all-gather/benchmarks/8_gpu.txt
new file mode 100644
index 0000000..1c654f3
--- /dev/null
+++ b/nccl/all-gather/benchmarks/8_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 256
+Global data size: 2048
+Number of GPUs: 8
+Message size range: 2097152 - 268435456
+Number of iterations: 10
+2097152 0.000286 seconds
+4194304 0.000523 seconds
+8388608 0.000954 seconds
+16777216 0.001696 seconds
+33554432 0.003150 seconds
+67108864 0.006500 seconds
+134217728 0.012278 seconds
+268435456 0.024449 seconds
diff --git a/nccl/all-reduce/128_gpu_run.sh b/nccl/all-reduce/128_gpu_run.sh
new file mode 100644
index 0000000..623f0c2
--- /dev/null
+++ b/nccl/all-reduce/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-reduce/16_gpu_run.sh b/nccl/all-reduce/16_gpu_run.sh
new file mode 100644
index 0000000..af689e9
--- /dev/null
+++ b/nccl/all-reduce/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-reduce/32_gpu_run.sh b/nccl/all-reduce/32_gpu_run.sh
new file mode 100644
index 0000000..b672e7c
--- /dev/null
+++ b/nccl/all-reduce/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-reduce/64_gpu_run.sh b/nccl/all-reduce/64_gpu_run.sh
new file mode 100644
index 0000000..fc0416c
--- /dev/null
+++ b/nccl/all-reduce/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-reduce/8_gpu_run.sh b/nccl/all-reduce/8_gpu_run.sh
new file mode 100644
index 0000000..d9c0ef6
--- /dev/null
+++ b/nccl/all-reduce/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-reduce/benchmarks/128_gpu.txt b/nccl/all-reduce/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..c8bc5f3
--- /dev/null
+++ b/nccl/all-reduce/benchmarks/128_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 128
+Message size range: 33554432 - 2147483648
+Number of iterations: 10
+33554432 0.002252 seconds
+67108864 0.003958 seconds
+134217728 0.005696 seconds
+268435456 0.008861 seconds
+536870912 0.016701 seconds
+1073741824 0.035052 seconds
+2147483648 0.069582 seconds
diff --git a/nccl/all-reduce/benchmarks/16_gpu.txt b/nccl/all-reduce/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..8199a8f
--- /dev/null
+++ b/nccl/all-reduce/benchmarks/16_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 16
+Message size range: 33554432 - 2147483648
+Number of iterations: 10
+33554432 0.000971 seconds
+67108864 0.001813 seconds
+134217728 0.003415 seconds
+268435456 0.007049 seconds
+536870912 0.013323 seconds
+1073741824 0.026322 seconds
+2147483648 0.052252 seconds
diff --git a/nccl/all-reduce/benchmarks/32_gpu.txt b/nccl/all-reduce/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..fa6e736
--- /dev/null
+++ b/nccl/all-reduce/benchmarks/32_gpu.txt
@@ -0,0 +1,14 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 32
+Message size range: 8388608 - 2147483648
+Number of iterations: 10
+8388608 0.000589 seconds
+16777216 0.001015 seconds
+33554432 0.001352 seconds
+67108864 0.002146 seconds
+134217728 0.003621 seconds
+268435456 0.006997 seconds
+536870912 0.013742 seconds
+1073741824 0.027021 seconds
+2147483648 0.054364 seconds
diff --git a/nccl/all-reduce/benchmarks/64_gpu.txt b/nccl/all-reduce/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..a773bf1
--- /dev/null
+++ b/nccl/all-reduce/benchmarks/64_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 64
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 0.001196 seconds
+33554432 0.001740 seconds
+67108864 0.002970 seconds
+134217728 0.004544 seconds
+268435456 0.008213 seconds
+536870912 0.017505 seconds
+1073741824 0.035188 seconds
+2147483648 0.069951 seconds
diff --git a/nccl/all-reduce/benchmarks/8_gpu.txt b/nccl/all-reduce/benchmarks/8_gpu.txt
new file mode 100644
index 0000000..4d60f0f
--- /dev/null
+++ b/nccl/all-reduce/benchmarks/8_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 8
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 0.000511 seconds
+33554432 0.000916 seconds
+67108864 0.001663 seconds
+134217728 0.003137 seconds
+268435456 0.006408 seconds
+536870912 0.012493 seconds
+1073741824 0.024300 seconds
+2147483648 0.048155 seconds
diff --git a/nccl/reduce-scatter/128_gpu_run.sh b/nccl/reduce-scatter/128_gpu_run.sh
new file mode 100644
index 0000000..8590821
--- /dev/null
+++ b/nccl/reduce-scatter/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/reduce-scatter/16_gpu_run.sh b/nccl/reduce-scatter/16_gpu_run.sh
new file mode 100644
index 0000000..7a20fa6
--- /dev/null
+++ b/nccl/reduce-scatter/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/reduce-scatter/32_gpu_run.sh b/nccl/reduce-scatter/32_gpu_run.sh
new file mode 100644
index 0000000..3d297ff
--- /dev/null
+++ b/nccl/reduce-scatter/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/reduce-scatter/64_gpu_run.sh b/nccl/reduce-scatter/64_gpu_run.sh
new file mode 100644
index 0000000..6bbf97a
--- /dev/null
+++ b/nccl/reduce-scatter/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/reduce-scatter/8_gpu_run.sh b/nccl/reduce-scatter/8_gpu_run.sh
new file mode 100644
index 0000000..21c0dc4
--- /dev/null
+++ b/nccl/reduce-scatter/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/reduce-scatter/benchmarks/128_gpu.txt b/nccl/reduce-scatter/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..7c1c8f9
--- /dev/null
+++ b/nccl/reduce-scatter/benchmarks/128_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 128
+Message size range: 33554432 - 2147483648
+Number of iterations: 10
+33554432 0.028300 seconds
+67108864 0.028351 seconds
+134217728 0.028351 seconds
+268435456 0.028502 seconds
+536870912 0.028579 seconds
+1073741824 0.028650 seconds
+2147483648 0.028506 seconds
diff --git a/nccl/reduce-scatter/benchmarks/16_gpu.txt b/nccl/reduce-scatter/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..14acf87
--- /dev/null
+++ b/nccl/reduce-scatter/benchmarks/16_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 16
+Message size range: 33554432 - 2147483648
+Number of iterations: 10
+33554432 0.033170 seconds
+67108864 0.033280 seconds
+134217728 0.033220 seconds
+268435456 0.033291 seconds
+536870912 0.033217 seconds
+1073741824 0.033158 seconds
+2147483648 0.033275 seconds
diff --git a/nccl/reduce-scatter/benchmarks/32_gpu.txt b/nccl/reduce-scatter/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..7eecc67
--- /dev/null
+++ b/nccl/reduce-scatter/benchmarks/32_gpu.txt
@@ -0,0 +1,14 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 32
+Message size range: 8388608 - 2147483648
+Number of iterations: 10
+8388608 0.027121 seconds
+16777216 0.027661 seconds
+33554432 0.027766 seconds
+67108864 0.027992 seconds
+134217728 0.027914 seconds
+268435456 0.027912 seconds
+536870912 0.027777 seconds
+1073741824 0.027861 seconds
+2147483648 0.027551 seconds
diff --git a/nccl/reduce-scatter/benchmarks/64_gpu.txt b/nccl/reduce-scatter/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..8f8ddd0
--- /dev/null
+++ b/nccl/reduce-scatter/benchmarks/64_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 64
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 0.028306 seconds
+33554432 0.028511 seconds
+67108864 0.028175 seconds
+134217728 0.027998 seconds
+268435456 0.027883 seconds
+536870912 0.027802 seconds
+1073741824 0.027954 seconds
+2147483648 0.028085 seconds
diff --git a/nccl/reduce-scatter/benchmarks/8_gpu.txt b/nccl/reduce-scatter/benchmarks/8_gpu.txt
new file mode 100644
index 0000000..26c22b6
--- /dev/null
+++ b/nccl/reduce-scatter/benchmarks/8_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 8
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 0.024231 seconds
+33554432 0.024389 seconds
+67108864 0.024167 seconds
+134217728 0.024047 seconds
+268435456 0.024293 seconds
+536870912 0.024031 seconds
+1073741824 0.024048 seconds
+2147483648 0.024241 seconds

From fd73957a5b1685a72da8dd5433583f11c1d8e7e2 Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Sun, 14 Apr 2024 16:13:32 -0400
Subject: [PATCH 34/52] add frontier code and benchmark results so far

---
 README.md                                     |   6 ++-
 allgather.cu                                  |   1 +
 allreduce.cu                                  |   1 +
 mpi/Makefile                                  |  12 +++---
 mpi/all-gather/128_gpu_run.sh                 |  37 ------------------
 mpi/all-gather/16_gpu_run.sh                  |  37 ------------------
 mpi/all-gather/32_gpu_run.sh                  |  37 ------------------
 mpi/all-gather/64_gpu_run.sh                  |  37 ------------------
 mpi/all-gather/8_gpu_run.sh                   |  37 ------------------
 mpi/all-gather/allgather.x                    | Bin 0 -> 25696 bytes
 mpi/all-gather/benchmarks/128_gpu.txt         |  12 ------
 mpi/all-gather/benchmarks/16_gpu.txt          |  12 ------
 mpi/all-gather/benchmarks/32_gpu.txt          |  14 -------
 mpi/all-gather/benchmarks/64_gpu.txt          |  13 ------
 mpi/all-gather/benchmarks/8_gpu.txt           |  13 ------
 mpi/all-gather/frontier/128_gcd_run.sh        |  21 ++++++++++
 mpi/all-gather/frontier/16_gcd_run.sh         |  21 ++++++++++
 mpi/all-gather/frontier/32_gcd_run.sh         |  21 ++++++++++
 mpi/all-gather/frontier/64_gcd_run.sh         |  21 ++++++++++
 mpi/all-gather/frontier/8_gcd_run.sh          |  21 ++++++++++
 mpi/all-gather/frontier/benchmarks/16_gcd.txt |  13 ++++++
 mpi/all-gather/frontier/benchmarks/32_gcd.txt |  15 +++++++
 mpi/all-gather/frontier/benchmarks/8_gcd.txt  |  14 +++++++
 mpi/all-reduce/128_gpu_run.sh                 |  37 ------------------
 mpi/all-reduce/16_gpu_run.sh                  |  37 ------------------
 mpi/all-reduce/32_gpu_run.sh                  |  37 ------------------
 mpi/all-reduce/64_gpu_run.sh                  |  37 ------------------
 mpi/all-reduce/allreduce.x                    | Bin 0 -> 25832 bytes
 mpi/all-reduce/frontier/8_gcd_run.sh          |  21 ++++++++++
 mpi/all-reduce/frontier/benchmarks/8_gcd.txt  |  13 ++++++
 mpi/reduce-scatter/128_gpu_run.sh             |  37 ------------------
 mpi/reduce-scatter/16_gpu_run.sh              |  37 ------------------
 mpi/reduce-scatter/32_gpu_run.sh              |  37 ------------------
 mpi/reduce-scatter/64_gpu_run.sh              |  37 ------------------
 mpi/reduce-scatter/8_gpu_run.sh               |  37 ------------------
 mpi/reduce-scatter/frontier/8_gcd_run.sh      |  21 ++++++++++
 .../frontier/benchmarks/8_gcd.txt             |  14 +++++++
 mpi/reduce-scatter/reduce_scatter.x           | Bin 0 -> 25888 bytes
 nccl/Makefile                                 |  30 --------------
 nccl/all-gather/128_gpu_run.sh                |  37 ------------------
 nccl/all-gather/16_gpu_run.sh                 |  37 ------------------
 nccl/all-gather/32_gpu_run.sh                 |  37 ------------------
 nccl/all-gather/64_gpu_run.sh                 |  37 ------------------
 nccl/all-gather/8_gpu_run.sh                  |  37 ------------------
 nccl/all-gather/benchmarks/128_gpu.txt        |  13 ------
 nccl/all-gather/benchmarks/16_gpu.txt         |  13 ------
 nccl/all-gather/benchmarks/32_gpu.txt         |  14 -------
 nccl/all-gather/benchmarks/64_gpu.txt         |  13 ------
 nccl/all-gather/benchmarks/8_gpu.txt          |  13 ------
 nccl/all-reduce/128_gpu_run.sh                |  37 ------------------
 nccl/all-reduce/16_gpu_run.sh                 |  37 ------------------
 nccl/all-reduce/32_gpu_run.sh                 |  37 ------------------
 nccl/all-reduce/64_gpu_run.sh                 |  37 ------------------
 nccl/all-reduce/8_gpu_run.sh                  |  37 ------------------
 nccl/reduce-scatter/128_gpu_run.sh            |  37 ------------------
 nccl/reduce-scatter/16_gpu_run.sh             |  37 ------------------
 nccl/reduce-scatter/32_gpu_run.sh             |  37 ------------------
 nccl/reduce-scatter/64_gpu_run.sh             |  37 ------------------
 nccl/reduce-scatter/8_gpu_run.sh              |  37 ------------------
 rccl/all-gather/allgather.x                   | Bin 0 -> 25736 bytes
 rccl/all-reduce/allreduce.x                   | Bin 0 -> 25840 bytes
 rccl/reduce-scatter/reduce_scatter.x          | Bin 0 -> 25848 bytes
 reduce_scatter.cu                             |   1 +
 63 files changed, 229 insertions(+), 1241 deletions(-)
 delete mode 100644 mpi/all-gather/128_gpu_run.sh
 delete mode 100644 mpi/all-gather/16_gpu_run.sh
 delete mode 100644 mpi/all-gather/32_gpu_run.sh
 delete mode 100644 mpi/all-gather/64_gpu_run.sh
 delete mode 100644 mpi/all-gather/8_gpu_run.sh
 create mode 100755 mpi/all-gather/allgather.x
 delete mode 100644 mpi/all-gather/benchmarks/128_gpu.txt
 delete mode 100644 mpi/all-gather/benchmarks/16_gpu.txt
 delete mode 100644 mpi/all-gather/benchmarks/32_gpu.txt
 delete mode 100644 mpi/all-gather/benchmarks/64_gpu.txt
 delete mode 100644 mpi/all-gather/benchmarks/8_gpu.txt
 create mode 100644 mpi/all-gather/frontier/128_gcd_run.sh
 create mode 100644 mpi/all-gather/frontier/16_gcd_run.sh
 create mode 100644 mpi/all-gather/frontier/32_gcd_run.sh
 create mode 100644 mpi/all-gather/frontier/64_gcd_run.sh
 create mode 100644 mpi/all-gather/frontier/8_gcd_run.sh
 create mode 100644 mpi/all-gather/frontier/benchmarks/16_gcd.txt
 create mode 100644 mpi/all-gather/frontier/benchmarks/32_gcd.txt
 create mode 100644 mpi/all-gather/frontier/benchmarks/8_gcd.txt
 delete mode 100644 mpi/all-reduce/128_gpu_run.sh
 delete mode 100644 mpi/all-reduce/16_gpu_run.sh
 delete mode 100644 mpi/all-reduce/32_gpu_run.sh
 delete mode 100644 mpi/all-reduce/64_gpu_run.sh
 create mode 100755 mpi/all-reduce/allreduce.x
 create mode 100644 mpi/all-reduce/frontier/8_gcd_run.sh
 create mode 100644 mpi/all-reduce/frontier/benchmarks/8_gcd.txt
 delete mode 100644 mpi/reduce-scatter/128_gpu_run.sh
 delete mode 100644 mpi/reduce-scatter/16_gpu_run.sh
 delete mode 100644 mpi/reduce-scatter/32_gpu_run.sh
 delete mode 100644 mpi/reduce-scatter/64_gpu_run.sh
 delete mode 100644 mpi/reduce-scatter/8_gpu_run.sh
 create mode 100644 mpi/reduce-scatter/frontier/8_gcd_run.sh
 create mode 100644 mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt
 create mode 100755 mpi/reduce-scatter/reduce_scatter.x
 delete mode 100644 nccl/Makefile
 delete mode 100644 nccl/all-gather/128_gpu_run.sh
 delete mode 100644 nccl/all-gather/16_gpu_run.sh
 delete mode 100644 nccl/all-gather/32_gpu_run.sh
 delete mode 100644 nccl/all-gather/64_gpu_run.sh
 delete mode 100644 nccl/all-gather/8_gpu_run.sh
 delete mode 100644 nccl/all-gather/benchmarks/128_gpu.txt
 delete mode 100644 nccl/all-gather/benchmarks/16_gpu.txt
 delete mode 100644 nccl/all-gather/benchmarks/32_gpu.txt
 delete mode 100644 nccl/all-gather/benchmarks/64_gpu.txt
 delete mode 100644 nccl/all-gather/benchmarks/8_gpu.txt
 delete mode 100644 nccl/all-reduce/128_gpu_run.sh
 delete mode 100644 nccl/all-reduce/16_gpu_run.sh
 delete mode 100644 nccl/all-reduce/32_gpu_run.sh
 delete mode 100644 nccl/all-reduce/64_gpu_run.sh
 delete mode 100644 nccl/all-reduce/8_gpu_run.sh
 delete mode 100644 nccl/reduce-scatter/128_gpu_run.sh
 delete mode 100644 nccl/reduce-scatter/16_gpu_run.sh
 delete mode 100644 nccl/reduce-scatter/32_gpu_run.sh
 delete mode 100644 nccl/reduce-scatter/64_gpu_run.sh
 delete mode 100644 nccl/reduce-scatter/8_gpu_run.sh
 create mode 100755 rccl/all-gather/allgather.x
 create mode 100755 rccl/all-reduce/allreduce.x
 create mode 100755 rccl/reduce-scatter/reduce_scatter.x

diff --git a/README.md b/README.md
index 396231b..a1fdcdb 100644
--- a/README.md
+++ b/README.md
@@ -2,12 +2,14 @@ Before compiling do these:
 
 ### Perlmutter
 ```sh
-module load PrgEnv-cray cudatoolkit craype-accel-nvidia80
+module load PrgEnv-cray cudatoolkit craype-accel-nvidia80 nccl/2.19.4
 export CRAY_ACCEL_TARGET=nvidia80
 export MPICH_GPU_SUPPORT_ENABLED=1
 ```
 ### Frontier
 ```sh
-module load PrgEnv-cray amd-mixed craype-accel-amd-gfx90a
+module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05
 export MPICH_GPU_SUPPORT_ENABLED=1
+export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
 ```
+
diff --git a/allgather.cu b/allgather.cu
index 8ae7481..8c357bb 100644
--- a/allgather.cu
+++ b/allgather.cu
@@ -14,6 +14,7 @@
   #include <cuda_bf16.h>
   #define bfloat16 nv_bfloat16
 #elif USE_ROCM
+  #define __HIP_PLATFORM_AMD__
   #include <hip/hip_bfloat16.h>
   #include <hip/hip_runtime.h>
   #include <hip/hip_runtime_api.h>
diff --git a/allreduce.cu b/allreduce.cu
index 7fdf2b9..111b254 100644
--- a/allreduce.cu
+++ b/allreduce.cu
@@ -14,6 +14,7 @@
   #include <cuda_bf16.h>
   #define bfloat16 nv_bfloat16
 #elif USE_ROCM
+  #define __HIP_PLATFORM_AMD__
   #include <hip/hip_bfloat16.h>
   #include <hip/hip_runtime.h>
   #include <hip/hip_runtime_api.h>
diff --git a/mpi/Makefile b/mpi/Makefile
index 782a6bf..ba9d72b 100644
--- a/mpi/Makefile
+++ b/mpi/Makefile
@@ -6,14 +6,14 @@
 CC = cc
 
 # perlmutter flags
-INC = -I/global/common/software/nersc9/nccl/2.19.4/include
-CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI
-LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
+# INC = -I/global/common/software/nersc9/nccl/2.19.4/include
+# CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI
+# LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
 
 # frontier flags
-# INC = -I${ROCM_PATH}/include
-# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL
-# LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl
+INC = -I${ROCM_PATH}/include
+CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_MPI
+LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl 
 
 all: allgather.x allreduce.x reduce_scatter.x
 
diff --git a/mpi/all-gather/128_gpu_run.sh b/mpi/all-gather/128_gpu_run.sh
deleted file mode 100644
index 3af373c..0000000
--- a/mpi/all-gather/128_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 32
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 16))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/benchmarks/128_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-gather/16_gpu_run.sh b/mpi/all-gather/16_gpu_run.sh
deleted file mode 100644
index 25d7b92..0000000
--- a/mpi/all-gather/16_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 4
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 128))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/benchmarks/16_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-gather/32_gpu_run.sh b/mpi/all-gather/32_gpu_run.sh
deleted file mode 100644
index 3a03ef0..0000000
--- a/mpi/all-gather/32_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 8
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 64))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/benchmarks/32_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-gather/64_gpu_run.sh b/mpi/all-gather/64_gpu_run.sh
deleted file mode 100644
index 37ba334..0000000
--- a/mpi/all-gather/64_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 16
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 32))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/benchmarks/64_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-gather/8_gpu_run.sh b/mpi/all-gather/8_gpu_run.sh
deleted file mode 100644
index aa3e3a8..0000000
--- a/mpi/all-gather/8_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 2
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 256))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/benchmarks/8_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-gather/allgather.x b/mpi/all-gather/allgather.x
new file mode 100755
index 0000000000000000000000000000000000000000..03793882f6c87b1c4fbedcb5062d4db7921d7a3f
GIT binary patch
literal 25696
zcmeHP3v^UPny%ZOmjQB{r{H6}V1THk^MJ&FfaxR+9jBua67Yd`I{io*(+~TB!Bs#b
z&a?q#*gcNx%+9dR?yP6lnPGQl9md@OA_|N<55UK8bY^76S2xH*MPPi8{r<XDotrj2
zI=koW*)u1V-1_VP>i_?G+^V`&w{O*YyK{+2QJ8clwwMw3gBco!>bcL@ky}Inq@LBW
zV)(OIDJuY<$1y#=o@qQFX{AZWB`wlfW(Fm_h=`AvG#(@AC7feSQkg@dq&MU=5!C5p
z6LeNmX3=@l%g41%#BUSjlXSPL%Smd~qcV2P(BnHAG#-$2|Ce;U`!bzMJt`LI^$NXS
zp(p7!QEo|Py)1hx?xphOO7Do&8>goy>!nrbJtfNflttqLk~(#9CaJ8$8=*(}{Bs@Y
z)3r(DXOp;pl4`=Bq%ysVo<L{qyow%gc~2me=q>N9t1YjcXN`re)!ZIspUSlK`gU%?
z3|}O{`CK09h+8{MWFk0i7@{*XK<LVLbpS@fm*&7-z(-0qkON<n17DW||7s5W@f`S5
zIq-oT`2HOD!5sMOIq=gt@K18!#kgT3)!X<S_?L3v)j9Bn9C&jMyfp{jo&)#hz=JvP
zwK?#<9Qd{z_>(#C<EZEH>>~E0TSJI3Ii8pyPxx-PMsNoIDsT&%#>(6p!5Q}m$1h^_
zZY_gR;2Q$p@79PlI1~D-f<CHXOgH}NxQrvJ{~2);^yp8<DQr8^(O_L42Nd=*!Bc|K
z*##J-&1?V|xx__*3uttBQSgLOaJrxufujH12e6p1vP(BzQuluUB@OnM-#vWiO_%(~
z8(Z&TE*C~5k1G~;N8_%bI}ifW6%2<2;sP7>bp>K^U)1G~x`RGfAmk4N_4K-3{y@mx
z6S&(4*6SNdtUC~C@dZ7RwG5%<zIc;wO~B(LW=Yh?X^Xq3C+s0)sf=z6Cqi+7*rU;K
zba^})2z3#n>n~p$@^nYTAxb$GkH*72tfjTt)f@`Md7v>I47#H3&>cfuOqcbB6LIE`
zz+BwVf_l=iIPxB4{=1@qxQ}Zrk;NjbZEGVwmnR&G2f7mBM2rgvEEq(B!kFLRlZbUQ
zKNX8-2FAPG@ory~vn$;qZw>BfG$6rvAn0ShUPQTkUU%G$dhCqF^r{CG@>20ao?t}J
zYb4Hav)n;1YC?Z#)KI0k=HcN%caa_=wQQ-gxuMZjZLPM}u!?XbUg3$l*H%P)6&{bT
zqN>(fX|3czJbGO8C<=5gsEFNF;YWqWk+X_e#O=AG+!tCCtLSvcd>Iqvu^wNns;2yI
zUuZ5FD6gvFw!H3WtlJ$bkK(SOJmqt(^Q@KS@mgO`bxm*dt}cI9&Fb2ks5jajjm^Dt
zb!1+6bu1Rq)6Jc|a!-#t)KxK*Nf9hhgzgB1?+S_MXICh(Ad|z-7V?W*p$d$8JUvv0
zRm6y55noCC-I<tRB;cYt<p8gFLb~ETuI`vSBk$#=cyw<h=nG~d?o7D7L9~nv5Da@0
zs9J80Qt}5fK}tr-@?7~dQHe~*6H7#NhOw!34Lx0-dl`lWrWtVKe4@;N(_BNkN&}uJ
zLC`t_PV-6W<hV;1jWyEAahGu8yrQl{<06Idjp^1KaAUd-18z*W)qor4aW@%oN<+F1
z11_Ht6xVIQF{ox-#DEuPP|Rx$_!tAe!GMo7;9oW1V(Q9cw;1pegZyI#TujY*>^1`~
zrn($|(twXQ*x7Etjq{Bi2At-I((N|jlOzbb-+)gx;0Fx&6a#+PfM009j~MW&20UfJ
zry1~52ArN}(w#Hl7fX<+TO*&*J(p`I;)~oG5sqKVL%L^iYdk>mVkyJ<scwx^!sXJ2
z0L`rt;db;8XEPqn1V(a-00##*tT$d#TK0j#^^6%W*#~5sCp*S#IHzRhH?nQZJ~7k>
zHXwzeYj6<h8LsOS!A@UP3;VT5H0<%kV!oIbOvK_EMj=nP7Rp2~ZEcTPi&+aq++9A6
zp064nxm~^m+RPrWR!)8~TjCD%_`C}=JP|cKWW!NyW^BPsuQnU8vy0hKYGe+YP-zaW
z!JNXY1wxTTT#LAA&Vk7XC1$l&WS**4Pl8cGGp)6LE#|`m+8Zlo?UWhViD?UwRaaLe
z5nHS+3<g53V64kUnIzWTYh>}%kGkW5a45F8m@VxIce;BtdKmLE@zNACXV^0W>S%5i
zrP7N<2^$MFWLYm-i_Pi0q0!aSVrzA^*luzyVghMf*4*N1b1btjciiA?0^YdHb~8jE
z-rDB6Vabw3)O?hS&DS_U>E4>qnZ2MDpuM0$(8oZZ1Kk1IhEDo(&>d*(-DofqF2h_9
zbSvlqP}#SYyO%Mg*P>iJwy<ama2ivn-#m&zq#Tqr6k3R{#Pvg9iwU!pSROPrjw@IT
zC9o9Vit87MUj~|qH??p9vKhgZhrB%?8a(Co0bCo95ic2m&4!Yx4JBGbNm)ZlrLCl{
zsbss-SGcKQbN<75kC-1cnJ4C#)Y(cZxd6l<Y_XLT*&w#AWQSre*{w7qy!Ouwn>HF0
zkP*E+8tTG<g4Kg&%45Dpp#n7lBK&^TZ5Jstl~^7zAwv(F?IqggJY;ZFzN4h_!2(A~
zU0-2KNqv-O+(so68E+^l;`e~=$uqDc`#G62x-G+;e@oTz=H$WK9mzj9`rbU%+UDGx
z-;2kIWAnK8@U(GkUNi>-$>8M0c+{S5cO>6ZpU%Gpw@iI{w`1e(MA_c_FN0GY{rPhs
z0VqDjvAO0ZU{FmT9~?~ez}hio%WZpv_B}(|3-{&+h4z{0y0!{S&n`qbxi8f~`RLDo
zBNMfG(bvi10%`H*<Q#qHP3rw?(Fz>NQ;vfv@RnF%)z8wGb7;WP_xp2xHJsmcY;X{C
z+$Tplo&1tcFMSHM>X2GCb1np$)Ur$enIf9h@JthF*wvGV{BTzFGHP9n`5o%I-`Ulb
zyFYNMSMK$zkz*$aI-#yS{((cik~6QUE6)&f2GJjJ<h;6akdX5v`%9>wA@ws&>ZIq@
zNoOIJ{;ZX~8J#9Ik~-mcA{A&JC(RR0>Xg0E{?MjQ`V`@_Hnr@G+IAWwrMA86SJ%*m
z0-rnydK&bs9Z`Op8hzOUq4Rz<`fCdnbJ53X6w{`@MrD2NB&5z0JnlJy?dthIf;&G5
zm2(gR2k%odg{m57K~>q+wlw6{9EIGm<B&UHBg96CdJIh2DH8ohn6s%X-lLR0h3M-i
zNd6?E&v3YjU_xyfwCw}RdXw6QYJB}eihyWY3g(Yd`4I6uz}JscObX?B%L4Chcz2xm
zKwUp)rK9iJBE-G)8}eQv@8{$_=a_#`-B@#eaL_S-f4q_IByP-rUtRZX@|Cl%oZU-L
zv&1<6#zXNb_q{-m!(+wjGk>t$cZeRNel_$gi4GjK_>+@va10I{+p)*kKH6<7Z7Xc;
zwzlPtzIS^a$q$nEl+JS`<E5>AgZb*l_2^EH{=X`!I`a(8{!+)`Hz)T2vPkGv4m}TO
zs)Y7&XfGhmnfzFy{{qyJY%10IJ~pYFXx3}rcwBw>EE>Lj;~Dkg6BL;Lq574#5a_$7
zRAcJHuTrnrOj+8DETz_w353S(LS~v$>jhh>vm3z9UsYPEZcIU~C3(7e;7zT?bDj*$
zNtqDa-&8vGZ_k2D{yOz6rn$-AHmg58iL}PD#5_m;qDL1n=1hL%7&tm<=WL6{s&=QY
zp)~pz-GdlM@+gn7=rPj~<4g{wuHf|dj)C-a^?@Tu8KC3f$>xD~OB2+yAEGiO|0sL*
z>UYpm(PC%lbn-`UbN<pppuA__j0o!42PuMjb{R=h&sHz{%j?fqQ7J#$pHIKV%ls$%
zb3IhQV1IrY=Kk;cvmgDr^>h34v9P0m+`>A>c5PKPwv%S}&){juQYlPUcTgz(SMbsJ
zE`*Syf8x$xK@Lsw$xQsMJbv8St6{`KMk+E!z5}7u4>FQfBsu?A|H}cy!3PlP#C6oE
zEY9Tl)Z>_9Z``eJe2j()fKwgG6RC{?{01LCAnE9jm)4<c{e;Susuj_HgXk8|htADC
zrA52YP1w$H3UfM>M^kyoxifh(H5OFwpH;iop@hlbp#lm4?y3jpOrC*FARofuuI1>L
zl#T}AO#oGcspB9m^Utqqa`fL*T9-NjmO_@)OIT1)uCEjsoSJ%8AeBH;2~ywf=zFgB
zR@*AuZMNHOuG{uFH+xH0Igr<mV-w}`wM9A8oP51`{_ocRT}$%A=7Yz%`Tn-jB1gZo
zbZS%bggu$EB@f!#x>3BQ)IF4gzUM5Ib_E|R-l6)U^a!G#$x^+@$>*4<F_=lAN}Wk=
z#wyID0i~)w-^>XT5xM#MAv5oYs(6CT9L3b^b8_=f1~az|nb{`HP%BM+=kv{U7|cxM
zW<D0>S|KdauLG%@VabUnl%x5+^LY3%C*ATLG@)-&TTa=NXZ5E+HFQ#o;Q7^8no8?Q
zL&I}s-HpTMN1s!&Ur?@j1tnIAyu1Jib4hpVAR3Bo@K=fd)YE$wc#7nCbEQl<)Q!LW
zba3!}brsToU%k><s=cq?KCtVjdduCeq!vRw{S(MIJo|UKr~`i=-ZL+vEXNiM$6h9E
z)^O~ngiRQZQD&(9BXhe@TP$@Q(!lg|A_0rgL-ZsbCe_=4e~cDqz#k!e3Gg?D<68)~
z0{^e!_;JFg0ROk)_zB>S<Usn)!NEbAN@I3WnY{O1M>285k!(M_&;AJn*gpF?ot)Om
zDI$)(dk-`9fkOzUe@uM8{jj4Sb9kD@_nm8u2l~#ns2eFqefOSbi7SP`Gvk5m4ALNg
z{CH4a+@^8+?5C+3`s|-5$2@)ao??mHq&&Ts*|}cG&(-DY_t{Tn<mdL?dyXY4r98L)
zbs=xo<tL&D8To=O$MUvd_2B5UA1>;<_Y>24jC(xuSZb&n=b|pS#jAL})tyM6?fL|0
z>Uq>2&zyxoZ5Ob?)OLVdw)|`lUpNSF*m4LAP2ZZ6zor?SbMwrHvBGS3Y%V9d+?M>b
z8RO#>RQE-Wfltk;<*@D8cs#CEy(Y&m44)W8o02DOgOh&a=-Z<>=KnTv6ivS9*4yOB
zhY4e5J&~RM2Ycv-!G3@KL3BY{Qke!1kp3PW=OHSn?gnfA70;-_nLmQBURI9~@iaEF
z^bsoc%<mCTRhmA)`6q}+O|bNC&Oc5(DxamdbABuFXyYusjq?u?k8aP>TR49|@fa6b
zdIRV0CLRwEmX2^fN<6K=(;b{&O+38<O}BEslXy(=Sh}9`w-8USJkyn&Uq(E=>P%~#
zzm9mU`B>V*`3B-kh-aK%K)g!)DNO1IXI2w$A^r&GuOyzDTKWLzXAqD3iSl#)BI4<l
zWqLd3ClF7gUwRwoi;1UKl<6&;HxW;-Ces@@|B)3uHT`si^Y0TsmG}<MA0?h%L8e<d
z{|Dk}a+0p+{Hw&%E5~#t=MNE2uRzio=U*TmD|(b4{A)8b1R0xlxlX-SovH-Fy-s%;
zP7wb4pOG)7I}In}FU;a+XYo~8d}9{BGK=rb;v-r7eOdh1viNUi@!!wlf1Je+WbyR-
zCOP9fW#XxUk~>hWaY_x8+><(C@*O?hIC6AD$@PxWIHe9l?tlo0WSQ2Xahw_qIhhW1
z5OUj18mID|zO+vrin~IM6Y3D;eklxpcz!TU{DA_E2-AJM_iE5X(+Px=<&f{V<##ml
z9kz_821l-2=u-nDC*$RB3r&?8$Em}STP65YgL;1IMSjo}dHyyD9zBoqvYvG9QGc`s
zMr&ZS21aXOv<60NV6+BCYhbhnMr&ZS21aXOv<60NV6+BCYhbhnMr&ZS21aXOv<60N
zV6+BCYhbhn{xWKyNc=9iy+Y#!suaJY$^GitrF>AtKPu>V1$|o3y@I|X=-Yyx5wxI6
zPiKmtvjnXbbcvuV1oaELM$iWZeN@oz3i`C5dj)+((6<FWBWOXjNMF!dg4PPUM9>w2
zHew^1#<%2YYp@vuyCpPzE1{;9)wq2%)s=Pgz4Pa~E8W#q^E$8g&Gq}eHGa3dwytts
z<-D4@T3=1w75Hu`5W2$`U7&GuPG^%a!j?8RF3`%b>HP|=s?s{wTB%i6R#!o_PAlWG
z%^{EV3L&<<eYstOr9|%)+SOKULNwf*ISA{&Y3jd}%JU7bF~}H$^t-Bw+2PAyR*kWY
zE#&bJWa8!Tt@J?xIr)1leSk5N)ARFPMCtZr``S8^Ffcv6+ff8veuZc+^7rSCEI!+e
z4S*=WLUFSE@;ij55$VPC?}$}4m>xk;k!M{<8Ag?r$G7nnDIY>&>^FqgEo6N<Jmtdq
z@Gmg8mS~{Z!k3qiEwwDKnwT%)GO;%JMaq2yz|9gC^mKoO)bp4hc#%>Me=L1NMA^!&
zo;MI_D#|w_%6uC<)8Qr{dGk12NZ)AX>nTHMEX7;$E&?gI0|ImvvZA6pK`(d^Uh$X7
z<00}8wTb(5FHA}SHKQ@~MVeC9LBS_rNV$>+=_>+d4i8!orC2FgZ~z`%l*lWfW5>`3
zgG%M|2#?uA!72*!pXL?UVsYnWOHsaALSHT_jXTAV(*;>_$~T_^u3k%NHd2<!sgH7&
z^6ez{C9X>BW4J7I$CMS<!haLj`1gd#nPA9$3(^w;Oj$`4R|sa}IFXx4O1tS^Hc4qS
zt%HA~X*~ocOLfWzn^FUhF6C-*XV(G7orPZyxCt2>YEjB~#ZoX{nV^_UE-sl_G7;_8
zWM$@(YsV5uR>muU$56BdA8nZCA#%Z(YsNHBymp!iUt5?<*_bhl$C%Ol^JcN;iCl$Q
z3fc>XEE<i#Y#}!!^h#i2O2%f!NdA!70!3+_sK9XHbQmvEksc*;!DMCjG=husd9q`S
zIa`SDHO5Y6vq8pL*|pR3JkOp+sn<{adIRtht1<#k2G#sbeoQE`WymnG<Hw*%CKQt3
z1jH2Sl|eBRllLo#!)C3A?#I5usb6bA{Y(bP#w%42pF%lVI4xA7=Vrv3S^?C=cXBDp
z$0fx~fg?d|rHieNd3}+X*jLT4)f`6vSeNUc8eL$LBZqmua@uUU>RRoZ>!weS#>|#G
zeI}e&pj-|!W=0=6k>zn2%ivi6JMpa^L@{<dE)%0+@N+ZZiN*s%>C2JUJiwIZ=z=*W
z9Fw513rupvHNVIdQy#F#VcX1Z5-AMJNJSzR6(_EAc|39lc`On0>D$LEuC54#JUt1o
zk1=15zkDd)I26F^^IN-RZOBe)-oq60g#G4F+&n?!e%u^dXr6H0(pAiqXMT&keOpas
z)6B(Io?|{_e&O<mhRQETe)DX_!7ChPm!rS=DuId98?XbM&SD7Q`~WI~3ug?t*YfiN
zT+KJ&H~}v<K%&rI=IM4v8JA)FL<Br>wZ^I{{V}SQ2jhv5Z&8;o<ckJ8VBJwq_agm>
z8m+vG|KCupmBZj7Y`lkFxgWG{J}M5!+4#T)_5GmbJ-$#^yn9h4phW1dK*(DjcXwiw
zJ9u|Y9N&P`6tJ6rcqAGj4j;foBi3Dx9jI}ihjwOi!_h8ZIktb6W2+~mLUFPc#ZeI`
zTo<w$?*^}{3&n60ImM|4$hbd3+gB+Elpk&rCmh71YZq13&Z(M5_F<qLTU0|~FrhFs
zDI*z609%_I;b-APDa3nX<=DyZ>sds%AmHvP_x1WbxFd99#2tZhG)`yxXtw2AwRK(<
z<}Sko=*S10ybz(z7!P>Lqdpvtfu3Zz&t%=$SbL~|^zh|@;T(>;K#^(R?C{QxBO!QK
zb4R;8Bgaxhq!!|evx@F;&{yH^Oaywo712mg4>YtlJDV!Ha9n|>yxKaqsse40iIXc}
zc8weZT&_sem(?ggn;%{=NOD;F)oY8FC=rSUx<WqM6|C3aXrujNjdtW#$y>7>wLSda
z=oOBEbjZ;syW}JyM`bxW$<bJjOj1r#U3*Ah>fI{lA5zaJc&R5TO}EI^bDy3q?J^k2
zK)1<YLO^-!1Eu}jg2|B2+G?v8kx@YvcDoE3ZMA&?TQ^Eu@r*5W9^Dyj-S`<>HX+rM
zQKYB#FegPE0yVM9Z_BV>U<zA4Oy^HBGzrLK?`7_*6sA?jNI0e=BjPiFD{L$~XVDNE
za(sr4sjpm9EEZ8j6`aRZ@76d%`5<>aF2Xz98bU+&`(}hO-(m7QS;`-|4qPVKm)}({
zBM@#iF0!-Dtsz8V-SzVDgv|G}89NYAOjDR#|4V#5<VUIpS}}}-Kg#V-%zW=F?L3x4
z{yRBv`aWAR4b`VyEj&f`F)6fkXb25CKASU5%Y4@?V`w`EGY!?7#Fq$Mepf8<B*!PQ
z-UdxVoVKBUOr{bh`@M{$ZB8bWnpyu!JP11@<?{syOu+rhUIo1>aJdF87lwW>a5?Xi
zi=+1h&Sq&`lAj%b&>37Bn}UX+<2(}Hg~;c4oYp88s<J+30iPg+473ioWdu8R;F>{B
z2I;&r>I2!XF9*Ig2mU?aBjtxy$|K<eIdJ-(ekA!9bKq~~z|ZEuX^~C$F><-ai~In1
za<rFkq;zX@;Iz6ONxmfqek<^i&Ufp~A-_5Yey6ar+pSTyGjDx*bI7mHfz!E<BjtZv
z4xIKfjU@l$9Qe*0_?{g2p&a;YIq-LK;P2(Y&j26kyu432J~W>5N`-d;#sLe);r(t+
zLY$TWr}iWF!PKLjk-MDZdf4b+l>=v1%mjVW2(yN;NU+)(nz6v>Vpi|kP;70GS+Oj!
ziWil`7*|whyRpE~rQ<*_&32Qj2t|702!;8HY*t^lIQOpG3rWc_YuwihDLxyw^0k+h
zs|`(c8He?mKeG;-Y4McZ!2l-TU13<!Qwj$ot}xD#3%hAaW2N;;z!#%gnq0iN<l+Tu
zJ3W&0^@MS79J6|G@*SdJ(bt*iLfUj94{)93C-=#y&d#WBjf|nyn+#*kB!k1%>6|_*
zY8NXx&~7?;kB4+BC;V9}7pycoaeiQkFJkNt*OFzn7Q4&Yyu8gtXA646`f604_+!<C
zHHR0+*|}Wd&ed4_;Vd>iN|zOjTo55C*H3KN!9by~D;RV~kY>cVR6<^`zyfL=xvE_}
zr$~ubHLjsWwLUL4!2GWB!xHJv$@BV_I&WyOIqA5b<@PpLo2|iVM-hiFz=aWRQ69Hw
zw2^i8B2P#z?Y*(E3kU6av7-Bbekh~sma2J_b^U%ep^jRss_CS{PHNrps@k5v^m7|s
zF3|>E{%{mGx`#SX_F;}AtagV%^ftid8s2H_*EgZ#<Z8fNqv_`BZ7t1>#$oh-9H!x!
zE)4VB$@8fr0+9RCL^PL`VWPB8uMn5i-(=Fj4V{~;a~$HHoS#XA)*WMTNqxCLkDjOG
zw&3|dj?$of%6T{5lyY|&e5^skQop`H<9K}mCz<o0g_%q`Ql4I1(%w1A%l&hkgnqql
zCR4`j{L_p1GX{ORKk$IiuN3++eVKm&^IxVYPNvTaH9d+73@7&=>YQE&GEB4u7v)Ln
z%l(uG#0JJENeC`GeOZ4i5l8wBBm^h-SN76PfumnW$dNu?f`6v|-N5L+RO2Ei=i#k*
zPfzY<nK7NkoME^NfHk&1d7igC=i514oafaD-Ib*;*Qw`3Lfgo=!DZL~eTX9wal10V
zr)?9HxhMFqJ3|Jt^Z%7BeYuWm(sUha-$s|+el{c0n7(}9vP#pnsDBw<w*Fru(x@L1
z?^z<6uEjU|8w_OUpVqH*t7Q9->+p^lx>9z1NnX-_fS78~m+Q{%FQ~sAx|AoGzT8h6
z`GWeppiB2o>dW=5JePi(Xn&$RW!kT-hyBo{`jh(deATJqr?oP~8J#S*#D0N*Q9rXj
zt=A=FI#NDc|HUkQx&F5ZeW@oYrDb%-z!26az-)b)zHG+X=@&uSn18K9<Ce5~-5|vq
zUA8{`d_|b7zk0XE#mj7*(D~HEoHmTZZ(Pu)Br3CTssF7koMg#KdYRDI(INkUO@yVO
zU}b$sNoy|s23=x<Xg^Xu%N$#fL%-FoYtWgCMwi7hM-KfXOLQ%Het@)*txxZ72+OX&
zT@GENEo;D#`jWbYzO*l?)}n_tNI=l+`lI(F(w0$lo6ujJ6`94dUI1BD%*`C37lv?$
w#lTG6@nT%E{mJLo2I$MSNAl9`XxBsZKE~*>SoTm3{p;w#2)8xMc((q30~vNPh5!Hn

literal 0
HcmV?d00001

diff --git a/mpi/all-gather/benchmarks/128_gpu.txt b/mpi/all-gather/benchmarks/128_gpu.txt
deleted file mode 100644
index 3787302..0000000
--- a/mpi/all-gather/benchmarks/128_gpu.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Local data size: 16
-Global data size: 2048
-Number of GPUs: 128
-Message size range: 262144 - 16777216
-Number of iterations: 10
-262144 0.003218 seconds
-524288 0.005101 seconds
-1048576 0.008701 seconds
-2097152 0.015526 seconds
-4194304 0.030239 seconds
-8388608 0.060280 seconds
-16777216 0.189415 seconds
diff --git a/mpi/all-gather/benchmarks/16_gpu.txt b/mpi/all-gather/benchmarks/16_gpu.txt
deleted file mode 100644
index b69654b..0000000
--- a/mpi/all-gather/benchmarks/16_gpu.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Local data size: 128
-Global data size: 2048
-Number of GPUs: 16
-Message size range: 2097152 - 134217728
-Number of iterations: 10
-2097152 0.002391 seconds
-4194304 0.003558 seconds
-8388608 0.007162 seconds
-16777216 0.014929 seconds
-33554432 0.030427 seconds
-67108864 0.062092 seconds
-134217728 0.151508 seconds
diff --git a/mpi/all-gather/benchmarks/32_gpu.txt b/mpi/all-gather/benchmarks/32_gpu.txt
deleted file mode 100644
index 0e15475..0000000
--- a/mpi/all-gather/benchmarks/32_gpu.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-Local data size: 64
-Global data size: 2048
-Number of GPUs: 32
-Message size range: 262144 - 67108864
-Number of iterations: 10
-262144 0.000730 seconds
-524288 0.001367 seconds
-1048576 0.002650 seconds
-2097152 0.003740 seconds
-4194304 0.007503 seconds
-8388608 0.014208 seconds
-16777216 0.029923 seconds
-33554432 0.061970 seconds
-67108864 0.168545 seconds
diff --git a/mpi/all-gather/benchmarks/64_gpu.txt b/mpi/all-gather/benchmarks/64_gpu.txt
deleted file mode 100644
index ed700b9..0000000
--- a/mpi/all-gather/benchmarks/64_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 32
-Global data size: 2048
-Number of GPUs: 64
-Message size range: 262144 - 33554432
-Number of iterations: 10
-262144 0.001561 seconds
-524288 0.002915 seconds
-1048576 0.004163 seconds
-2097152 0.007885 seconds
-4194304 0.014989 seconds
-8388608 0.029413 seconds
-16777216 0.063034 seconds
-33554432 0.183096 seconds
diff --git a/mpi/all-gather/benchmarks/8_gpu.txt b/mpi/all-gather/benchmarks/8_gpu.txt
deleted file mode 100644
index de3a837..0000000
--- a/mpi/all-gather/benchmarks/8_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 256
-Global data size: 2048
-Number of GPUs: 8
-Message size range: 2097152 - 268435456
-Number of iterations: 10
-2097152 0.000838 seconds
-4194304 0.001719 seconds
-8388608 0.003172 seconds
-16777216 0.006797 seconds
-33554432 0.013860 seconds
-67108864 0.027938 seconds
-134217728 0.055353 seconds
-268435456 0.104310 seconds
diff --git a/mpi/all-gather/frontier/128_gcd_run.sh b/mpi/all-gather/frontier/128_gcd_run.sh
new file mode 100644
index 0000000..4e8c955
--- /dev/null
+++ b/mpi/all-gather/frontier/128_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 15:00
+#SBATCH -N 16
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/128_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 16))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/frontier/16_gcd_run.sh b/mpi/all-gather/frontier/16_gcd_run.sh
new file mode 100644
index 0000000..bb2429f
--- /dev/null
+++ b/mpi/all-gather/frontier/16_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 10:00
+#SBATCH -N 2
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/16_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 128))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/frontier/32_gcd_run.sh b/mpi/all-gather/frontier/32_gcd_run.sh
new file mode 100644
index 0000000..e630b97
--- /dev/null
+++ b/mpi/all-gather/frontier/32_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 15:00
+#SBATCH -N 4
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/32_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 64))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/frontier/64_gcd_run.sh b/mpi/all-gather/frontier/64_gcd_run.sh
new file mode 100644
index 0000000..e7c707f
--- /dev/null
+++ b/mpi/all-gather/frontier/64_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 15:00
+#SBATCH -N 8
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/64_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 32))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/frontier/8_gcd_run.sh b/mpi/all-gather/frontier/8_gcd_run.sh
new file mode 100644
index 0000000..563f933
--- /dev/null
+++ b/mpi/all-gather/frontier/8_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 10:00
+#SBATCH -N 1
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/8_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 256))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/frontier/benchmarks/16_gcd.txt b/mpi/all-gather/frontier/benchmarks/16_gcd.txt
new file mode 100644
index 0000000..35a9e26
--- /dev/null
+++ b/mpi/all-gather/frontier/benchmarks/16_gcd.txt
@@ -0,0 +1,13 @@
+srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 16 2097152 134217728 10
+ 0: Local data size: 128
+ 0: Global data size: 2048
+ 0: Number of GPUs: 16
+ 0: Message size range: 2097152 - 134217728
+ 0: Number of iterations: 10
+ 0: 2097152 0.002249 seconds
+ 0: 4194304 0.003148 seconds
+ 0: 8388608 0.006062 seconds
+ 0: 16777216 0.011871 seconds
+ 0: 33554432 0.023485 seconds
+ 0: 67108864 0.046822 seconds
+ 0: 134217728 0.139763 seconds
diff --git a/mpi/all-gather/frontier/benchmarks/32_gcd.txt b/mpi/all-gather/frontier/benchmarks/32_gcd.txt
new file mode 100644
index 0000000..f758360
--- /dev/null
+++ b/mpi/all-gather/frontier/benchmarks/32_gcd.txt
@@ -0,0 +1,15 @@
+srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 32 262144 67108864 10
+ 0: Local data size: 64
+ 0: Global data size: 2048
+ 0: Number of GPUs: 32
+ 0: Message size range: 262144 - 67108864
+ 0: Number of iterations: 10
+ 0: 262144 0.000783 seconds
+ 0: 524288 0.001513 seconds
+ 0: 1048576 0.002953 seconds
+ 0: 2097152 0.003404 seconds
+ 0: 4194304 0.006485 seconds
+ 0: 8388608 0.012489 seconds
+ 0: 16777216 0.024484 seconds
+ 0: 33554432 0.048460 seconds
+ 0: 67108864 0.185884 seconds
diff --git a/mpi/all-gather/frontier/benchmarks/8_gcd.txt b/mpi/all-gather/frontier/benchmarks/8_gcd.txt
new file mode 100644
index 0000000..7856a16
--- /dev/null
+++ b/mpi/all-gather/frontier/benchmarks/8_gcd.txt
@@ -0,0 +1,14 @@
+srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 8 2097152 268435456 10
+0: Local data size: 256
+0: Global data size: 2048
+0: Number of GPUs: 8
+0: Message size range: 2097152 - 268435456
+0: Number of iterations: 10
+0: 2097152 0.000505 seconds
+0: 4194304 0.000856 seconds
+0: 8388608 0.001645 seconds
+0: 16777216 0.003223 seconds
+0: 33554432 0.006379 seconds
+0: 67108864 0.012691 seconds
+0: 134217728 0.025316 seconds
+0: 268435456 0.053944 seconds
diff --git a/mpi/all-reduce/128_gpu_run.sh b/mpi/all-reduce/128_gpu_run.sh
deleted file mode 100644
index 6a5ccff..0000000
--- a/mpi/all-reduce/128_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 32
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/benchmarks/128_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-reduce/16_gpu_run.sh b/mpi/all-reduce/16_gpu_run.sh
deleted file mode 100644
index 4158fe0..0000000
--- a/mpi/all-reduce/16_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 4
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/benchmarks/16_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-reduce/32_gpu_run.sh b/mpi/all-reduce/32_gpu_run.sh
deleted file mode 100644
index 8990167..0000000
--- a/mpi/all-reduce/32_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 8
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/benchmarks/32_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-reduce/64_gpu_run.sh b/mpi/all-reduce/64_gpu_run.sh
deleted file mode 100644
index 314f852..0000000
--- a/mpi/all-reduce/64_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 16
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/benchmarks/64_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-reduce/allreduce.x b/mpi/all-reduce/allreduce.x
new file mode 100755
index 0000000000000000000000000000000000000000..283e31cfd4ec10983f8a8159c5c70bb949af53dd
GIT binary patch
literal 25832
zcmeHP3v^t?d7itg7qX?he&E=~#Fro&@z~W$wv23S(CT5Y(iJF{j3EK9ulFG>-WU5Y
zHUVQJOtJ`rLz`DiTbHIOCnrtaoRm_UgXI{E$w`PIVBEZL5+J(>KfuPO33&Vcb7yw1
z*4jc(&gtoCHTIo<{%`*OpPB#8ojY?^Gxv0RR$3H=$zWmEGUASZO5;#Fj~N^KoJfGQ
zvo=<Xcp+<GRp2W)X5_asjVB~^S`1v$8iQq4P|}Nw{J2HqIg-xf9AlEo8WJVFV$ej=
zphu<~tfb6l@T6CX>!`>-D%vNhQ^-kb)}uDsE;a1fIyIh<wDFS${-mH%kD5h#jY6+c
z=t;U;v|Cc?m(_2i{M5d3<sB7zC#M?aO2708y?N6NeBLyTCnOy(U?!>b;b)*n_5AM!
z(x>Y|QJ)9J{gYG^1|^l{HHAYPmMm@x`|aUSG`-orxowGk$zn$`=4j#e=r&NBR$ae_
zTd*P)MR2}cL^|TujuV*yP8)|9%%?!;Dh+J_Cc;;h!M(sIDmPRH-&6+QS_c1O8T`-7
z;NLET?<<2JEQ22^ga3OO{C8#W56a*bxM36dZF(7eP8ob*8GKn8{L^Ldo66wcGI+2I
zzOf8`XBm9B4E}H#{Oe`#C(7VYm%-n|eVNAQvd47|DaPdZGX?U5kLnu11^lmo+t?h|
zrfUQj!ryUxE^F7d0!E2{68J$~Bi0lwc)k@p(+b`J=6@EBQJ#@&q)K^`GX53w51=d!
zB^YCe!d??R6`HuLN@XbLRbUj-b3+%<h>{aLkxOwAor??4+jcOugf(9L;JmhXN9J{O
zzw-8x+i#lpZ@+!yF6Q-O1oL^5DLs+$M)Xh=$Y3NE6^IvXA}|<ArUD6XFrh~R-cU3c
z1M1tXdxN2<9uD0Z0P7D-BsLU^_XZ-q_!fp#e<0Nr*c9>wh*_BkaN4Vf!!aKrt7LX(
zEFDb=q&ty_CHhl|P;`(GL%)AZ)HjrfMXBUuDv^qX1qrj>z8-H+G?e0r&R8VkP3X~$
zMJ{RR_+#l53&vqK6=V^k3dt1OkYK?(5}{OpYpj&bBdY^j;sLKO7EOf))3J1t3kWO{
z5e425_xch69qPh#Fc?lJhggsr%d3W>5&?f2J|K3jE)+ZTL?RSO7+@+C39!IsWO)OA
zJ*C6D8<I)mh5(BCskKpGByQ9^o?^I#dc+U^7_XjIa+P;6UO{wk>1DKpt@8AAbb4DH
zEsj>!6pN>td<lI^Q#{b*^97okmpGgbCr{!9=A{=}Xv4Cm<Q+{xxH*NIH6`P^Z=*dB
z-IQ$FpeF+b6ZT{{kZf+X-x-K5A_Mm3R&L9$Cz3;Y)Ske-M0@Ou9E%-JdumA_+|s%^
zamQeAu=TbjtqFf(D3M%r`)%>XvD=c#xKVET>e+o^Jv!J_tfWZV)6tF5*d0;v79NbI
zmlbMwej`7=6>?z0=L?e$n~4$4BHl^-orRo8JcOICZ{Yy<JRyUruy-h_7v%li6wlrq
zj|3uxjN8+?Kcc6G3P2?0Ps6p`9F-Id6_Qkpl;yPw7P8WXlrNc%8w?(OcCcF`!sl>e
zsNnn|6K<ZD95LZmE@?zZO?ZU~&zf+Wm&zc=W5Q?*l3`TWc#`l2o-(4g^%_r9BQ}@Y
zZo<vwx=pyb+&&X-o)6w+!l?`y)|+to?x4IO6OMtl5aK4hwtzCW*@RCq;oD5OnCkM}
zFPiW=ll(3dUT?y`X2S6zD1_Z6oZcHUJZ{3pRGMe)G2!NU%abOY=9)5$n(&zt1U+cN
zFErtYO!zDle#C@-!h|0+;j>M6)`ZV7;U`Tvz2{_j--KTzL87`wF{67f*I>lg=o%4@
zU(8cRWYaaCAbGhIB4D<z5#jRldCG`1UE>LoH&UEsjk>`Reu)9`*r{ur5^kQK(_CSq
zpa?LZ-*%7rlG5^7$=}16`I66yd^X6A`5MnDh53<u2IR9t^)$jZlu*3#d6Z|mu1`lc
z1QJ>-sKpa8Um%$bB(+F7nbI&``G&M;A$wKdnxvza^+H4+3~2OT*6^wx3@p<Yg#DVG
z;##&+4}}B%Wg1?V8eYz^gtj2LY=K|947r!pvSMju4w_KuiEhH2#IJ>-@pMXy>ojM<
z<b?`zIGPIYUxzQvsGtRoB|$A2z>D3VtYvGcGO&}>mZPfP!FW1(t+qT8ih3i-K`&L3
zSbejZ#cUy=r$Vu4^4eOqDjeIOhc$X_^EUC;)G|-ZHv#JI=@YFonneYh8&$MyG_BX=
z@!Zhq?d^5-d3#+qc~>xj46N?y^$xgKclWz*@N@z1T<y9UA`tHz@ZPX;<qEoY%0<>|
z`ao$MdkKR<7&H&M4Ri$tf!&~g209AbfQR{I;z6^Zui)WmL&s^E&ls&Oz6-h=RE`D8
zovWF$*`{1HwYp{(a2kW?Ia>$&c2L$)Z6n@^>keSo66UJ6-EZlvtJ(r3u#`{r`!e!Z
zgBJ2Fy<C87#&P`!`S*#APuP99+^9$ll!6D>*KmCUn2QXfoR0e09rapAePc(x(^cQL
zs(w@%u70p;XXQf`4_oiISZ7q$x4G(_TmWJawz=wSTo5zrMLzg4D%cLbP%rB>3RSEN
zPz~^x;G;pLSy#R7VGE2rWbLlkc2>aXgO%=j=lxag`nKWf-um_gFU&=?5ruWs*Kl9<
z!A>m(9QoXmIdj-GE(CWqAM44yxXzvVjeGd@lYIl8os|di8gcKeb8)ibQ49?E3%`RQ
z=JXnO<_-0U%DHg8`oySv$7s6oK;;y0ihHE;QAhwv&2sN-oej*rbHydc^ZD%eVP;I(
zb?bhiy}YQs{6M8$Xa|M13QPAchj!+<>?2fxk;?u;*3K0_IR=XlNQ*zB;2u6_QSaSi
zA)S-%7qj4P$?E3kbC+^xpL_V%?+4XbWoj&+2d%rxpchsc^x|K<mCrZ7q&6<_0q9a2
zFa9}zV3!(O&_WX3>WP<v2o7~V^%BN{ZguOgy4AI#r#$Kv2ZCyR>^MQk)wRb?xz#H;
zb68z_hL|(R{yj&|scZ9uoFmy6pnitb&vdCXpH*j`g;?%<D+dZXU1~ggJm^6w&^$((
z$Gg;72cZ3)OP%>4(q~<2;~90}G)Pt*csr<Wq6-Z^aRT%-=-F;$1zl?56&r-k1=Yl>
zHfrXIE2+IM_2<;qpPzu#IfCm>=XmSS{Rg;nd8oV(A#jL3BvWuzT{~2})qxx|H@yY9
zv15=s?jpoRh&l$Q@g#}90&_0)#&@Ws4<Y*6agslQ>@ythBA8HD0UbC+Rqs*<;KtY9
zqYQ{PW?_De+J}s10ls#Oa<XX88#Y95BHDQTlzPuqYu&>?s6pP#zo6)4ihe}VGw!7?
zsykZG<@4^P2UDGNCvju;1=X!T$Q(X<`0N3CJEiM_J6=l7+Wtd&y^YnXPyNQW{Uv%W
z2G!^fNOa#@wqR!F4etEDu_yPN`^WGywl&l|I(DTxe;f4|^{HRmw!bJ;sg!;Da!{SR
z(w!f5`)W~#yO*i|GIiCyU!&&H=aF1#B=?<CZ{qT<%t6=LL$0%<ykh?t-0^(s$?ebc
zihX<kt~IW;t{YuzTm${??Qee(1-dh*GIup}xihJT^~3o}bw?OauzTe5HO*(9qKRhq
zhlqPJ&jPYZ=s6BO18BB{zQ>{G0BI8X5+HY`tHJ3S{?MX6cmfYt?~d2ihu-9gy!y~D
zDY5he^$RZ}F??5plc^6qkEDAiRcI$Fl#P%Lq~`5KRl2fK!L}Q03~blZa6`Mg<FL?M
zo_!0in~|=Dd0#(!SHtqmtJ!aXhw*tVy~sVX;`CCC*O@=K_l?bb`l~h#75rKAXm$zN
z9a-@zGN0a7KsBJAOg`I4xtaYv!+8uHuc07M=6wtu;EkapLuF?QL&wtJ;ZeXN@y?r^
zUiiF0Fa8@)K6ErffQF9UltDwsSxWGsW7q$=@!}6}pl#=m7tavfO(R74@xlhTe9ZCU
z?_lnKZoCL$yy*YP@nS6I9;tiuYR2|j(50THRl`&CHqT};xqXsSxe>(ZcXvR@Ju>6z
z>mY~T_;?|IBhRl}OGdV-Fw#^o^0yGmey1SmBFR5`fcg_YK+*H3c{1m+4`VjIV^rPo
z01XKM8wtEqfIIm(0ZDh}SDwu8|CJh)wTsLzQfB6s2Df|XEe*5x;sIe#*D1_{$1QsS
zD(%j^mYqg)=Rm{Ez3mIpG_GF<%y?K^_8v!ryE4b&kZPcN_ki_e-b7JA-a|Qi_tTR)
zl4@`QcpX4<K6?xVL(0}w?vcA1+}Yz`DWzq<hXn~WrCQXwG5eT6>VRYeq(182{tOL?
zpL5;fy4AJL<-K*kXJ@z}>_!wHt5fG=;?=9X4*U1)_GEtResPT3rYba!cp7GRVc41J
z$z0Wt`I+lqVW%s*6=tc_>3q;Q;m#b-eqKN`0A<&q4j*M>nNWBE4)SEUjjILp6F}Ly
zAH&8O(bflwHcknMx?uJ&rhgw*kM9bFl|>u>D4<nj<B^YHBPtXuMH?FhWF;Hd!-gm0
zZ@8&v`v<g6@QnByoKMo6zf0{s+MRjR8248}C%YO^P(9q$&_^21?j5gh?F{bvs8+m<
z5-=0JMR@X<!RpzU(F<Jp7t(u;65arwJ&A~SMyke5>JElUz`N=#DDhqO3QvRcU3J~Q
zy>@g$bjv+TR)=`*7awAx;5)eYC-k&ljJ&6QgW8WR8;`wC*uwGHj|iJS9(w^8^?z92
z3;(ce1S!mQXQ8%x{{hyGBEC+l>wup_r!wL6BiGnU;PlEh;eSE61NaN$arm5#%>w?!
zc>F`)c){gv&*$?rFLw{W?abW$wmXwP>dve=@?1A5vgf+rH^^y&oFw8NzWWGM?|TW!
z+`kY%(tX4|f&~Gs35MVAOofKu?^So~Le}uzr&;<6A@I~RAW#1iO{BT~{b(=lTHSNq
zr^yY&-5)4pzTvx1vh=M|{u3mBLdf4}$hSY&eX=0GX!!2;S=uS(x&7*ChW&+x{0uaq
zAYZj>tYR0IBJSbtBQ?W!e_*-CCUmiGQFknYFSx}kdA-%AQ9j%I0nls~-s6?C5vc72
zme2ka;I3WI@8_$KVl-}f1%{?)J(*W&hUVG1;5%3yuW|3R6YY0pKJ3Yy@mx;suW|4D
z(3<UsdH0TEDXsbEa=n4!(UUpTl{w+c&-{gZc)#La`pfiN?%`)@K6k4e)-VArtWQeI
zKeZnw$^J;?J9zGBiDt<kBK`da&QsJ@Bjj5@g?CbZK?7oSemhdc4^d|BC^dY+T;j>A
zxkH?vPCT5!a-*EDB_94~xjme>5RY!oa=SVIdk1(t>nyj6^Y0Rm0g&akasDmh@q%Ev
zIOl&ud=2sIIsY@_=@Ue*kMl1Pj~N}ywR8T5#M7tKoRjlEAf7&%<}}VfMLgD!ENA2V
zcZshjo^k$L#H+-g#Q2b3@EGy*nILzR^M6h}^|st0&Obst?kC#M`3H!nPoKFxoWGZN
z8uD_xIe#bd^r<tqi}MNM=@Vye8|QB$o_c>S&iM_*(<jZ`dd`2A_&LP)aeg)NG#SaY
zbN<uB)2GXvlk*+K&m&&r{4(P4r2*~dd<*fPB%X2p3gYJze-dNgMVD$wGIq0|?>G#e
z97N$g>In#fF#7G$%xlyW5M;i+gkMp@uPWhhF5x33{N@s#Miz5<UoGMPzJ&j73BSLD
z|8WWbY6(xjtx_<5yC$AGFooGuG)}1lQ+V7U{2Ll2<+p0O;S@%NJUvi6i1Gws+Io${
z)S)QIa_9k~u*aftDt|kW_UVD6;H=g-r3Z$>>%#DR=khTQh<uGR80|WZ)FpzK?Vt`w
zLH^wjbvOz#Uw%`cE%d1aQjq!dKvH<wsc}jVB!#yHe=={>Ctjs-99@&wZ&dJjGC5!H
z6PKEdCOt6efk_WcdSKE6lOCA#z@!HzJuvBkNe@hVVA2DV9+>pNqz5KFFzJCw4@`Ps
z(gTwonDoG;2PQo*u?K3z?`-Yj_oz`p<$m>2ty#?)hZYLDM9`Ij-Y95L&`pBgFX&eU
z{idK#2zo%!!-Bpk=ovw)T8wgM3A#|wC4#OL^hQC0f^HJ@enGz?=r;v@LeK+(9v1XX
zLC*+Uwa6&H6I;qOzPV1@gv}Dz&7c{Z0yV9%RS&ebINO%`moCzsdQ0=-4Oa&i1%v+9
zpsp`zb1rr+Zf#o<Xl=V3e=HP=ZVV)rY22L0(<O|sRh^y7v_@=#zg%l}Iu<#cT8p!#
z8LDksBbV)o`W%-FvHmsv-5M;VH(#z@?Z9?IerJs6DiGs0L*ti7dEUV_CK+>*ez&x+
zZp8AJOLHz`%X$8Ng?#yYCjCJG1^Ig>{Q<;8!KlwSk!9GI{q2#7gn=35twR$G`8P$s
zl)vwuEa6MdxB!UuE0ibOFMsE!zZCN0`e)=S+bj<wsmQZ0qzofxRq*XdHOhOCnEDq&
z>$7D25=6=;mLk5uI$E!RVk=%nCAP}4iWXw%j}0vR<b@h#I{^p_g$1MB@1gVx76e|S
zv?HEMf5t=ADs|6qk!h)^v?9xTD<aF0E+7?)Ib8i)AeBbR5SmK)wu-qRRU08dS2e4t
zxgGR^`w`Xt2}L|b5wafSG2IJ`QbpZp%C)$WO5=J;J_bX|6+C%}l2`Gh4OxnVl2wNg
z(M5$E2Aw+PF*HMQK8y5}{giB`B>(AK@xCDLoa`yeqxD&m>AYTK3__NI^60mLtJhJP
zom6EC>Q^{R_4W|^GFK(`Yq)H5$CMkdL;O0fY1@U#1z;%9ZuaRRrmQ8$RfC!FmqKEu
zvc_^Zo2d*~wj%zF<sJxJDAlPRY*s5Gx|FNMo!tr;cNTFw;4W0G*rSwbimhszGF`FO
zUsS)Keg^uj#lfug*G(mmtV~k?PoZoZ{wTt-7@5naTr;I-#&vTn_-h7h1Di7C+9_6a
z|B8jIX9ibcwyHH%MT=%5Fk8(H3B7ulm{qV@Fj84GTcs#HGZYxEz68c=RFp@>TyUXs
z*&KpvD|xX~%r#q%zlWH5A-fEu&cUvmW7PSwIaGT4?5}nJuXiXD;ABv(EY!z>CfkaJ
ziJdkDE}33Ug42;xV>p9yW@PSFkcUlF58R7=f3v^T0smYGkWEvXAwG+0vV2ao-l)xl
zp1KjJh3~LZlt0whG6jJIvDGcPCFu{ulVXQ7!=`a;@ptqaf68-#MUEWSrOIim?aJ%4
zYd(F+C5fcfc6$JS4yq`Z!i<&C9~zP6x`Ji!EQB5Q4j;0x!3&p#(J=Ut74Riep<?-R
zq_ys2N>5_hRTdnfps))pa>TX%lqr@9VDwj$R(6vpVO&L;(#eE4fo0I=lY7CF>14pz
z2;OvcQz+^Sr~LuO0^y*&SZ{3h_XmQGA?XeIkXrXM#X9|<HJY+c*La+=MweTse|ps|
z%u-=}gQDjiu{6%H);f5N`H=as+aE2qUyl6N%M>?vINB~pf9sV36Q?s^FE^dF5W-0S
z)C4b13(&XllK{LuH{f6ZKQ<_$(SGI|(i4o!Fn-bj-nd$4bF=Xp)$EZ}IvQ9p7>EWE
zAs<*h;Tu|E99g5;2l@XSS~NQhuD}+0=-GpyZA;-e9B&f<8!`54+QWh9U}|WE6Hq#O
zM=0vIr}Pcjt`1&LisKn@N&@!fkIzIW#4!SxXe5X1*ddw<_-N-OH=Gy@*s(p+j;)xG
zil)d`0!Klhaf7I8Y6!fcE)-)4)D-6vpyI(eZ7-!7P<^;foK=uYY+2E~<f`VyWFH3X
z*g_hOfr-YTNfjwr0@&BniMRwWmXHc3?bwqa2(O@95Yof;z~+DtcZ6<?xFb+br0A?4
z&DF29I2Jc!?lMk*4sXC23UPWEQz4%{5y0UWc#=%_nXH>TYnU2HFJGP*&*2ygG@16b
zj(^y3xC4LG^u(ZV;#}&8)I+=})-)801e){>=}_3;l!!-+M8}#QPgm0*4j%B?TO5m;
zo6rZDID-OaH_0);>y0M@C7ts8`f$gf$Z`GG@D^`TI+_d(Mgz2K*YMwDr~P=HcH&;i
zdviCuJ^tPp4#z-R4D=_n6eJ=?WjQ*@(O8a5QchAsyQnYq#>D(X>b)m;sV6B-w<xsp
znBFaGWHMfWZk5TjfGXH2WzAm;ra;cOHP9|H6M`yiolKf-4SWn+pOLmw1zUJ{j9|9)
zne%M9gjBeoNN??NL5jEqYGKa5mg%6t6xKgZ=f7lV5>Ub3Dcn~nOskHGa7;%g#6JaG
zVN+Rrnub&{@EJO$zH&`*t;iy(;N+opUE>JVgTnQ=2w$&jNEPq*%}8UuQ}{bnsvm{*
zxGb<Q|899Tfe5$ZB0IZv4Jk?+VKYxpXR|vs4ixM_K(WkX@_T^9?}5}rexMb@MEF;L
zD{MyL?`fr-ua%Mi+cG$9-BK(?f6CRux5+*xg|_t?QpLb$bCx-UzZ;f0LD*rIqTeLG
zMc~p868}EOXR^}o9|Q1@#Zq7RJ6<XOS&plP^{>Qt^ZK7(&mTfyI__8LD(GhdmupbF
zF!XDI%Xybv9K9p(JqtB1$@lLgbq1HlW}#ykIL}1%3Dk2MPH&V8Rq4-#z^6+g6Kw--
zo4`&taLptqlM&$b43vi9GWa89@NWU1s6Mn(o(SJp27kT`{?jt}8)fjbWpG+#(|z1?
zsm7apA9xB)0;jEk6!;%+gL*j5p>a`v8T@l)a9Uwb<cCBVd>inI&U^bp8F~5;GLd~+
zc29IZ++UWF|3(@7pUU9Rmcd^sgVQIEiR$@Q8T_3x_?a?zC7$1j?AMpU=a#`6%HWp+
zpXgjYC&!EL1=(4a3;dw2Nr+Pyw_nHPewB9gR|>R~Y@+(FD}yr!W|V<MoH?RcY&cvU
zJy@s=GKYUlG`S_h99Z5s#0SrDj5lGhLs+O7(kURAuIr>KQt@yKsTe<_%@G(9=i?3e
zAt^cLNCh@SiqGmDd=2N|YQ-rt<FK)UV2*J!HeRwG31JdH7=sm~q*x^Gjp1aum`+P0
z2d#5Lfh5iL<RZu`7eQDf8kwYT6pW+fn8SyY@sI_Jfeq<Fluak^05@2E!k^6AuptrH
zBy(sLDAQPb$>ex-I?vAm?_xy=Iz%V*@svU3!l7gZgcZvMoHH2Zi=l3}cjanVZ@1Ue
z(?8&)GYI`LV^wP`Ah7zv+Qp9p?!4aEhTE`+bTlK$oy=v$Vi`n8%5@kUcK}c{=8Z)3
zILeHhmt3d|7HU8p6T8~WYl;GCb>uBB&W-uC3FZ~zhbq#YlPCDC^4!qj^3bt7{oMoJ
z0au5o8%-R)kQYX{MR_cv*+$73jYi8wtNqEC7f0{;u`>K$et4tzv(1aC>c;)-f{*%|
zTj=b<4b;0+%}c@zBCS3gAQg)MY+m9`rXziu`n`Rjey3NQ`Y6c%m%|^uUeRs6!B_%)
zBTUa*>4}gNEVRc#^dZ9Q9sitlU*Cmitz4hy8eKPE@9OR8G{)jTCZ^$>F1)6AkmqE}
z^RwlCGm*_@Wtu2$Ag#tF^&hlo;ELxr8ytstB<F<^q4mxbTvA`|yW1mxQM^wmP#IKD
zIp4<@S{~*j#@aU~_2oXdc$JagXAmaL6v~nE^ud(&#7Ta%O5+-%Lcg7J#Wt4KpMDTH
zW722UnxR1zMsOOWP`<3cfcY<CR5w|^+*jxn`f^{PlozjG$+BLgs7_K}?vtENPDOa!
zAm^1Y{kInRq~9Ur<i5#Kp)VDwj`;X~p8s`VRHqhP6y*GW6u%EpxLH=rVEK7uBFRYz
zG3iS&d9JuTZ+yTkjC860?<moi>*M$|Ly_;WHrXik|8}7-Zg=5#yrW`r^Dh5Ysc1lc
z+mOf?O7!J-o0m01g8H{Pl-7SIGO7KteEFT{ZOzc4=gS;Q_5T!^W_{MI8RC~1#3mhU
zu;=-o*3ERQ<ntl-p~`dOOZ_EzNxuPI)ub=iyR$C^h#=(*uE<M#4>HK7vV5&m<D%Nf
z)ZYtf@}1O|>uPyE{%+BJ@i-OgBk$uu=#u}WzC54xp!i9z5qahy>n*V#BVpDrtc%+X
z30aPmFV+8PiN5^qU=#XMPf{w&9LB&9HY~tWeObQj#--)cd3omgYwI;`NozL@&ZbOL
zDAlK*%?OkJYu7a{-R9zi&b=NNv~e7MTZ29o;Vi+W{$G{gBuhck`9j}7i}6vHNJ~M%
zN`FX6M>+itLt>lgKT^KL99vdKe|@*1L1!?ULkY{=W%N(3G_>Ttcxj_lpS~LrR_ec>
zxDAbgk^w{NOX?N+(!Qk5UL&<l0)m$MkG{)DTV~NcLjT&5%o3Ju22kQ+ZssI?s6^1j
wgG-O&MYv@Dlkcx>(3gFW<YlmlPrLLT$Q(*oMkoJFr2iPb7!jT=$t>0XPcR{BMF0Q*

literal 0
HcmV?d00001

diff --git a/mpi/all-reduce/frontier/8_gcd_run.sh b/mpi/all-reduce/frontier/8_gcd_run.sh
new file mode 100644
index 0000000..81ffbc4
--- /dev/null
+++ b/mpi/all-reduce/frontier/8_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 1
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/8_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/frontier/benchmarks/8_gcd.txt b/mpi/all-reduce/frontier/benchmarks/8_gcd.txt
new file mode 100644
index 0000000..a9b69c1
--- /dev/null
+++ b/mpi/all-reduce/frontier/benchmarks/8_gcd.txt
@@ -0,0 +1,13 @@
+srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 8 16777216 1073741824 10
+0: Local data size: 1024
+0: Global data size: 1024
+0: Number of GPUs: 8
+0: Message size range: 16777216 - 1073741824
+0: Number of iterations: 10
+0: 16777216 0.049728 seconds
+0: 33554432 0.099497 seconds
+0: 67108864 0.202129 seconds
+0: 134217728 0.500335 seconds
+0: 268435456 1.560791 seconds
+0: 536870912 3.265382 seconds
+0: 1073741824 6.500534 seconds
diff --git a/mpi/reduce-scatter/128_gpu_run.sh b/mpi/reduce-scatter/128_gpu_run.sh
deleted file mode 100644
index e0a9db1..0000000
--- a/mpi/reduce-scatter/128_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 32
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/benchmarks/128_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/reduce-scatter/16_gpu_run.sh b/mpi/reduce-scatter/16_gpu_run.sh
deleted file mode 100644
index be576de..0000000
--- a/mpi/reduce-scatter/16_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 4
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/benchmarks/16_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/reduce-scatter/32_gpu_run.sh b/mpi/reduce-scatter/32_gpu_run.sh
deleted file mode 100644
index 04a7f0a..0000000
--- a/mpi/reduce-scatter/32_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 8
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/benchmarks/32_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/reduce-scatter/64_gpu_run.sh b/mpi/reduce-scatter/64_gpu_run.sh
deleted file mode 100644
index 48c7645..0000000
--- a/mpi/reduce-scatter/64_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 16
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/benchmarks/64_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/reduce-scatter/8_gpu_run.sh b/mpi/reduce-scatter/8_gpu_run.sh
deleted file mode 100644
index 5f8f10e..0000000
--- a/mpi/reduce-scatter/8_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 2
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/benchmarks/8_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/reduce-scatter/frontier/8_gcd_run.sh b/mpi/reduce-scatter/frontier/8_gcd_run.sh
new file mode 100644
index 0000000..9d4191c
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/8_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 1
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt
new file mode 100644
index 0000000..493d5ee
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt
@@ -0,0 +1,14 @@
+srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 8 16777216 2147483648 10
+0: Local data size: 2048
+0: Global data size: 2048
+0: Number of GPUs: 8
+0: Message size range: 16777216 - 2147483648
+0: Number of iterations: 10
+0: 16777216 5.130130 seconds
+0: 33554432 5.120491 seconds
+0: 67108864 5.115654 seconds
+0: 134217728 5.128319 seconds
+0: 268435456 5.111989 seconds
+0: 536870912 5.115996 seconds
+0: 1073741824 5.127237 seconds
+0: 2147483648 5.116940 seconds
diff --git a/mpi/reduce-scatter/reduce_scatter.x b/mpi/reduce-scatter/reduce_scatter.x
new file mode 100755
index 0000000000000000000000000000000000000000..d50ad5ac990357f4067a380d5a59a5e6b24a3805
GIT binary patch
literal 25888
zcmeHP3v^V~x!z|c2{Ay<@DRaAIcmVDWbz=vfPk4~0uwut2tn{U4D(1v=EXb^yvoBE
z>omkt+huL_TCUexdT-Y%z3Wyjz2zZ_=xQxmAGBJv*2l!~QV^&jbH9I|y(cF_rh4yP
z>#nYQv*GN2|KI-qfA9UDefHV=O!i*a?3`y(6efd-&1J;Ry;$Q=1CJToe4|K!G_ZPB
zf_N$`V@2Q#IHu<}FpVc9tupDjq{TYR%%G$f75Pz<#&aY+pL2{!Dr-oT^ag?^k~*y#
zr?ZkWi_Vi?AufqmU99m8NuxqeQllQVar4DooNaE>^|JH=9WVcyPNg0-i}aondQS;G
zNjH-Kf~3+f3*S$8seSp%dq?Ok8^zW6y`of`(2I`I@#q+hCnRmxVJ4~c;kD4Cdj3}f
z>C?4Q)Mul(f0Am#pro?A%3xsmjM~bew;~t_Cs$XjuAfmcqt+UaSZlaFx(#%%=3ldz
zTQDOQMQ}b}L^@-K$fWL|4S`<-jIP{J4`3*KejeNfe5i5*dGJ+v@U?mHd-LEw$b<hh
z556l8zBdoPFAx5QJotxs@Kbs40^G2n{5CcZJ}D1AH4i>34}NtXd}$thMIJns2Vav1
zUz-QNKM(#;9{k67@So?wU(AEQmIwb7_iGG0mu+!tNHHeIrIC;)e5YF@IEz04+`=ZY
z2De6VHvF37=bnK-FYvu?Ei0=hOnB}UJnIVH4aR>aj!~YTtEXhFYw39;IUr5rh{E1N
zc~p`<t|;t;;Hk{mB}FPjS%-m9NS+%wi$)v8FyK&D87`vd;==Q|0Za*D<ri!`zy6b+
z^BbFAJ+Ob}((`}ym;3KvE*C~Pk1L*V#}clPI}ira5sHKb;sP7<bp+xGU(Dr?xkElz
zAncC-^{jTg{DH7L7`WXB*6SNetTPa8^@TjqH4LdmzJ%SkD&X-EGcV@jwACF9Mm&Vf
zm)T8`WH=#^=2$EeTa<_e!X1R@`is_tJ)N;gm`aW(Vu?slkRWSqYjL%N0|}mJiiAS0
zm^-{;fQ#!o-bgaR{85-q_*qD=LOg*s#F+oKSRmo!8uMiH$ZGqVsL$nzgcE^|WF#5q
z0s;$#M1c#UE>FzohC2KcbFXp5<Guu3lkj;GOqAyL2b1wm=BM`a+Qxj|q{ruqd)x^)
znPW@b!eFC277Iu)5eWI1Z#A-9KCe6BMj^}Nas5UC3VX>5VNWQkw<wxmxV7$(7k<-U
zS#{(>S1n#$bkFG}HiOM~wlp@mYOFQZI#wBpCMv1kl~G@%$K$K4o?)%BR`DcWd@gzs
z2A0pNjNex2hx-$#S!F!x_N=Jzg;&KZm%HP>tci+v&=;?+tGL}4o=yfTs_VEduR9j+
zbcZWqxW{Nu#dK?}wW=a9!xyZnTOGTt!{1SN>x{aXH`W=8PhWX!v^H{UJRa4{&0oC=
zPtYCis2r%INLD1nE5ec6!s6}S5l+s^*6_?meswG4z?jDqBp+52Bbr5g74f%cb3)Mo
z?+YB@o+qRu5p;FN-C21rH^sA8M?=0)He+Sd?G3pTomn6h@h0I~ZjMUw2eL`JbyAks
z%Ad_jW>cPcGO9Be64=st8WBGC6T=GUml<%w@WMI`IL&orh#GJ?ccQ%22At-zGRX0o
zFdCa=kmEJsQzQslzf9xY2*k#68w|LyT!#TSmfL2)jq}K*2As-}VVMD!?-t7IG~gJ7
zvmt80OR^~ZV8BNj@bv~<Op$r+y#{==L4K0~FE!u~8gRTMvSG6Ur+11BTMW3Eg7d7c
z2HZH`dCGu`B@36|X~4%z<Dh#D_}K=0p8=m>!1o*Qa}4-927ICcPaE(_2K<Ntr}v)>
zCk^;{5+v%@C}woe<=Tz-Vz)+w<6_Cgd5c@)l;q_Sihzl3jR=>Y%u{-#xiy|3c|FBh
zR_@k0!Y|Yz9#^?FP6;>8>+22p#au#<XpS*dPz0FCtY2rmq_lj7GV2&KUh)}|&kWfy
zUV}L$JFk+@hJ2=|{(4xC5(chJ2IU#9Ym%YmzL*yAYtdN5<HP$hu7#5Egobg<)2W5C
z+4I{L$E_u-6(a5qpGNO-4X^eN-z;rP(5qEYT*BtL13{m6mWCIrhL?FHrcH^@n&Q<i
zLGC3bY@jqU2TiE7gjZn><JAJ;XfmNi-89F+dzcC%!&9s?{94?H7r!@N!WL8E(2Hxc
zQ8iabG#Q_(%?<^^u28(gMU^7fz1qlPHW70t0+DchZV8(oj4XEtHF|yXHt^b)FlWRw
z1nOvM6RpsjK?RpEtF?0Ql)>sr8d^3`oZh}xo71_V$<^9wYjd^Qmb&IJfwV7dX?3+b
z7B(+(EO6R^H!ZYX4-tsBwYwI~n>PnHP&v<hC9T0}jC&CSMHKXN(7QorVt{xIbQ9<^
zpc)?Z{ltSF03C$~WHvg_Et46$3G^+{$EY(Q$gx7XeIZj;Ta@!gjVRuPJQ}0uIlK;K
zRFGif2n+F5xFW#j5@sv4++%7QU9<*DU@5;1S1<Axf@bqgtz3X?MsYnu^%f5RVbq2O
zR3rjQ!HMfZTn_`ggAAjb#?pz6rCMWYd1Gmnt+alA=}x6<#KxlD!utxoWxmH`zP_-u
z-d0-01t11ti><WS1~I)}X_(oI3N}!?WxdT;DpWB$pkm;kVSwpCn%PS&-!j3-edgv;
zt+xP1Hx@cdtL`asl-75RXf17s@xp9W8&OzeX)*U@8|>7;-vwmH5H<}8{!P^fTT(CG
z;z+&g=z8x+Tf4Kj@FZR@j^5ELIhpe?28PVphcNUUTkJ@^uRc-u0=j|v#7;-|&Sd$X
z!e4<?96g0^LIO}?f}^)?FEB^%oJ$X7GU)>_)30p0dAHC$FrYnqPvM`0_TxfZg{7L=
z&`v#<ewr%KQ}~^1R_~nG55nR$Y4JA{99^eP>RoG0q;tgaQX0G^KBD^hzKc1u%hC1c
zlYTW)nCQ=BKu1s0>DdK3z2MD{GMVa^)$%DG0Cu(ff;Ry8?P_F74M{YshhO$1Sk=kY
zI~emj)U|Ilt4nqsb*h)`@vG7PLj)aCmmECmP%q=mYwD8Y#2iQVXB;`LF3Avbnq*&u
z`f*Y}Zdb=YtByYbvA#2{?8)lb)oA*V--%M7d5|;@+0_Yqp#7;$9sdQ=Cv0l@akc#z
zNLp<_;8$1Cg$5r!40;UoL^HDdHZ}ID1wyC&YV36jHFM78)LxtV2DSB#!;m^n@aSWG
zy!EI53hs0UDkmWX4$&863a%R60M%x-y$_nJK7w5TLC77l5n>}m?FUnSghXG3Ih%Uj
z$5hf65Pka)$sb1caSq!FCe)Ti+mBM!?P@#R`1Yri0nzd_%=c6Kknt?Qw+~WI8tr-C
zg6IQ8D-Io1*G*gE=z6*sd9S=l(JK`FhN5R2Ghb4>>rQ7fj+uKCO>`%5V|Mw~wNIyB
zJMr3yJ@j@;j`nxIoS3lT1$w>pm#Ev`wQP8qUW<M;{4|N~`pDu>jbGr%?CO7Nx3Pb8
z&0=c<)jRtyS0}Hh{-SRCvt`3eLX}F{wW|-Rqvkm>I~|@9)Zxxq>XW19@A@-p9)2Fl
zd3ti!QFSSox25*lPVBRt*vTvQGk^E<iKjL^&nx!R-J2HMme{VdEw;5Ua%?zoFA8i)
zok-nL=5VAEWy`uUg=%*QPjE}my5j2N+h_us{tR(T>RCV*37r7sNbLqRQ9|!=sXc%+
z3GD)8>-xf^Zv2#<t?oC}`wk${+I?KT?=MI=XC76*@kb<DQqMJaXVm*%B(YwqOfM>v
z{wFF0sq=TB8us*~g00ore+Fxx87!++yWbFc_Vm{P_SnnL|Ne<P%IvAv({JIm({)GL
z`7Alz(KBbuOq8BF*0Srs`0cM-H0DgbTfH-VE7|Rt(}T?IJF{pzP-p5y`X<np)H5x7
z*m$36&4&%}`mk{ym7U5C8*@IyV}Qru(+@a3`?os1;76c**q8<Z8a5uI3>r2*rvx81
z)X5mo|IdvVpTCdxojG1SLvS;V5c$Uo3taOR$BQ4q-2dKqf%dGM*8VpKj9LsB{Simc
z=&xVF*p7whSKDbhv5j8i>2okSc#2Ye`w*k^O@&ZP&pF%IK@MH=b<BSC{H;8HbP*ZZ
zr@}~O*2v2cO8+S<c@IfqgtGq+Mkrfqypx8gC+R8FoT=mKUt>Pry;JSpjX{CHassyq
za1Vh!T+&<C=1hIy#2`}dOnvf8YFqkRA@(H1oW0(%i97JDu<fH2=EReiu0hqEsSne$
zKpnm9W#e~jnu-Q;)ftrMNtD%I-8=giP-5v<V9b$v7K6vyi|nb79eB#?@%sM$iNB`o
zWlkC<oQ~9CN9u)i33T~0@9aTsl2tqA!m0MuA^0@|BRd#aN9rw96UaeACXOM2(_r5M
zTb)UN2!hdR?R;uoTRIJvw=?|+mNn##Q}om$uROg`ASN_2y@1qrIyO8*1M7{pn`}4R
zZn3#;-tFuSmIcWL(f&y^J21=-9LgJxD{m^RU-w;Jf&+Cf?{St*w4;IJovCSMj?_oC
zU&5R{y%N@`a9XiBQ-|U2bXY+21vhO&Rld~7G@<b;_*odaR6zd$D18bOz%MoOp=k7~
z0VD4T=r%I)<X18BsL-e$F!DnI)sT_cS21$4(4gglXwD4+I!N87f{bkV9Irz56fGI-
zYU|17)M@<{(g4Zy*Ae;E*X(6W@eW$s_@!;{heb?fuM?#m(AgW)^n}<lFDAF^w%-Gu
zK8%QWMYVf8#)D7Pn^4RrHubXKeWKp7Yez`$iCdMlAA)@!e}OrMXYY>5fVZ2`C%5&X
z#{IJfWBr6p9gO{+u(5-&SAn5NkAmeLaHsB1Op`~Vo;wa^i}*gN-U9q6x{v{<-`o1<
z0jHO%0e^^aEAZWeaWl&3p8)({2jld+7v7J3D>Io4P3aw72dYwc9&n_R?>JJ6_dnNs
z3I^G8%_ntoOeaT(IJ)lK&(ynLMzZfo;(MC+J9@B|pyfo@$)-f0>tw6iy$M-ecOGNO
z%Y?wTF+jH0z=5<Z=s|mNzeYdTe2m=C)qG0n_jKKPge7m5^5>AeUC7_4%QrmNd?YJB
zz3a}CELkPxx&51j{N=j*I5Z(EU$m*eU=vm`j;`kY#a(xvGOe=+T`YOj?&<Iaw|F_P
zx4IqWvmK{^rVqk<ymA%-wH?4R>HPpVZF+t;tzQN$$H35Zt|j$4&FGxHQ?_Fzz1Y!P
zL3EKV^+ikSxbsr-d9h>H7v}UL*mQIsOlZ|_$n}Xc^%;hAd+M+)GyY9S*KWlz^H0f-
z99_>8-*~eez%XIWu4i(~Kf0T46zumDDp<DDlFpRbNBX;UoTsRPddSp0YQ+k1%43Mt
z$qh&m-$|K$?@()}{D63JW#2x|-%mU|!1{J_{+q<Z>8x)n=kFpOy`1%J=KSr%<Dq7K
zn>ZgM9zz`KThIAhiN`yF^+h?qoOryDS>H0w-#|QluIOvy{6gX}3uJu_oWGiQ`UKlo
z#ra0!>C<bU#`#&qW6jC>ES#?)zLa>z`OAo>Ppo}MFeYTCTtvKu_;)yeF7edG`u1^t
zEb+LXXg}vmh#yD%R?eG<r?IVXGv`060FPUN_H+Id;^|Xq-+In}L_Br*z9{G4C7wQw
z_ATT5ABmqtd>iLqCZ6UleGQy{f%x-?uj2gE#M38~K8^F+h`)e%3+Mlhczjtv`#JwF
z#7`#v2nIadf8sUb-vPhkVoEah6G88&)Oq<$dNlO}1nM9Z==V+vlCP(pVc?r{_{BNA
zD~Dg1!*}QKn{xO^a`>O+@H=w&7jpPFa`^Xi_`^9o{hmvKZm=;>ho;arQsb05G=(iX
z;osjVNjI7TbpQ${g}iVpn<&Z?glWq(E=L`Vf-Hv~C<<Fm8mII?P>}ZN0i@72LgSPk
zAPT31;ZILz_-8Xs<ZC<^_L}t6g@Tvupbkkv{v8r^KngNneyiUi^r=Hqkook0QfRN%
zI33Mn#)5)BlF{pPZ;_sluFC88rr^<2IWPSr)Q8Ei2ZlW`?15nq40~YM1H&E|_Q0?Q
zhCMLsfng5}dtlfD!yXv+z_15~JuvKnVGj&@VAuo09vJq(--!o`#qU}>t2ADt2Jt(P
z+^?Qn%?CvO<AOdR=pI2|6Z8W?j|*B<quZY#=u|;x2s%&D>jd=+x=PS{1bslz#|3>t
z&^>~_Cg=x(9v8G|x+q`Jse;ZBbe^Es3F;Sgm7w<s`hcKK*dnI!&2ri*Y;wS^1Wn)M
zr)lMNZeLwZRsBrw%<1kbcTIKe@+*AP{eEwq-|e1JUsYRGTUS5BS66>2{#GXtUg3+)
z(zrRN(=Lp#`Ato;v~p~Qzf`NPvQD>FX*E?f)ljY1%DHSy*kip^h%H*Ys9A%h<myYc
zE3DWah?_PIm;M`_{>z^{@8C+{Lk0Ss(8QV%%U=oyOU>r_cW3kE?}uZBnEd^a{s3X9
zpx5VdWa;*0f4hGuVPJZBx1b5SykGPq`Fm<KhtD-*10dS3P@ZhR{9XMCWO{M^0=de1
z)3=aR<k=TehLN)h__m^Ah5mMT)I&n+2C{x3BITT!h|e-xR1Fkc>Iw?66_yp$5JNxq
zoA}8K#mWW(5T*(Xdb$6O(hHa$c(KxecohBZ3{@-FJ%2%_skqRLEc4BXO#AIX3Tin#
zg8quQP%jxmqbT1}a4tyE3JB0Of)y991U>5>L?vIRh^Ht*)<z!Fy)Y?7)O|+I#f?<T
zmr?Rj7*a0d$$gZZ#*-FgDOO4r?L$Nt74jPBsF9DN8A{c&NRQl2$!bdSpPCidHPsrJ
zfgKjg!=-7GX?jr%K^;6B$iqJcu3klDHc^!+s1I<K>g^=<6|PF`gSaeo$CT@?Li`@C
zF^VW_3K$BHKzeL|DND$4BfyNiKuC;N7Mt#5<CS*PTEy3y)<NKGsZRA^6Y3DrrCcHI
z>{`IMvxpl2+flKB9;J*?EJb6Kv5L9$ywWM9<Ir#M2P)>$t40w>R>mlRM^d&0e}rJF
zMdqxLSB`8Mchw{l{(8V%#zu~uJJO8qUoe%mjN>ZIQna{ez@pIz%#PrOgkC93Ovu{I
z8YvtwTcjv0;}jSkaUqNst0<3(IqPiYl1T)Y6!Kz68f!Kie}6FQY<3CAXe+yFl3wRa
zCQ<1P6TjC8yws`;fs;YCFk2rJnrs;`OzfDEaLL#aBsdm1#kw;nXI$zo1$o%)^v%1l
zPjBM48{wa`0n!bI_ynrS>`CELy*5L7>N=n%z7tAOJ})g{3IYjY6Igsr-0O?R#ZG62
z&EMF<Z(XGSDa~0XIdYh1D#y&0%dgU|y!yfmV{x-(rO$+u29%3o#?0sseaQ0YtYz>l
zfF1N!53;a%3zvxx%U@anPb?7_C|{1W=3Pu_iOrg3!a)TJJIf?TT=Va7a6thu`cp+S
zTZ;IMic}`!F>wM*hsPuLbjOo%pT42H@`}np*b_{8eT?~n{)&NmV>7?k=eKrBZ^(z#
zyqhWJv3t$ogn6vS<Agap+dTH_`8P3Bf%$!kp1a>vKFM5S<u&F*=2vcic%c1q<TqcU
zIJm>nb~*Z+FBh0NeF1y2=`4i+cJxydT-f!G!vknXzpG^dHu!t7!4Qr1GEb*F#<&dQ
zCmG;Pt2I?u>#t9(B9usmeRDc|VP7oZ0qc%=I_Ky|)@T(S{QrU)tpWz;U@JWID*T}J
zGvPQKZ{q_S()U+Z1byL-MCY6;K*{iJfv~qC;a-ky=iuFOar^>KLBRg{!I|iUI1T_4
zjCf}Sc68Fg2Q<eZ!?6xu1x{M1z*b5~g%e~ehNB+PxDHe`(FtBx7mAS>YKn6PP;q~h
zwu4d)s6N~#&KyX@*37A%F|E3m?886>wtR*oV8Rh-Qbn?s0JgO>A<n@EN=O9b71&$v
z3(lci5O4=8e5-vP+!4Al;*LN$mY}nKG}|Jr##&pAIm;jcx@9;EAxaNpBH*cr`Ea-e
zo+QJ4ChNw|8l(o&%a<nxb2z2~O{N{6gCBMr(!d`zcdWxRbS`y7>LIQKtL%(~e3kCy
z$w1Is8H<MWMC0NXr@gWR2MBm7YOK?%E71p;I7<R%SIIHJ<%-69Ii2#%`f$gf$U*&A
z_ZDwaG8_+dgnhKjR`=g<r~PW3cIaNodvi0qJ^0?}4#z+`5a>@<DM&<)%5rp)qp=*B
zq@1L>_JF?BTOsBjQZFfZsVAvLBpL)&*kYNCW}%y9GAW<}c2rsXL&0RpnYP*+L}pA-
zh20{PMqBM)!Pd3XRw8Q)504&<wyr(HmQ6?nvx@ZA9u%aAL!c&B^+TEV3rt~)2I>4w
zh9&_8?BncxmBO^@7z)R9WJvrX;0hbX_Kwkz8VG!bj;XI)Q_K}vL=~JY)Zo@QLiM0<
z4KBi$xizHpnI+Ndabdob{ku=9ABAPOOt3HiPIw`K2)E)QJDc4aQk2%iYMvg;wlrxR
zkam0!P)rk;{2n0jb&wj$58uv%KLlK1<FbFhD((DZ9{DYK@E@D0J_G)g>&l;#eM|~1
z%QU10LJ>94G%5Rcx-zE=c9?0vZxVk%;L;Bg{{zRzv)u0=*TcTaRGR&}St;-0xSIX@
zS&9Fe*Z<6V{tg0Taldj`L2nCOu0ip~;5>XFa5?Xii=*QLXP0PPj_<Qa>T_Hgn}Cj?
z<2)15`Kad@oZcuGs?wjA10O4e40H}~%Mf<x95u}#CzEt88a)HK;ody>_wwLB13pxJ
zXe-@N_?|p?zVpxa=aK&?4}K~SJ_f5jx{s_(<4wLBJcU|;(;hww{EwwUujM$021QHr
z;LGyhfjszadGK!FL!IkJ=Qj@J&+q2J>C4AZ=fKg&fuZdDBo9vCU51i>ArJn?Ja{?}
z{&^mp_9YHg?pb;8(meP@d2rhPG?e`-^5C{S_&nf4ovYW%@qzb+>_pcKe6L%R5U00r
z`=goMH`0K9Od-y3J#CEtArH>1m}&ZAQDzNe5n{D9wqOC%!K~gj;rN;mvtrp}6(2wc
zF|L@-c47geODBL}8t*1mk%|TrNJaP=Y*t^VI2W(e3rWc_Yr?l0Qhc^=<!d`DR~wj0
zGY;!33uYZOlj9}3Ljg?gJ0h^6mlO#_T@f4~7je^)$x7>=fG<w7K)E<_$;A=Yj(R5P
z>jmSeIA-<WL_B1{qHlS!17*_*I>2?7pX?{ImM@R_R>>S%9m+J;WHLEeozC&I!n;^0
zf_Bo$d_1L7xuD2d8DS-}9Ony$`68*=;hMM5*4pfHwk&FQ(b<FEh`!p@7ZzBRVNK)3
z0d_7|WcjUFTw1G<<WA<YVi64@B<1>yoq1$VIN}P0+))%9HLk}{A*|wnT8H+xi&qz=
z($dK_uyWTY-v*f1il3-Rw@)6^H{ZFS(dMM%brv<ZyV`Ay&Stc8@G4&z;TGkYj7A$d
z2Q=!<7ftuZBQBh;=f$$|@BHLO*A3ORRCWEP+To+N>KZzra5;7HMD>gygGikRr$I$R
z0IO%X;>l3kszt80z@jRbIPy`D|L-S1x?G|=yZn(D`b>}>#@yo|hgh8tg7^m~*Wib+
z`5HSO#C*>J*RWrIjjgq%$v7<kx6?G-(}mX|5Au9%dG5B{7bdc~tV|Q7jh`cMN&Sr`
z4cx%F&pO8;9?AKnL};Bf5|`AM`{T9>U_ahB6sQcUr(7rC%PtR-5o669k@|8!+Py`3
z{z9GLgQp&(oTT)76YX7-yxhO`rqFK?da{kV^`{>ajvMr|`}0Tvp-Ly&@@4%6%zsIv
zy2<k8{=YV%FZcgRdGUIdENexI>Xg+N`xD#9sR*=gq9EHZ%a{IJf;`f1Bq0R3zwxBd
zH|}r5NBcAU?*>M7s=-A;t{W_>&eJb*#vs2bNGyN^=}R$r9=SZHyj=<kn(P1Da`ff8
zdhZxrf$xkq*pTg)&<3F|Zg=+gKY0%PMfiQh7;@|XjU0XXy{G*`U5om+G33_27n#(4
zS-$)}6cqaOd>KQo{&$dR)L&n%aq;y+Uz3g%Msn-_AaH7;>>rz(G%hL6m(TT=<RyI!
z(sZk(zFa47xtN>D_7|NS=s!|_D>A4~QeW<8-TD>vcR-hXC-voeT%N1HS+rj~PTBfM
zKkS7r`A_Q0^I4Zp1&C0tlWZ*OErH)6VbssAryFz$S&o#?)&G5tzWiQc5&BY3QYy<B
z`oR#^CBR&LSw2>vJmi*N3}s{ewPhNY(;7Ho8f6+nu0H+TMws+pgInX$4K_~bJnTV1
z8^qx^Ip|XnRXMoSe<TMdSqhR)7Wz6m5Wiy+X(=dJ=?^Js&8Od}ORN|DN6P1zV|0#u
zE=ID;nstp$Ihi>ubL7!KGEdikJ4YdhW$i*g*MGlr==$wB1BTR>l)g*Jw63AYRrF#(
zSkL2uDA#|XJo;OO{@fhnIV@WZAjieLup{(=6QN!_xKofm50~tJ^8K|Q`m*nlybR^y
blQ5m1Zwxss`(_^fZS-P9csIv*uKs@lq3fIJ

literal 0
HcmV?d00001

diff --git a/nccl/Makefile b/nccl/Makefile
deleted file mode 100644
index 5652112..0000000
--- a/nccl/Makefile
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2024 Parallel Software and Systems Group, University of Maryland.
-# See the top-level LICENSE file for details.
-# 
-# SPDX-License-Identifier: MIT
-
-CC = cc
-
-# perlmutter flags
-INC = -I/global/common/software/nersc9/nccl/2.19.4/include
-CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
-LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
-
-# frontier flags
-# INC = -I${ROCM_PATH}/include
-# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL
-# LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl
-
-all: allgather.x allreduce.x reduce_scatter.x
-
-allgather.x: ../allgather.cu
-	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu
-
-allreduce.x: ../allreduce.cu
-	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu
-
-reduce_scatter.x: ../reduce_scatter.cu
-	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu
-
-clean: 
-	rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x
diff --git a/nccl/all-gather/128_gpu_run.sh b/nccl/all-gather/128_gpu_run.sh
deleted file mode 100644
index e9fc3ae..0000000
--- a/nccl/all-gather/128_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 32
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 32))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/128_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/all-gather/16_gpu_run.sh b/nccl/all-gather/16_gpu_run.sh
deleted file mode 100644
index a94a523..0000000
--- a/nccl/all-gather/16_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 4
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 256))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/16_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/all-gather/32_gpu_run.sh b/nccl/all-gather/32_gpu_run.sh
deleted file mode 100644
index f1ecd9f..0000000
--- a/nccl/all-gather/32_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 8
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 64))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/32_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/all-gather/64_gpu_run.sh b/nccl/all-gather/64_gpu_run.sh
deleted file mode 100644
index 357da9e..0000000
--- a/nccl/all-gather/64_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 16
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 32))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/64_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/all-gather/8_gpu_run.sh b/nccl/all-gather/8_gpu_run.sh
deleted file mode 100644
index 4bd249d..0000000
--- a/nccl/all-gather/8_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 2
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 256))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/8_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/all-gather/benchmarks/128_gpu.txt b/nccl/all-gather/benchmarks/128_gpu.txt
deleted file mode 100644
index c84792c..0000000
--- a/nccl/all-gather/benchmarks/128_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 32
-Global data size: 4096
-Number of GPUs: 128
-Message size range: 262144 - 33554432
-Number of iterations: 10
-262144 0.002247 seconds
-524288 0.002277 seconds
-1048576 0.002775 seconds
-2097152 0.004497 seconds
-4194304 0.007477 seconds
-8388608 0.015057 seconds
-16777216 0.028550 seconds
-33554432 0.056270 seconds
diff --git a/nccl/all-gather/benchmarks/16_gpu.txt b/nccl/all-gather/benchmarks/16_gpu.txt
deleted file mode 100644
index 73e83d9..0000000
--- a/nccl/all-gather/benchmarks/16_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 256
-Global data size: 4096
-Number of GPUs: 16
-Message size range: 2097152 - 268435456
-Number of iterations: 10
-2097152 0.000532 seconds
-4194304 0.000982 seconds
-8388608 0.001976 seconds
-16777216 0.003447 seconds
-33554432 0.006826 seconds
-67108864 0.013190 seconds
-134217728 0.026196 seconds
-268435456 0.052567 seconds
diff --git a/nccl/all-gather/benchmarks/32_gpu.txt b/nccl/all-gather/benchmarks/32_gpu.txt
deleted file mode 100644
index 72f0d07..0000000
--- a/nccl/all-gather/benchmarks/32_gpu.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-Local data size: 64
-Global data size: 2048
-Number of GPUs: 32
-Message size range: 262144 - 67108864
-Number of iterations: 10
-262144 0.000622 seconds
-524288 0.000577 seconds
-1048576 0.000780 seconds
-2097152 0.001190 seconds
-4194304 0.002041 seconds
-8388608 0.003571 seconds
-16777216 0.006995 seconds
-33554432 0.013830 seconds
-67108864 0.027698 seconds
diff --git a/nccl/all-gather/benchmarks/64_gpu.txt b/nccl/all-gather/benchmarks/64_gpu.txt
deleted file mode 100644
index db7919c..0000000
--- a/nccl/all-gather/benchmarks/64_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 32
-Global data size: 2048
-Number of GPUs: 64
-Message size range: 262144 - 33554432
-Number of iterations: 10
-262144 0.001077 seconds
-524288 0.001154 seconds
-1048576 0.001399 seconds
-2097152 0.002078 seconds
-4194304 0.003777 seconds
-8388608 0.007711 seconds
-16777216 0.014418 seconds
-33554432 0.028471 seconds
diff --git a/nccl/all-gather/benchmarks/8_gpu.txt b/nccl/all-gather/benchmarks/8_gpu.txt
deleted file mode 100644
index 1c654f3..0000000
--- a/nccl/all-gather/benchmarks/8_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 256
-Global data size: 2048
-Number of GPUs: 8
-Message size range: 2097152 - 268435456
-Number of iterations: 10
-2097152 0.000286 seconds
-4194304 0.000523 seconds
-8388608 0.000954 seconds
-16777216 0.001696 seconds
-33554432 0.003150 seconds
-67108864 0.006500 seconds
-134217728 0.012278 seconds
-268435456 0.024449 seconds
diff --git a/nccl/all-reduce/128_gpu_run.sh b/nccl/all-reduce/128_gpu_run.sh
deleted file mode 100644
index 0e1358b..0000000
--- a/nccl/all-reduce/128_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 32
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 4096))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/128_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/all-reduce/16_gpu_run.sh b/nccl/all-reduce/16_gpu_run.sh
deleted file mode 100644
index 6553e02..0000000
--- a/nccl/all-reduce/16_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 4
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 4096))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/16_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/all-reduce/32_gpu_run.sh b/nccl/all-reduce/32_gpu_run.sh
deleted file mode 100644
index b672e7c..0000000
--- a/nccl/all-reduce/32_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 8
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/32_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/all-reduce/64_gpu_run.sh b/nccl/all-reduce/64_gpu_run.sh
deleted file mode 100644
index fc0416c..0000000
--- a/nccl/all-reduce/64_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 16
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/64_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/all-reduce/8_gpu_run.sh b/nccl/all-reduce/8_gpu_run.sh
deleted file mode 100644
index d9c0ef6..0000000
--- a/nccl/all-reduce/8_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 2
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/8_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/reduce-scatter/128_gpu_run.sh b/nccl/reduce-scatter/128_gpu_run.sh
deleted file mode 100644
index fa2199a..0000000
--- a/nccl/reduce-scatter/128_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 32
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 4096))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/128_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/reduce-scatter/16_gpu_run.sh b/nccl/reduce-scatter/16_gpu_run.sh
deleted file mode 100644
index 2edffa6..0000000
--- a/nccl/reduce-scatter/16_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 4
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 4096))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/16_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/reduce-scatter/32_gpu_run.sh b/nccl/reduce-scatter/32_gpu_run.sh
deleted file mode 100644
index 3d297ff..0000000
--- a/nccl/reduce-scatter/32_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 8
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/32_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/reduce-scatter/64_gpu_run.sh b/nccl/reduce-scatter/64_gpu_run.sh
deleted file mode 100644
index 6bbf97a..0000000
--- a/nccl/reduce-scatter/64_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 16
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/64_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/reduce-scatter/8_gpu_run.sh b/nccl/reduce-scatter/8_gpu_run.sh
deleted file mode 100644
index 21c0dc4..0000000
--- a/nccl/reduce-scatter/8_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 2
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/8_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/rccl/all-gather/allgather.x b/rccl/all-gather/allgather.x
new file mode 100755
index 0000000000000000000000000000000000000000..fc85917cfaeee3d0d9962cb061dab34a2359729d
GIT binary patch
literal 25736
zcmeHP3v`s#oxk5q5@I0vCcFe6^#cP$C6fms2IP^+119bSBP2pq947NfMkh1FJP_RS
zGGIFel-eHGcHPRlZp(4)S?addQVT>B(Czl%qsUfQsTMmSf)o_ku9E%z?|1Lyn<3M(
z-E;Pw?a77j{_p>H|NsAe-|yaUa_{;k#{!e0FzHNe9wTngWQ{|0+-EHRE)f8!W3{XR
z{!~`N^1$bCOpmW)8V^WXVbXC)^L3V)K}jzx;=?A5$4Gh&=NOYz=8!1q4LVH(b-Hu3
z&PvKGI!}7Jcy@{SU7~!F)(JUDje1nZ)01^Or|UHykhJbgI(}48sYk^kz5MI-`1}Tq
z2PEAg$}Op^m&M0p9x7k9^o|KVOQCL8)=R6<W5qhoiZwkTs6)V#$~wFqdX&%qsUv-Q
zwu=00740XfCJag{(<=}7J8G)S1D>*gKNw$Kwz{^atftx;4Oy$WJ!%Fj)51mV+=3at
zNP_d(Jkk+On<g>_oR)^@%w!OHGF>fzq40%Sa3}Dg()DM-S7pK1Wx>Ce1^-SK{Mjsc
zZx;OZEckD;;AgVnW;DQ1^-z!nACm>2lm)NJg3rr>JF?(QvfwMS;P+<1S7yOiXTjHI
z!4p~VZ)U-FWx-#_f|mm?W>>MDE)6Y*`H1_BH3@v5OXG;d9|vw>6Iq48c_h3iIerzZ
z%fSCh;D<7B8np`kk#A1>N2rX+4C(UZ%YV8A%?S$IhqN^0MV})S_V0oR#3Ix3REBh3
z2SzS_mH!IL!-s+=LgSj~Rd_J2Hi9W2tn}Kg*VLZba!q~HuRcDya@jRM`|#1PFsBnU
zmD?GOxgs%Vx62;{(%Bsf3d9LE;_dWDW8R3<7jboao&KON1k}CS<@EW3u7LjmFIbOv
zD6uYoxW(J;4zFPdE%C-0y{r6gFEI-uUQSzFfk4Pj$U+(25Q+z50%?jwLXjo0h(Fj#
zh_1h6P0-yH2?Z(TpxYgQW_!@TGVX2mNZzPNBDO$ap)1zqjc|d6P<MB8&>vgu3f{+}
zu}CZwU@fi9PQrMA1e_6yC75$jN-Y}mgyJ#g3nN1@AM4gL57ZlpFyH+Vf6U7@7RZVr
z^?J7}8e>Am=L^K6UCc+t=aH^R#E)!IcxlYv?PcE8$b{4Dam8Gy*N$jZZwf#`4`nv!
z?hflIhhq$l=j!&LX7rcKOsXVjHC{5*s`R?4VGAA2^$pG{Yn62-D-VTZ<?e`UO?lW`
z?sj|2D{HJ3)(RfPE6Pc)EPu!B^635LK2&fFIV+EbUGDqJyunq`@(x$jn=(-r4S1uK
zGs_<E24|3gvdWp<md6!|cDaIO5ws%8Q#Qj|ZLKJa)p!F{Ggn9M@AP%fytig%#1rX?
zL}#qLH(VXMHyRD=>1NMfnLFSLc9sujQUuH5!TW-t`-9>=+Zl|{PUY~@LSDWVs=x?3
z4%J~LF``(+R}lX|DyBQ^cT$~lfY&@Bow0zkE9y$gd$=hcy*k|O?M_9kjJrJD=$k2^
zJLHL@YPmT|$>&c6DH$otbLC4##Zw`7G#=I&#&$Mo82S1dO-vD--(|o}9MQcO3^=Vx
zq}yk}b0i3Q$bi$jQ#v^h6J}gz$a$FX>!mPg?Fx-^Bj6j;tux@pbnOP*m~N{9H?H@V
z8E{HNx)laoz8fg6%Yb8oO}VfEFG!(`tv28z4fqBFKFWZ9&47y~E|1-Az>5s>j~j5j
z08(y;0jKwcbUO{WTq;xCE(2~{$Gl*`X)P(;J_9~hf}n>C_&5W8*np2W;71Mkl?MEn
z0iR&NlLmaE0Y7cP>HQ|%1p|Jy1c|yd@)@<c++z`+@6w2H{8}E;J&Q}@0g{(n9Ren}
zG$LI7Iv&zJ&86`G$?GA`vQn4M5k5(W(tLXN59JgA4h(EqZ+xV*92*1c88bd|Y{;=d
zc8pItr=-?Lvd_!0LiyF*2Ba|f3=AMW!?P&f-QkUBA)gkGgxua})Em{h<I$Lg*~#6d
z1yj)rTic`70@eZ%SEpB_x2T2}Yo~X%HYMQE%E&KZ3tawy*E3tgTTsKRG!)UML}yR&
zXwwiot$;0RXmF(AQ=$OV(ttVGX%4Q!g2bcwgW-5g3%h8+f|U#9z-leW+q#o40<7*h
zqtvHZYkXSN>kb7yFxXBxgq^512l;n)hU3wB+MI5G(AgdBbW+ZVb*(nCc;QD}F@Go+
zomapX20|UKfJSd_UQk}d0_F(0hd}Mkt)fVJSt(&-p$0AMMQgD+9Je(%TUu<b&KBD;
z=UgU`w#CgY&NlnvrX}{<9F4#m7TdlI5s0_8Id5CAU@jU|x!QaS?TM+q_h3Tk0__L=
z3g``(2%Z4l0Qw@R1q1pp=<hHvO3;bsUB_4>=pN9m)T!X)T%bI#m?^6*%GIMr<ZnkD
z%|SHI7QuEI3D%FW5MP0(6WBb$Y(<ubObvy3YoG*{;#=`-M*L#XRJ^H$3y{q)o);0n
zNel#%--M?Y8F`zGz-E2Xg!&?_zNoamsKQoMyRc}3xo5=IyluG;=R9J5$YeI<71i2`
zD!2f|AZ)P}<u^bK`f!wwqj-LXKpmA+=F@zOLK$fUbOFy9)JrEQYb>%nVuGQE%}qtx
zwj3DVnrknrcqq?aRNFJ6rKm2#6SPr|M1u82`MmDP{*9Pm<hUhsMz=l9`L<V{Y)-s>
zk3I2$z2~>5TiYDla+7#%*tZqBIGH;gWf&N@2d~;s+wF<t>hrm`qN>&B_t`h^i<chA
zy#<_N-;(<*Bml+6+qccM1G8_NTX||=Ao(uL^efw!?-$yELG3vQa+eG3RYF^ZrRU~A
zJ8>}i9A#ijt~(XAZSMOgVeuJh@fYOmJr_;tgKJErbK3rT61*ikqVm<g>p9eG@A=II
zpBl=I^$!ey7EagcxE!5c`|gQ>fyy`3(kX5LjcVz&?*Q;Ms-Y=WB+;aP@`ex2s$NIE
zf-#?6UH4v-x^&-JhkD}ypBnBzMbIgA>B+Np^+wJdQJ0=0<{YB`z>$mU(g8v)lI$;`
zevZ`7HL7D@QOBN#Sl^{q4y1G%)o}8Z&w*5+d6G0wHLBweK>IVBI`#{M&)d|}b86eC
zAW60DW1qT;9u)YKPe4BfJ>P^VpG}SY$^xN_J~i^Tg^D@%CMvH@eTT~W&L@z%NO0k&
zeZ2G+{}bHB0jOMn5IA^WkSSDEVI5SP)V4lot~vp^{*#b9Wh2B!h}sXP^fZb73g&F;
z9lxiPzJTcar%3)2M4#huBf*5)QfS**%6g;PhH8BOGm3y{X%go9seFid1>pN9DJF^X
z9Jj#x2;P0C&Z_IDFSYl)l#jT#-X-rX@_s?y%l28XtD9$D92l_AIuvW5mPEt!`qXtV
zC61gwa{d6lo#KVQ&2PlUZ+wkjZ~X=8?hh;*-=No`PYu3AqP-_9zQowu>;t|1FYGt=
zk9ONq+a0!cTiX)*#*gh7jP}IY#ET^_qU+cbKPuS=+B1-=ZvH;Tm3_-!=U1NFP0PLH
zx3IWtPP`1rBB6~OItXZjgn}G807#S2U4U#opPSUJ<uKW_c|d)5Di54fAHJ3Xv(Bqu
z9ZRVt_HTYiefSCrZWB4$rspU*6-Mk}wjwKg@iZn&1VZ1#bYwD+##v975Tb4#0W%I1
zY!|qe#AnUD$F&ysIWqhLkYqawwPh2x_wDmKmUug<0&PqjZ&v^LRL_e%lEtg-Tjn}v
zld&`Q-V<Z@e8r-%%6-X~A&QtL#Ml!jc#Kz%c?L0##QEfNpv{ScjzrSlo1CO>N@Bn}
z_#DvOd$J@>bHF#!=Ya6<(5KP2OLaPq>Gay8ABj1@34rE+w*jDkhNf6ag3keex#OS8
zI>h)t&P)Es$3Ijrcl^WL|K0dUdH&P{P>l(oKV;uh_+K}pga1y|*dAK;@21ytvJ)?U
z`XI>qW>F8?GWQ6C>|4g{Dbc&&v{d{=9$)x88EGXW<tZbRA(SjjNq$0e0cM=W|HX`B
z6I0GlXq;_FN{+<2<gc;B-n>uU{Hz$*j{~<SP9<M5;#;KrMU1nb@<K*L{L|1&jmAQ<
zd<Yhj`w=afJsW`RE>xJK@>Ls+@5ZDD3wla*9kjRnh))VzzQu>MYS~m$ixkx2#*#2G
zrGi;O%ml%-5|b~OIv;VAECO3gjv{bY4SlO5sRy1YxxfSK?=Gpp)ZSL2`P5=Z38wnu
zoD$084n!yGMR{nO?{GiMXQouS%J;sB)@e+9gsJ7H5Zt=~tRry>wE^V&gzQAyCf)=0
zZE%$X$)`YCW_`A9nIkcf{6{c$QHjZA*bGtGf5;pE#S*IVWP<>I0x(%aCidAkzD#d`
zyKHycmfP;JIhXJEZEsPq;5yYj>*V^KG;_V4w9%xmw~ieZNwyrAIjKmF#0Nj6Zv;)1
zuO{1s`6pm7c_o<3QMyhjb&}EtSQ1{2(h0~>>8eQS2bZbzW1;i~&-Q1@_+{#?6?(5g
zFIfksc_U10U@y=<Ky5i{OT4GQeX3EE<a~H%)Z$INZ8yKOuGY8xa!NFTClYrg-$&Q5
z4g50xnr`zL@MJ$c*-g~V%P}pSQSa7MIHTV9uV>VIdiRb-*Fb04r6ei8eYFt60`-Tm
zws#K=dK2Pzzl|LCj|cwkbi5b1J<*GfMdjOzT4l+H5Q5}<l6(}(x`l_y!dt+nrQ?qf
z{vvQS9j6(f|7pAX(B3Cgxp<neZ$frVdM>^Xye~e$GkyqN)R62+in>#e`aycsDT<np
zC|bSSdp@p6to_)Yh##{j+K(P=`W&(BU=s~akWY1Tnuxt;?NO#~dIQ0}pAf&L>8O1R
zwhgp%=(*4k^Y>h6Q8!Zode(l*;x`I`-Niumyo4H|9Y7dGz%HW@wyD;7nm$+h-92kh
zv-ol;e<jJkCFJXL`MQHmr&IDXde&ZG@d_!=?SCxfExP;|6d@&_x4l1SJ2ou#o~EPu
zJ!?NVt+xnW>~7S}Gf)@Y;!Ql?>K>%e_I?gD`39<vXU;;Pwinny^5+1zZ+~?^->Z<9
zw*3M_%f;rz+q7bIY@703?3CN>+scS8u_eA}PMmYxKoy^F@BPA@Tmr-P%_n18<vVgc
z;z;}fFM-CyC$@pH@7jCzEB0Cc5kFz?c{%^C<#OJ^0yVXN$xQ$3erhe)-;(<qj5*rE
znFbD%{(c?jAu6Tr24+^_6*n-Y8oqj69YVx+QDomSD)N*YiKqJPJIwjX#G@Km-#*S?
zMLcSr_3h&PXyP#*Sl<rL7Z8tu&ib}<-b6g+Q`WbE^M9~{r_V`!Va}f+KA-p%oIgQ4
zO`CnKod1A$tQA>b9p`^dJl3+TuY&V$5I>4|jq|S&k3A;qvvB?;;){r9oZn5nO8jZe
z0|Qf@Bi=&%G0s0jJoU4_!<>JLc(f<V&-w2VPoF~jc5(hu;%U0;+rjy-6HlK$`?hob
zLE`C?XWs_SKR`V7`@S&eBg9W2eg)_6C7wQQ_O)`pgLqoo^wn|xPU7iPW?u#87ZZ;!
z6DU9DZzUdIB2a$L*AxFG;u-k<$r^(AwsXBse`nPxRR}p+=8zMF|K@1qucDTRlkqgw
z8Tp$tcv}X)B!lnF;3FBl{01Y_`$k6m-)Hc&iZbTs=NbIL4F2^Do_<p$XZ&_eJat%d
z7YZ~^sl$@nsS_rD3!sK1M-5DF$4HG+8bIV|mqSh@%d{05=cxmclj+caA-Bt<aY`MY
zoU~5^jNF6~8mBZs$h{{Fe|B*oM0_$&Bf@kafd)M^2|;qQ9P(Sb{JS3ctz5=a2PAi)
zUgMNH961?J1CZRy6&j~B0Li^8_|pS=ewK;+peyqH?G!vlBIjj2>Dt5oa19LCz;F!=
z*T8TM4A;PL4Gh=7a19LCz;F!=*T8TM4A;PL4Gh=7a19LCz;F!=*T8TM4A;PL4Gh=7
za1H!<HIOfUhpQ03SM3s1o>$MbnpdfDXuP0P1+5WufuMH?>JxO8pbrW9n4sSi^m#!K
z2zo@&j|4p@XkL{_U(l(7)(E;l&^rY63A#$qhXj30(C-QQyr2gJJtF8wf;QlwnZ}RY
zX{&I40tXy4{ivX(mCkf|XI53z&hpHf;i_;|RaST0?49BBd1m@tuA174>Wb=_wKd+E
zwKw2Tqx`}9yph=&H|KCP3L|V`L&I#X6o=n$&?+meGprR_RYg@LRBN?TF54V*TW=6z
zOWKz-X|NPueS>zh6^9JbBGd8czai?sOv?KWZsDzUsYkzKnphKj`AcTH%p4xSDHSh&
z-=seXASZv{q(6Wd%IW#}9-?&nvVT1~lrS(oy?amuUH*vZhw}H{V;Ov=85;mmeud&>
z`Q`8a&m+=<=f{XuHkckkP?2|CNEt?zmBWuG<tv{-V$|OXtvkv3BzVe|v*2H0-d?1E
z;^*v~TpWC5IaS2a9~+oh8~l7_BLQ$zg#|s`A0hP|<^!Iu)WIJ`f51c8%B-Fb5oyZL
zH6zNr9G>ZDBaobG4v+W{NUokTgho-kCFd%Ty!#+P&j^;EzY_F{hu{_b6?r^F9-_8#
zpW4Et<Wct-IS&n~l&+xQQ!u35$b*L|IGqPAh*GQ+%sUK^9!lg0=%|rTp%_ZVD+rI=
zPr*tG@}I&L=N{2=vZp9d6eUTfVV;QSge*DbiD!YU^C`^+$}&0iG0sxH9mKxHRf&BZ
zkA+%Hxnn;3-{L7=DNIfQL+-nf9_?q!QmVKSV8(0{5@VHi(^@uGX)~>Zf4gZt1jb2q
z$_E=i6CORv&7x)30Y=NhuLInOj1BfErC71#6)U3^bJ5jBQ;No*-<qt<Tr__afn=pv
z0X&kTE%+k|Q#B%IkGy4M^O*S)P5A2ua|s(ca^6TYx_{18);xx*FiT#0-k?RJ5ttpp
z4GFy>m>8e3nKF_)Xf{t#n#U+GJYo`z=c`DMlDT4>GHoKk1-U%gk;a_O!QV%W8po!A
z6k6H*iF%%=O{CQ8Cj3o3@FJ@+1WpFk+*E!{D6(bHFtNoWQ6-~Ckl<*<<m;6|F=G-B
zDu~0Os;@tYbAc1SQIGl=2apvjl@K3KIhivtSfuA>$eOwXsEMDrQk4HHDqsp63F6dS
zbWPOb4M)X!X@(=@IO=a*qW@{n6(%`zm}e=Unk_fY*KWCW(xgb#Y+30w;Wh&0dYCaY
z`a>hKT$r*9p80WJ-s(maWB1@O@oD)_X22bZ`3KXNGp)IoDb11D(@nTaL19;z<cw>6
zlPRVgVBbLsX0{CeB^fD?M<e3Kl}@)?o(zx1qh9@3c=^rc{-8S$_jnoe27G0M`Nq+G
zkJo4IlC>cRsd+zB%%cyPgE8}Hjr%ckaE^KOtqbpFrX2He@(w;~DxGL9u<{)9Df6<+
z9~>;docYbu6g#hQlwHpL=9>g2ZfU^DZn|^9k9z^A2u|D`;9A4)1#mXshHC;mIID<4
zdzib+6=7V4@p}vKrqvoMEA`i>R@NPh2fcGUy+Lop?*{9NxVz@+SJY@_o&5i8RazMg
z&c%Ux=#}|EYiFV2aGi}8Y`1=%vn=2ZcE-BqRsf0z@An5iWieL=4zGiEMa4A@p?DOh
z^wT5J32~_a78ud4GMpfddEIn~k{gb6ddqN9vka#)Ar*{~tq87!K;b%()mRsJU0o=K
zBFHIjD?rA5VLE<FIiUP-o4C0k7Fjd5vSxZ^HQ9%OG8{1thQI_v(4>r{ECFn7Zh)VG
z52g?cM9XkO-y4`qP2hJ0%Dk(+ZnOwBjA#)kM`Cohk7iq<RavVmv1UmVpsO5kqe7Sl
zW6bX^i+FJf21b&hoyoefvj(Vu^z!9_bPktNpvZK@HGQz-ss}#QT#-)q(6Q7JsfRdY
zth_7K?Jak8#QgzJc_iGe2kP6K9gXFkxQ4)8R%M-0S&lx)#Eld%yGqUhPG>mc&FGYu
z=7(1dl1%HrdTsF%#e-3QXV6Orc=h@l?zETJX@~BWyf-(Yw$s~2uW(GHgO2`WmYhW7
ztSo0IIUCEFNy<s8YY*y6y?4a=L+TwDywsDFmRscNxKHnvb{Pz(pye_c7f=p6tF%8U
zm=w9xR$HBjj0mc*dt}gPtL-vu-7am#QnoO7bZ4}6`z5w)LMo6_q_=jOlOlG3npnk?
zGVBwW!j`1z{85G$0XgjVsdkmZwCflO$8uyyd@^u_jbg3E8bX7Puh6mdm3xYLB8sSj
z8;I&$8b>G}<QCx}e1%IxXt2G%j4;+aO#WRe<&WG7JSNzef5*I-K)8GHkewYa4Iv8a
zZZ!{&PW}CA$_@k+(|9KL{}Nvh`Jw88b__$|k8%50lr?AwVOhz2eRV#R{68RmD4hNt
zS}_gQr`#<(3wZ^LLdyzGSk?(&&6y^q{*G8iHoy)u4c42)?-02BJ6?&?p%-Qv%QC-r
zw4r`XrlQo}^-B3WIj*MmzY<@|^M7eRUxUDCv{&XX=+_dTtZ@@%+-K}J0+;J9xjFj1
zz%5fXF3HdMBXkar#>S&#=s1srcO~*!jN2OJMpf45RN$kfkb%<4H_H%q=)N$`ASZ*}
zz-bI*x}Gffqgn80fDe@)+9?l(_h!Lg&4Ry~1wWnzKc5AsP4-avxh4x<o&}$q1z*JR
zLbl<0jaTU=@Z?rzkzbt!-<Sob4<1A1|L=egbx+*WS>&J1g8z%Kv(Kebwo~)K?kw`X
zS@2(G!H;Ia-_L@7lm+k4f}hWVD;U>9)mt9$q3+8o<oMt`&npAoM1dc2X%gb}OWb}T
zljl+D(ErHI;J6+(`nP1knH4KUZzRmDL2MqZw)$plJUW@xvnCi_)6J~du2{u~%QVIr
z(b+C+Jap+85G=o4q$)z;Kn$S}zk$u_?GpFib$K8uIcAM{S3`=g(ye??X60&wOJK%f
zeMi8oX)7?Eva8#VMR{ikR`irY-C<`4_rrx;wAHcFzQym2(&|laYMgRYgFT-fN&0%i
zxG0WU-MF_7QLyOkh<74wx@iZv&hnf4WK>5-#Jfty(5_B~v6qs;balF?&x+c`4iB`8
z?#<&Noyv_s#*PL%kPh4*804FqCcAUNVp~g-)6u-7%}I9)dP4dxRo@U`H-){32iMm*
zouQ6<u_?seYkHI}D>lC%LQ?LlSlZn_!H~1N+Z9HdVdEAGdBMgDsCDS7cJiDeCED#c
z2RGaLdf5Q;R_B)`Qp?Hv`4&2EtG7Am#+@ZiZO%4Zy`u?5Oy8IbBiy3AZqaBX<L*VC
zklgZnq9G?P*7IPe_ZNRDqw~(nYAU?mBN|c5t(8@Dw_yj3k62|*fUZLP|8sAn(<%C-
z(-(@M;R7_7P=d6}9EaEyr$O}9!0AjML`{ntF_5x#Xs*%t<wdrZ<_6=m`e%n}x~2!O
z5ANhW*7AO9d0tFJb6FWCO6TWB;F0=UO&Ykt`;~Q$L)??=IEm2yWF#J`FVD^G62MNp
zFUV0Elux-n#}`%Zu7i)gYDntKb85@-^my8zkP~K7=}7tMh@&%Yl9%V&b_)GEQihXd
z%*;Q1z&~fum*@79LcfA*52i2kFJS%)7UfN*FV6)|pakLMxj>!M>p+Hyw&0;WNqu?F
z@q##5xRZq7Wcg+Kvi_DLj`ZtE2u_}R+(C^3XFT_akL8!tzY7?(OBEh+a=pGC-}A`@
zWyW+CYmRj12W#wqOzuaEbvZ}6IM1sQx<5l-?q^$t{xULdaI*Xo+K2#&h~`TD?k4YB
zA1%ru<umjD)eL?4o#kas*P#Awbea8U8zPPA%kMJpYPuGUFQd!U|0W`h`lrSBnA4ij
z`V;m4IK-%kvVX|^y1d6dv%VxR>C@0v4f=9_dSNm^I4P%dgS^CdA%f~urZ4y5?0Q{8
z%4O5viwJ5vsW11z^8WfAqWoewrSc=&@ep*W{-nOVUv;PW39b}zMwgkNe?z2EKedmo
z)3sze(nhBKn;H7@JAp;$OFc;`Eu-rPLs*XhGxcTqg(5yPeYy(9n15}B#wE2nUBN<;
z2A8Q%KXVZ#>#xqGap?*hCv;zRn$yxa{1yg%N}?hIm-^q$z)6;zq}K_39Ub%!+eBCj
z3Rc#Kl(c5kuh%6ui2ftxGt9BsS@c_*bPc*g(daT*Mi<p&VkCcTfv)v_Mq~!d=sOQ#
znf15VuIsmDOc+vMQm4?@r3ZbDUMz4MxIgG+)*pRGk+ue<@E#HRbkDufWw2~DfQ%~U
z;>W~``?Q#tX*gbuNA^GY{@MV2+4qd;9B$V+`fg)%87%vH7X1!-F~U8SVLVg+zX7OP
BLl^)6

literal 0
HcmV?d00001

diff --git a/rccl/all-reduce/allreduce.x b/rccl/all-reduce/allreduce.x
new file mode 100755
index 0000000000000000000000000000000000000000..a21c76bfb1f099fdcb0b0fde0a1a598eac47e83d
GIT binary patch
literal 25840
zcmeHP3v^V~x!z|c2{DkI2`@pk93&t<GMNN|0eNKdfJvKRBtfjNVKR?oWHK|(1HpUc
zVZb`&A=kFns<-8Oz1Hh;^|EfQwOkh@qCBjvXsy!PR(#Qk5k#OstC0Ks`|LeAGh}*k
zb*-*-yRzY&fB)b9|9|iOpS}0l`(*ZBQ{TA2q$o@V6Pw3~J9oLpp=y3-EPRWIfK;<8
zR)G6dR>JbY=Wt9<uVxyLNNO|bxTN_y%gmsp7Z&MZlg3jdy_9o|Nh(W7l=KFICZak$
zHd1FLWfq+$y<A)pKPKuY=>Z`psZo#W*gaXd(_O3ah@>ko)A9VT=v3-awMg%v&^svf
zB;6+JEvfX&;*Vk;s$aJJ4hg;L5qf^oFU>;FR;=T;VvR>6ZPj5Wsr2Cu(4%sG*+BYq
zZ4%|#B%Yt7nlLD-%&#oqZ<{%zEZ`{(_=EA4r7Npumd>1EjfSih+#Wp!s?)+nOSlCy
z?jj4$XS<P(c=Tx^qrhoth|XLNLRV&}0x%f9FbnPkK3Kl~Ecl8n`06b9{aNrwvf$5T
z!F#gcuV=wO%z_`!f}8OG2J=Hf7JOtDd_ord$}IT2Eck*fcxx8?mMnNj7Ce##UzG)4
zn+4yL1%D_D{%98b`7C&M7W`V^#q1LHluN@CWlWAgO#<(BX#}V6ZNM#TJgahP1gF9e
zIerPNc4;Y$B0m=RK9@$UA()tge{?se{UcOQ(Iupm#ATWZy7X@V--o<3<e-lo3VToR
zlxgIOJe47z-vFZ!zr=qLjVOJBCo&lq(MxclzpVpPK-jd)HeFhEeB-4x^}qh);GH*J
z`ad5$_|MGg#0chgMq{o>%-QMk2Z6MAhJpfdf{l3F{n3~=;`BvaonEIu=nDaLuXH(m
z{-7)1zuODe;~h+_!yj()cDlo>7-B8nSe<u;-|Z!4LBz{xlPeGixd~Y)lWRlqU`!zO
zkw_@g5{vkQ?S$z1Evtg=jz}m-IS1YD05q2b{ddN_4IarG^+?1X2rTw`;>er0+E8a_
zL(m^v><TVt(O4uF3J4NlP0bBXO5+g{bVf*uV9rG;*J#WWipQ8QjDp2{tW&Q9P;Vr{
ze0N3sF)!CxARR>NHEvfl#)OK`7l=nYxKNEN67hQ@JiavM@ANVsImf#ag>ZU3u9yq{
zZHq?rM*=A5q0$E3onbxaaE#$0x;j1ZkpA+iBxgBi;3Y&)mtIFR*}}$#np$UtwZdA-
z%0l5-nLFZIRTlP^x!vBf@|jkf)yAWEg*oY!=5L!_7QL&?2S>+Hys~K6<z8Ou4X%im
zwYj3+l!?-4z#A>EEWO(soK6Nx%PYApk1G=Ga0N>vc$%nG>2&K1tF1IP(;KL$Tp78m
z-Pc}u$IQxzC(;p#PQUYx@QlzM(P&uDH+%6)-2qpyy=<V6B3c>`E)Rz83X1n|doVsb
zRl@TtdEr*bfe|zr@?kkKqFTh;h`&3P(i!$U$)_CPo+qR|7I1b%T`74FH^q}zhC98T
zsf0V@E>EW`){z1_L!LNX%gs?vK7T4o*+^MlDqkuoo{G7n@vzP?wsL`n5kQ~AiJ^k?
zYYn(@UUI(yH*-n-w$*^kIS{2hY`|$=DuW!438OJchHjU}qlB06n0~8Trt!!y+>QBG
z8*pR34g+q?x7mOj=YuyHaLPl5Wd>ZnJ1DKgfMcLdg|GoHNTKkv0UvI_*BWp!)#a)8
z8}LGd{8j^AWWXOb;CK<F!Zrggrv5zbDFZI1(j4Dzz>V{q=M6Z`HD%~F;G-o7y3c@*
zG2jOb_*ese(12fTzz-SlaRxkTz{eZ#V+Newb26MU;1eWB)TPm#(KDB8FyixF8WE0*
zr4Q#VE{#)?mrEf6#<?^iTz(>t={L=#@d(N5G0w7SE}bKMk`D2^&82ZlxN&|?Gl{{1
zBEbIswQG!*l$L#^e+^^COZFAnH^`3hO6Qc+{7Ciz*>|X%dRU7b2Cn{o<Y%}R#XH-)
z5iR7?!jX{M8;yFST4y{O(=b}OJG5Xbd13RCsI`DKLB!SW)#$CN;YHi-ovlp?c(hWw
z7qA5`f57XRt>N{l;nf<7Xj7uIr+Bn0ka|S{TU1-yn1)Y@0!+&T=3u8GxB?RukLC}C
z<1sDlqKOP<G*kkswG40X2%j8S-El^_PqEJQX;H5`6!gI05-K6=M7236zq36YkIvKP
zboztN&S<-nN=~e6rIE!1AmWPoL&5000=6&^YI6lNdXw{t@+uav#*lju)X~r^s-)MI
zGB#Ffz_MPoCVOM!^|j8XCVR89$$pb_E)z)W;)W(?t7CC}i{tvnI^ea7?O%ro#G6~4
z*DqKw7Y|gKV7`vl&@{IF5(7jS^mEYlptCSAJOR2D^d(RY9sD5ipr3%QM#q?g#`MdH
zjNJ;lNP|2zE(AG7D0eSr%1VneVZ^Zftw^J>iTdA-$fJ}5Ylc~fx8Vu_n@5<v$a0^l
zwlHrMl)zGYGp@}@UksW`H#KnqvKhwp5|vwY2*Rii)hI{^ltLq}hjBdy>?j#VJ~c(-
zYKpX)qG>fnHhWRk!lJe2u3?+<Hs@~1dBA+1$^5~vqAGiljSD~w!WMf`z8zwExsot5
z8v{o*)mxU^e4RoCs{@n|{0{i39dTAyWO={@BU{Y%McU>Z7~PcXD6-v`=P0V`8rD=)
z9pRbTsWc+9nxcH}%VyYl9CmWZjv;JK3%;%8M;a2Z-|k4f@96sQSaWOR=G+dvJ{+41
z-^1I*v3c%eD0lyuMUxpjxx|t9NPRB104`Ub>vnADj!)Z@`#Bng;@Frw2NHl{V;!3-
z^C1D%E06Z~CvSzdK4t4|yM^{O1KM-;<X#4jwDW|vig;uWv=e)i6;zImxhF8%kzo}U
z-#!A1S4oSnQgC#gGpYBk!c%l4jyYaWg11D6mGAGJ%%L7f*L!DtYADy!*WV9Xc&Sdu
zocx&cm%R#F{)RejN(&O|)M=OfoD%BP(3HzbvR?i44IhG4ok*>NF`q+S{fByWY4?dn
z^{PESHQaZUprh*2BPSf{Rh)TCU3!X`Q%L>`N6x8B`w2NmvcH7-DN;XGr;dJE9eo;N
zz2{rmlhUbE!^xw*M&ts`Bcyq>P93`k+JCaEqt7CK+OAGJrM8{~Nvf@%_|z42p~9bj
z3VIUsbUl)Mb~W;A3xv-3)X3Wws^;7YRA0OL4%PLYPa$=V;KC1kdF{{r8Qi&ksGNZi
zINZ*XDY&Z82Gx4CwHKNzK89T15y&026JjSs?E^FI7>T|Nb9VK{&nTy}5PkP3$$yIE
zQyi`%m{5BPZ9PFnuTxv$#&`cj2@su@g!w+I9}->$`0f!(NuoX<S#Udy+w!9))HT;E
zb#%R$kF+;`N4GcW_A1?8a?E;N-B5Y1zuz%yU#ym%BpytUPhI_D;;qwfo!&!lt9YSr
z!yB=&>t3PPU0;E^<9*AzH|RC#Q-d#(XwSzMUt;w2j{cs$=XV?1N7rn&+F#z?ceOfk
zEwvYQ$9tA_uM1VmrKhJCsv{OS`nw(O0+ivN+3HV6EbMs?B@gaLbb%i2IicRf<?V@m
z_R|OKr@MK<{>!&vf9&~n`+32h*}Zj%eX0FM`x1L=i(}m<4s`m4#OcHfCC?&jN8;HM
zD6y{oTy?`Y(2pB7uE{SywS%Ug$tN(`Y)HHe$ReSC1LR2T1~g7WTe#F7K$?V>1G0CW
zHL06am~?D-N8OT#NYjQ>>Xx(UbB(i3sNXn=NJC<8{f2&ZOOiU`W>KWgdXbX#loyyy
zD9bKfb;(+RJd;A~K<Z{aQ$mQk;jl2X9b9q+>acM=*Z$t=7fJ|Ayq&ZG?RtTuEI!?_
zaqiw(C|KfTL(eCpcRpj$SYzV-^6uoHA%?`Kk+}0aDf9!NjfvCAKG255OAUNn`H(8a
z#})AUxU!kbkw}dzBmRhfg8uT-VNS<9rqjz}pnP2U91$8<qLe`6$}1G1aYdbo0skLv
zJo))YsNVVG$+rovrx7Ikcyb6b7dW28A^P_lPf(vV*R=lC0c8dTl)jK-W8v~^(V{O#
zv)W0^j2-lvPyP^-2l_zCdS~N(jJCG`LJb=)-q{K{w8-{U`XxNQa4H#DtHMZG%19f8
zlAS5ZW7PXFg4O*EBbYrgx{-#kpHlzaikun~r;-Jjx^L)KH+(=D0H+xc;m0wxPvIM-
z{NcvL@t^TZJ}%PVfnKT?7LtX1u#h}~WXbGY3uH&3!Wzr>+n-0gF8PGW^(WBY_$=@9
z8^6uFvT9jhQiUAU;<}PBG1COIjF@qPX(omyPb60DBaX5~WK#*vK~JcmZ<i$X$YUjE
zcx273B{sY&T1zyaTHIKI0k}A)gvz)L$;pVQ&$Y(-*mjxm$m$YD@zfL)dDnJ4BuC;8
z@B)xep|a~B7dQv@5ZLnmWG@KbA^Q8kp}a#Q<O63=R7c_<W-N{Fq(ivVftqu7Z`{Po
z;Mki}f}+6vAODOd-1RW}bF9qhF<gRiD7gj9mr*8pQvN-1Pu}n)l}VnIw~_MbFR4uO
zr0gc;w|Ohqo6v>SndC`%Dk(og%IiMI+mJm^tA;wY=}cYXto|0+jBJt*<91vvUSC2@
z3-6%SUmKXeeHm{p6WM+jW|EV^^#3BhOV4Kt<zvH5wsdvF<ro2utGDX89=EGky>?u^
zy=PYqngbfhb|v{bEcAYHw!gnI@yBEibY)qR^m3;9{Sg@vNNpe)5#SMG;?OPccC34e
zUh%isZ?)fMzuoS<Z8zr0PeX3kQ5q{MkkyXE5bqlc{K0hm0NGCTpanI+)-IS~$p;XF
zWEDxi2xZ;E_sGJVz_oPz5yD>po}Z3CO865F_r6`PrAqN*!oCIB5$UCP4tQ_8zrP<z
zJ!rv(Y`>tSoAsn8)056p(tIS*BEr%2i7j!@Cyqq?kR!3=;NE&tWP9tWPlKG)$uT01
zu6qtLb^RNN_WqRkjr9i|8?kz!1x(kO+L*uVOq05SM!v3lPO|t_LSRQRke$c*liQnI
zg(_gNQwZDSwXXUvls<RYJ;zx5HYtBG$>$Vn@bKPbgDzjaxBgg4etOqEXIR`O<+=S!
zh5UG3eiW*ZlF!@Pm$Mb?CP!EO!Thd!zA&w^2wkj&)D6?&3vTghUT$?K@@Kog0GfOU
zuH%KX5UA|})}P!1aO>9nyI*}ZZTSa=rjHGYw`m61xOvKtuySAG*j!4q#hy6ZkT}(N
zC3!sG(R0?EY=K3`h9fbp{2jR-YE1kEFNeCsr}qBQzjJi$Rvfc_AOG0V^-}&Vx5;LT
z336)vl$rmD-Sn7Xe`D@v=*_g!GxZ-J{oOjwV^lpo^jAi#=!jEdxT_PZ5hK2X5_=C(
zU8meZJUO!W0O#9?NBvoEH|K9A9xi9S+d03Ocyt8TyN&Z-B_6$#^={>S4e=PlS?^lT
z&n6y2KkE&1zJmCC;+Jv$D&lFh?rrA$<-}uB$$G0fe+luJ+Ol37=SLDhf_RPd1;o>*
z-ChglO~e-w&p7`Vd<N;DPr1FvFyi!2IZnKV_(Pokn0RVny$3k|KJj>-s6Xd_OZ+I}
zw{!jt;%Ol5-NyM>h^J4py<0i|BJuP|ws$S(cMwkvzBkPI|0aGM@yj^>H1YK5wYQn`
zj}uRGq26lFKSKNj;%%INka+rJ)2ng*o5bS_3F^=Jdx`%F@r?6#6F-soV;D=Xo~$9t
zm|xHzmFYY=h{9=V2?&C)eHr}a)DjHoWf}bZ44yi-QT~<;K9s@Vox$Iq!GAA<|IZBm
z*$n>W4F0tY{x=ys{VkY+@mo9b)PO0B8?JFm4Vc1HI^o~<C`u2O0zGgF-9nx^6b~Xl
zL728o<1jTS3Njz+Koqu{G)}1lP>}Yi!%?se(>SFLL*cM6{HJsM`e!+juJKgH4%F(g
zNrIR4paw`m{!WP+90i$94U)n>p-&Bvf=s6lNnw4t#%UqH<6EHMkM-;23FqnQXqvoy
z-GWEY<b29cLVt)1d0@x`Lmn9Nz>o)qJTT;eArB0BV8{bQ9vJe#kOzi5Fyw(D4-9!=
z$OA(j81lf72ZlT_<bfd%{Czx-FaB<I%%<@S!{YBea=&_JDfWo;w*);b=qW+-Ds($z
z1)VDBOhFe2dZVB|L01TRpP&y3`lO)G3A#tnw*);b=qW+-ri=Uqohs-|K^F*mqo6)P
zR|tBapbrW9q@d3UT8j;38sA!{t-!trY+TUvErFUgt<vSKtguzh^30m<vbieCXS7}G
zo$m8_Dt#{3%qrUq+l<Ppncm8(EAh)Fe{i`sGF#*38XN0`5w@_lcD6PRo8Yh1%5B!^
zR-0B~t0;$Rl{SsbHU!<)D}`9gl9qZ6mf|a~)ULH+Goj&OU4yv(H#q$-d-A-4>kKl+
zDE%GK#OiUEe+e|EV*dh9U!O{se;*tv#N^)x=?4Xa1-(2^B1x1-w6_Nb69%T|cRQ+}
z%ReOAm;8I_whTVgj2(cezd~uU{_^kU&mqx+>nBK6)|wtbRFP+2NEt@X%Hi9Q@|8b9
zV#K3D>t?b(2{+|p{F;_sWbQ7~K(Q4sCl?!CSxyBp^v8P>KY1ZvSw{fERAE8S_gUng
z!+gN=m1^8a&@W-ASh71x_k4gvQ+}=)N#@&dGaal0k~4$D!#)6#tLF@%5tMGpp&!cS
zEr$SI!&rX)ouC)phg-qd=*DAoL((RGr_4-B9yO!k^Y9>*Y0D`3I1DLQ@#q1HUc;jn
zBq>&k<{iL|F3RLB&=JEQM>Q1N%ZLx(P0@0S@;@aj&ei1_n1LM{%40=IlBqo>nnXKK
z2J+Z5z}5MbXDt<(g8C3=soagkzR6XIeHfR8o|tmueB3|8Ra`FengWKx_aQyf&y=O)
zxM5&MH3^B)$`aE(Y_!s9T8;Y+rZo^4Bh{%KY-}ZNbSc-0C%YOjo-FRwfa_4Offl6{
zE0(-sWu#&*nou;QXcXEle$Zkrnm>X-vQn%79!|*?{K&vG1BtVTUpKsA)co-#{5rr~
z!iEo@H{6WopEH#;jN&THlD8ypz@pIz%nsv*gdTo5V;-BbnKF_)U^Y)t8b&EFJZut-
z=c~w%vbkuCa>aOp3vzk3!;K}IgWng77{jgrDYUZr<MlFMF`ja-9{1fE;6+wt5S$FE
zxvBD)P-V-2VPcDi!zCk!k>E(A<m=9$lu?O$6{KOa)Hm<NzQ1w*QUm{t0mzD#a)^(m
zlFS(&EYeFesHbiOYT`So6y@`x0;V94Aa=DyS4BPEa8&G%X4o>0E&kRP{iib*ndHb}
zo~4{LTdtn3UH8>VlOj>G<xZ~&XAvlqVaCkphdX4sFl8A$^J9O!)r};^ZpUR}Gz|Wq
z1#m}V{(=1ENNet4N<(D!H6|Q^ps<Tfa>O<NiYcZXVDvjhGrLLTkXDegcr+qTU}<-|
z<xcQuJnGfAftOud<`239agUcVZ@^bNP;PA1_jrBQ4(ScqNzJ>NVjj8A9E_PqYWyBE
z2j`eaes$rk%#>sPh;DlyG))_CF0k?v^C9zs>mMAbza06^S11ndaMWFn{^qL%CQfI-
zPHsAD!H;tQs0vOT7~opP&jD~YT#sV`JlKnfN_&{Q!xdp%hVgR>@W$0@(cSrLR4eU_
z#e?3t?cSg_;&+2}Mcf^8^&@Mv(sus;`U<TS2Ipb}J@iU_pjETrI2>=|1>33b*DMWq
zgYB`7xi&!Y;9dTpr!?kj!)A5xuBbSkAry~dSAKdT8X*o5z(gb3QHmX+F|V68Npi!H
zc5f+mVwPgpC8UBevK7Hm5U5-`iW=(xud55iPy{8#=>#aaFHGA@sRUFWZWAXJ#3HNa
zme0JVd<NNvfl_QC4Tiu3L(rsxq$~k!Zm7jQ10TpC7KoN&N4_^OmmY!N6)5$t^t$mx
z=)s650_8}I&ic{pEn0<jMmgp#X##X~15Qu~Q)i6%-K7yP4!=N8GCXIpZfvXpsvy05
zc_f{~Arz=GZDCFC>^RzicQsd}-930JHAHG5&KN7}2z7ePTy1fGz*80pcj}RvB@K;r
zW$ie2z+GBlonBssHps*Y6fnC&jsZ?*IO5G{l;@X+I|f;%wO`#^yhib0)ZZTT(gt1K
ze?yJ-!W!-1t&+FqdU!ki+2{_(KspfUC#V!8B1dI8I?2&kj!aTcQeAsMU+O(7<{wh;
zKLjuJB(;c0wV(=HBBS9HbeoLE1(d^1C`*1Im=rnRR%^9Lj0mc*+hx>ftMvkG-5_nn
zQnt`}^kB4g!+EysLMo6_q_=ijkRlF&nwaeeGVT+Y!dlXF{whP0fE@N&>bXi`T6GMD
zV>&V@emQW3jbQf|Ylsa5K10XUSFS1Mi6o*5P9CauX&j+)P*{YE@MSIyF@0u9^y|1V
z-%0(hlgdY787>p-%ijSnCJ^BcTx4gPOGAv}dRWQhBU8WsOxb~eVj9cj_W+5nf&5^8
zpcTVl_(R<OsMPOMrJaYf$p3p5e3O~VGvH6Tu6%~<V^U~YrXe;E_-xKJKJ`0XnZkeZ
zZW{2L#QzEQ8Ss6znuIvr$??%F^ZN&#A7(NYrG6JHm2T#^n)-dL#2@G7Kfj!>Kwu=E
zSLQ0{w*r@IQ2f}Nhxa6&u{ion;N4R-F30!jBX$ax#>S#y=r~Wr?P8R(7^gSNg{t%?
zZ7mupg$%R`xMdJK^}scQoQ!q?r#_Gwy0YL8X2G8ZK3I8Zr92qklLg<O1^-nR{G%*5
z{f>UHd})zA7=CFMyetboHw(Uq<Apd6QsXXN51zuE0&f=2o&VSx_!f?Hk+j=)vf%%k
z1*aWIgVpQVEclDS2Rl!0e-`=Iv)~7Xoo<)L%P+@`ceBVJ&VrxFg3}i5!RnQRemWRF
zJPUqt7JO0`d}<c_YT$#Nt5?DCf%k%ZnDYd_PxwdTwcLIollwoa(OxOs#Bn`tyf4dw
zGb?74-bk2PgIH`>?KKTpsI)VyXH_t|s*_o<ys?T8o@tCTqO%=XsOZu$AegSZNL9qb
zff!;TenOko+ab=!>+nEQa?Bd@u7nhy)m!-*&dSvWrp%1P`U-+s(`IZuXIG~mllb-!
ztmrv~I>XKo&W{VZXlZ1nb&lT~rP-cb1Ucm*2x~+=k@WS9agZFdx^b=@l3>x>7H>!1
zbn*^xo#iL|$)vWnh<Aldp;e%aW9=oQ>FRWzpB3K4iV(De&e!8Hoyvtn#tH~4mNuL-
z803qgdWUntVtZ4)v$3J2)k$X%dP4fDR$oA1^@X*I2glnvouRfnu!zLjY<iL|D;CQj
zLQ<~7SlU@X!H~1F(-lUZVdIhuWx+xXsC95xJ9$Zw6RnP%1B-Kger<sHQ|E^&(vy?t
z_bqI^zQ*22C-Ah?w>n$xHI4PCV){Z}7~vM>v5ZC=8D})|jN~T*Pc-Dj!FwL84FBN|
zZ*<;VK7%T+w}?7;xw*W8&OU6T{t+vm8DJ2pbmIuBP$$6hna*guvw1~}v)SKbbJ97A
z|NjnubUH=Db^1aPw2lDvIMgKVM94vw+G!Adif}s9ds_XXI`p<|t)6SteSML=siD?5
z_Wmt#8rJE;Ym5hZUbZ|(Tkhi$$y`>(iPFZ*VYsCJCX)tk;2dY2;}E~e`JqH;-7_4Q
z)R+6&whN#e?-dG^2bEL4zwxD(hl#jj4IGmCa{pL3Pfu^w2@_^g`AB*Ca7ug4B;TE<
zagA=FU(LCJI%bxi{xERLpwEVBx&{>(!KRZ`{<8c6=6@NZvdR4Aem$Gem;3dkym$>u
zmNg+pWs>@G|KT`tD#BAbIWK?dzokeg{Td-B_ak--eW^%g#7Fq^{O<xrWvakML9PS3
zX@@OBP!>#QF*ivMez2r3#cX1oE6*KoO&8~7H9~h~=*xBT{l&Tz-??m1$n^g@p)VeH
z>hF2-T=xe>J*0`u@_!>kUw*$?KS|f1_H7KA?PoI*ss1v5`Mu{cp-=tG7&7(0g+!zN
z_HvDjZx{L&=~!VTv;4GvrYg$*(Os)?$?nOVq@K6V4LpD8$0v}W;*$Du-MjAs^|wQp
z$|UvW{?h{&sJ{!k<U6S^*VhMy{x(s6(VbHDk$%_*UGksQm*=#uoC*+Onod%8S#Jsa
z0uiHrYJFU-OUQhre5U@dGW6y52aC{`dXiFJ#?S|bur2{+>dX8KMS5ob`A{~NUt6Ye
zNv&E}7)ObQkf~39J|j%}uiB+?=_)%XbpCZ((9$^kjSc#gg)IY@`rpsMNtS}76NSEx
z4%`pfMO+FBR{BFqTC?fb=n`v1`;qb)=Gg2k`pfEd4LWnt7&2Jq$fAF2fv)v#hC&9*
z==%|2nf`mtq3gG13>Z>hQm4>2mXYE%da)p^<@bSGrvK>sjI=c%g?;HlpU%@ah76Xi
z1d!okE`E$YSR!bm<I-Q+C*YFpPrkp_LSME$l9$0EKJn7`AY;g2**CN3KTa=3guNNY
HGxh%$>otm9

literal 0
HcmV?d00001

diff --git a/rccl/reduce-scatter/reduce_scatter.x b/rccl/reduce-scatter/reduce_scatter.x
new file mode 100755
index 0000000000000000000000000000000000000000..d2657f4967ef5b24773d2d3a26a0addf76226bc0
GIT binary patch
literal 25848
zcmeHP3wT_`b)LJc7qX?hvW;W&h%Z<M^U!KnwuJlwtseG@tUy>Y1_G?F_8~23AMC@}
zw0<F*u#rt1LK2dYgz`~J8``*S0)$d5W3UY&i6I2skOY$eSt0xgHntP6)&I<$SzWEQ
zDSY|b?`x~E?>%$=Ip@sGxifQTu6AZ!i)XP#QJ4%Cb~z)ie5S^s27YI3)#pV7q=D75
zQrs_Km8=MS0mqE=2Bz_dqz;RLOIl*E%nC|+F_9j#Xgo#Ib2!JCq_TuWNpB=*A}Z))
z!Afc~c+xARsKjS##4#r5ULhwbGwD$s_s-<fY;TjsBa-%<Yv3mYm3mYy(#r_FjL?(x
zK@vcaRQhG<J4rv)FJFH9g`Q&~SLe@)V(mh&xx&DkD>NRFw9|l@q|%4iK#$7#X#?rg
zwON#Bvv_`zYQmtTGQaBHP*>gD>Rx|UZzz&lRkf<VuBvXXJrT9naC_u-dR9xWTE;C{
zaTi%|KHrUWCXJCuk6Rl9KOY!fxuG7wSoo4WxEJ_X`G)e~EA!xM^Wb;q!M~XYe=HBa
zBM<&^9{lY*_=kD$VmyGc{7{|;pPC1skq58JgE!{EugrsAmk0Oe!F%)IEA!xM^57fu
z;ORX0SM%WC&4WLk2Y)UPz5sXyJBvN4Yj~oJ$?@kJfe+{!!CCxa;5Ig$)$1C;+3-D%
zpT!z<EsIg)34!m?HDXP{g8r?bPb+u_nEw%Kq!<!XY6O?57HE&iDC{NVr6C7n>`>S{
zf~QR57Z#}u`TQCfh16N0GiXFPBzPh-aS=TW7y8=<Fr|deK6mpu^&j+~)7bKh_xIg)
z-8ujEyL<nddA%6HeBMM-k0-rhJrn`b9gao>;sqNIbcYhjK-?RQ>*0Vm6bVLw`c~=Q
zU?`&ZhVBS}^#{fh>j}l$0%2clHAAc;kZcaD4EX}YERF{_ZPR;uqdr2G$mFJIDv}gP
zOFSNpcO>JXNH-yde#h#FuO}XjP|gvbuNRujBB9$-fmXld&3Yu#6b*-4BcbF{J#s6T
zSsL)Ce1Q(1o<z-9A{kFcdj;ubZSAdIlI9Uo^~On$V9q5d=S0#UO(j_{hQcL-ENoN+
zXdoVE!Q11ZWPocdmQEt|MxUNYGNBR-_NEd&EJ&{4iF!O9LTM<zJQ)fHSYQ<j;SKop
zqz)f<B@)JC0Tl65X(PUH%*Z*GWO$Hz*bgrmFQ8g-n0GE-MD&E|l~l);cv>5qyfyY3
zdo8Pu#*)>(xW2kN7O3|50@cnsyTk6_QM|^y^jZsb&96?}ULAz9lPFnrBBuLptqMd|
zCaSyiL?COTD$yHAIBTo!2t?+PfhuP$x8>L4i5@*t6~_}reX8cz=h_`r$+|#qP3@}q
z?cKrd+FR;s<NkP0JTd3CTVivgw<HoVBj5bRtMc{gk?!h|LW*crDspQidV55?i@PJK
z`PmYlUdc~xg&Y`1BO)I<i4oNz-a-5w*_3cB<RzbSfP0>h?qsjGC!uHM{oE8!UKI-m
z!r6q|Qo28^CwsC$IO<QqwcH%#6bxmfl#P_-r3z+~QrVa<k%}1%1^|S+7i*l-97zlr
zoZo7~&GVD{O}Ldy8n*{cxSR`7%A+Qn&rgMp9G3~Bu}FpiUE@*0D|yVg)vwTaq!@Q|
zz6~baoUhx2oAYfq;pTbabtathkYR-hm+uft>oMUNaI+z1!b`I#))yvxf(c)5!o`%A
zr`~PC%S`fHO?bHpzu$!8g^&#ons71Y=V_0ca51Ik_%;)6p6@(q!sQZ!6b4NA6mG$|
z?J?nJn()0Qe5wiGXTm>Y!uOl-X(l{l!l#??!zP^GcQTwX;b%*bsIJkS(KDB8G2%;f
zjR?o((ua7Pt`Xt*XL-!HP17|VAv-g8%(!W~#v_CqG0w8ty1@~Co&oW@L)SPZe5`po
z%`C<WiU5a)*RL~QQd;(z;dP9eFWF~gA0a#DYc!{1=S#9L$Ua2nHNtx2FmeqKBR|u1
zRVv&Sh-=ZH7K=xHfkYyZ(88%iQp3pQ>(L_F<R$IP682Ko1`)kGpwSyv!%Me2FkhS1
z>({F2Udk5hq27RhzJ@oXhSzK~uFXo!pXJvsMCyg5?5d_F&nSFW0^q1TU=DU#BP%g+
z@oS+-ES1z^I!$aa51<lIZnNxlK`jyRMI(L~SVkp+UP4=l@_M^tsl?^l!f+_!4JW$2
zR8nH~Rc011|G1tEMI(vJOWBg%XqVor(OaBXj900Yd7{2CP<Ly)sDe>9%D9x-?bUHU
zWAP@8DU02gGSzG(JEMMWE|2HxCU0AttKHk?y3V_Z38ZsrYn!*zy|ksneYK|<c+*nX
z^$>x0d#Cs6#fulA1)ObN(2j>o<J^lFEP6qYgRTdy#USw@=w{FX&~wn)UnL$i1Nv9!
zB=u-eFV0|$)+SeKkO!4xg>uJIrmV6lXHP6H*$SM-DC&pT!hRJfYb>@A@4yuWb~$0L
za@$>&rm~{dPy$QoRK6`pUkaK{x3qBqvKhnm9MU(64nf#PTn#8l6qJGo*ZsI20CtEB
zBcI0dX^rJtWBKgHa)+zDeo6UyYhUr^qAi8@6nx2gm&H0%SYGcccW?oSLD=RhFL6Q4
zC>Qx)0S1r;)Gb>s>jH%e)(ofw_$}~JH{z_h-1a34jND^wDc80X!06^ece&%PB6oRx
zUvXP`L!4*kqSA=W8p}(#FWX_~A=r`qnaokxIw}OWIuEp_U%uI${;j+3?ZfSzo-Ktv
zc#XKXl)Z(wjC;$X2T<<eGq0M-*wJO~^gHU~g?~Z|P#+&~ZyHF=-c>k6?ZMq&xDXP6
zl2hGVYEJ-z>O}{Khch?A+K{sKrkz6j96T_gy>M4y8934|651-_T??U|-kqtTa`YD-
z!Kg=uRakue04!cCExts--FM2O-nkl2(Vaf*emMi)mMC_<FgTM#JKTM5od~MYLjTb4
zFlgC120ioWdz?S_B~a%p>g-t^NN84PpZjA<XjY@M&L_zh_0TIp1iLzeS_fl6x4QN>
zE$Z@tBOdkQT|qTAbdaEf>hc3e-0H=gc}-n@jF@9c{u4(|smq56IYqKRh59j4Kh~^H
zd0w4z9Abl~TiKP>X;x#IgFz2+f#w0yJlL#G-39FrUFwvRh#z;UvyZ8rM?o@b=lel*
zC0(fSp+lfYL65f}Dd<w;zpz2*R8Wn-Zlh{0I-Bb2Qs1DuzHtarrwA^4dyv=u)E~i}
z8ivXV2!X@xB$<M%${bK_Q9B2rx$-^84IO~oK^Gw|LewEJvk#N#^DyU9ul)n%bP}R(
z9whlgNIu5lW`YTIWzo(fRP<)G6K;I-LrQ??><r8gQT>qcJis>(P)Y{%dB=v^ySUwY
z@QAwZ(&g^HXG@Uw>aXbbD&1b9+jH)DFRPnsPYn;d=j};0(UZi3*%4INKAV2+_-n^^
z(c3Cj7Tokoa_WW`>2)_$sy_8w+lE)@H5pVR&ywhl_iVxRl&jstJBFUzX>K2V^V!;v
zb71Hab;f#XFX~fo**3f^R4JDoI|iXTaj|=N!0jtV8P?2KpP0C0$6F|Q<OM_*8_^v{
z)a$ssE4{~ce6Q>H05910gPUGRKDprqUa-e@Ze8YD?z+~s%+=Z9-tfK~oxU}FJpFX#
z50JGx{ewy<vA*F#b<-Enk6ZiKl{k+*MbpsC!<clork@97lhEG)a;J9!nkJ!pxYRB{
znuKlz<mx+VQ8%kF>E85)dQTA|ZJUm%_nbtZ^UOP<e&Hx0t?At@n}*eUGSm^bh$3w<
zij--gyufTmS+?V9&NK<+;Vj|;(me06N<!34?+P>9z-3mV4*eUs_BW0{T}fE_^^5~(
z-_smrsX6ZcMNiB_!O}-tcf3F4na6Az^Q3?49LW3;Vo3ZZ5})~M7X2MiPx^Rf2(&f*
zTq_?}-lhuiaRt0Fu56)lq_gA7#NVTzpuar#E~jTcV9;}upnP2U2oV}r5|lvW%8L}C
zaYdcc{C{vf`SClb-s$7XeFV4A2$Fw1*$<gdIG&^+`ad_Gpg!v^?fkO?%3KU6Ls55s
z+2=1qi@p%e>KR&YJVmeh%->=1@Fc|s8>#j5FRFu3YyW4S>4Y3wWOp|Ge4bu5m5gjv
zVWc{1qzgisa8~jJ?9m9;{1=R1uJjZS4Pj4E|J;h4Jn3VZQcT@94XB&mrtS@#W<Z30
zfT?{J@0arLdeR^Kh*$C<k^U>_WqV;6S=a*$nIlM+%ronOJXNMJkMjlBlZZEG9u~Pi
z4(<LQ@IK#vAMeVlZDS?P7}bjA$`~=T1+#*fX@Y4dh9*xW)(|9)vPEQb<vv7?sL}f>
zGe+cr$`d@Y?#4<7UKO2{T2QU<RAK<GD5#_|K8oZ_m#9xYN@R=yG}O>wlzu*2KgagT
zc!bUAgLsO6fWr0xJT!OuH}DFOHwihw1x|t83)VTD`413`l50QbN&i0cD==>1&CEKi
zy2!=*c`GnnpSeYVGyrD8WMaU*;W--WKkvHHb(8C6m-nWf9^ZkXiRz37=2qT-?R5`J
zn{GIhKf(UZyjt$v1(m1*YV_WZX#U=Ua@{R**aMrH1z<jn^5=wdKPjIo_;kwULis9E
z-tj4wUxqw+tAdm_eoE#0gz~HKL1qalZ}^B-Fi&Ex&zjY?6V2(9#*67nlrggmw-3~c
zjg{0|@mgEkEZ*;*##I{6Gv&#=jwUtylhjTlgF(QVqqxadrEaRjX!3!2qmkVQF7@J{
zeW2dFV|yc-5*pVwC9?_QgEmNEqWt%;w*3?h7W<I?)B%)mXe#h8kH%jIju#i6w;Rga
z;WU=H2Qf%qMv~7$*|6{pvhXVK8Kd!U68<#s;?elG2!Gh^+q3=6Y$^Vgu&+RN!st>w
z4ty{*%nLq%7H!JbiXsi(U?lzR=%fNlx&leGig5S6??|tC-<?kFcc+)_+uiapY_i=g
z)Tu#^8ssn$ci)<QOx^ejqJvKm-`}#&-H!zbtwH)uG$liQC)(6aH1zeYIm%KO3xTI9
zfIL%NLGIq2S%oTKwNnP$<h8z*kCh=`-<rcLb(56;49QOy@@_-EVRy^nto)q5H78ig
zA?3M!r;wj!$WKNUvhqb+hYGf0;pFaX*;mrH=3~n`o6yB#NZm9CzTg%w;pJAJLH=y}
z$3Qc`gzI?WYy@iCfemM#2e@_X3p-yLwfqA^)5q5I>okM(Y?<{>ShX*6Z>b{M;Yy!u
zO&{}IL=G=;?>K4Abikl{(}ASsd_%5>Jn28-<<Oiy<QksxD|g>c#XaxWsrTG{&y{@s
zCOJG|f}CAH<>r56Cp{(D?=SoVx*o0cEW>+Af2V=-7}d@Q!?j&@be>s$+-bZUo<)2Q
zB@XVVs?NHBc=F@mUd}Hi9@S@q1DwB-czB!*ZsU9-@#qI^@IlVcCm!9M4Q}Op4e=Ou
z+2DH4UraoG-WrT?{(Ry~h+o0^vxujmb+DcDlZeNpk_|R+zLa=OZP}oM^A_UiQ}3Y0
z`9I+^M-LX?=--_GfcSFa8Ry?4o<8Xg9!CEkp7mSeZN%^A{4a^8_BFVd^REz(=ZX4r
z{zc*^6Tgk~&k|3g-r$3re~NheL_4^Z^Z!CTeVQFy&-s5Kp4$6hjPnl>KaKbmoc|{A
z^a*ybo%8n+PjjKc2F`zx__K+3aQ;r>>66W%#`!ylKbLqL=i|hGmUzbbTZo@Q{Na<s
z!wY6=h%&ZX(CyU*FTZ(zL@fb95dODQGk*cK1O%BrCx>s!;oEX}Zw{Zx;n(Kyn{)VY
z=kWiS!#|nBzmUWKJcs{v4o`m@reOZ&PCPYW3fcsXQ)<8zXzD|O8ZZTVuoUQlQ`jTq
zsYCG~@)Lw<D>M#MgQ6hwp$<f0n?>W4IsgS}pE?``N3q5!br=fo3Bw<r8a6)5iFA#p
zGPb|Th@B^RSr2M}6y)!gsKHT?>C_-8>=XLb04d0H>W~!fb84KH@jGLW3I6b~QJz?l
zk&dRx%Qql+^i0lY{Ur3q$+!o`JuvQpaSx1pVB7=a9vJt)xCh2PFz$hI4~%<Y+ymnt
z827-q2gW@x?tyU+jC)|*1LGbT_rQOT2TH`>shIeC&U!)Re)Zg3UlZx?3VKY?q8g2h
zP8IY5LF)uvEa<g@1_fOy=v{(-P0&XKeO%C8g1#o`yMi7Qv}lgVU(gE#trK*ypw|i-
z6m+GacM1A6K_3zHaY1(p+Jwzy8sAo@t;9YEY+TTc?SPs#yH*d>);Q|t`RC2i9eRy(
zZr5djIl-X6HmK`$^^UoYxwZ9mf!g|u@Ux>(<kmoZzQ)aYJk7!gThi1tUz?3h@E2)L
zhkcISq18BQoKUUTW^>urh|hkJ5bIdh(W1dpYSl&BWp->NG(D_K5jXzkCjOE))9}v%
zlZ-h^fA_Pn7To1u{>-V2E#&DNv+45hfs=%o{Cgn%fMBd(l;;s73Hzem+&h*qFeAU4
zQ3XT(5g{-CKDsT3&o$!$AnLF1G+BQ^A4j4e*AJ1Ztham#QAM78A^(=n$XNw^8&Zk#
zAtWY#OK9Cd*3ZLD`3!zN%g(UwE!RM?1+SnGn_O8z4Keh`dJ8{!p+wn00Kx^rf|2hJ
zkb40O0xwY-aGyv&e4%2=?kL^!J3y9_LMxK2H{oX4*9@d!E{BVM2c*!*8A1~&-BxfG
zNYSkjpsSdbl-ve-#$C9TevWQDMmHpF=6A}>q7+dxns7NDq%wO2MIVA8<zgP)OVLYt
z)P^L*PSK*hxY0$KyaqaP!b7Nr;&>kM2|Fq3q$vN>vEseQseu{Uk)b?Lo*|j0a*@!@
zlYu<&7;yCp%Cm`zOhNq`XQ|vCVqfK|#NLm~Mo&z+_6pqJ##P}GCTD@6@Eu4`3Nd9l
zIgXAln0%9vn4&DRtYK4>PRm-{ud%Fyz?o8=%E6}A;zpNpnRv2m0prQy-T=561siEm
zN`+!8s!%2=*7CE<XO&MzyTuP&tmRisB#^9BD1awWvJF2fu*^l`{0R#tv`)Tax&^-m
zuvW4O6E2@%Me{GXfVEEMD$G{2tZ2ld*$B)Qb3;N8zm%~~&DzWwDI76dq$sVE6&Nl)
z55`MW<VV?@ai((Nbb?C@dA1YGC0mH!6HGjlT?kTUXID%&%6#E;%DrLQ*BgPC+m$hJ
zGN=}2%VR;6Z6k(>t(X9pOe!Y9Nk}O%oIxp*(|0OJ!{(?j-idvF)4tpY|C|YsRVYq~
zPo<J9oE|ARN;9UXt_5o0JEs)oqw-RwAdn#TizQYk{DD|P?2TsFE{-k!_73BxFlSig
zNio)W%2BKBk}I?YSDtrXJYlun7O>!C0%a!5SQ-6Lhb)(6ErVwv?2Whkki^){xGao@
z!T+)VzIZY;lD`~jtvi^~8lQiu1xFw#><o(>ajidRilqP;{VdVSt`j+oDoAxI5f>-0
zbo+dA7kDC-2pC(yt1qh#MSQ&}e}J(-Z?I~l+}Nh?4+QN!(i^gqT6Z$VI%$tJlC)0J
z_&sTjEVNF#a><R%Qeb_DZoBWb%${y7weu45A@dX0KQdB(Ir3XCRNUO*sJk5ft(OQ)
zoX&t<+;rAL2xkCL6}&hnKwr(z0Pwb6jUxg4*o%lt`<bstk25aA_!$Lw<7!P#r|}xq
zs=~=sB(SJE5DCOXKCpV+*R#ktvPP@w=Krs*(W+o@5jM|5uPO*yKM#(>@iqalVPk)0
zRc|2Do$OiU0F;W{9*X#@l6n_5s)N@P;&_H=DuEsOqZ82xaex3O8i}4N><CQ;e6&fD
z8;*Aes&Ixv6?R=hDv~5yaU2DK%5|ft$sX{Ax=@V9QBs^tfPx2Ow4IbnK;_{!aY{il
zzIu_f?o#JmvJV4Q*fJW4f{8?-Nd?JT0@&W#gnJG?l0&jLQH9<3f!;;*2ts;qRbW-X
zhbKZ0Mm!NH$CGr{kLK#oYV32Jn7fP;pko_wdP0mkV>0BciU)A`1$vU{Ig@pBW9_91
z(#w}eMsqlT0#&BXp`$xHj&<N&O^<i`#!jV%NG-&hWYs;<aG+Z6N`-p;)$v%^h%_#1
z^)y#^<H!MDRgHa)vl?xXiPI-wcBLExyxv$mkkcqnFAsMNvK-ZZ4R7%pr6P$?cO*a?
zat;5DH`-6uXvc1qyfwGL+oPY2;cyJ3BY}Q$N<kuWRF<QY9F673B;_PEv`6%%-uJ}(
zL+brd@KR4wn}{?Bs<34;8p}dA$!JPI1?-5j?7M==lGAN<Hi*QypbEQLM$NW5KY^`l
zq^)Gu7CMg+%(kvM&6Z0@^=1|6tvxD85w}1s%<)|r4+>0S9iw#qEJKrk0``aObCtrh
z>KF^hbYx8YeBcV3$nLMu5E}`6hK{MPTvJ>wl87oed8k3xI6~#1a1}1XSLhmI#>|rF
z^|&zK$^MR$%12=ZE(`3--~BEn5aAYFWamL$LyY1^SjFR$vcLC~b^;JkEK`~M9w6~`
zkRQtrU(SPni`$=^{XMF*^X)wHkLJO@YNhgw_*1Sc|C#J#QfOPDAvO|<sDhU1+27sD
zl-pp3Sw{RO@r?rC*Q7~^)7>1O!g9ZVTo3yeOL_Kpuu_T653^X*?C)VE{ykp))64lY
z2u#BB%3TG$DR8+4#gDsrcvs?1jq`GGbWGrTFEDVvFCW-PxHL8u4a2~}@^B8yS%K3V
z<w8~Z^Ag~bq>za&0&W|_4xO*2ndD?N4xIWxZn!%S{`EZg_koX99@<+s7QQPF{?k18
zzC8GQdGL?(;1yWyja8oW^5Apx;4OLZ<s2`=d5{`+=|=Dr?hyD2@!a{3tAT%=<6LCa
z?K^q!NAlp`&x8Ma9{k6^$2w2$l|1sl%!9ur><s7{FTWf&XxH>uem;~3|2Pj$d$Y%~
zQ<evxk_Vrj2cMM(ug-(d1wPifdJ8x{@?MY+bBVzB2>(dj!|j(bx!<D!?UjPgaU*WN
zhw|Xeju~Yj9%J?h78`a~V=ERa-OTP^9Z9SXGdq?ycJaY;6yuE>Y!4PHhIA4Lrt3PX
zidd{SiCC1M&Snqvi1YD!{E(C!vnK<qAjN0(cD{zQbG4BvGvlzaf?)PhGd7;H9u8p=
z-yMY&Bd2IM=8fX)xTsD`BRj2gLV*O$_T(bSD;GgnBN~aMZ)A)^<e1%uGwqNBi-E3G
zH}a;Fc7Pi!KjBX%b#=u9D`g6;0%aU)FBu)JPUrdA;a#i<L3`+IJsvZtTsY*cfUsie
z!a0Kxz8Gq8dlxTtwY7LXtsR|SI)l(3HCDC80s^ZqtX=##+Rp2ZcHM$SB+g|sk_=g~
zSOyW2avjD_KSd`J^@hWG3>nAF>n;=st2LnZvHk7k#YJ|sMDmWT(2Ys93FeQUpR7oa
zPafd6#B+6{%R?vbbhLDOJ6(;Q7Swa}YF-%O7UkKDW*a#NHS&z)cLIMR>cz=>ek>3F
zU!36Ry}>z`9AGq!W_Y^YSwjaPc2P%3I_r8FL~4CFdny_R=&bW5QsMTM9p3g(hr>&U
zDgN&}0n+Ogjn^BD#?eB0sq3L8qmG3fW6eDZqHhvj@8~Yqa#b_BUA~sjHJY!#%GK7|
zWFCb7f;bKDbm29}gFH7|o~JGMZ;50sE8|3IKU*;_slVBxfg3r`+2A<DZ*ra}5nBID
zz$NwNKDKQF7{L350_8#Fl<#qTvE^X~?pO;)rM}!Z7ArE++YN#bnMROulF|oM+G8g9
zfg+6y4ha1Q&W+SDxBT=6f@3CqR;(EsRA2;$L9+SF@(Y;%g^bE3^OyVd97132)06Vz
zbu3xdh8UGe>dSqH8aWk#)<YCz{bl~rf6I|Z`i&%nAon5e5&BY*^zkwNH2>?ss7y7u
zD9H7|UfNlUkdOs4Sj<sIhY(oOmtyjqae4lD=V)<WRx@;aj=o$k-%mSr5%`W}Bg*fx
z{u0`N2<0yxclI22dA|F-q8?H{xBOqo(U;$G?mN%Wp!RJJx$S2Q63zL`?>>(Sed=H4
zkgNX{B%1XHoEjG&5c)RhSYaf${PzQ=D$4dD_jSs1<8%Ebc}X9Ju4>Yk>)(Ad0U}5_
zgB#%`z6}XfCYk^KCXI{k|AhM6Ax+Ot>dSSuJTL!2QGd~$vgMI}*aKbipVXJ<wBCII
zK!n)_$=+qTCGZnO%=+1Na)Tiu^O5qo`ajRnm){|5LSO1hN_m;X5E#Px1emKY^Dh(W
zx%rns*<5~Yg~lbd219|)fiZ_%efl#RVbXsMy2i!pU7XN4*rS3rio@U9pifyia&W2t
zogAEGDM&g)=o{$BeZNb@rJ!J?Kcu8RpMIkuv0k(vDW7AG&CjF1qQ%glvlh)EhsC)v
zBXu#$9bRm3Z{{fEu&h()=lbtwZbQE_XTXs9lG67k88_$epcf0mdVU|d<@zt2M}M2p
zzdXly4$D>n$Z;{x>@a<}M9@UXrN6kJjZ3ya`TklDecAR%UIv@^)O&AEhB++zVjlg6
O=*5VzH^+Fc{(k~WGn2>w

literal 0
HcmV?d00001

diff --git a/reduce_scatter.cu b/reduce_scatter.cu
index 1853aed..99fc950 100644
--- a/reduce_scatter.cu
+++ b/reduce_scatter.cu
@@ -14,6 +14,7 @@
   #include <cuda_bf16.h>
   #define bfloat16 nv_bfloat16
 #elif USE_ROCM
+  #define __HIP_PLATFORM_AMD__
   #include <hip/hip_bfloat16.h>
   #include <hip/hip_runtime.h>
   #include <hip/hip_runtime_api.h>

From 79e7570c80271c3938c313c6aca1da20cb96b99f Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Sun, 14 Apr 2024 13:38:57 -0700
Subject: [PATCH 35/52] fix Makefiles

---
 mpi/Makefile  | 13 +------------
 nccl/Makefile |  5 -----
 rccl/Makefile |  5 -----
 3 files changed, 1 insertion(+), 22 deletions(-)

diff --git a/mpi/Makefile b/mpi/Makefile
index 3efbe3f..12ed3bf 100644
--- a/mpi/Makefile
+++ b/mpi/Makefile
@@ -6,25 +6,14 @@
 CC = cc
 
 # perlmutter flags
-<<<<<<< HEAD
 INC = -I/global/common/software/nersc9/nccl/2.19.4/include
 CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI
 LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
 
 # frontier flags
 # INC = -I${ROCM_PATH}/include
-# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL
+# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_MPI
 # LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl
-=======
-# INC = -I/global/common/software/nersc9/nccl/2.19.4/include
-# CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI
-# LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
-
-# frontier flags
-INC = -I${ROCM_PATH}/include
-CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_MPI
-LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl 
->>>>>>> origin/frontier
 
 all: allgather.x allreduce.x reduce_scatter.x
 
diff --git a/nccl/Makefile b/nccl/Makefile
index 5652112..d4423b4 100644
--- a/nccl/Makefile
+++ b/nccl/Makefile
@@ -10,11 +10,6 @@ INC = -I/global/common/software/nersc9/nccl/2.19.4/include
 CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
 LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
 
-# frontier flags
-# INC = -I${ROCM_PATH}/include
-# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL
-# LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl
-
 all: allgather.x allreduce.x reduce_scatter.x
 
 allgather.x: ../allgather.cu
diff --git a/rccl/Makefile b/rccl/Makefile
index 590dee7..aa0a7b9 100644
--- a/rccl/Makefile
+++ b/rccl/Makefile
@@ -5,11 +5,6 @@
 
 CC = cc
 
-# perlmutter flags
-# INC = -I/global/common/software/nersc9/nccl/2.19.4/include
-# CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
-# LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
-
 # frontier flags
 INC = -I${ROCM_PATH}/include
 CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL

From 0cd86f0b6518f895bfb0d2aca42905d900a038c0 Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Sun, 14 Apr 2024 17:28:06 -0400
Subject: [PATCH 36/52] add benchmark code for all-reduce and reduce-scatter

---
 mpi/Makefile                               | 12 ++++++------
 mpi/all-reduce/frontier/128_gcd_run.sh     | 21 +++++++++++++++++++++
 mpi/all-reduce/frontier/16_gcd_run.sh      | 21 +++++++++++++++++++++
 mpi/all-reduce/frontier/32_gcd_run.sh      | 21 +++++++++++++++++++++
 mpi/all-reduce/frontier/64_gcd_run.sh      | 21 +++++++++++++++++++++
 mpi/reduce-scatter/frontier/128_gcd_run.sh | 21 +++++++++++++++++++++
 mpi/reduce-scatter/frontier/16_gcd_run.sh  | 21 +++++++++++++++++++++
 mpi/reduce-scatter/frontier/32_gcd_run.sh  | 21 +++++++++++++++++++++
 mpi/reduce-scatter/frontier/64_gcd_run.sh  | 21 +++++++++++++++++++++
 9 files changed, 174 insertions(+), 6 deletions(-)
 create mode 100644 mpi/all-reduce/frontier/128_gcd_run.sh
 create mode 100644 mpi/all-reduce/frontier/16_gcd_run.sh
 create mode 100644 mpi/all-reduce/frontier/32_gcd_run.sh
 create mode 100644 mpi/all-reduce/frontier/64_gcd_run.sh
 create mode 100644 mpi/reduce-scatter/frontier/128_gcd_run.sh
 create mode 100644 mpi/reduce-scatter/frontier/16_gcd_run.sh
 create mode 100644 mpi/reduce-scatter/frontier/32_gcd_run.sh
 create mode 100644 mpi/reduce-scatter/frontier/64_gcd_run.sh

diff --git a/mpi/Makefile b/mpi/Makefile
index 12ed3bf..28861d4 100644
--- a/mpi/Makefile
+++ b/mpi/Makefile
@@ -6,14 +6,14 @@
 CC = cc
 
 # perlmutter flags
-INC = -I/global/common/software/nersc9/nccl/2.19.4/include
-CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI
-LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
+# INC = -I/global/common/software/nersc9/nccl/2.19.4/include
+# CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI
+# LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
 
 # frontier flags
-# INC = -I${ROCM_PATH}/include
-# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_MPI
-# LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl
+INC = -I${ROCM_PATH}/include
+CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_MPI
+LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl
 
 all: allgather.x allreduce.x reduce_scatter.x
 
diff --git a/mpi/all-reduce/frontier/128_gcd_run.sh b/mpi/all-reduce/frontier/128_gcd_run.sh
new file mode 100644
index 0000000..5c6baf5
--- /dev/null
+++ b/mpi/all-reduce/frontier/128_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 16
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/128_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/frontier/16_gcd_run.sh b/mpi/all-reduce/frontier/16_gcd_run.sh
new file mode 100644
index 0000000..e1ad604
--- /dev/null
+++ b/mpi/all-reduce/frontier/16_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 2
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/16_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/frontier/32_gcd_run.sh b/mpi/all-reduce/frontier/32_gcd_run.sh
new file mode 100644
index 0000000..be7bdd9
--- /dev/null
+++ b/mpi/all-reduce/frontier/32_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 4
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/32_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/frontier/64_gcd_run.sh b/mpi/all-reduce/frontier/64_gcd_run.sh
new file mode 100644
index 0000000..a8e13d2
--- /dev/null
+++ b/mpi/all-reduce/frontier/64_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 8
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/64_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/frontier/128_gcd_run.sh b/mpi/reduce-scatter/frontier/128_gcd_run.sh
new file mode 100644
index 0000000..b6505f8
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/128_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 16
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/frontier/16_gcd_run.sh b/mpi/reduce-scatter/frontier/16_gcd_run.sh
new file mode 100644
index 0000000..eb6b2ba
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/16_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 2
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/frontier/32_gcd_run.sh b/mpi/reduce-scatter/frontier/32_gcd_run.sh
new file mode 100644
index 0000000..4ed3437
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/32_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 4
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/frontier/64_gcd_run.sh b/mpi/reduce-scatter/frontier/64_gcd_run.sh
new file mode 100644
index 0000000..a5a9957
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/64_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 8
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x

From 032011775eaf727fa38d07fcfebf944484e76c8b Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Mon, 15 Apr 2024 03:23:10 -0400
Subject: [PATCH 37/52] add results of MPI on Frontier so far

---
 mpi/all-reduce/frontier/benchmarks/16_gcd.txt     | 12 ++++++++++++
 mpi/all-reduce/frontier/benchmarks/32_gcd.txt     | 14 ++++++++++++++
 mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt | 13 +++++++++++++
 mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt | 15 +++++++++++++++
 4 files changed, 54 insertions(+)
 create mode 100644 mpi/all-reduce/frontier/benchmarks/16_gcd.txt
 create mode 100644 mpi/all-reduce/frontier/benchmarks/32_gcd.txt
 create mode 100644 mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt
 create mode 100644 mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt

diff --git a/mpi/all-reduce/frontier/benchmarks/16_gcd.txt b/mpi/all-reduce/frontier/benchmarks/16_gcd.txt
new file mode 100644
index 0000000..609afbd
--- /dev/null
+++ b/mpi/all-reduce/frontier/benchmarks/16_gcd.txt
@@ -0,0 +1,12 @@
+srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 16 33554432 1073741824 10
+ 0: Local data size: 1024
+ 0: Global data size: 1024
+ 0: Number of GPUs: 16
+ 0: Message size range: 33554432 - 1073741824
+ 0: Number of iterations: 10
+ 0: 33554432 0.133082 seconds
+ 0: 67108864 0.267616 seconds
+ 0: 134217728 0.634895 seconds
+ 0: 268435456 1.928400 seconds
+ 0: 536870912 3.973167 seconds
+ 0: 1073741824 7.913018 seconds
diff --git a/mpi/all-reduce/frontier/benchmarks/32_gcd.txt b/mpi/all-reduce/frontier/benchmarks/32_gcd.txt
new file mode 100644
index 0000000..b92c437
--- /dev/null
+++ b/mpi/all-reduce/frontier/benchmarks/32_gcd.txt
@@ -0,0 +1,14 @@
+srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 32 8388608 1073741824 10
+ 0: Local data size: 1024
+ 0: Global data size: 1024
+ 0: Number of GPUs: 32
+ 0: Message size range: 8388608 - 1073741824
+ 0: Number of iterations: 10
+ 0: 8388608 0.043066 seconds
+ 0: 16777216 0.084259 seconds
+ 0: 33554432 0.167705 seconds
+ 0: 67108864 0.336696 seconds
+ 0: 134217728 0.773389 seconds
+ 0: 268435456 2.284815 seconds
+ 0: 536870912 4.693147 seconds
+ 0: 1073741824 9.356859 seconds
diff --git a/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt
new file mode 100644
index 0000000..fa9c67a
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt
@@ -0,0 +1,13 @@
+srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 16 33554432 2147483648 10
+ 0: Local data size: 2048
+ 0: Global data size: 2048
+ 0: Number of GPUs: 16
+ 0: Message size range: 33554432 - 2147483648
+ 0: Number of iterations: 10
+ 0: 33554432 5.091016 seconds
+ 0: 67108864 5.092117 seconds
+ 0: 134217728 5.082377 seconds
+ 0: 268435456 5.103443 seconds
+ 0: 536870912 5.102289 seconds
+ 0: 1073741824 5.116191 seconds
+ 0: 2147483648 5.115768 seconds
diff --git a/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt
new file mode 100644
index 0000000..23a0ace
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt
@@ -0,0 +1,15 @@
+srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 32 8388608 2147483648 10
+ 0: Local data size: 2048
+ 0: Global data size: 2048
+ 0: Number of GPUs: 32
+ 0: Message size range: 8388608 - 2147483648
+ 0: Number of iterations: 10
+ 0: 8388608 5.006776 seconds
+ 0: 16777216 4.981770 seconds
+ 0: 33554432 5.014587 seconds
+ 0: 67108864 4.994224 seconds
+ 0: 134217728 4.977063 seconds
+ 0: 268435456 4.980235 seconds
+ 0: 536870912 5.007770 seconds
+ 0: 1073741824 5.013561 seconds
+ 0: 2147483648 5.015718 seconds

From 7752cedddba582d6f4ddbbd68f764d7e4d035995 Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Mon, 15 Apr 2024 12:39:44 -0400
Subject: [PATCH 38/52] add 64 gcd data for MPI

---
 mpi/all-gather/frontier/benchmarks/64_gcd.txt   | 14 ++++++++++++++
 mpi/all-reduce/frontier/benchmarks/64_gcd.txt   | 13 +++++++++++++
 .../frontier/benchmarks/64_gcd.txt              | 17 +++++++++++++++++
 3 files changed, 44 insertions(+)
 create mode 100644 mpi/all-gather/frontier/benchmarks/64_gcd.txt
 create mode 100644 mpi/all-reduce/frontier/benchmarks/64_gcd.txt
 create mode 100644 mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt

diff --git a/mpi/all-gather/frontier/benchmarks/64_gcd.txt b/mpi/all-gather/frontier/benchmarks/64_gcd.txt
new file mode 100644
index 0000000..3eed822
--- /dev/null
+++ b/mpi/all-gather/frontier/benchmarks/64_gcd.txt
@@ -0,0 +1,14 @@
+srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 64 262144 33554432 10
+ 0: Local data size: 32
+ 0: Global data size: 2048
+ 0: Number of GPUs: 64
+ 0: Message size range: 262144 - 33554432
+ 0: Number of iterations: 10
+ 0: 262144 0.001685 seconds
+ 0: 524288 0.003350 seconds
+ 0: 1048576 0.003938 seconds
+ 0: 2097152 0.006864 seconds
+ 0: 4194304 0.013037 seconds
+ 0: 8388608 0.025167 seconds
+ 0: 16777216 0.049414 seconds
+ 0: 33554432 0.211224 seconds
diff --git a/mpi/all-reduce/frontier/benchmarks/64_gcd.txt b/mpi/all-reduce/frontier/benchmarks/64_gcd.txt
new file mode 100644
index 0000000..122c83e
--- /dev/null
+++ b/mpi/all-reduce/frontier/benchmarks/64_gcd.txt
@@ -0,0 +1,13 @@
+srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 64 16777216 1073741824 10
+ 0: Local data size: 1024
+ 0: Global data size: 1024
+ 0: Number of GPUs: 64
+ 0: Message size range: 16777216 - 1073741824
+ 0: Number of iterations: 10
+ 0: 16777216 0.101777 seconds
+ 0: 33554432 0.203258 seconds
+ 0: 67108864 0.406569 seconds
+ 0: 134217728 0.913391 seconds
+ 0: 268435456 2.633732 seconds
+ 0: 536870912 5.375804 seconds
+ 0: 1073741824 10.708706 seconds
diff --git a/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt
new file mode 100644
index 0000000..560c383
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt
@@ -0,0 +1,17 @@
+srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 64 16777216 2147483648 10
+ 0: Local data size: 2048
+ 0: Global data size: 2048
+ 0: Number of GPUs: 64
+ 0: Message size range: 16777216 - 2147483648
+ 0: Number of iterations: 10
+ 0: 16777216 5.006610 seconds
+ 0: 33554432 4.998351 seconds
+ 0: 67108864 5.003749 seconds
+ 0: 134217728 5.066133 seconds
+ 0: 268435456 4.980950 seconds
+ 0: 536870912 4.982830 seconds
+ 0: 1073741824 5.023178 seconds
+ 0: 2147483648 4.988750 seconds
+ 0: 
+ 0: MPICH Slingshot Network Summary: 4 network timeouts
+ 0: 

From 4d5a82721db14ffe7d5bac7be4b5750a3e1779b4 Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Tue, 16 Apr 2024 02:39:12 -0400
Subject: [PATCH 39/52] add 128 gcd numbers for MPI on Frontier

---
 mpi/all-gather/frontier/benchmarks/128_gcd.txt     | 13 +++++++++++++
 mpi/all-reduce/frontier/benchmarks/128_gcd.txt     | 12 ++++++++++++
 mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt | 13 +++++++++++++
 3 files changed, 38 insertions(+)
 create mode 100644 mpi/all-gather/frontier/benchmarks/128_gcd.txt
 create mode 100644 mpi/all-reduce/frontier/benchmarks/128_gcd.txt
 create mode 100644 mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt

diff --git a/mpi/all-gather/frontier/benchmarks/128_gcd.txt b/mpi/all-gather/frontier/benchmarks/128_gcd.txt
new file mode 100644
index 0000000..824b380
--- /dev/null
+++ b/mpi/all-gather/frontier/benchmarks/128_gcd.txt
@@ -0,0 +1,13 @@
+srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 128 262144 16777216 10
+  0: Local data size: 16
+  0: Global data size: 2048
+  0: Number of GPUs: 128
+  0: Message size range: 262144 - 16777216
+  0: Number of iterations: 10
+  0: 262144 0.003748 seconds
+  0: 524288 0.005048 seconds
+  0: 1048576 0.008068 seconds
+  0: 2097152 0.014084 seconds
+  0: 4194304 0.026981 seconds
+  0: 8388608 0.051879 seconds
+  0: 16777216 0.255600 seconds
diff --git a/mpi/all-reduce/frontier/benchmarks/128_gcd.txt b/mpi/all-reduce/frontier/benchmarks/128_gcd.txt
new file mode 100644
index 0000000..56c18aa
--- /dev/null
+++ b/mpi/all-reduce/frontier/benchmarks/128_gcd.txt
@@ -0,0 +1,12 @@
+srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 128 33554432 1073741824 10
+  0: Local data size: 1024
+  0: Global data size: 1024
+  0: Number of GPUs: 128
+  0: Message size range: 33554432 - 1073741824
+  0: Number of iterations: 10
+  0: 33554432 0.240206 seconds
+  0: 67108864 0.476990 seconds
+  0: 134217728 1.041500 seconds
+  0: 268435456 2.951969 seconds
+  0: 536870912 5.990606 seconds
+  0: 1073741824 12.004613 seconds
diff --git a/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt
new file mode 100644
index 0000000..af5e98a
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt
@@ -0,0 +1,13 @@
+srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 128 33554432 2147483648 10
+  0: Local data size: 2048
+  0: Global data size: 2048
+  0: Number of GPUs: 128
+  0: Message size range: 33554432 - 2147483648
+  0: Number of iterations: 10
+  0: 33554432 5.046207 seconds
+  0: 67108864 5.031027 seconds
+  0: 134217728 5.063647 seconds
+  0: 268435456 5.054240 seconds
+  0: 536870912 5.047598 seconds
+  0: 1073741824 5.051536 seconds
+  0: 2147483648 5.057082 seconds

From dffbac0f00eb291a78a6e5086e87ea8eb233049c Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Thu, 11 Jul 2024 17:13:17 -0700
Subject: [PATCH 40/52] use latest nccl

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a1fdcdb..526fb95 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@ Before compiling do these:
 
 ### Perlmutter
 ```sh
-module load PrgEnv-cray cudatoolkit craype-accel-nvidia80 nccl/2.19.4
+module load PrgEnv-cray cudatoolkit craype-accel-nvidia80 nccl
 export CRAY_ACCEL_TARGET=nvidia80
 export MPICH_GPU_SUPPORT_ENABLED=1
 ```

From 7ca3d66301fb92049dd797e3406ae27c0b8169cc Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Thu, 11 Jul 2024 17:17:30 -0700
Subject: [PATCH 41/52] update .gitignore to ignore .x and .out files

---
 LICENSE                                       |  20 --
 README.md                                     |  15 -
 allgather.cu                                  | 248 ----------------
 allreduce.cu                                  | 262 -----------------
 mpi/Makefile                                  |  30 --
 mpi/all-gather/allgather.x                    | Bin 25696 -> 0 bytes
 mpi/all-gather/frontier/128_gcd_run.sh        |  21 --
 mpi/all-gather/frontier/16_gcd_run.sh         |  21 --
 mpi/all-gather/frontier/32_gcd_run.sh         |  21 --
 mpi/all-gather/frontier/64_gcd_run.sh         |  21 --
 mpi/all-gather/frontier/8_gcd_run.sh          |  21 --
 .../frontier/benchmarks/128_gcd.txt           |  13 -
 mpi/all-gather/frontier/benchmarks/16_gcd.txt |  13 -
 mpi/all-gather/frontier/benchmarks/32_gcd.txt |  15 -
 mpi/all-gather/frontier/benchmarks/64_gcd.txt |  14 -
 mpi/all-gather/frontier/benchmarks/8_gcd.txt  |  14 -
 mpi/all-gather/perlmutter/128_gpu_run.sh      |  37 ---
 mpi/all-gather/perlmutter/16_gpu_run.sh       |  37 ---
 mpi/all-gather/perlmutter/32_gpu_run.sh       |  37 ---
 mpi/all-gather/perlmutter/64_gpu_run.sh       |  37 ---
 mpi/all-gather/perlmutter/8_gpu_run.sh        |  37 ---
 .../perlmutter/benchmarks/128_gpu.txt         |  12 -
 .../perlmutter/benchmarks/16_gpu.txt          |  12 -
 .../perlmutter/benchmarks/32_gpu.txt          |  14 -
 .../perlmutter/benchmarks/64_gpu.txt          |  13 -
 .../perlmutter/benchmarks/8_gpu.txt           |  13 -
 mpi/all-reduce/allreduce.x                    | Bin 25832 -> 0 bytes
 mpi/all-reduce/frontier/128_gcd_run.sh        |  21 --
 mpi/all-reduce/frontier/16_gcd_run.sh         |  21 --
 mpi/all-reduce/frontier/32_gcd_run.sh         |  21 --
 mpi/all-reduce/frontier/64_gcd_run.sh         |  21 --
 mpi/all-reduce/frontier/8_gcd_run.sh          |  21 --
 .../frontier/benchmarks/128_gcd.txt           |  12 -
 mpi/all-reduce/frontier/benchmarks/16_gcd.txt |  12 -
 mpi/all-reduce/frontier/benchmarks/32_gcd.txt |  14 -
 mpi/all-reduce/frontier/benchmarks/64_gcd.txt |  13 -
 mpi/all-reduce/frontier/benchmarks/8_gcd.txt  |  13 -
 mpi/all-reduce/perlmutter/128_gpu_run.sh      |  37 ---
 mpi/all-reduce/perlmutter/16_gpu_run.sh       |  37 ---
 mpi/all-reduce/perlmutter/32_gpu_run.sh       |  37 ---
 mpi/all-reduce/perlmutter/64_gpu_run.sh       |  37 ---
 mpi/all-reduce/perlmutter/8_gpu_run.sh        |  37 ---
 .../perlmutter/benchmarks/128_gpu.txt         |  11 -
 .../perlmutter/benchmarks/16_gpu.txt          |  11 -
 .../perlmutter/benchmarks/32_gpu.txt          |  13 -
 .../perlmutter/benchmarks/64_gpu.txt          |  12 -
 .../perlmutter/benchmarks/8_gpu.txt           |  12 -
 mpi/reduce-scatter/frontier/128_gcd_run.sh    |  21 --
 mpi/reduce-scatter/frontier/16_gcd_run.sh     |  21 --
 mpi/reduce-scatter/frontier/32_gcd_run.sh     |  21 --
 mpi/reduce-scatter/frontier/64_gcd_run.sh     |  21 --
 mpi/reduce-scatter/frontier/8_gcd_run.sh      |  21 --
 .../frontier/benchmarks/128_gcd.txt           |  13 -
 .../frontier/benchmarks/16_gcd.txt            |  13 -
 .../frontier/benchmarks/32_gcd.txt            |  15 -
 .../frontier/benchmarks/64_gcd.txt            |  17 --
 .../frontier/benchmarks/8_gcd.txt             |  14 -
 mpi/reduce-scatter/perlmutter/128_gpu_run.sh  |  37 ---
 mpi/reduce-scatter/perlmutter/16_gpu_run.sh   |  37 ---
 mpi/reduce-scatter/perlmutter/32_gpu_run.sh   |  37 ---
 mpi/reduce-scatter/perlmutter/64_gpu_run.sh   |  37 ---
 mpi/reduce-scatter/perlmutter/8_gpu_run.sh    |  37 ---
 .../perlmutter/benchmarks/128_gpu.txt         |  12 -
 .../perlmutter/benchmarks/16_gpu.txt          |  12 -
 .../perlmutter/benchmarks/32_gpu.txt          |  14 -
 .../perlmutter/benchmarks/64_gpu.txt          |  13 -
 .../perlmutter/benchmarks/8_gpu.txt           |  13 -
 mpi/reduce-scatter/reduce_scatter.x           | Bin 25888 -> 0 bytes
 nccl/Makefile                                 |  25 --
 nccl/all-gather/128_gpu_run.sh                |  37 ---
 nccl/all-gather/16_gpu_run.sh                 |  37 ---
 nccl/all-gather/32_gpu_run.sh                 |  37 ---
 nccl/all-gather/64_gpu_run.sh                 |  37 ---
 nccl/all-gather/8_gpu_run.sh                  |  37 ---
 nccl/all-gather/benchmarks/128_gpu.txt        |  13 -
 nccl/all-gather/benchmarks/16_gpu.txt         |  13 -
 nccl/all-gather/benchmarks/32_gpu.txt         |  14 -
 nccl/all-gather/benchmarks/64_gpu.txt         |  13 -
 nccl/all-gather/benchmarks/8_gpu.txt          |  13 -
 nccl/all-reduce/128_gpu_run.sh                |  37 ---
 nccl/all-reduce/16_gpu_run.sh                 |  37 ---
 nccl/all-reduce/32_gpu_run.sh                 |  37 ---
 nccl/all-reduce/64_gpu_run.sh                 |  37 ---
 nccl/all-reduce/8_gpu_run.sh                  |  37 ---
 nccl/all-reduce/benchmarks/128_gpu.txt        |  12 -
 nccl/all-reduce/benchmarks/16_gpu.txt         |  12 -
 nccl/all-reduce/benchmarks/32_gpu.txt         |  14 -
 nccl/all-reduce/benchmarks/64_gpu.txt         |  13 -
 nccl/all-reduce/benchmarks/8_gpu.txt          |  13 -
 nccl/reduce-scatter/128_gpu_run.sh            |  37 ---
 nccl/reduce-scatter/16_gpu_run.sh             |  37 ---
 nccl/reduce-scatter/32_gpu_run.sh             |  37 ---
 nccl/reduce-scatter/64_gpu_run.sh             |  37 ---
 nccl/reduce-scatter/8_gpu_run.sh              |  37 ---
 nccl/reduce-scatter/benchmarks/128_gpu.txt    |  12 -
 nccl/reduce-scatter/benchmarks/16_gpu.txt     |  12 -
 nccl/reduce-scatter/benchmarks/32_gpu.txt     |  14 -
 nccl/reduce-scatter/benchmarks/64_gpu.txt     |  13 -
 nccl/reduce-scatter/benchmarks/8_gpu.txt      |  13 -
 rccl/Makefile                                 |  25 --
 rccl/all-gather/allgather.x                   | Bin 25736 -> 0 bytes
 rccl/all-reduce/allreduce.x                   | Bin 25840 -> 0 bytes
 rccl/reduce-scatter/reduce_scatter.x          | Bin 25848 -> 0 bytes
 reduce_scatter.cu                             | 269 ------------------
 104 files changed, 2905 deletions(-)
 delete mode 100644 LICENSE
 delete mode 100644 README.md
 delete mode 100644 allgather.cu
 delete mode 100644 allreduce.cu
 delete mode 100644 mpi/Makefile
 delete mode 100755 mpi/all-gather/allgather.x
 delete mode 100644 mpi/all-gather/frontier/128_gcd_run.sh
 delete mode 100644 mpi/all-gather/frontier/16_gcd_run.sh
 delete mode 100644 mpi/all-gather/frontier/32_gcd_run.sh
 delete mode 100644 mpi/all-gather/frontier/64_gcd_run.sh
 delete mode 100644 mpi/all-gather/frontier/8_gcd_run.sh
 delete mode 100644 mpi/all-gather/frontier/benchmarks/128_gcd.txt
 delete mode 100644 mpi/all-gather/frontier/benchmarks/16_gcd.txt
 delete mode 100644 mpi/all-gather/frontier/benchmarks/32_gcd.txt
 delete mode 100644 mpi/all-gather/frontier/benchmarks/64_gcd.txt
 delete mode 100644 mpi/all-gather/frontier/benchmarks/8_gcd.txt
 delete mode 100644 mpi/all-gather/perlmutter/128_gpu_run.sh
 delete mode 100644 mpi/all-gather/perlmutter/16_gpu_run.sh
 delete mode 100644 mpi/all-gather/perlmutter/32_gpu_run.sh
 delete mode 100644 mpi/all-gather/perlmutter/64_gpu_run.sh
 delete mode 100644 mpi/all-gather/perlmutter/8_gpu_run.sh
 delete mode 100644 mpi/all-gather/perlmutter/benchmarks/128_gpu.txt
 delete mode 100644 mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
 delete mode 100644 mpi/all-gather/perlmutter/benchmarks/32_gpu.txt
 delete mode 100644 mpi/all-gather/perlmutter/benchmarks/64_gpu.txt
 delete mode 100644 mpi/all-gather/perlmutter/benchmarks/8_gpu.txt
 delete mode 100755 mpi/all-reduce/allreduce.x
 delete mode 100644 mpi/all-reduce/frontier/128_gcd_run.sh
 delete mode 100644 mpi/all-reduce/frontier/16_gcd_run.sh
 delete mode 100644 mpi/all-reduce/frontier/32_gcd_run.sh
 delete mode 100644 mpi/all-reduce/frontier/64_gcd_run.sh
 delete mode 100644 mpi/all-reduce/frontier/8_gcd_run.sh
 delete mode 100644 mpi/all-reduce/frontier/benchmarks/128_gcd.txt
 delete mode 100644 mpi/all-reduce/frontier/benchmarks/16_gcd.txt
 delete mode 100644 mpi/all-reduce/frontier/benchmarks/32_gcd.txt
 delete mode 100644 mpi/all-reduce/frontier/benchmarks/64_gcd.txt
 delete mode 100644 mpi/all-reduce/frontier/benchmarks/8_gcd.txt
 delete mode 100644 mpi/all-reduce/perlmutter/128_gpu_run.sh
 delete mode 100644 mpi/all-reduce/perlmutter/16_gpu_run.sh
 delete mode 100644 mpi/all-reduce/perlmutter/32_gpu_run.sh
 delete mode 100644 mpi/all-reduce/perlmutter/64_gpu_run.sh
 delete mode 100644 mpi/all-reduce/perlmutter/8_gpu_run.sh
 delete mode 100644 mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt
 delete mode 100644 mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt
 delete mode 100644 mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt
 delete mode 100644 mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt
 delete mode 100644 mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt
 delete mode 100644 mpi/reduce-scatter/frontier/128_gcd_run.sh
 delete mode 100644 mpi/reduce-scatter/frontier/16_gcd_run.sh
 delete mode 100644 mpi/reduce-scatter/frontier/32_gcd_run.sh
 delete mode 100644 mpi/reduce-scatter/frontier/64_gcd_run.sh
 delete mode 100644 mpi/reduce-scatter/frontier/8_gcd_run.sh
 delete mode 100644 mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt
 delete mode 100644 mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt
 delete mode 100644 mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt
 delete mode 100644 mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt
 delete mode 100644 mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt
 delete mode 100644 mpi/reduce-scatter/perlmutter/128_gpu_run.sh
 delete mode 100644 mpi/reduce-scatter/perlmutter/16_gpu_run.sh
 delete mode 100644 mpi/reduce-scatter/perlmutter/32_gpu_run.sh
 delete mode 100644 mpi/reduce-scatter/perlmutter/64_gpu_run.sh
 delete mode 100644 mpi/reduce-scatter/perlmutter/8_gpu_run.sh
 delete mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt
 delete mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt
 delete mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt
 delete mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt
 delete mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt
 delete mode 100755 mpi/reduce-scatter/reduce_scatter.x
 delete mode 100644 nccl/Makefile
 delete mode 100644 nccl/all-gather/128_gpu_run.sh
 delete mode 100644 nccl/all-gather/16_gpu_run.sh
 delete mode 100644 nccl/all-gather/32_gpu_run.sh
 delete mode 100644 nccl/all-gather/64_gpu_run.sh
 delete mode 100644 nccl/all-gather/8_gpu_run.sh
 delete mode 100644 nccl/all-gather/benchmarks/128_gpu.txt
 delete mode 100644 nccl/all-gather/benchmarks/16_gpu.txt
 delete mode 100644 nccl/all-gather/benchmarks/32_gpu.txt
 delete mode 100644 nccl/all-gather/benchmarks/64_gpu.txt
 delete mode 100644 nccl/all-gather/benchmarks/8_gpu.txt
 delete mode 100644 nccl/all-reduce/128_gpu_run.sh
 delete mode 100644 nccl/all-reduce/16_gpu_run.sh
 delete mode 100644 nccl/all-reduce/32_gpu_run.sh
 delete mode 100644 nccl/all-reduce/64_gpu_run.sh
 delete mode 100644 nccl/all-reduce/8_gpu_run.sh
 delete mode 100644 nccl/all-reduce/benchmarks/128_gpu.txt
 delete mode 100644 nccl/all-reduce/benchmarks/16_gpu.txt
 delete mode 100644 nccl/all-reduce/benchmarks/32_gpu.txt
 delete mode 100644 nccl/all-reduce/benchmarks/64_gpu.txt
 delete mode 100644 nccl/all-reduce/benchmarks/8_gpu.txt
 delete mode 100644 nccl/reduce-scatter/128_gpu_run.sh
 delete mode 100644 nccl/reduce-scatter/16_gpu_run.sh
 delete mode 100644 nccl/reduce-scatter/32_gpu_run.sh
 delete mode 100644 nccl/reduce-scatter/64_gpu_run.sh
 delete mode 100644 nccl/reduce-scatter/8_gpu_run.sh
 delete mode 100644 nccl/reduce-scatter/benchmarks/128_gpu.txt
 delete mode 100644 nccl/reduce-scatter/benchmarks/16_gpu.txt
 delete mode 100644 nccl/reduce-scatter/benchmarks/32_gpu.txt
 delete mode 100644 nccl/reduce-scatter/benchmarks/64_gpu.txt
 delete mode 100644 nccl/reduce-scatter/benchmarks/8_gpu.txt
 delete mode 100644 rccl/Makefile
 delete mode 100755 rccl/all-gather/allgather.x
 delete mode 100755 rccl/all-reduce/allreduce.x
 delete mode 100755 rccl/reduce-scatter/reduce_scatter.x
 delete mode 100644 reduce_scatter.cu

diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index 9943369..0000000
--- a/LICENSE
+++ /dev/null
@@ -1,20 +0,0 @@
-Copyright (c) 2024, Parallel Software and Systems Group, University of
-Maryland.
-
-Permission is hereby granted, free of charge, to any person obtaining a
-copy of this software and associated documentation files (the "Software"),
-to deal in the Software without restriction, including without limitation
-the rights to use, copy, modify, merge, publish, distribute, sublicense,
-and/or sell copies of the Software, and to permit persons to whom the
-Software is furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
deleted file mode 100644
index 526fb95..0000000
--- a/README.md
+++ /dev/null
@@ -1,15 +0,0 @@
-Before compiling do these:
-
-### Perlmutter
-```sh
-module load PrgEnv-cray cudatoolkit craype-accel-nvidia80 nccl
-export CRAY_ACCEL_TARGET=nvidia80
-export MPICH_GPU_SUPPORT_ENABLED=1
-```
-### Frontier
-```sh
-module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05
-export MPICH_GPU_SUPPORT_ENABLED=1
-export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
-```
-
diff --git a/allgather.cu b/allgather.cu
deleted file mode 100644
index 8c357bb..0000000
--- a/allgather.cu
+++ /dev/null
@@ -1,248 +0,0 @@
-/* \file allgather.cu
- * Copyright 2024 Parallel Software and Systems Group, University of Maryland.
- * See the top-level LICENSE file for details.
- * 
- * SPDX-License-Identifier: MIT
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-#include <stdint.h>
-
-#ifdef USE_CUDA
-  #include <cuda_bf16.h>
-  #define bfloat16 nv_bfloat16
-#elif USE_ROCM
-  #define __HIP_PLATFORM_AMD__
-  #include <hip/hip_bfloat16.h>
-  #include <hip/hip_runtime.h>
-  #include <hip/hip_runtime_api.h>
-  #define bfloat16 hip_bfloat16
-#endif
-
-#ifdef USE_NCCL
-  #include "nccl.h"
-#elif USE_RCCL
-  #include <rccl/rccl.h> 
-#endif
-
-#define NUM_WARMUP_ITERATIONS		5
-
-#define MPI_CHECK(cmd) do {                         \
-  int64_t e = cmd;                                      \
-  if( e != MPI_SUCCESS ) {                          \
-    printf("Failed: MPI error %s:%d '%ld'\n",        \
-        __FILE__,__LINE__, e);                      \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-#define CUDA_CHECK(cmd) do {                        \
-  cudaError_t e = cmd;                              \
-  if(e != cudaSuccess) {                            \
-    printf("CUDA error  %s:%d: %s\n",               \
-        __FILE__, __LINE__, cudaGetErrorString(e)); \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-#define HIP_CHECK(cmd) do {                        \
-  hipError_t e = cmd;                              \
-  if(e != hipSuccess) {                            \
-    printf("HIP error  %s:%d: %s\n",               \
-        __FILE__, __LINE__, hipGetErrorString(e)); \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-// NCCL_CHECK is used to validate RCCL functions as well
-#define NCCL_CHECK(cmd) do {                        \
-  ncclResult_t e = cmd;                             \
-  if (e != ncclSuccess) {                           \
-    printf("NCCL error %s:%d %s\n",                 \
-        __FILE__, __LINE__, ncclGetErrorString(e)); \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-void initializeData(bfloat16 *data, int64_t size) {
-    for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) {
-        #ifdef USE_CUDA
-        data[i] = __float2bfloat16((float)i);
-        #elif USE_ROCM
-        // ROCm doesn't have a float2bfloat16 method
-        data[i] = (bfloat16) ((float) i);
-        #endif
-    }
-}
-
-int main(int argc, char *argv[]) {
-    if (argc != 5) {
-        fprintf(stderr, "Usage: %s <num_gpus> <min_msg_size> <max_msg_size> <iterations>\n", argv[0]);
-        return EXIT_FAILURE;
-    }
-
-    int num_gpus = atoi(argv[1]);
-    int64_t min_msg_size = atoi(argv[2]);
-    int64_t max_msg_size = atoi(argv[3]);
-    int iterations = atoi(argv[4]);
-
-    if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
-        fprintf(stderr, "Invalid input parameters.\n");
-        return EXIT_FAILURE;
-    }
-
-    int my_rank, num_pes;
-    int num_gpus_per_node;
-    int msg_count;
-
-    MPI_Init(&argc, &argv);
-    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
-
-    if (num_pes != num_gpus) {
-        fprintf(stderr, "Number of processes must match number of GPUs.\n");
-        MPI_Finalize();
-        return EXIT_FAILURE;
-    }
-
-    // Initialize GPU context
-    #if USE_CUDA
-    cudaGetDeviceCount(&num_gpus_per_node);
-    cudaSetDevice((my_rank % num_gpus_per_node));
-    #elif USE_ROCM
-    hipGetDeviceCount(&num_gpus_per_node);
-    hipSetDevice((my_rank % num_gpus_per_node));
-    #endif
-
-    int64_t local_data_size = max_msg_size; // Size of local data
-    int64_t global_data_size = local_data_size * num_gpus; // Size of global data
-
-    if (my_rank == 0) {
-        fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024);
-        fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024);
-    }
-
-    bfloat16 *local_data = (bfloat16*)malloc(local_data_size);
-    bfloat16 *global_data = (bfloat16*)malloc(global_data_size);
-
-    // Initialize local data
-    initializeData(local_data, local_data_size);
-
-    // Allocate memory on GPU
-    bfloat16 *d_local_data, *d_global_data;
-    #ifdef USE_CUDA
-    CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
-    CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
-    // Copy local data to GPU
-    CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
-
-    #elif USE_ROCM
-    HIP_CHECK(hipMalloc(&d_local_data, local_data_size));
-    HIP_CHECK(hipMalloc(&d_global_data, global_data_size));
-    HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice));
-    #endif
-
-    #ifdef USE_MPI
-    // create 2-byte datatype (send raw, un-interpreted bytes)
-    MPI_Datatype mpi_type_bfloat16;
-    MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16);
-    MPI_Type_commit(&mpi_type_bfloat16);
-
-    #elif defined(USE_NCCL) || defined(USE_RCCL)
-    ncclUniqueId nccl_comm_id;
-    ncclComm_t nccl_comm;
-
-    if (my_rank == 0) {
-        /* Generates an Id to be used in ncclCommInitRank. */
-        ncclGetUniqueId(&nccl_comm_id);
-    }
-
-    /* distribute nccl_comm_id to all ranks */
-    MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
-                        0, MPI_COMM_WORLD));
-
-    /* Create a new NCCL/RCCL communicator */
-    NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
-    #endif
-
-    // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
-    double total_time, start_time;
-    MPI_Request request;
-    MPI_Status status;
-
-    // Print benchmark results
-    if (my_rank == 0) {
-        printf("Number of GPUs: %d\n", num_gpus);
-        printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size);
-        printf("Number of iterations: %d\n", iterations);
-    }
-    fflush(NULL);
-
-    for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
-	msg_count = msg_size / sizeof(bfloat16);
-	// warmup iterations
-	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
-            #ifdef USE_MPI
-	    MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16,
-		d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request));
-                
-            MPI_CHECK(MPI_Wait(&request, &status));
-            #elif defined(USE_NCCL) || defined(USE_RCCL)
-            NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL));
-            #endif
-        
-            #ifdef USE_CUDA
-            cudaDeviceSynchronize();
-            #elif USE_ROCM
-            hipDeviceSynchronize();
-            #endif
-        }
-
-	if(msg_size >= 8388608)
-	    iterations = 20;
-
-        MPI_Barrier(MPI_COMM_WORLD);
-        start_time = MPI_Wtime();
-	for (int i = 0; i < iterations; ++i) {
-            #ifdef USE_MPI
-	    MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16,
-		d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request));
-                
-            MPI_CHECK(MPI_Wait(&request, &status));
-            #elif defined(USE_NCCL) || defined(USE_RCCL)
-            NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL));
-            #endif
-        
-            #ifdef USE_CUDA
-            cudaDeviceSynchronize();
-            #elif USE_ROCM
-            hipDeviceSynchronize();
-            #endif
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-        total_time = MPI_Wtime() - start_time;
-	if (my_rank == 0)
-	    printf("%ld %.6f seconds\n", msg_size, (total_time / iterations));
-    }
-
-    // Cleanup
-    free(local_data);
-    free(global_data);
-    #ifdef USE_CUDA
-    CUDA_CHECK(cudaFree(d_local_data));
-    CUDA_CHECK(cudaFree(d_global_data));
-    #elif USE_ROCM
-    HIP_CHECK(hipFree(d_local_data));
-    HIP_CHECK(hipFree(d_global_data));
-    #endif
-
-    #ifdef defined(USE_NCCL) || defined(USE_RCCL)
-    ncclCommDestroy(nccl_comm);
-    #endif
-
-    MPI_Finalize();
-    return EXIT_SUCCESS;
-}
-
diff --git a/allreduce.cu b/allreduce.cu
deleted file mode 100644
index 111b254..0000000
--- a/allreduce.cu
+++ /dev/null
@@ -1,262 +0,0 @@
-/* \file allreduce.cu
- * Copyright 2024 Parallel Software and Systems Group, University of Maryland.
- * See the top-level LICENSE file for details.
- * 
- * SPDX-License-Identifier: MIT
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-#include <stdint.h>
-
-#ifdef USE_CUDA
-  #include <cuda_bf16.h>
-  #define bfloat16 nv_bfloat16
-#elif USE_ROCM
-  #define __HIP_PLATFORM_AMD__
-  #include <hip/hip_bfloat16.h>
-  #include <hip/hip_runtime.h>
-  #include <hip/hip_runtime_api.h>
-  #define bfloat16 hip_bfloat16
-#endif
-
-#ifdef USE_NCCL
-  #include "nccl.h"
-#elif USE_RCCL
-  #include <rccl/rccl.h> 
-#endif
-
-#define NUM_WARMUP_ITERATIONS		5
-
-#define MPI_CHECK(cmd) do {                         \
-  int64_t e = cmd;                                      \
-  if( e != MPI_SUCCESS ) {                          \
-    printf("Failed: MPI error %s:%d '%ld'\n",        \
-        __FILE__,__LINE__, e);                      \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-#define CUDA_CHECK(cmd) do {                        \
-  cudaError_t e = cmd;                              \
-  if(e != cudaSuccess) {                            \
-    printf("CUDA error  %s:%d: %s\n",               \
-        __FILE__, __LINE__, cudaGetErrorString(e)); \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-#define HIP_CHECK(cmd) do {                        \
-  hipError_t e = cmd;                              \
-  if(e != hipSuccess) {                            \
-    printf("HIP error  %s:%d: %s\n",               \
-        __FILE__, __LINE__, hipGetErrorString(e)); \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-// NCCL_CHECK is used to validate RCCL functions as well
-#define NCCL_CHECK(cmd) do {                        \
-  ncclResult_t e = cmd;                             \
-  if (e != ncclSuccess) {                           \
-    printf("NCCL error %s:%d %s\n",                 \
-        __FILE__, __LINE__, ncclGetErrorString(e)); \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-void initializeData(bfloat16 *data, int64_t size) {
-    for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) {
-        #ifdef USE_CUDA
-        data[i] = __float2bfloat16((float)i);
-        #elif USE_ROCM
-        // ROCm doesn't have a float2bfloat16 method
-        data[i] = (bfloat16) ((float) i);
-        #endif
-    }
-}
-
-void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
-    bfloat16* in = (bfloat16*) invec;
-    bfloat16* inout = (bfloat16*) inoutvec;
-    for (int i = 0; i < *len; i++) {
-        #ifdef USE_CUDA
-        inout[i] = __hadd(in[i], inout[i]);
-        #elif USE_ROCM
-        inout[i] = in[i] + inout[i];
-        #endif
-    }
-}
-
-int main(int argc, char *argv[]) {
-    if (argc != 5) {
-        fprintf(stderr, "Usage: %s <num_gpus> <min_msg_size> <max_msg_size> <iterations>\n", argv[0]);
-        return EXIT_FAILURE;
-    }
-
-    int num_gpus = atoi(argv[1]);
-    int64_t min_msg_size = strtoll(argv[2], NULL, 10);
-    int64_t max_msg_size = strtoll(argv[3], NULL, 10);
-    int iterations = atoi(argv[4]);
-
-    if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
-        fprintf(stderr, "Invalid input parameters.\n");
-        return EXIT_FAILURE;
-    }
-
-    int my_rank, num_pes;
-    int num_gpus_per_node;
-    int msg_count;
-
-    MPI_Init(&argc, &argv);
-    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
-
-    if (num_pes != num_gpus) {
-        fprintf(stderr, "Number of processes must match number of GPUs.\n");
-        MPI_Finalize();
-        return EXIT_FAILURE;
-    }
-
-    // Initialize GPU context
-    #if USE_CUDA
-    cudaGetDeviceCount(&num_gpus_per_node);
-    cudaSetDevice((my_rank % num_gpus_per_node));
-    #elif USE_ROCM
-    hipGetDeviceCount(&num_gpus_per_node);
-    hipSetDevice((my_rank % num_gpus_per_node));
-    #endif
-
-    int64_t local_data_size = max_msg_size; // Size of local data
-    int64_t global_data_size = local_data_size; // Size of global data 
-    
-    if (my_rank == 0) {
-        fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024);
-        fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024);
-    }
-
-    bfloat16 *local_data = (bfloat16*)malloc(local_data_size);
-    bfloat16 *global_data = (bfloat16*)malloc(global_data_size);
-
-    // Initialize local data
-    initializeData(local_data, local_data_size);
-
-    bfloat16 *d_local_data, *d_global_data;
-    #ifdef USE_CUDA
-    CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
-    CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
-    // Copy local data to GPU
-    CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
-
-    #elif USE_ROCM
-    HIP_CHECK(hipMalloc(&d_local_data, local_data_size));
-    HIP_CHECK(hipMalloc(&d_global_data, global_data_size));
-    HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice));
-    #endif
-
-    #ifdef USE_MPI
-    // create 2-byte datatype (send raw, un-interpreted bytes)
-    MPI_Datatype mpi_type_bfloat16;
-    MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16);
-    MPI_Type_commit(&mpi_type_bfloat16);
-
-    // define custom reduce operation for nv_bfloat16 types
-    MPI_Op CUSTOM_SUM;
-    MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM);
-
-    #elif defined(USE_NCCL) || defined(USE_RCCL)
-    ncclUniqueId nccl_comm_id;
-    ncclComm_t nccl_comm;
-
-    if (my_rank == 0) {
-        /* Generates an Id to be used in ncclCommInitRank. */
-        ncclGetUniqueId(&nccl_comm_id);
-    }
-
-    /* distribute nccl_comm_id to all ranks */
-    MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
-                        0, MPI_COMM_WORLD));
-
-    /* Create a new NCCL/RCCL communicator */
-    NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
-    #endif
-
-    // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
-    double total_time, start_time;
-    MPI_Request request;
-    MPI_Status status;
-
-    // Print benchmark results
-    if (my_rank == 0) {
-        printf("Number of GPUs: %d\n", num_gpus);
-        printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size);
-        printf("Number of iterations: %d\n", iterations);
-    }
-    fflush(NULL);
-
-    for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
-	msg_count = msg_size / sizeof(bfloat16);
-	// warmup iterations
-	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
-            #ifdef USE_MPI
-            MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16,
-                CUSTOM_SUM, MPI_COMM_WORLD, &request));
-
-            MPI_CHECK(MPI_Wait(&request, &status));
-            #elif defined(USE_NCCL) || defined(USE_RCCL)
-            NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL));
-            #endif
-            
-            #ifdef USE_CUDA
-            cudaDeviceSynchronize();
-            #elif USE_ROCM
-            hipDeviceSynchronize();
-            #endif
-        }
-
-	if(msg_size >= 8388608)
-	    iterations = 20;
-
-        MPI_Barrier(MPI_COMM_WORLD);
-        start_time = MPI_Wtime();
-	for (int i = 0; i < iterations; ++i) {
-            #ifdef USE_MPI
-            MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16,
-                CUSTOM_SUM, MPI_COMM_WORLD, &request));
-
-            MPI_CHECK(MPI_Wait(&request, &status));
-            #elif defined(USE_NCCL) || defined(USE_RCCL)
-            NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL));
-            #endif
-            
-            #ifdef USE_CUDA
-            cudaDeviceSynchronize();
-            #elif USE_ROCM
-            hipDeviceSynchronize();
-            #endif
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-        total_time = MPI_Wtime() - start_time;
-	if (my_rank == 0)
-	    printf("%ld %.6f seconds\n", msg_size, (total_time / iterations));
-    }
-
-    // Cleanup
-    free(local_data);
-    free(global_data);
-    #ifdef USE_CUDA
-    CUDA_CHECK(cudaFree(d_local_data));
-    CUDA_CHECK(cudaFree(d_global_data));
-    #elif USE_ROCM
-    HIP_CHECK(hipFree(d_local_data));
-    HIP_CHECK(hipFree(d_global_data));
-    #endif
-
-    #ifdef defined(USE_NCCL) || defined(USE_RCCL)
-    ncclCommDestroy(nccl_comm);
-    #endif
-
-    MPI_Finalize();
-    return EXIT_SUCCESS;
-}
diff --git a/mpi/Makefile b/mpi/Makefile
deleted file mode 100644
index 28861d4..0000000
--- a/mpi/Makefile
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2024 Parallel Software and Systems Group, University of Maryland.
-# See the top-level LICENSE file for details.
-# 
-# SPDX-License-Identifier: MIT
-
-CC = cc
-
-# perlmutter flags
-# INC = -I/global/common/software/nersc9/nccl/2.19.4/include
-# CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI
-# LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
-
-# frontier flags
-INC = -I${ROCM_PATH}/include
-CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_MPI
-LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl
-
-all: allgather.x allreduce.x reduce_scatter.x
-
-allgather.x: ../allgather.cu
-	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu
-
-allreduce.x: ../allreduce.cu
-	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu
-
-reduce_scatter.x: ../reduce_scatter.cu
-	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu
-
-clean: 
-	rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x
diff --git a/mpi/all-gather/allgather.x b/mpi/all-gather/allgather.x
deleted file mode 100755
index 03793882f6c87b1c4fbedcb5062d4db7921d7a3f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 25696
zcmeHP3v^UPny%ZOmjQB{r{H6}V1THk^MJ&FfaxR+9jBua67Yd`I{io*(+~TB!Bs#b
z&a?q#*gcNx%+9dR?yP6lnPGQl9md@OA_|N<55UK8bY^76S2xH*MPPi8{r<XDotrj2
zI=koW*)u1V-1_VP>i_?G+^V`&w{O*YyK{+2QJ8clwwMw3gBco!>bcL@ky}Inq@LBW
zV)(OIDJuY<$1y#=o@qQFX{AZWB`wlfW(Fm_h=`AvG#(@AC7feSQkg@dq&MU=5!C5p
z6LeNmX3=@l%g41%#BUSjlXSPL%Smd~qcV2P(BnHAG#-$2|Ce;U`!bzMJt`LI^$NXS
zp(p7!QEo|Py)1hx?xphOO7Do&8>goy>!nrbJtfNflttqLk~(#9CaJ8$8=*(}{Bs@Y
z)3r(DXOp;pl4`=Bq%ysVo<L{qyow%gc~2me=q>N9t1YjcXN`re)!ZIspUSlK`gU%?
z3|}O{`CK09h+8{MWFk0i7@{*XK<LVLbpS@fm*&7-z(-0qkON<n17DW||7s5W@f`S5
zIq-oT`2HOD!5sMOIq=gt@K18!#kgT3)!X<S_?L3v)j9Bn9C&jMyfp{jo&)#hz=JvP
zwK?#<9Qd{z_>(#C<EZEH>>~E0TSJI3Ii8pyPxx-PMsNoIDsT&%#>(6p!5Q}m$1h^_
zZY_gR;2Q$p@79PlI1~D-f<CHXOgH}NxQrvJ{~2);^yp8<DQr8^(O_L42Nd=*!Bc|K
z*##J-&1?V|xx__*3uttBQSgLOaJrxufujH12e6p1vP(BzQuluUB@OnM-#vWiO_%(~
z8(Z&TE*C~5k1G~;N8_%bI}ifW6%2<2;sP7>bp>K^U)1G~x`RGfAmk4N_4K-3{y@mx
z6S&(4*6SNdtUC~C@dZ7RwG5%<zIc;wO~B(LW=Yh?X^Xq3C+s0)sf=z6Cqi+7*rU;K
zba^})2z3#n>n~p$@^nYTAxb$GkH*72tfjTt)f@`Md7v>I47#H3&>cfuOqcbB6LIE`
zz+BwVf_l=iIPxB4{=1@qxQ}Zrk;NjbZEGVwmnR&G2f7mBM2rgvEEq(B!kFLRlZbUQ
zKNX8-2FAPG@ory~vn$;qZw>BfG$6rvAn0ShUPQTkUU%G$dhCqF^r{CG@>20ao?t}J
zYb4Hav)n;1YC?Z#)KI0k=HcN%caa_=wQQ-gxuMZjZLPM}u!?XbUg3$l*H%P)6&{bT
zqN>(fX|3czJbGO8C<=5gsEFNF;YWqWk+X_e#O=AG+!tCCtLSvcd>Iqvu^wNns;2yI
zUuZ5FD6gvFw!H3WtlJ$bkK(SOJmqt(^Q@KS@mgO`bxm*dt}cI9&Fb2ks5jajjm^Dt
zb!1+6bu1Rq)6Jc|a!-#t)KxK*Nf9hhgzgB1?+S_MXICh(Ad|z-7V?W*p$d$8JUvv0
zRm6y55noCC-I<tRB;cYt<p8gFLb~ETuI`vSBk$#=cyw<h=nG~d?o7D7L9~nv5Da@0
zs9J80Qt}5fK}tr-@?7~dQHe~*6H7#NhOw!34Lx0-dl`lWrWtVKe4@;N(_BNkN&}uJ
zLC`t_PV-6W<hV;1jWyEAahGu8yrQl{<06Idjp^1KaAUd-18z*W)qor4aW@%oN<+F1
z11_Ht6xVIQF{ox-#DEuPP|Rx$_!tAe!GMo7;9oW1V(Q9cw;1pegZyI#TujY*>^1`~
zrn($|(twXQ*x7Etjq{Bi2At-I((N|jlOzbb-+)gx;0Fx&6a#+PfM009j~MW&20UfJ
zry1~52ArN}(w#Hl7fX<+TO*&*J(p`I;)~oG5sqKVL%L^iYdk>mVkyJ<scwx^!sXJ2
z0L`rt;db;8XEPqn1V(a-00##*tT$d#TK0j#^^6%W*#~5sCp*S#IHzRhH?nQZJ~7k>
zHXwzeYj6<h8LsOS!A@UP3;VT5H0<%kV!oIbOvK_EMj=nP7Rp2~ZEcTPi&+aq++9A6
zp064nxm~^m+RPrWR!)8~TjCD%_`C}=JP|cKWW!NyW^BPsuQnU8vy0hKYGe+YP-zaW
z!JNXY1wxTTT#LAA&Vk7XC1$l&WS**4Pl8cGGp)6LE#|`m+8Zlo?UWhViD?UwRaaLe
z5nHS+3<g53V64kUnIzWTYh>}%kGkW5a45F8m@VxIce;BtdKmLE@zNACXV^0W>S%5i
zrP7N<2^$MFWLYm-i_Pi0q0!aSVrzA^*luzyVghMf*4*N1b1btjciiA?0^YdHb~8jE
z-rDB6Vabw3)O?hS&DS_U>E4>qnZ2MDpuM0$(8oZZ1Kk1IhEDo(&>d*(-DofqF2h_9
zbSvlqP}#SYyO%Mg*P>iJwy<ama2ivn-#m&zq#Tqr6k3R{#Pvg9iwU!pSROPrjw@IT
zC9o9Vit87MUj~|qH??p9vKhgZhrB%?8a(Co0bCo95ic2m&4!Yx4JBGbNm)ZlrLCl{
zsbss-SGcKQbN<75kC-1cnJ4C#)Y(cZxd6l<Y_XLT*&w#AWQSre*{w7qy!Ouwn>HF0
zkP*E+8tTG<g4Kg&%45Dpp#n7lBK&^TZ5Jstl~^7zAwv(F?IqggJY;ZFzN4h_!2(A~
zU0-2KNqv-O+(so68E+^l;`e~=$uqDc`#G62x-G+;e@oTz=H$WK9mzj9`rbU%+UDGx
z-;2kIWAnK8@U(GkUNi>-$>8M0c+{S5cO>6ZpU%Gpw@iI{w`1e(MA_c_FN0GY{rPhs
z0VqDjvAO0ZU{FmT9~?~ez}hio%WZpv_B}(|3-{&+h4z{0y0!{S&n`qbxi8f~`RLDo
zBNMfG(bvi10%`H*<Q#qHP3rw?(Fz>NQ;vfv@RnF%)z8wGb7;WP_xp2xHJsmcY;X{C
z+$Tplo&1tcFMSHM>X2GCb1np$)Ur$enIf9h@JthF*wvGV{BTzFGHP9n`5o%I-`Ulb
zyFYNMSMK$zkz*$aI-#yS{((cik~6QUE6)&f2GJjJ<h;6akdX5v`%9>wA@ws&>ZIq@
zNoOIJ{;ZX~8J#9Ik~-mcA{A&JC(RR0>Xg0E{?MjQ`V`@_Hnr@G+IAWwrMA86SJ%*m
z0-rnydK&bs9Z`Op8hzOUq4Rz<`fCdnbJ53X6w{`@MrD2NB&5z0JnlJy?dthIf;&G5
zm2(gR2k%odg{m57K~>q+wlw6{9EIGm<B&UHBg96CdJIh2DH8ohn6s%X-lLR0h3M-i
zNd6?E&v3YjU_xyfwCw}RdXw6QYJB}eihyWY3g(Yd`4I6uz}JscObX?B%L4Chcz2xm
zKwUp)rK9iJBE-G)8}eQv@8{$_=a_#`-B@#eaL_S-f4q_IByP-rUtRZX@|Cl%oZU-L
zv&1<6#zXNb_q{-m!(+wjGk>t$cZeRNel_$gi4GjK_>+@va10I{+p)*kKH6<7Z7Xc;
zwzlPtzIS^a$q$nEl+JS`<E5>AgZb*l_2^EH{=X`!I`a(8{!+)`Hz)T2vPkGv4m}TO
zs)Y7&XfGhmnfzFy{{qyJY%10IJ~pYFXx3}rcwBw>EE>Lj;~Dkg6BL;Lq574#5a_$7
zRAcJHuTrnrOj+8DETz_w353S(LS~v$>jhh>vm3z9UsYPEZcIU~C3(7e;7zT?bDj*$
zNtqDa-&8vGZ_k2D{yOz6rn$-AHmg58iL}PD#5_m;qDL1n=1hL%7&tm<=WL6{s&=QY
zp)~pz-GdlM@+gn7=rPj~<4g{wuHf|dj)C-a^?@Tu8KC3f$>xD~OB2+yAEGiO|0sL*
z>UYpm(PC%lbn-`UbN<pppuA__j0o!42PuMjb{R=h&sHz{%j?fqQ7J#$pHIKV%ls$%
zb3IhQV1IrY=Kk;cvmgDr^>h34v9P0m+`>A>c5PKPwv%S}&){juQYlPUcTgz(SMbsJ
zE`*Syf8x$xK@Lsw$xQsMJbv8St6{`KMk+E!z5}7u4>FQfBsu?A|H}cy!3PlP#C6oE
zEY9Tl)Z>_9Z``eJe2j()fKwgG6RC{?{01LCAnE9jm)4<c{e;Susuj_HgXk8|htADC
zrA52YP1w$H3UfM>M^kyoxifh(H5OFwpH;iop@hlbp#lm4?y3jpOrC*FARofuuI1>L
zl#T}AO#oGcspB9m^Utqqa`fL*T9-NjmO_@)OIT1)uCEjsoSJ%8AeBH;2~ywf=zFgB
zR@*AuZMNHOuG{uFH+xH0Igr<mV-w}`wM9A8oP51`{_ocRT}$%A=7Yz%`Tn-jB1gZo
zbZS%bggu$EB@f!#x>3BQ)IF4gzUM5Ib_E|R-l6)U^a!G#$x^+@$>*4<F_=lAN}Wk=
z#wyID0i~)w-^>XT5xM#MAv5oYs(6CT9L3b^b8_=f1~az|nb{`HP%BM+=kv{U7|cxM
zW<D0>S|KdauLG%@VabUnl%x5+^LY3%C*ATLG@)-&TTa=NXZ5E+HFQ#o;Q7^8no8?Q
zL&I}s-HpTMN1s!&Ur?@j1tnIAyu1Jib4hpVAR3Bo@K=fd)YE$wc#7nCbEQl<)Q!LW
zba3!}brsToU%k><s=cq?KCtVjdduCeq!vRw{S(MIJo|UKr~`i=-ZL+vEXNiM$6h9E
z)^O~ngiRQZQD&(9BXhe@TP$@Q(!lg|A_0rgL-ZsbCe_=4e~cDqz#k!e3Gg?D<68)~
z0{^e!_;JFg0ROk)_zB>S<Usn)!NEbAN@I3WnY{O1M>285k!(M_&;AJn*gpF?ot)Om
zDI$)(dk-`9fkOzUe@uM8{jj4Sb9kD@_nm8u2l~#ns2eFqefOSbi7SP`Gvk5m4ALNg
z{CH4a+@^8+?5C+3`s|-5$2@)ao??mHq&&Ts*|}cG&(-DY_t{Tn<mdL?dyXY4r98L)
zbs=xo<tL&D8To=O$MUvd_2B5UA1>;<_Y>24jC(xuSZb&n=b|pS#jAL})tyM6?fL|0
z>Uq>2&zyxoZ5Ob?)OLVdw)|`lUpNSF*m4LAP2ZZ6zor?SbMwrHvBGS3Y%V9d+?M>b
z8RO#>RQE-Wfltk;<*@D8cs#CEy(Y&m44)W8o02DOgOh&a=-Z<>=KnTv6ivS9*4yOB
zhY4e5J&~RM2Ycv-!G3@KL3BY{Qke!1kp3PW=OHSn?gnfA70;-_nLmQBURI9~@iaEF
z^bsoc%<mCTRhmA)`6q}+O|bNC&Oc5(DxamdbABuFXyYusjq?u?k8aP>TR49|@fa6b
zdIRV0CLRwEmX2^fN<6K=(;b{&O+38<O}BEslXy(=Sh}9`w-8USJkyn&Uq(E=>P%~#
zzm9mU`B>V*`3B-kh-aK%K)g!)DNO1IXI2w$A^r&GuOyzDTKWLzXAqD3iSl#)BI4<l
zWqLd3ClF7gUwRwoi;1UKl<6&;HxW;-Ces@@|B)3uHT`si^Y0TsmG}<MA0?h%L8e<d
z{|Dk}a+0p+{Hw&%E5~#t=MNE2uRzio=U*TmD|(b4{A)8b1R0xlxlX-SovH-Fy-s%;
zP7wb4pOG)7I}In}FU;a+XYo~8d}9{BGK=rb;v-r7eOdh1viNUi@!!wlf1Je+WbyR-
zCOP9fW#XxUk~>hWaY_x8+><(C@*O?hIC6AD$@PxWIHe9l?tlo0WSQ2Xahw_qIhhW1
z5OUj18mID|zO+vrin~IM6Y3D;eklxpcz!TU{DA_E2-AJM_iE5X(+Px=<&f{V<##ml
z9kz_821l-2=u-nDC*$RB3r&?8$Em}STP65YgL;1IMSjo}dHyyD9zBoqvYvG9QGc`s
zMr&ZS21aXOv<60NV6+BCYhbhnMr&ZS21aXOv<60NV6+BCYhbhnMr&ZS21aXOv<60N
zV6+BCYhbhn{xWKyNc=9iy+Y#!suaJY$^GitrF>AtKPu>V1$|o3y@I|X=-Yyx5wxI6
zPiKmtvjnXbbcvuV1oaELM$iWZeN@oz3i`C5dj)+((6<FWBWOXjNMF!dg4PPUM9>w2
zHew^1#<%2YYp@vuyCpPzE1{;9)wq2%)s=Pgz4Pa~E8W#q^E$8g&Gq}eHGa3dwytts
z<-D4@T3=1w75Hu`5W2$`U7&GuPG^%a!j?8RF3`%b>HP|=s?s{wTB%i6R#!o_PAlWG
z%^{EV3L&<<eYstOr9|%)+SOKULNwf*ISA{&Y3jd}%JU7bF~}H$^t-Bw+2PAyR*kWY
zE#&bJWa8!Tt@J?xIr)1leSk5N)ARFPMCtZr``S8^Ffcv6+ff8veuZc+^7rSCEI!+e
z4S*=WLUFSE@;ij55$VPC?}$}4m>xk;k!M{<8Ag?r$G7nnDIY>&>^FqgEo6N<Jmtdq
z@Gmg8mS~{Z!k3qiEwwDKnwT%)GO;%JMaq2yz|9gC^mKoO)bp4hc#%>Me=L1NMA^!&
zo;MI_D#|w_%6uC<)8Qr{dGk12NZ)AX>nTHMEX7;$E&?gI0|ImvvZA6pK`(d^Uh$X7
z<00}8wTb(5FHA}SHKQ@~MVeC9LBS_rNV$>+=_>+d4i8!orC2FgZ~z`%l*lWfW5>`3
zgG%M|2#?uA!72*!pXL?UVsYnWOHsaALSHT_jXTAV(*;>_$~T_^u3k%NHd2<!sgH7&
z^6ez{C9X>BW4J7I$CMS<!haLj`1gd#nPA9$3(^w;Oj$`4R|sa}IFXx4O1tS^Hc4qS
zt%HA~X*~ocOLfWzn^FUhF6C-*XV(G7orPZyxCt2>YEjB~#ZoX{nV^_UE-sl_G7;_8
zWM$@(YsV5uR>muU$56BdA8nZCA#%Z(YsNHBymp!iUt5?<*_bhl$C%Ol^JcN;iCl$Q
z3fc>XEE<i#Y#}!!^h#i2O2%f!NdA!70!3+_sK9XHbQmvEksc*;!DMCjG=husd9q`S
zIa`SDHO5Y6vq8pL*|pR3JkOp+sn<{adIRtht1<#k2G#sbeoQE`WymnG<Hw*%CKQt3
z1jH2Sl|eBRllLo#!)C3A?#I5usb6bA{Y(bP#w%42pF%lVI4xA7=Vrv3S^?C=cXBDp
z$0fx~fg?d|rHieNd3}+X*jLT4)f`6vSeNUc8eL$LBZqmua@uUU>RRoZ>!weS#>|#G
zeI}e&pj-|!W=0=6k>zn2%ivi6JMpa^L@{<dE)%0+@N+ZZiN*s%>C2JUJiwIZ=z=*W
z9Fw513rupvHNVIdQy#F#VcX1Z5-AMJNJSzR6(_EAc|39lc`On0>D$LEuC54#JUt1o
zk1=15zkDd)I26F^^IN-RZOBe)-oq60g#G4F+&n?!e%u^dXr6H0(pAiqXMT&keOpas
z)6B(Io?|{_e&O<mhRQETe)DX_!7ChPm!rS=DuId98?XbM&SD7Q`~WI~3ug?t*YfiN
zT+KJ&H~}v<K%&rI=IM4v8JA)FL<Br>wZ^I{{V}SQ2jhv5Z&8;o<ckJ8VBJwq_agm>
z8m+vG|KCupmBZj7Y`lkFxgWG{J}M5!+4#T)_5GmbJ-$#^yn9h4phW1dK*(DjcXwiw
zJ9u|Y9N&P`6tJ6rcqAGj4j;foBi3Dx9jI}ihjwOi!_h8ZIktb6W2+~mLUFPc#ZeI`
zTo<w$?*^}{3&n60ImM|4$hbd3+gB+Elpk&rCmh71YZq13&Z(M5_F<qLTU0|~FrhFs
zDI*z609%_I;b-APDa3nX<=DyZ>sds%AmHvP_x1WbxFd99#2tZhG)`yxXtw2AwRK(<
z<}Sko=*S10ybz(z7!P>Lqdpvtfu3Zz&t%=$SbL~|^zh|@;T(>;K#^(R?C{QxBO!QK
zb4R;8Bgaxhq!!|evx@F;&{yH^Oaywo712mg4>YtlJDV!Ha9n|>yxKaqsse40iIXc}
zc8weZT&_sem(?ggn;%{=NOD;F)oY8FC=rSUx<WqM6|C3aXrujNjdtW#$y>7>wLSda
z=oOBEbjZ;syW}JyM`bxW$<bJjOj1r#U3*Ah>fI{lA5zaJc&R5TO}EI^bDy3q?J^k2
zK)1<YLO^-!1Eu}jg2|B2+G?v8kx@YvcDoE3ZMA&?TQ^Eu@r*5W9^Dyj-S`<>HX+rM
zQKYB#FegPE0yVM9Z_BV>U<zA4Oy^HBGzrLK?`7_*6sA?jNI0e=BjPiFD{L$~XVDNE
za(sr4sjpm9EEZ8j6`aRZ@76d%`5<>aF2Xz98bU+&`(}hO-(m7QS;`-|4qPVKm)}({
zBM@#iF0!-Dtsz8V-SzVDgv|G}89NYAOjDR#|4V#5<VUIpS}}}-Kg#V-%zW=F?L3x4
z{yRBv`aWAR4b`VyEj&f`F)6fkXb25CKASU5%Y4@?V`w`EGY!?7#Fq$Mepf8<B*!PQ
z-UdxVoVKBUOr{bh`@M{$ZB8bWnpyu!JP11@<?{syOu+rhUIo1>aJdF87lwW>a5?Xi
zi=+1h&Sq&`lAj%b&>37Bn}UX+<2(}Hg~;c4oYp88s<J+30iPg+473ioWdu8R;F>{B
z2I;&r>I2!XF9*Ig2mU?aBjtxy$|K<eIdJ-(ekA!9bKq~~z|ZEuX^~C$F><-ai~In1
za<rFkq;zX@;Iz6ONxmfqek<^i&Ufp~A-_5Yey6ar+pSTyGjDx*bI7mHfz!E<BjtZv
z4xIKfjU@l$9Qe*0_?{g2p&a;YIq-LK;P2(Y&j26kyu432J~W>5N`-d;#sLe);r(t+
zLY$TWr}iWF!PKLjk-MDZdf4b+l>=v1%mjVW2(yN;NU+)(nz6v>Vpi|kP;70GS+Oj!
ziWil`7*|whyRpE~rQ<*_&32Qj2t|702!;8HY*t^lIQOpG3rWc_YuwihDLxyw^0k+h
zs|`(c8He?mKeG;-Y4McZ!2l-TU13<!Qwj$ot}xD#3%hAaW2N;;z!#%gnq0iN<l+Tu
zJ3W&0^@MS79J6|G@*SdJ(bt*iLfUj94{)93C-=#y&d#WBjf|nyn+#*kB!k1%>6|_*
zY8NXx&~7?;kB4+BC;V9}7pycoaeiQkFJkNt*OFzn7Q4&Yyu8gtXA646`f604_+!<C
zHHR0+*|}Wd&ed4_;Vd>iN|zOjTo55C*H3KN!9by~D;RV~kY>cVR6<^`zyfL=xvE_}
zr$~ubHLjsWwLUL4!2GWB!xHJv$@BV_I&WyOIqA5b<@PpLo2|iVM-hiFz=aWRQ69Hw
zw2^i8B2P#z?Y*(E3kU6av7-Bbekh~sma2J_b^U%ep^jRss_CS{PHNrps@k5v^m7|s
zF3|>E{%{mGx`#SX_F;}AtagV%^ftid8s2H_*EgZ#<Z8fNqv_`BZ7t1>#$oh-9H!x!
zE)4VB$@8fr0+9RCL^PL`VWPB8uMn5i-(=Fj4V{~;a~$HHoS#XA)*WMTNqxCLkDjOG
zw&3|dj?$of%6T{5lyY|&e5^skQop`H<9K}mCz<o0g_%q`Ql4I1(%w1A%l&hkgnqql
zCR4`j{L_p1GX{ORKk$IiuN3++eVKm&^IxVYPNvTaH9d+73@7&=>YQE&GEB4u7v)Ln
z%l(uG#0JJENeC`GeOZ4i5l8wBBm^h-SN76PfumnW$dNu?f`6v|-N5L+RO2Ei=i#k*
zPfzY<nK7NkoME^NfHk&1d7igC=i514oafaD-Ib*;*Qw`3Lfgo=!DZL~eTX9wal10V
zr)?9HxhMFqJ3|Jt^Z%7BeYuWm(sUha-$s|+el{c0n7(}9vP#pnsDBw<w*Fru(x@L1
z?^z<6uEjU|8w_OUpVqH*t7Q9->+p^lx>9z1NnX-_fS78~m+Q{%FQ~sAx|AoGzT8h6
z`GWeppiB2o>dW=5JePi(Xn&$RW!kT-hyBo{`jh(deATJqr?oP~8J#S*#D0N*Q9rXj
zt=A=FI#NDc|HUkQx&F5ZeW@oYrDb%-z!26az-)b)zHG+X=@&uSn18K9<Ce5~-5|vq
zUA8{`d_|b7zk0XE#mj7*(D~HEoHmTZZ(Pu)Br3CTssF7koMg#KdYRDI(INkUO@yVO
zU}b$sNoy|s23=x<Xg^Xu%N$#fL%-FoYtWgCMwi7hM-KfXOLQ%Het@)*txxZ72+OX&
zT@GENEo;D#`jWbYzO*l?)}n_tNI=l+`lI(F(w0$lo6ujJ6`94dUI1BD%*`C37lv?$
w#lTG6@nT%E{mJLo2I$MSNAl9`XxBsZKE~*>SoTm3{p;w#2)8xMc((q30~vNPh5!Hn

diff --git a/mpi/all-gather/frontier/128_gcd_run.sh b/mpi/all-gather/frontier/128_gcd_run.sh
deleted file mode 100644
index 4e8c955..0000000
--- a/mpi/all-gather/frontier/128_gcd_run.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-#SBATCH -p batch
-#SBATCH -A CSC569
-#SBATCH -t 15:00
-#SBATCH -N 16
-#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/128_gcd.txt
-#SBATCH -C nvme
-
-## calculating the number of nodes and GPUs
-export NNODES=$SLURM_JOB_NUM_NODES
-export GPUS_PER_NODE=8 ## change as per your machine
-export GPUS=$(( NNODES * GPUS_PER_NODE )) 
-
-MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 16))
-SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-gather/frontier/16_gcd_run.sh b/mpi/all-gather/frontier/16_gcd_run.sh
deleted file mode 100644
index bb2429f..0000000
--- a/mpi/all-gather/frontier/16_gcd_run.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-#SBATCH -p batch
-#SBATCH -A CSC569
-#SBATCH -t 10:00
-#SBATCH -N 2
-#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/16_gcd.txt
-#SBATCH -C nvme
-
-## calculating the number of nodes and GPUs
-export NNODES=$SLURM_JOB_NUM_NODES
-export GPUS_PER_NODE=8 ## change as per your machine
-export GPUS=$(( NNODES * GPUS_PER_NODE )) 
-
-MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 128))
-SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-gather/frontier/32_gcd_run.sh b/mpi/all-gather/frontier/32_gcd_run.sh
deleted file mode 100644
index e630b97..0000000
--- a/mpi/all-gather/frontier/32_gcd_run.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-#SBATCH -p batch
-#SBATCH -A CSC569
-#SBATCH -t 15:00
-#SBATCH -N 4
-#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/32_gcd.txt
-#SBATCH -C nvme
-
-## calculating the number of nodes and GPUs
-export NNODES=$SLURM_JOB_NUM_NODES
-export GPUS_PER_NODE=8 ## change as per your machine
-export GPUS=$(( NNODES * GPUS_PER_NODE )) 
-
-MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 64))
-SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-gather/frontier/64_gcd_run.sh b/mpi/all-gather/frontier/64_gcd_run.sh
deleted file mode 100644
index e7c707f..0000000
--- a/mpi/all-gather/frontier/64_gcd_run.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-#SBATCH -p batch
-#SBATCH -A CSC569
-#SBATCH -t 15:00
-#SBATCH -N 8
-#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/64_gcd.txt
-#SBATCH -C nvme
-
-## calculating the number of nodes and GPUs
-export NNODES=$SLURM_JOB_NUM_NODES
-export GPUS_PER_NODE=8 ## change as per your machine
-export GPUS=$(( NNODES * GPUS_PER_NODE )) 
-
-MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 32))
-SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-gather/frontier/8_gcd_run.sh b/mpi/all-gather/frontier/8_gcd_run.sh
deleted file mode 100644
index 563f933..0000000
--- a/mpi/all-gather/frontier/8_gcd_run.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-#SBATCH -p batch
-#SBATCH -A CSC569
-#SBATCH -t 10:00
-#SBATCH -N 1
-#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/8_gcd.txt
-#SBATCH -C nvme
-
-## calculating the number of nodes and GPUs
-export NNODES=$SLURM_JOB_NUM_NODES
-export GPUS_PER_NODE=8 ## change as per your machine
-export GPUS=$(( NNODES * GPUS_PER_NODE )) 
-
-MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 256))
-SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-gather/frontier/benchmarks/128_gcd.txt b/mpi/all-gather/frontier/benchmarks/128_gcd.txt
deleted file mode 100644
index 824b380..0000000
--- a/mpi/all-gather/frontier/benchmarks/128_gcd.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 128 262144 16777216 10
-  0: Local data size: 16
-  0: Global data size: 2048
-  0: Number of GPUs: 128
-  0: Message size range: 262144 - 16777216
-  0: Number of iterations: 10
-  0: 262144 0.003748 seconds
-  0: 524288 0.005048 seconds
-  0: 1048576 0.008068 seconds
-  0: 2097152 0.014084 seconds
-  0: 4194304 0.026981 seconds
-  0: 8388608 0.051879 seconds
-  0: 16777216 0.255600 seconds
diff --git a/mpi/all-gather/frontier/benchmarks/16_gcd.txt b/mpi/all-gather/frontier/benchmarks/16_gcd.txt
deleted file mode 100644
index 35a9e26..0000000
--- a/mpi/all-gather/frontier/benchmarks/16_gcd.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 16 2097152 134217728 10
- 0: Local data size: 128
- 0: Global data size: 2048
- 0: Number of GPUs: 16
- 0: Message size range: 2097152 - 134217728
- 0: Number of iterations: 10
- 0: 2097152 0.002249 seconds
- 0: 4194304 0.003148 seconds
- 0: 8388608 0.006062 seconds
- 0: 16777216 0.011871 seconds
- 0: 33554432 0.023485 seconds
- 0: 67108864 0.046822 seconds
- 0: 134217728 0.139763 seconds
diff --git a/mpi/all-gather/frontier/benchmarks/32_gcd.txt b/mpi/all-gather/frontier/benchmarks/32_gcd.txt
deleted file mode 100644
index f758360..0000000
--- a/mpi/all-gather/frontier/benchmarks/32_gcd.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 32 262144 67108864 10
- 0: Local data size: 64
- 0: Global data size: 2048
- 0: Number of GPUs: 32
- 0: Message size range: 262144 - 67108864
- 0: Number of iterations: 10
- 0: 262144 0.000783 seconds
- 0: 524288 0.001513 seconds
- 0: 1048576 0.002953 seconds
- 0: 2097152 0.003404 seconds
- 0: 4194304 0.006485 seconds
- 0: 8388608 0.012489 seconds
- 0: 16777216 0.024484 seconds
- 0: 33554432 0.048460 seconds
- 0: 67108864 0.185884 seconds
diff --git a/mpi/all-gather/frontier/benchmarks/64_gcd.txt b/mpi/all-gather/frontier/benchmarks/64_gcd.txt
deleted file mode 100644
index 3eed822..0000000
--- a/mpi/all-gather/frontier/benchmarks/64_gcd.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 64 262144 33554432 10
- 0: Local data size: 32
- 0: Global data size: 2048
- 0: Number of GPUs: 64
- 0: Message size range: 262144 - 33554432
- 0: Number of iterations: 10
- 0: 262144 0.001685 seconds
- 0: 524288 0.003350 seconds
- 0: 1048576 0.003938 seconds
- 0: 2097152 0.006864 seconds
- 0: 4194304 0.013037 seconds
- 0: 8388608 0.025167 seconds
- 0: 16777216 0.049414 seconds
- 0: 33554432 0.211224 seconds
diff --git a/mpi/all-gather/frontier/benchmarks/8_gcd.txt b/mpi/all-gather/frontier/benchmarks/8_gcd.txt
deleted file mode 100644
index 7856a16..0000000
--- a/mpi/all-gather/frontier/benchmarks/8_gcd.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 8 2097152 268435456 10
-0: Local data size: 256
-0: Global data size: 2048
-0: Number of GPUs: 8
-0: Message size range: 2097152 - 268435456
-0: Number of iterations: 10
-0: 2097152 0.000505 seconds
-0: 4194304 0.000856 seconds
-0: 8388608 0.001645 seconds
-0: 16777216 0.003223 seconds
-0: 33554432 0.006379 seconds
-0: 67108864 0.012691 seconds
-0: 134217728 0.025316 seconds
-0: 268435456 0.053944 seconds
diff --git a/mpi/all-gather/perlmutter/128_gpu_run.sh b/mpi/all-gather/perlmutter/128_gpu_run.sh
deleted file mode 100644
index 710a399..0000000
--- a/mpi/all-gather/perlmutter/128_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 32
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 16))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-gather/perlmutter/16_gpu_run.sh b/mpi/all-gather/perlmutter/16_gpu_run.sh
deleted file mode 100644
index d4d984e..0000000
--- a/mpi/all-gather/perlmutter/16_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 4
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 128))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-gather/perlmutter/32_gpu_run.sh b/mpi/all-gather/perlmutter/32_gpu_run.sh
deleted file mode 100644
index d2f1b0d..0000000
--- a/mpi/all-gather/perlmutter/32_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 8
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 64))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-gather/perlmutter/64_gpu_run.sh b/mpi/all-gather/perlmutter/64_gpu_run.sh
deleted file mode 100644
index 515d667..0000000
--- a/mpi/all-gather/perlmutter/64_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 16
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 32))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-gather/perlmutter/8_gpu_run.sh b/mpi/all-gather/perlmutter/8_gpu_run.sh
deleted file mode 100644
index 210ea3d..0000000
--- a/mpi/all-gather/perlmutter/8_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 2
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 256))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt
deleted file mode 100644
index 3787302..0000000
--- a/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Local data size: 16
-Global data size: 2048
-Number of GPUs: 128
-Message size range: 262144 - 16777216
-Number of iterations: 10
-262144 0.003218 seconds
-524288 0.005101 seconds
-1048576 0.008701 seconds
-2097152 0.015526 seconds
-4194304 0.030239 seconds
-8388608 0.060280 seconds
-16777216 0.189415 seconds
diff --git a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
deleted file mode 100644
index b69654b..0000000
--- a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Local data size: 128
-Global data size: 2048
-Number of GPUs: 16
-Message size range: 2097152 - 134217728
-Number of iterations: 10
-2097152 0.002391 seconds
-4194304 0.003558 seconds
-8388608 0.007162 seconds
-16777216 0.014929 seconds
-33554432 0.030427 seconds
-67108864 0.062092 seconds
-134217728 0.151508 seconds
diff --git a/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt
deleted file mode 100644
index 0e15475..0000000
--- a/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-Local data size: 64
-Global data size: 2048
-Number of GPUs: 32
-Message size range: 262144 - 67108864
-Number of iterations: 10
-262144 0.000730 seconds
-524288 0.001367 seconds
-1048576 0.002650 seconds
-2097152 0.003740 seconds
-4194304 0.007503 seconds
-8388608 0.014208 seconds
-16777216 0.029923 seconds
-33554432 0.061970 seconds
-67108864 0.168545 seconds
diff --git a/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt
deleted file mode 100644
index ed700b9..0000000
--- a/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 32
-Global data size: 2048
-Number of GPUs: 64
-Message size range: 262144 - 33554432
-Number of iterations: 10
-262144 0.001561 seconds
-524288 0.002915 seconds
-1048576 0.004163 seconds
-2097152 0.007885 seconds
-4194304 0.014989 seconds
-8388608 0.029413 seconds
-16777216 0.063034 seconds
-33554432 0.183096 seconds
diff --git a/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt
deleted file mode 100644
index de3a837..0000000
--- a/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 256
-Global data size: 2048
-Number of GPUs: 8
-Message size range: 2097152 - 268435456
-Number of iterations: 10
-2097152 0.000838 seconds
-4194304 0.001719 seconds
-8388608 0.003172 seconds
-16777216 0.006797 seconds
-33554432 0.013860 seconds
-67108864 0.027938 seconds
-134217728 0.055353 seconds
-268435456 0.104310 seconds
diff --git a/mpi/all-reduce/allreduce.x b/mpi/all-reduce/allreduce.x
deleted file mode 100755
index 283e31cfd4ec10983f8a8159c5c70bb949af53dd..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 25832
zcmeHP3v^t?d7itg7qX?he&E=~#Fro&@z~W$wv23S(CT5Y(iJF{j3EK9ulFG>-WU5Y
zHUVQJOtJ`rLz`DiTbHIOCnrtaoRm_UgXI{E$w`PIVBEZL5+J(>KfuPO33&Vcb7yw1
z*4jc(&gtoCHTIo<{%`*OpPB#8ojY?^Gxv0RR$3H=$zWmEGUASZO5;#Fj~N^KoJfGQ
zvo=<Xcp+<GRp2W)X5_asjVB~^S`1v$8iQq4P|}Nw{J2HqIg-xf9AlEo8WJVFV$ej=
zphu<~tfb6l@T6CX>!`>-D%vNhQ^-kb)}uDsE;a1fIyIh<wDFS${-mH%kD5h#jY6+c
z=t;U;v|Cc?m(_2i{M5d3<sB7zC#M?aO2708y?N6NeBLyTCnOy(U?!>b;b)*n_5AM!
z(x>Y|QJ)9J{gYG^1|^l{HHAYPmMm@x`|aUSG`-orxowGk$zn$`=4j#e=r&NBR$ae_
zTd*P)MR2}cL^|TujuV*yP8)|9%%?!;Dh+J_Cc;;h!M(sIDmPRH-&6+QS_c1O8T`-7
z;NLET?<<2JEQ22^ga3OO{C8#W56a*bxM36dZF(7eP8ob*8GKn8{L^Ldo66wcGI+2I
zzOf8`XBm9B4E}H#{Oe`#C(7VYm%-n|eVNAQvd47|DaPdZGX?U5kLnu11^lmo+t?h|
zrfUQj!ryUxE^F7d0!E2{68J$~Bi0lwc)k@p(+b`J=6@EBQJ#@&q)K^`GX53w51=d!
zB^YCe!d??R6`HuLN@XbLRbUj-b3+%<h>{aLkxOwAor??4+jcOugf(9L;JmhXN9J{O
zzw-8x+i#lpZ@+!yF6Q-O1oL^5DLs+$M)Xh=$Y3NE6^IvXA}|<ArUD6XFrh~R-cU3c
z1M1tXdxN2<9uD0Z0P7D-BsLU^_XZ-q_!fp#e<0Nr*c9>wh*_BkaN4Vf!!aKrt7LX(
zEFDb=q&ty_CHhl|P;`(GL%)AZ)HjrfMXBUuDv^qX1qrj>z8-H+G?e0r&R8VkP3X~$
zMJ{RR_+#l53&vqK6=V^k3dt1OkYK?(5}{OpYpj&bBdY^j;sLKO7EOf))3J1t3kWO{
z5e425_xch69qPh#Fc?lJhggsr%d3W>5&?f2J|K3jE)+ZTL?RSO7+@+C39!IsWO)OA
zJ*C6D8<I)mh5(BCskKpGByQ9^o?^I#dc+U^7_XjIa+P;6UO{wk>1DKpt@8AAbb4DH
zEsj>!6pN>td<lI^Q#{b*^97okmpGgbCr{!9=A{=}Xv4Cm<Q+{xxH*NIH6`P^Z=*dB
z-IQ$FpeF+b6ZT{{kZf+X-x-K5A_Mm3R&L9$Cz3;Y)Ske-M0@Ou9E%-JdumA_+|s%^
zamQeAu=TbjtqFf(D3M%r`)%>XvD=c#xKVET>e+o^Jv!J_tfWZV)6tF5*d0;v79NbI
zmlbMwej`7=6>?z0=L?e$n~4$4BHl^-orRo8JcOICZ{Yy<JRyUruy-h_7v%li6wlrq
zj|3uxjN8+?Kcc6G3P2?0Ps6p`9F-Id6_Qkpl;yPw7P8WXlrNc%8w?(OcCcF`!sl>e
zsNnn|6K<ZD95LZmE@?zZO?ZU~&zf+Wm&zc=W5Q?*l3`TWc#`l2o-(4g^%_r9BQ}@Y
zZo<vwx=pyb+&&X-o)6w+!l?`y)|+to?x4IO6OMtl5aK4hwtzCW*@RCq;oD5OnCkM}
zFPiW=ll(3dUT?y`X2S6zD1_Z6oZcHUJZ{3pRGMe)G2!NU%abOY=9)5$n(&zt1U+cN
zFErtYO!zDle#C@-!h|0+;j>M6)`ZV7;U`Tvz2{_j--KTzL87`wF{67f*I>lg=o%4@
zU(8cRWYaaCAbGhIB4D<z5#jRldCG`1UE>LoH&UEsjk>`Reu)9`*r{ur5^kQK(_CSq
zpa?LZ-*%7rlG5^7$=}16`I66yd^X6A`5MnDh53<u2IR9t^)$jZlu*3#d6Z|mu1`lc
z1QJ>-sKpa8Um%$bB(+F7nbI&``G&M;A$wKdnxvza^+H4+3~2OT*6^wx3@p<Yg#DVG
z;##&+4}}B%Wg1?V8eYz^gtj2LY=K|947r!pvSMju4w_KuiEhH2#IJ>-@pMXy>ojM<
z<b?`zIGPIYUxzQvsGtRoB|$A2z>D3VtYvGcGO&}>mZPfP!FW1(t+qT8ih3i-K`&L3
zSbejZ#cUy=r$Vu4^4eOqDjeIOhc$X_^EUC;)G|-ZHv#JI=@YFonneYh8&$MyG_BX=
z@!Zhq?d^5-d3#+qc~>xj46N?y^$xgKclWz*@N@z1T<y9UA`tHz@ZPX;<qEoY%0<>|
z`ao$MdkKR<7&H&M4Ri$tf!&~g209AbfQR{I;z6^Zui)WmL&s^E&ls&Oz6-h=RE`D8
zovWF$*`{1HwYp{(a2kW?Ia>$&c2L$)Z6n@^>keSo66UJ6-EZlvtJ(r3u#`{r`!e!Z
zgBJ2Fy<C87#&P`!`S*#APuP99+^9$ll!6D>*KmCUn2QXfoR0e09rapAePc(x(^cQL
zs(w@%u70p;XXQf`4_oiISZ7q$x4G(_TmWJawz=wSTo5zrMLzg4D%cLbP%rB>3RSEN
zPz~^x;G;pLSy#R7VGE2rWbLlkc2>aXgO%=j=lxag`nKWf-um_gFU&=?5ruWs*Kl9<
z!A>m(9QoXmIdj-GE(CWqAM44yxXzvVjeGd@lYIl8os|di8gcKeb8)ibQ49?E3%`RQ
z=JXnO<_-0U%DHg8`oySv$7s6oK;;y0ihHE;QAhwv&2sN-oej*rbHydc^ZD%eVP;I(
zb?bhiy}YQs{6M8$Xa|M13QPAchj!+<>?2fxk;?u;*3K0_IR=XlNQ*zB;2u6_QSaSi
zA)S-%7qj4P$?E3kbC+^xpL_V%?+4XbWoj&+2d%rxpchsc^x|K<mCrZ7q&6<_0q9a2
zFa9}zV3!(O&_WX3>WP<v2o7~V^%BN{ZguOgy4AI#r#$Kv2ZCyR>^MQk)wRb?xz#H;
zb68z_hL|(R{yj&|scZ9uoFmy6pnitb&vdCXpH*j`g;?%<D+dZXU1~ggJm^6w&^$((
z$Gg;72cZ3)OP%>4(q~<2;~90}G)Pt*csr<Wq6-Z^aRT%-=-F;$1zl?56&r-k1=Yl>
zHfrXIE2+IM_2<;qpPzu#IfCm>=XmSS{Rg;nd8oV(A#jL3BvWuzT{~2})qxx|H@yY9
zv15=s?jpoRh&l$Q@g#}90&_0)#&@Ws4<Y*6agslQ>@ythBA8HD0UbC+Rqs*<;KtY9
zqYQ{PW?_De+J}s10ls#Oa<XX88#Y95BHDQTlzPuqYu&>?s6pP#zo6)4ihe}VGw!7?
zsykZG<@4^P2UDGNCvju;1=X!T$Q(X<`0N3CJEiM_J6=l7+Wtd&y^YnXPyNQW{Uv%W
z2G!^fNOa#@wqR!F4etEDu_yPN`^WGywl&l|I(DTxe;f4|^{HRmw!bJ;sg!;Da!{SR
z(w!f5`)W~#yO*i|GIiCyU!&&H=aF1#B=?<CZ{qT<%t6=LL$0%<ykh?t-0^(s$?ebc
zihX<kt~IW;t{YuzTm${??Qee(1-dh*GIup}xihJT^~3o}bw?OauzTe5HO*(9qKRhq
zhlqPJ&jPYZ=s6BO18BB{zQ>{G0BI8X5+HY`tHJ3S{?MX6cmfYt?~d2ihu-9gy!y~D
zDY5he^$RZ}F??5plc^6qkEDAiRcI$Fl#P%Lq~`5KRl2fK!L}Q03~blZa6`Mg<FL?M
zo_!0in~|=Dd0#(!SHtqmtJ!aXhw*tVy~sVX;`CCC*O@=K_l?bb`l~h#75rKAXm$zN
z9a-@zGN0a7KsBJAOg`I4xtaYv!+8uHuc07M=6wtu;EkapLuF?QL&wtJ;ZeXN@y?r^
zUiiF0Fa8@)K6ErffQF9UltDwsSxWGsW7q$=@!}6}pl#=m7tavfO(R74@xlhTe9ZCU
z?_lnKZoCL$yy*YP@nS6I9;tiuYR2|j(50THRl`&CHqT};xqXsSxe>(ZcXvR@Ju>6z
z>mY~T_;?|IBhRl}OGdV-Fw#^o^0yGmey1SmBFR5`fcg_YK+*H3c{1m+4`VjIV^rPo
z01XKM8wtEqfIIm(0ZDh}SDwu8|CJh)wTsLzQfB6s2Df|XEe*5x;sIe#*D1_{$1QsS
zD(%j^mYqg)=Rm{Ez3mIpG_GF<%y?K^_8v!ryE4b&kZPcN_ki_e-b7JA-a|Qi_tTR)
zl4@`QcpX4<K6?xVL(0}w?vcA1+}Yz`DWzq<hXn~WrCQXwG5eT6>VRYeq(182{tOL?
zpL5;fy4AJL<-K*kXJ@z}>_!wHt5fG=;?=9X4*U1)_GEtResPT3rYba!cp7GRVc41J
z$z0Wt`I+lqVW%s*6=tc_>3q;Q;m#b-eqKN`0A<&q4j*M>nNWBE4)SEUjjILp6F}Ly
zAH&8O(bflwHcknMx?uJ&rhgw*kM9bFl|>u>D4<nj<B^YHBPtXuMH?FhWF;Hd!-gm0
zZ@8&v`v<g6@QnByoKMo6zf0{s+MRjR8248}C%YO^P(9q$&_^21?j5gh?F{bvs8+m<
z5-=0JMR@X<!RpzU(F<Jp7t(u;65arwJ&A~SMyke5>JElUz`N=#DDhqO3QvRcU3J~Q
zy>@g$bjv+TR)=`*7awAx;5)eYC-k&ljJ&6QgW8WR8;`wC*uwGHj|iJS9(w^8^?z92
z3;(ce1S!mQXQ8%x{{hyGBEC+l>wup_r!wL6BiGnU;PlEh;eSE61NaN$arm5#%>w?!
zc>F`)c){gv&*$?rFLw{W?abW$wmXwP>dve=@?1A5vgf+rH^^y&oFw8NzWWGM?|TW!
z+`kY%(tX4|f&~Gs35MVAOofKu?^So~Le}uzr&;<6A@I~RAW#1iO{BT~{b(=lTHSNq
zr^yY&-5)4pzTvx1vh=M|{u3mBLdf4}$hSY&eX=0GX!!2;S=uS(x&7*ChW&+x{0uaq
zAYZj>tYR0IBJSbtBQ?W!e_*-CCUmiGQFknYFSx}kdA-%AQ9j%I0nls~-s6?C5vc72
zme2ka;I3WI@8_$KVl-}f1%{?)J(*W&hUVG1;5%3yuW|3R6YY0pKJ3Yy@mx;suW|4D
z(3<UsdH0TEDXsbEa=n4!(UUpTl{w+c&-{gZc)#La`pfiN?%`)@K6k4e)-VArtWQeI
zKeZnw$^J;?J9zGBiDt<kBK`da&QsJ@Bjj5@g?CbZK?7oSemhdc4^d|BC^dY+T;j>A
zxkH?vPCT5!a-*EDB_94~xjme>5RY!oa=SVIdk1(t>nyj6^Y0Rm0g&akasDmh@q%Ev
zIOl&ud=2sIIsY@_=@Ue*kMl1Pj~N}ywR8T5#M7tKoRjlEAf7&%<}}VfMLgD!ENA2V
zcZshjo^k$L#H+-g#Q2b3@EGy*nILzR^M6h}^|st0&Obst?kC#M`3H!nPoKFxoWGZN
z8uD_xIe#bd^r<tqi}MNM=@Vye8|QB$o_c>S&iM_*(<jZ`dd`2A_&LP)aeg)NG#SaY
zbN<uB)2GXvlk*+K&m&&r{4(P4r2*~dd<*fPB%X2p3gYJze-dNgMVD$wGIq0|?>G#e
z97N$g>In#fF#7G$%xlyW5M;i+gkMp@uPWhhF5x33{N@s#Miz5<UoGMPzJ&j73BSLD
z|8WWbY6(xjtx_<5yC$AGFooGuG)}1lQ+V7U{2Ll2<+p0O;S@%NJUvi6i1Gws+Io${
z)S)QIa_9k~u*aftDt|kW_UVD6;H=g-r3Z$>>%#DR=khTQh<uGR80|WZ)FpzK?Vt`w
zLH^wjbvOz#Uw%`cE%d1aQjq!dKvH<wsc}jVB!#yHe=={>Ctjs-99@&wZ&dJjGC5!H
z6PKEdCOt6efk_WcdSKE6lOCA#z@!HzJuvBkNe@hVVA2DV9+>pNqz5KFFzJCw4@`Ps
z(gTwonDoG;2PQo*u?K3z?`-Yj_oz`p<$m>2ty#?)hZYLDM9`Ij-Y95L&`pBgFX&eU
z{idK#2zo%!!-Bpk=ovw)T8wgM3A#|wC4#OL^hQC0f^HJ@enGz?=r;v@LeK+(9v1XX
zLC*+Uwa6&H6I;qOzPV1@gv}Dz&7c{Z0yV9%RS&ebINO%`moCzsdQ0=-4Oa&i1%v+9
zpsp`zb1rr+Zf#o<Xl=V3e=HP=ZVV)rY22L0(<O|sRh^y7v_@=#zg%l}Iu<#cT8p!#
z8LDksBbV)o`W%-FvHmsv-5M;VH(#z@?Z9?IerJs6DiGs0L*ti7dEUV_CK+>*ez&x+
zZp8AJOLHz`%X$8Ng?#yYCjCJG1^Ig>{Q<;8!KlwSk!9GI{q2#7gn=35twR$G`8P$s
zl)vwuEa6MdxB!UuE0ibOFMsE!zZCN0`e)=S+bj<wsmQZ0qzofxRq*XdHOhOCnEDq&
z>$7D25=6=;mLk5uI$E!RVk=%nCAP}4iWXw%j}0vR<b@h#I{^p_g$1MB@1gVx76e|S
zv?HEMf5t=ADs|6qk!h)^v?9xTD<aF0E+7?)Ib8i)AeBbR5SmK)wu-qRRU08dS2e4t
zxgGR^`w`Xt2}L|b5wafSG2IJ`QbpZp%C)$WO5=J;J_bX|6+C%}l2`Gh4OxnVl2wNg
z(M5$E2Aw+PF*HMQK8y5}{giB`B>(AK@xCDLoa`yeqxD&m>AYTK3__NI^60mLtJhJP
zom6EC>Q^{R_4W|^GFK(`Yq)H5$CMkdL;O0fY1@U#1z;%9ZuaRRrmQ8$RfC!FmqKEu
zvc_^Zo2d*~wj%zF<sJxJDAlPRY*s5Gx|FNMo!tr;cNTFw;4W0G*rSwbimhszGF`FO
zUsS)Keg^uj#lfug*G(mmtV~k?PoZoZ{wTt-7@5naTr;I-#&vTn_-h7h1Di7C+9_6a
z|B8jIX9ibcwyHH%MT=%5Fk8(H3B7ulm{qV@Fj84GTcs#HGZYxEz68c=RFp@>TyUXs
z*&KpvD|xX~%r#q%zlWH5A-fEu&cUvmW7PSwIaGT4?5}nJuXiXD;ABv(EY!z>CfkaJ
ziJdkDE}33Ug42;xV>p9yW@PSFkcUlF58R7=f3v^T0smYGkWEvXAwG+0vV2ao-l)xl
zp1KjJh3~LZlt0whG6jJIvDGcPCFu{ulVXQ7!=`a;@ptqaf68-#MUEWSrOIim?aJ%4
zYd(F+C5fcfc6$JS4yq`Z!i<&C9~zP6x`Ji!EQB5Q4j;0x!3&p#(J=Ut74Riep<?-R
zq_ys2N>5_hRTdnfps))pa>TX%lqr@9VDwj$R(6vpVO&L;(#eE4fo0I=lY7CF>14pz
z2;OvcQz+^Sr~LuO0^y*&SZ{3h_XmQGA?XeIkXrXM#X9|<HJY+c*La+=MweTse|ps|
z%u-=}gQDjiu{6%H);f5N`H=as+aE2qUyl6N%M>?vINB~pf9sV36Q?s^FE^dF5W-0S
z)C4b13(&XllK{LuH{f6ZKQ<_$(SGI|(i4o!Fn-bj-nd$4bF=Xp)$EZ}IvQ9p7>EWE
zAs<*h;Tu|E99g5;2l@XSS~NQhuD}+0=-GpyZA;-e9B&f<8!`54+QWh9U}|WE6Hq#O
zM=0vIr}Pcjt`1&LisKn@N&@!fkIzIW#4!SxXe5X1*ddw<_-N-OH=Gy@*s(p+j;)xG
zil)d`0!Klhaf7I8Y6!fcE)-)4)D-6vpyI(eZ7-!7P<^;foK=uYY+2E~<f`VyWFH3X
z*g_hOfr-YTNfjwr0@&BniMRwWmXHc3?bwqa2(O@95Yof;z~+DtcZ6<?xFb+br0A?4
z&DF29I2Jc!?lMk*4sXC23UPWEQz4%{5y0UWc#=%_nXH>TYnU2HFJGP*&*2ygG@16b
zj(^y3xC4LG^u(ZV;#}&8)I+=})-)801e){>=}_3;l!!-+M8}#QPgm0*4j%B?TO5m;
zo6rZDID-OaH_0);>y0M@C7ts8`f$gf$Z`GG@D^`TI+_d(Mgz2K*YMwDr~P=HcH&;i
zdviCuJ^tPp4#z-R4D=_n6eJ=?WjQ*@(O8a5QchAsyQnYq#>D(X>b)m;sV6B-w<xsp
znBFaGWHMfWZk5TjfGXH2WzAm;ra;cOHP9|H6M`yiolKf-4SWn+pOLmw1zUJ{j9|9)
zne%M9gjBeoNN??NL5jEqYGKa5mg%6t6xKgZ=f7lV5>Ub3Dcn~nOskHGa7;%g#6JaG
zVN+Rrnub&{@EJO$zH&`*t;iy(;N+opUE>JVgTnQ=2w$&jNEPq*%}8UuQ}{bnsvm{*
zxGb<Q|899Tfe5$ZB0IZv4Jk?+VKYxpXR|vs4ixM_K(WkX@_T^9?}5}rexMb@MEF;L
zD{MyL?`fr-ua%Mi+cG$9-BK(?f6CRux5+*xg|_t?QpLb$bCx-UzZ;f0LD*rIqTeLG
zMc~p868}EOXR^}o9|Q1@#Zq7RJ6<XOS&plP^{>Qt^ZK7(&mTfyI__8LD(GhdmupbF
zF!XDI%Xybv9K9p(JqtB1$@lLgbq1HlW}#ykIL}1%3Dk2MPH&V8Rq4-#z^6+g6Kw--
zo4`&taLptqlM&$b43vi9GWa89@NWU1s6Mn(o(SJp27kT`{?jt}8)fjbWpG+#(|z1?
zsm7apA9xB)0;jEk6!;%+gL*j5p>a`v8T@l)a9Uwb<cCBVd>inI&U^bp8F~5;GLd~+
zc29IZ++UWF|3(@7pUU9Rmcd^sgVQIEiR$@Q8T_3x_?a?zC7$1j?AMpU=a#`6%HWp+
zpXgjYC&!EL1=(4a3;dw2Nr+Pyw_nHPewB9gR|>R~Y@+(FD}yr!W|V<MoH?RcY&cvU
zJy@s=GKYUlG`S_h99Z5s#0SrDj5lGhLs+O7(kURAuIr>KQt@yKsTe<_%@G(9=i?3e
zAt^cLNCh@SiqGmDd=2N|YQ-rt<FK)UV2*J!HeRwG31JdH7=sm~q*x^Gjp1aum`+P0
z2d#5Lfh5iL<RZu`7eQDf8kwYT6pW+fn8SyY@sI_Jfeq<Fluak^05@2E!k^6AuptrH
zBy(sLDAQPb$>ex-I?vAm?_xy=Iz%V*@svU3!l7gZgcZvMoHH2Zi=l3}cjanVZ@1Ue
z(?8&)GYI`LV^wP`Ah7zv+Qp9p?!4aEhTE`+bTlK$oy=v$Vi`n8%5@kUcK}c{=8Z)3
zILeHhmt3d|7HU8p6T8~WYl;GCb>uBB&W-uC3FZ~zhbq#YlPCDC^4!qj^3bt7{oMoJ
z0au5o8%-R)kQYX{MR_cv*+$73jYi8wtNqEC7f0{;u`>K$et4tzv(1aC>c;)-f{*%|
zTj=b<4b;0+%}c@zBCS3gAQg)MY+m9`rXziu`n`Rjey3NQ`Y6c%m%|^uUeRs6!B_%)
zBTUa*>4}gNEVRc#^dZ9Q9sitlU*Cmitz4hy8eKPE@9OR8G{)jTCZ^$>F1)6AkmqE}
z^RwlCGm*_@Wtu2$Ag#tF^&hlo;ELxr8ytstB<F<^q4mxbTvA`|yW1mxQM^wmP#IKD
zIp4<@S{~*j#@aU~_2oXdc$JagXAmaL6v~nE^ud(&#7Ta%O5+-%Lcg7J#Wt4KpMDTH
zW722UnxR1zMsOOWP`<3cfcY<CR5w|^+*jxn`f^{PlozjG$+BLgs7_K}?vtENPDOa!
zAm^1Y{kInRq~9Ur<i5#Kp)VDwj`;X~p8s`VRHqhP6y*GW6u%EpxLH=rVEK7uBFRYz
zG3iS&d9JuTZ+yTkjC860?<moi>*M$|Ly_;WHrXik|8}7-Zg=5#yrW`r^Dh5Ysc1lc
z+mOf?O7!J-o0m01g8H{Pl-7SIGO7KteEFT{ZOzc4=gS;Q_5T!^W_{MI8RC~1#3mhU
zu;=-o*3ERQ<ntl-p~`dOOZ_EzNxuPI)ub=iyR$C^h#=(*uE<M#4>HK7vV5&m<D%Nf
z)ZYtf@}1O|>uPyE{%+BJ@i-OgBk$uu=#u}WzC54xp!i9z5qahy>n*V#BVpDrtc%+X
z30aPmFV+8PiN5^qU=#XMPf{w&9LB&9HY~tWeObQj#--)cd3omgYwI;`NozL@&ZbOL
zDAlK*%?OkJYu7a{-R9zi&b=NNv~e7MTZ29o;Vi+W{$G{gBuhck`9j}7i}6vHNJ~M%
zN`FX6M>+itLt>lgKT^KL99vdKe|@*1L1!?ULkY{=W%N(3G_>Ttcxj_lpS~LrR_ec>
zxDAbgk^w{NOX?N+(!Qk5UL&<l0)m$MkG{)DTV~NcLjT&5%o3Ju22kQ+ZssI?s6^1j
wgG-O&MYv@Dlkcx>(3gFW<YlmlPrLLT$Q(*oMkoJFr2iPb7!jT=$t>0XPcR{BMF0Q*

diff --git a/mpi/all-reduce/frontier/128_gcd_run.sh b/mpi/all-reduce/frontier/128_gcd_run.sh
deleted file mode 100644
index 5c6baf5..0000000
--- a/mpi/all-reduce/frontier/128_gcd_run.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-#SBATCH -p batch
-#SBATCH -A CSC569
-#SBATCH -t 20:00
-#SBATCH -N 16
-#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/128_gcd.txt
-#SBATCH -C nvme
-
-## calculating the number of nodes and GPUs
-export NNODES=$SLURM_JOB_NUM_NODES
-export GPUS_PER_NODE=8 ## change as per your machine
-export GPUS=$(( NNODES * GPUS_PER_NODE )) 
-
-MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 1024))
-SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-reduce/frontier/16_gcd_run.sh b/mpi/all-reduce/frontier/16_gcd_run.sh
deleted file mode 100644
index e1ad604..0000000
--- a/mpi/all-reduce/frontier/16_gcd_run.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-#SBATCH -p batch
-#SBATCH -A CSC569
-#SBATCH -t 20:00
-#SBATCH -N 2
-#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/16_gcd.txt
-#SBATCH -C nvme
-
-## calculating the number of nodes and GPUs
-export NNODES=$SLURM_JOB_NUM_NODES
-export GPUS_PER_NODE=8 ## change as per your machine
-export GPUS=$(( NNODES * GPUS_PER_NODE )) 
-
-MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 1024))
-SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-reduce/frontier/32_gcd_run.sh b/mpi/all-reduce/frontier/32_gcd_run.sh
deleted file mode 100644
index be7bdd9..0000000
--- a/mpi/all-reduce/frontier/32_gcd_run.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-#SBATCH -p batch
-#SBATCH -A CSC569
-#SBATCH -t 20:00
-#SBATCH -N 4
-#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/32_gcd.txt
-#SBATCH -C nvme
-
-## calculating the number of nodes and GPUs
-export NNODES=$SLURM_JOB_NUM_NODES
-export GPUS_PER_NODE=8 ## change as per your machine
-export GPUS=$(( NNODES * GPUS_PER_NODE )) 
-
-MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 1024))
-SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-reduce/frontier/64_gcd_run.sh b/mpi/all-reduce/frontier/64_gcd_run.sh
deleted file mode 100644
index a8e13d2..0000000
--- a/mpi/all-reduce/frontier/64_gcd_run.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-#SBATCH -p batch
-#SBATCH -A CSC569
-#SBATCH -t 20:00
-#SBATCH -N 8
-#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/64_gcd.txt
-#SBATCH -C nvme
-
-## calculating the number of nodes and GPUs
-export NNODES=$SLURM_JOB_NUM_NODES
-export GPUS_PER_NODE=8 ## change as per your machine
-export GPUS=$(( NNODES * GPUS_PER_NODE )) 
-
-MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 1024))
-SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-reduce/frontier/8_gcd_run.sh b/mpi/all-reduce/frontier/8_gcd_run.sh
deleted file mode 100644
index 81ffbc4..0000000
--- a/mpi/all-reduce/frontier/8_gcd_run.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-#SBATCH -p batch
-#SBATCH -A CSC569
-#SBATCH -t 20:00
-#SBATCH -N 1
-#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/8_gcd.txt
-#SBATCH -C nvme
-
-## calculating the number of nodes and GPUs
-export NNODES=$SLURM_JOB_NUM_NODES
-export GPUS_PER_NODE=8 ## change as per your machine
-export GPUS=$(( NNODES * GPUS_PER_NODE )) 
-
-MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 1024))
-SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-reduce/frontier/benchmarks/128_gcd.txt b/mpi/all-reduce/frontier/benchmarks/128_gcd.txt
deleted file mode 100644
index 56c18aa..0000000
--- a/mpi/all-reduce/frontier/benchmarks/128_gcd.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 128 33554432 1073741824 10
-  0: Local data size: 1024
-  0: Global data size: 1024
-  0: Number of GPUs: 128
-  0: Message size range: 33554432 - 1073741824
-  0: Number of iterations: 10
-  0: 33554432 0.240206 seconds
-  0: 67108864 0.476990 seconds
-  0: 134217728 1.041500 seconds
-  0: 268435456 2.951969 seconds
-  0: 536870912 5.990606 seconds
-  0: 1073741824 12.004613 seconds
diff --git a/mpi/all-reduce/frontier/benchmarks/16_gcd.txt b/mpi/all-reduce/frontier/benchmarks/16_gcd.txt
deleted file mode 100644
index 609afbd..0000000
--- a/mpi/all-reduce/frontier/benchmarks/16_gcd.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 16 33554432 1073741824 10
- 0: Local data size: 1024
- 0: Global data size: 1024
- 0: Number of GPUs: 16
- 0: Message size range: 33554432 - 1073741824
- 0: Number of iterations: 10
- 0: 33554432 0.133082 seconds
- 0: 67108864 0.267616 seconds
- 0: 134217728 0.634895 seconds
- 0: 268435456 1.928400 seconds
- 0: 536870912 3.973167 seconds
- 0: 1073741824 7.913018 seconds
diff --git a/mpi/all-reduce/frontier/benchmarks/32_gcd.txt b/mpi/all-reduce/frontier/benchmarks/32_gcd.txt
deleted file mode 100644
index b92c437..0000000
--- a/mpi/all-reduce/frontier/benchmarks/32_gcd.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 32 8388608 1073741824 10
- 0: Local data size: 1024
- 0: Global data size: 1024
- 0: Number of GPUs: 32
- 0: Message size range: 8388608 - 1073741824
- 0: Number of iterations: 10
- 0: 8388608 0.043066 seconds
- 0: 16777216 0.084259 seconds
- 0: 33554432 0.167705 seconds
- 0: 67108864 0.336696 seconds
- 0: 134217728 0.773389 seconds
- 0: 268435456 2.284815 seconds
- 0: 536870912 4.693147 seconds
- 0: 1073741824 9.356859 seconds
diff --git a/mpi/all-reduce/frontier/benchmarks/64_gcd.txt b/mpi/all-reduce/frontier/benchmarks/64_gcd.txt
deleted file mode 100644
index 122c83e..0000000
--- a/mpi/all-reduce/frontier/benchmarks/64_gcd.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 64 16777216 1073741824 10
- 0: Local data size: 1024
- 0: Global data size: 1024
- 0: Number of GPUs: 64
- 0: Message size range: 16777216 - 1073741824
- 0: Number of iterations: 10
- 0: 16777216 0.101777 seconds
- 0: 33554432 0.203258 seconds
- 0: 67108864 0.406569 seconds
- 0: 134217728 0.913391 seconds
- 0: 268435456 2.633732 seconds
- 0: 536870912 5.375804 seconds
- 0: 1073741824 10.708706 seconds
diff --git a/mpi/all-reduce/frontier/benchmarks/8_gcd.txt b/mpi/all-reduce/frontier/benchmarks/8_gcd.txt
deleted file mode 100644
index a9b69c1..0000000
--- a/mpi/all-reduce/frontier/benchmarks/8_gcd.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 8 16777216 1073741824 10
-0: Local data size: 1024
-0: Global data size: 1024
-0: Number of GPUs: 8
-0: Message size range: 16777216 - 1073741824
-0: Number of iterations: 10
-0: 16777216 0.049728 seconds
-0: 33554432 0.099497 seconds
-0: 67108864 0.202129 seconds
-0: 134217728 0.500335 seconds
-0: 268435456 1.560791 seconds
-0: 536870912 3.265382 seconds
-0: 1073741824 6.500534 seconds
diff --git a/mpi/all-reduce/perlmutter/128_gpu_run.sh b/mpi/all-reduce/perlmutter/128_gpu_run.sh
deleted file mode 100644
index 33729eb..0000000
--- a/mpi/all-reduce/perlmutter/128_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 20:00
-#SBATCH -N 32
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 1024))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-reduce/perlmutter/16_gpu_run.sh b/mpi/all-reduce/perlmutter/16_gpu_run.sh
deleted file mode 100644
index dc30279..0000000
--- a/mpi/all-reduce/perlmutter/16_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 15:00
-#SBATCH -N 4
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 1024))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-reduce/perlmutter/32_gpu_run.sh b/mpi/all-reduce/perlmutter/32_gpu_run.sh
deleted file mode 100644
index be73564..0000000
--- a/mpi/all-reduce/perlmutter/32_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 20:00
-#SBATCH -N 8
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 1024))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-reduce/perlmutter/64_gpu_run.sh b/mpi/all-reduce/perlmutter/64_gpu_run.sh
deleted file mode 100644
index cf714da..0000000
--- a/mpi/all-reduce/perlmutter/64_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 20:00
-#SBATCH -N 16
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 1024))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-reduce/perlmutter/8_gpu_run.sh b/mpi/all-reduce/perlmutter/8_gpu_run.sh
deleted file mode 100644
index 49ff135..0000000
--- a/mpi/all-reduce/perlmutter/8_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 15:00
-#SBATCH -N 2
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 1024))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt
deleted file mode 100644
index 4e3e17d..0000000
--- a/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-Local data size: 1024
-Global data size: 1024
-Number of GPUs: 128
-Message size range: 33554432 - 1073741824
-Number of iterations: 10
-33554432 0.264543 seconds
-67108864 0.527909 seconds
-134217728 1.092095 seconds
-268435456 3.194094 seconds
-536870912 6.415718 seconds
-1073741824 12.819154 seconds
diff --git a/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt
deleted file mode 100644
index b377ec2..0000000
--- a/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-Local data size: 1024
-Global data size: 1024
-Number of GPUs: 16
-Message size range: 33554432 - 1073741824
-Number of iterations: 10
-33554432 0.142677 seconds
-67108864 0.324897 seconds
-134217728 0.673650 seconds
-268435456 2.140369 seconds
-536870912 4.318430 seconds
-1073741824 8.632880 seconds
diff --git a/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt
deleted file mode 100644
index cda53bf..0000000
--- a/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 1024
-Global data size: 1024
-Number of GPUs: 32
-Message size range: 8388608 - 1073741824
-Number of iterations: 10
-8388608 0.049975 seconds
-16777216 0.092395 seconds
-33554432 0.181888 seconds
-67108864 0.368241 seconds
-134217728 0.774021 seconds
-268435456 2.362729 seconds
-536870912 4.760279 seconds
-1073741824 9.524390 seconds
diff --git a/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt
deleted file mode 100644
index 341fc93..0000000
--- a/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Local data size: 1024
-Global data size: 1024
-Number of GPUs: 64
-Message size range: 16777216 - 1073741824
-Number of iterations: 10
-16777216 0.111867 seconds
-33554432 0.230462 seconds
-67108864 0.465838 seconds
-134217728 0.970915 seconds
-268435456 2.875694 seconds
-536870912 5.771569 seconds
-1073741824 11.522959 seconds
diff --git a/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt
deleted file mode 100644
index 05fd1e8..0000000
--- a/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Local data size: 1024
-Global data size: 1024
-Number of GPUs: 8
-Message size range: 16777216 - 1073741824
-Number of iterations: 10
-16777216 0.058292 seconds
-33554432 0.107128 seconds
-67108864 0.211506 seconds
-134217728 0.491929 seconds
-268435456 1.508757 seconds
-536870912 3.052047 seconds
-1073741824 6.103450 seconds
diff --git a/mpi/reduce-scatter/frontier/128_gcd_run.sh b/mpi/reduce-scatter/frontier/128_gcd_run.sh
deleted file mode 100644
index b6505f8..0000000
--- a/mpi/reduce-scatter/frontier/128_gcd_run.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-#SBATCH -p batch
-#SBATCH -A CSC569
-#SBATCH -t 20:00
-#SBATCH -N 16
-#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt
-#SBATCH -C nvme
-
-## calculating the number of nodes and GPUs
-export NNODES=$SLURM_JOB_NUM_NODES
-export GPUS_PER_NODE=8 ## change as per your machine
-export GPUS=$(( NNODES * GPUS_PER_NODE )) 
-
-MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/reduce-scatter/frontier/16_gcd_run.sh b/mpi/reduce-scatter/frontier/16_gcd_run.sh
deleted file mode 100644
index eb6b2ba..0000000
--- a/mpi/reduce-scatter/frontier/16_gcd_run.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-#SBATCH -p batch
-#SBATCH -A CSC569
-#SBATCH -t 20:00
-#SBATCH -N 2
-#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt
-#SBATCH -C nvme
-
-## calculating the number of nodes and GPUs
-export NNODES=$SLURM_JOB_NUM_NODES
-export GPUS_PER_NODE=8 ## change as per your machine
-export GPUS=$(( NNODES * GPUS_PER_NODE )) 
-
-MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/reduce-scatter/frontier/32_gcd_run.sh b/mpi/reduce-scatter/frontier/32_gcd_run.sh
deleted file mode 100644
index 4ed3437..0000000
--- a/mpi/reduce-scatter/frontier/32_gcd_run.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-#SBATCH -p batch
-#SBATCH -A CSC569
-#SBATCH -t 20:00
-#SBATCH -N 4
-#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt
-#SBATCH -C nvme
-
-## calculating the number of nodes and GPUs
-export NNODES=$SLURM_JOB_NUM_NODES
-export GPUS_PER_NODE=8 ## change as per your machine
-export GPUS=$(( NNODES * GPUS_PER_NODE )) 
-
-MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/reduce-scatter/frontier/64_gcd_run.sh b/mpi/reduce-scatter/frontier/64_gcd_run.sh
deleted file mode 100644
index a5a9957..0000000
--- a/mpi/reduce-scatter/frontier/64_gcd_run.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-#SBATCH -p batch
-#SBATCH -A CSC569
-#SBATCH -t 20:00
-#SBATCH -N 8
-#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt
-#SBATCH -C nvme
-
-## calculating the number of nodes and GPUs
-export NNODES=$SLURM_JOB_NUM_NODES
-export GPUS_PER_NODE=8 ## change as per your machine
-export GPUS=$(( NNODES * GPUS_PER_NODE )) 
-
-MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/reduce-scatter/frontier/8_gcd_run.sh b/mpi/reduce-scatter/frontier/8_gcd_run.sh
deleted file mode 100644
index 9d4191c..0000000
--- a/mpi/reduce-scatter/frontier/8_gcd_run.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-#SBATCH -p batch
-#SBATCH -A CSC569
-#SBATCH -t 20:00
-#SBATCH -N 1
-#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt
-#SBATCH -C nvme
-
-## calculating the number of nodes and GPUs
-export NNODES=$SLURM_JOB_NUM_NODES
-export GPUS_PER_NODE=8 ## change as per your machine
-export GPUS=$(( NNODES * GPUS_PER_NODE )) 
-
-MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt
deleted file mode 100644
index af5e98a..0000000
--- a/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 128 33554432 2147483648 10
-  0: Local data size: 2048
-  0: Global data size: 2048
-  0: Number of GPUs: 128
-  0: Message size range: 33554432 - 2147483648
-  0: Number of iterations: 10
-  0: 33554432 5.046207 seconds
-  0: 67108864 5.031027 seconds
-  0: 134217728 5.063647 seconds
-  0: 268435456 5.054240 seconds
-  0: 536870912 5.047598 seconds
-  0: 1073741824 5.051536 seconds
-  0: 2147483648 5.057082 seconds
diff --git a/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt
deleted file mode 100644
index fa9c67a..0000000
--- a/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 16 33554432 2147483648 10
- 0: Local data size: 2048
- 0: Global data size: 2048
- 0: Number of GPUs: 16
- 0: Message size range: 33554432 - 2147483648
- 0: Number of iterations: 10
- 0: 33554432 5.091016 seconds
- 0: 67108864 5.092117 seconds
- 0: 134217728 5.082377 seconds
- 0: 268435456 5.103443 seconds
- 0: 536870912 5.102289 seconds
- 0: 1073741824 5.116191 seconds
- 0: 2147483648 5.115768 seconds
diff --git a/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt
deleted file mode 100644
index 23a0ace..0000000
--- a/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 32 8388608 2147483648 10
- 0: Local data size: 2048
- 0: Global data size: 2048
- 0: Number of GPUs: 32
- 0: Message size range: 8388608 - 2147483648
- 0: Number of iterations: 10
- 0: 8388608 5.006776 seconds
- 0: 16777216 4.981770 seconds
- 0: 33554432 5.014587 seconds
- 0: 67108864 4.994224 seconds
- 0: 134217728 4.977063 seconds
- 0: 268435456 4.980235 seconds
- 0: 536870912 5.007770 seconds
- 0: 1073741824 5.013561 seconds
- 0: 2147483648 5.015718 seconds
diff --git a/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt
deleted file mode 100644
index 560c383..0000000
--- a/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 64 16777216 2147483648 10
- 0: Local data size: 2048
- 0: Global data size: 2048
- 0: Number of GPUs: 64
- 0: Message size range: 16777216 - 2147483648
- 0: Number of iterations: 10
- 0: 16777216 5.006610 seconds
- 0: 33554432 4.998351 seconds
- 0: 67108864 5.003749 seconds
- 0: 134217728 5.066133 seconds
- 0: 268435456 4.980950 seconds
- 0: 536870912 4.982830 seconds
- 0: 1073741824 5.023178 seconds
- 0: 2147483648 4.988750 seconds
- 0: 
- 0: MPICH Slingshot Network Summary: 4 network timeouts
- 0: 
diff --git a/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt
deleted file mode 100644
index 493d5ee..0000000
--- a/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 8 16777216 2147483648 10
-0: Local data size: 2048
-0: Global data size: 2048
-0: Number of GPUs: 8
-0: Message size range: 16777216 - 2147483648
-0: Number of iterations: 10
-0: 16777216 5.130130 seconds
-0: 33554432 5.120491 seconds
-0: 67108864 5.115654 seconds
-0: 134217728 5.128319 seconds
-0: 268435456 5.111989 seconds
-0: 536870912 5.115996 seconds
-0: 1073741824 5.127237 seconds
-0: 2147483648 5.116940 seconds
diff --git a/mpi/reduce-scatter/perlmutter/128_gpu_run.sh b/mpi/reduce-scatter/perlmutter/128_gpu_run.sh
deleted file mode 100644
index 469aeaf..0000000
--- a/mpi/reduce-scatter/perlmutter/128_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 32
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/reduce-scatter/perlmutter/16_gpu_run.sh b/mpi/reduce-scatter/perlmutter/16_gpu_run.sh
deleted file mode 100644
index e66b9f4..0000000
--- a/mpi/reduce-scatter/perlmutter/16_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 4
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/reduce-scatter/perlmutter/32_gpu_run.sh b/mpi/reduce-scatter/perlmutter/32_gpu_run.sh
deleted file mode 100644
index 07d6020..0000000
--- a/mpi/reduce-scatter/perlmutter/32_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 30:00
-#SBATCH -N 8
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/reduce-scatter/perlmutter/64_gpu_run.sh b/mpi/reduce-scatter/perlmutter/64_gpu_run.sh
deleted file mode 100644
index e51945a..0000000
--- a/mpi/reduce-scatter/perlmutter/64_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 16
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/reduce-scatter/perlmutter/8_gpu_run.sh b/mpi/reduce-scatter/perlmutter/8_gpu_run.sh
deleted file mode 100644
index 1b51537..0000000
--- a/mpi/reduce-scatter/perlmutter/8_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 30:00
-#SBATCH -N 2
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt
deleted file mode 100644
index d696072..0000000
--- a/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 128
-Message size range: 33554432 - 2147483648
-Number of iterations: 10
-33554432 3.352414 seconds
-67108864 3.323000 seconds
-134217728 3.331817 seconds
-268435456 3.327162 seconds
-536870912 3.345694 seconds
-1073741824 3.326455 seconds
-2147483648 3.321790 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt
deleted file mode 100644
index b71477d..0000000
--- a/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 16
-Message size range: 33554432 - 2147483648
-Number of iterations: 10
-33554432 3.368300 seconds
-67108864 3.361940 seconds
-134217728 3.367816 seconds
-268435456 3.360722 seconds
-536870912 3.363088 seconds
-1073741824 3.392373 seconds
-2147483648 3.375325 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt
deleted file mode 100644
index 38e09b1..0000000
--- a/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 32
-Message size range: 8388608 - 2147483648
-Number of iterations: 10
-8388608 3.368554 seconds
-16777216 3.367485 seconds
-33554432 3.376475 seconds
-67108864 3.381592 seconds
-134217728 3.384111 seconds
-268435456 3.375780 seconds
-536870912 3.371542 seconds
-1073741824 3.379895 seconds
-2147483648 3.381470 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt
deleted file mode 100644
index d982100..0000000
--- a/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 64
-Message size range: 16777216 - 2147483648
-Number of iterations: 10
-16777216 2.220629 seconds
-33554432 2.201147 seconds
-67108864 2.196879 seconds
-134217728 2.199449 seconds
-268435456 2.194973 seconds
-536870912 2.196809 seconds
-1073741824 2.196212 seconds
-2147483648 2.201029 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt
deleted file mode 100644
index d2bdd9a..0000000
--- a/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 8
-Message size range: 16777216 - 2147483648
-Number of iterations: 10
-16777216 3.558431 seconds
-33554432 3.553477 seconds
-67108864 3.562137 seconds
-134217728 3.556267 seconds
-268435456 3.551567 seconds
-536870912 3.599067 seconds
-1073741824 3.608635 seconds
-2147483648 3.624090 seconds
diff --git a/mpi/reduce-scatter/reduce_scatter.x b/mpi/reduce-scatter/reduce_scatter.x
deleted file mode 100755
index d50ad5ac990357f4067a380d5a59a5e6b24a3805..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 25888
zcmeHP3v^V~x!z|c2{Ay<@DRaAIcmVDWbz=vfPk4~0uwut2tn{U4D(1v=EXb^yvoBE
z>omkt+huL_TCUexdT-Y%z3Wyjz2zZ_=xQxmAGBJv*2l!~QV^&jbH9I|y(cF_rh4yP
z>#nYQv*GN2|KI-qfA9UDefHV=O!i*a?3`y(6efd-&1J;Ry;$Q=1CJToe4|K!G_ZPB
zf_N$`V@2Q#IHu<}FpVc9tupDjq{TYR%%G$f75Pz<#&aY+pL2{!Dr-oT^ag?^k~*y#
zr?ZkWi_Vi?AufqmU99m8NuxqeQllQVar4DooNaE>^|JH=9WVcyPNg0-i}aondQS;G
zNjH-Kf~3+f3*S$8seSp%dq?Ok8^zW6y`of`(2I`I@#q+hCnRmxVJ4~c;kD4Cdj3}f
z>C?4Q)Mul(f0Am#pro?A%3xsmjM~bew;~t_Cs$XjuAfmcqt+UaSZlaFx(#%%=3ldz
zTQDOQMQ}b}L^@-K$fWL|4S`<-jIP{J4`3*KejeNfe5i5*dGJ+v@U?mHd-LEw$b<hh
z556l8zBdoPFAx5QJotxs@Kbs40^G2n{5CcZJ}D1AH4i>34}NtXd}$thMIJns2Vav1
zUz-QNKM(#;9{k67@So?wU(AEQmIwb7_iGG0mu+!tNHHeIrIC;)e5YF@IEz04+`=ZY
z2De6VHvF37=bnK-FYvu?Ei0=hOnB}UJnIVH4aR>aj!~YTtEXhFYw39;IUr5rh{E1N
zc~p`<t|;t;;Hk{mB}FPjS%-m9NS+%wi$)v8FyK&D87`vd;==Q|0Za*D<ri!`zy6b+
z^BbFAJ+Ob}((`}ym;3KvE*C~Pk1L*V#}clPI}ira5sHKb;sP7<bp+xGU(Dr?xkElz
zAncC-^{jTg{DH7L7`WXB*6SNetTPa8^@TjqH4LdmzJ%SkD&X-EGcV@jwACF9Mm&Vf
zm)T8`WH=#^=2$EeTa<_e!X1R@`is_tJ)N;gm`aW(Vu?slkRWSqYjL%N0|}mJiiAS0
zm^-{;fQ#!o-bgaR{85-q_*qD=LOg*s#F+oKSRmo!8uMiH$ZGqVsL$nzgcE^|WF#5q
z0s;$#M1c#UE>FzohC2KcbFXp5<Guu3lkj;GOqAyL2b1wm=BM`a+Qxj|q{ruqd)x^)
znPW@b!eFC277Iu)5eWI1Z#A-9KCe6BMj^}Nas5UC3VX>5VNWQkw<wxmxV7$(7k<-U
zS#{(>S1n#$bkFG}HiOM~wlp@mYOFQZI#wBpCMv1kl~G@%$K$K4o?)%BR`DcWd@gzs
z2A0pNjNex2hx-$#S!F!x_N=Jzg;&KZm%HP>tci+v&=;?+tGL}4o=yfTs_VEduR9j+
zbcZWqxW{Nu#dK?}wW=a9!xyZnTOGTt!{1SN>x{aXH`W=8PhWX!v^H{UJRa4{&0oC=
zPtYCis2r%INLD1nE5ec6!s6}S5l+s^*6_?meswG4z?jDqBp+52Bbr5g74f%cb3)Mo
z?+YB@o+qRu5p;FN-C21rH^sA8M?=0)He+Sd?G3pTomn6h@h0I~ZjMUw2eL`JbyAks
z%Ad_jW>cPcGO9Be64=st8WBGC6T=GUml<%w@WMI`IL&orh#GJ?ccQ%22At-zGRX0o
zFdCa=kmEJsQzQslzf9xY2*k#68w|LyT!#TSmfL2)jq}K*2As-}VVMD!?-t7IG~gJ7
zvmt80OR^~ZV8BNj@bv~<Op$r+y#{==L4K0~FE!u~8gRTMvSG6Ur+11BTMW3Eg7d7c
z2HZH`dCGu`B@36|X~4%z<Dh#D_}K=0p8=m>!1o*Qa}4-927ICcPaE(_2K<Ntr}v)>
zCk^;{5+v%@C}woe<=Tz-Vz)+w<6_Cgd5c@)l;q_Sihzl3jR=>Y%u{-#xiy|3c|FBh
zR_@k0!Y|Yz9#^?FP6;>8>+22p#au#<XpS*dPz0FCtY2rmq_lj7GV2&KUh)}|&kWfy
zUV}L$JFk+@hJ2=|{(4xC5(chJ2IU#9Ym%YmzL*yAYtdN5<HP$hu7#5Egobg<)2W5C
z+4I{L$E_u-6(a5qpGNO-4X^eN-z;rP(5qEYT*BtL13{m6mWCIrhL?FHrcH^@n&Q<i
zLGC3bY@jqU2TiE7gjZn><JAJ;XfmNi-89F+dzcC%!&9s?{94?H7r!@N!WL8E(2Hxc
zQ8iabG#Q_(%?<^^u28(gMU^7fz1qlPHW70t0+DchZV8(oj4XEtHF|yXHt^b)FlWRw
z1nOvM6RpsjK?RpEtF?0Ql)>sr8d^3`oZh}xo71_V$<^9wYjd^Qmb&IJfwV7dX?3+b
z7B(+(EO6R^H!ZYX4-tsBwYwI~n>PnHP&v<hC9T0}jC&CSMHKXN(7QorVt{xIbQ9<^
zpc)?Z{ltSF03C$~WHvg_Et46$3G^+{$EY(Q$gx7XeIZj;Ta@!gjVRuPJQ}0uIlK;K
zRFGif2n+F5xFW#j5@sv4++%7QU9<*DU@5;1S1<Axf@bqgtz3X?MsYnu^%f5RVbq2O
zR3rjQ!HMfZTn_`ggAAjb#?pz6rCMWYd1Gmnt+alA=}x6<#KxlD!utxoWxmH`zP_-u
z-d0-01t11ti><WS1~I)}X_(oI3N}!?WxdT;DpWB$pkm;kVSwpCn%PS&-!j3-edgv;
zt+xP1Hx@cdtL`asl-75RXf17s@xp9W8&OzeX)*U@8|>7;-vwmH5H<}8{!P^fTT(CG
z;z+&g=z8x+Tf4Kj@FZR@j^5ELIhpe?28PVphcNUUTkJ@^uRc-u0=j|v#7;-|&Sd$X
z!e4<?96g0^LIO}?f}^)?FEB^%oJ$X7GU)>_)30p0dAHC$FrYnqPvM`0_TxfZg{7L=
z&`v#<ewr%KQ}~^1R_~nG55nR$Y4JA{99^eP>RoG0q;tgaQX0G^KBD^hzKc1u%hC1c
zlYTW)nCQ=BKu1s0>DdK3z2MD{GMVa^)$%DG0Cu(ff;Ry8?P_F74M{YshhO$1Sk=kY
zI~emj)U|Ilt4nqsb*h)`@vG7PLj)aCmmECmP%q=mYwD8Y#2iQVXB;`LF3Avbnq*&u
z`f*Y}Zdb=YtByYbvA#2{?8)lb)oA*V--%M7d5|;@+0_Yqp#7;$9sdQ=Cv0l@akc#z
zNLp<_;8$1Cg$5r!40;UoL^HDdHZ}ID1wyC&YV36jHFM78)LxtV2DSB#!;m^n@aSWG
zy!EI53hs0UDkmWX4$&863a%R60M%x-y$_nJK7w5TLC77l5n>}m?FUnSghXG3Ih%Uj
z$5hf65Pka)$sb1caSq!FCe)Ti+mBM!?P@#R`1Yri0nzd_%=c6Kknt?Qw+~WI8tr-C
zg6IQ8D-Io1*G*gE=z6*sd9S=l(JK`FhN5R2Ghb4>>rQ7fj+uKCO>`%5V|Mw~wNIyB
zJMr3yJ@j@;j`nxIoS3lT1$w>pm#Ev`wQP8qUW<M;{4|N~`pDu>jbGr%?CO7Nx3Pb8
z&0=c<)jRtyS0}Hh{-SRCvt`3eLX}F{wW|-Rqvkm>I~|@9)Zxxq>XW19@A@-p9)2Fl
zd3ti!QFSSox25*lPVBRt*vTvQGk^E<iKjL^&nx!R-J2HMme{VdEw;5Ua%?zoFA8i)
zok-nL=5VAEWy`uUg=%*QPjE}my5j2N+h_us{tR(T>RCV*37r7sNbLqRQ9|!=sXc%+
z3GD)8>-xf^Zv2#<t?oC}`wk${+I?KT?=MI=XC76*@kb<DQqMJaXVm*%B(YwqOfM>v
z{wFF0sq=TB8us*~g00ore+Fxx87!++yWbFc_Vm{P_SnnL|Ne<P%IvAv({JIm({)GL
z`7Alz(KBbuOq8BF*0Srs`0cM-H0DgbTfH-VE7|Rt(}T?IJF{pzP-p5y`X<np)H5x7
z*m$36&4&%}`mk{ym7U5C8*@IyV}Qru(+@a3`?os1;76c**q8<Z8a5uI3>r2*rvx81
z)X5mo|IdvVpTCdxojG1SLvS;V5c$Uo3taOR$BQ4q-2dKqf%dGM*8VpKj9LsB{Simc
z=&xVF*p7whSKDbhv5j8i>2okSc#2Ye`w*k^O@&ZP&pF%IK@MH=b<BSC{H;8HbP*ZZ
zr@}~O*2v2cO8+S<c@IfqgtGq+Mkrfqypx8gC+R8FoT=mKUt>Pry;JSpjX{CHassyq
za1Vh!T+&<C=1hIy#2`}dOnvf8YFqkRA@(H1oW0(%i97JDu<fH2=EReiu0hqEsSne$
zKpnm9W#e~jnu-Q;)ftrMNtD%I-8=giP-5v<V9b$v7K6vyi|nb79eB#?@%sM$iNB`o
zWlkC<oQ~9CN9u)i33T~0@9aTsl2tqA!m0MuA^0@|BRd#aN9rw96UaeACXOM2(_r5M
zTb)UN2!hdR?R;uoTRIJvw=?|+mNn##Q}om$uROg`ASN_2y@1qrIyO8*1M7{pn`}4R
zZn3#;-tFuSmIcWL(f&y^J21=-9LgJxD{m^RU-w;Jf&+Cf?{St*w4;IJovCSMj?_oC
zU&5R{y%N@`a9XiBQ-|U2bXY+21vhO&Rld~7G@<b;_*odaR6zd$D18bOz%MoOp=k7~
z0VD4T=r%I)<X18BsL-e$F!DnI)sT_cS21$4(4gglXwD4+I!N87f{bkV9Irz56fGI-
zYU|17)M@<{(g4Zy*Ae;E*X(6W@eW$s_@!;{heb?fuM?#m(AgW)^n}<lFDAF^w%-Gu
zK8%QWMYVf8#)D7Pn^4RrHubXKeWKp7Yez`$iCdMlAA)@!e}OrMXYY>5fVZ2`C%5&X
z#{IJfWBr6p9gO{+u(5-&SAn5NkAmeLaHsB1Op`~Vo;wa^i}*gN-U9q6x{v{<-`o1<
z0jHO%0e^^aEAZWeaWl&3p8)({2jld+7v7J3D>Io4P3aw72dYwc9&n_R?>JJ6_dnNs
z3I^G8%_ntoOeaT(IJ)lK&(ynLMzZfo;(MC+J9@B|pyfo@$)-f0>tw6iy$M-ecOGNO
z%Y?wTF+jH0z=5<Z=s|mNzeYdTe2m=C)qG0n_jKKPge7m5^5>AeUC7_4%QrmNd?YJB
zz3a}CELkPxx&51j{N=j*I5Z(EU$m*eU=vm`j;`kY#a(xvGOe=+T`YOj?&<Iaw|F_P
zx4IqWvmK{^rVqk<ymA%-wH?4R>HPpVZF+t;tzQN$$H35Zt|j$4&FGxHQ?_Fzz1Y!P
zL3EKV^+ikSxbsr-d9h>H7v}UL*mQIsOlZ|_$n}Xc^%;hAd+M+)GyY9S*KWlz^H0f-
z99_>8-*~eez%XIWu4i(~Kf0T46zumDDp<DDlFpRbNBX;UoTsRPddSp0YQ+k1%43Mt
z$qh&m-$|K$?@()}{D63JW#2x|-%mU|!1{J_{+q<Z>8x)n=kFpOy`1%J=KSr%<Dq7K
zn>ZgM9zz`KThIAhiN`yF^+h?qoOryDS>H0w-#|QluIOvy{6gX}3uJu_oWGiQ`UKlo
z#ra0!>C<bU#`#&qW6jC>ES#?)zLa>z`OAo>Ppo}MFeYTCTtvKu_;)yeF7edG`u1^t
zEb+LXXg}vmh#yD%R?eG<r?IVXGv`060FPUN_H+Id;^|Xq-+In}L_Br*z9{G4C7wQw
z_ATT5ABmqtd>iLqCZ6UleGQy{f%x-?uj2gE#M38~K8^F+h`)e%3+Mlhczjtv`#JwF
z#7`#v2nIadf8sUb-vPhkVoEah6G88&)Oq<$dNlO}1nM9Z==V+vlCP(pVc?r{_{BNA
zD~Dg1!*}QKn{xO^a`>O+@H=w&7jpPFa`^Xi_`^9o{hmvKZm=;>ho;arQsb05G=(iX
z;osjVNjI7TbpQ${g}iVpn<&Z?glWq(E=L`Vf-Hv~C<<Fm8mII?P>}ZN0i@72LgSPk
zAPT31;ZILz_-8Xs<ZC<^_L}t6g@Tvupbkkv{v8r^KngNneyiUi^r=Hqkook0QfRN%
zI33Mn#)5)BlF{pPZ;_sluFC88rr^<2IWPSr)Q8Ei2ZlW`?15nq40~YM1H&E|_Q0?Q
zhCMLsfng5}dtlfD!yXv+z_15~JuvKnVGj&@VAuo09vJq(--!o`#qU}>t2ADt2Jt(P
z+^?Qn%?CvO<AOdR=pI2|6Z8W?j|*B<quZY#=u|;x2s%&D>jd=+x=PS{1bslz#|3>t
z&^>~_Cg=x(9v8G|x+q`Jse;ZBbe^Es3F;Sgm7w<s`hcKK*dnI!&2ri*Y;wS^1Wn)M
zr)lMNZeLwZRsBrw%<1kbcTIKe@+*AP{eEwq-|e1JUsYRGTUS5BS66>2{#GXtUg3+)
z(zrRN(=Lp#`Ato;v~p~Qzf`NPvQD>FX*E?f)ljY1%DHSy*kip^h%H*Ys9A%h<myYc
zE3DWah?_PIm;M`_{>z^{@8C+{Lk0Ss(8QV%%U=oyOU>r_cW3kE?}uZBnEd^a{s3X9
zpx5VdWa;*0f4hGuVPJZBx1b5SykGPq`Fm<KhtD-*10dS3P@ZhR{9XMCWO{M^0=de1
z)3=aR<k=TehLN)h__m^Ah5mMT)I&n+2C{x3BITT!h|e-xR1Fkc>Iw?66_yp$5JNxq
zoA}8K#mWW(5T*(Xdb$6O(hHa$c(KxecohBZ3{@-FJ%2%_skqRLEc4BXO#AIX3Tin#
zg8quQP%jxmqbT1}a4tyE3JB0Of)y991U>5>L?vIRh^Ht*)<z!Fy)Y?7)O|+I#f?<T
zmr?Rj7*a0d$$gZZ#*-FgDOO4r?L$Nt74jPBsF9DN8A{c&NRQl2$!bdSpPCidHPsrJ
zfgKjg!=-7GX?jr%K^;6B$iqJcu3klDHc^!+s1I<K>g^=<6|PF`gSaeo$CT@?Li`@C
zF^VW_3K$BHKzeL|DND$4BfyNiKuC;N7Mt#5<CS*PTEy3y)<NKGsZRA^6Y3DrrCcHI
z>{`IMvxpl2+flKB9;J*?EJb6Kv5L9$ywWM9<Ir#M2P)>$t40w>R>mlRM^d&0e}rJF
zMdqxLSB`8Mchw{l{(8V%#zu~uJJO8qUoe%mjN>ZIQna{ez@pIz%#PrOgkC93Ovu{I
z8YvtwTcjv0;}jSkaUqNst0<3(IqPiYl1T)Y6!Kz68f!Kie}6FQY<3CAXe+yFl3wRa
zCQ<1P6TjC8yws`;fs;YCFk2rJnrs;`OzfDEaLL#aBsdm1#kw;nXI$zo1$o%)^v%1l
zPjBM48{wa`0n!bI_ynrS>`CELy*5L7>N=n%z7tAOJ})g{3IYjY6Igsr-0O?R#ZG62
z&EMF<Z(XGSDa~0XIdYh1D#y&0%dgU|y!yfmV{x-(rO$+u29%3o#?0sseaQ0YtYz>l
zfF1N!53;a%3zvxx%U@anPb?7_C|{1W=3Pu_iOrg3!a)TJJIf?TT=Va7a6thu`cp+S
zTZ;IMic}`!F>wM*hsPuLbjOo%pT42H@`}np*b_{8eT?~n{)&NmV>7?k=eKrBZ^(z#
zyqhWJv3t$ogn6vS<Agap+dTH_`8P3Bf%$!kp1a>vKFM5S<u&F*=2vcic%c1q<TqcU
zIJm>nb~*Z+FBh0NeF1y2=`4i+cJxydT-f!G!vknXzpG^dHu!t7!4Qr1GEb*F#<&dQ
zCmG;Pt2I?u>#t9(B9usmeRDc|VP7oZ0qc%=I_Ky|)@T(S{QrU)tpWz;U@JWID*T}J
zGvPQKZ{q_S()U+Z1byL-MCY6;K*{iJfv~qC;a-ky=iuFOar^>KLBRg{!I|iUI1T_4
zjCf}Sc68Fg2Q<eZ!?6xu1x{M1z*b5~g%e~ehNB+PxDHe`(FtBx7mAS>YKn6PP;q~h
zwu4d)s6N~#&KyX@*37A%F|E3m?886>wtR*oV8Rh-Qbn?s0JgO>A<n@EN=O9b71&$v
z3(lci5O4=8e5-vP+!4Al;*LN$mY}nKG}|Jr##&pAIm;jcx@9;EAxaNpBH*cr`Ea-e
zo+QJ4ChNw|8l(o&%a<nxb2z2~O{N{6gCBMr(!d`zcdWxRbS`y7>LIQKtL%(~e3kCy
z$w1Is8H<MWMC0NXr@gWR2MBm7YOK?%E71p;I7<R%SIIHJ<%-69Ii2#%`f$gf$U*&A
z_ZDwaG8_+dgnhKjR`=g<r~PW3cIaNodvi0qJ^0?}4#z+`5a>@<DM&<)%5rp)qp=*B
zq@1L>_JF?BTOsBjQZFfZsVAvLBpL)&*kYNCW}%y9GAW<}c2rsXL&0RpnYP*+L}pA-
zh20{PMqBM)!Pd3XRw8Q)504&<wyr(HmQ6?nvx@ZA9u%aAL!c&B^+TEV3rt~)2I>4w
zh9&_8?BncxmBO^@7z)R9WJvrX;0hbX_Kwkz8VG!bj;XI)Q_K}vL=~JY)Zo@QLiM0<
z4KBi$xizHpnI+Ndabdob{ku=9ABAPOOt3HiPIw`K2)E)QJDc4aQk2%iYMvg;wlrxR
zkam0!P)rk;{2n0jb&wj$58uv%KLlK1<FbFhD((DZ9{DYK@E@D0J_G)g>&l;#eM|~1
z%QU10LJ>94G%5Rcx-zE=c9?0vZxVk%;L;Bg{{zRzv)u0=*TcTaRGR&}St;-0xSIX@
zS&9Fe*Z<6V{tg0Taldj`L2nCOu0ip~;5>XFa5?Xii=*QLXP0PPj_<Qa>T_Hgn}Cj?
z<2)15`Kad@oZcuGs?wjA10O4e40H}~%Mf<x95u}#CzEt88a)HK;ody>_wwLB13pxJ
zXe-@N_?|p?zVpxa=aK&?4}K~SJ_f5jx{s_(<4wLBJcU|;(;hww{EwwUujM$021QHr
z;LGyhfjszadGK!FL!IkJ=Qj@J&+q2J>C4AZ=fKg&fuZdDBo9vCU51i>ArJn?Ja{?}
z{&^mp_9YHg?pb;8(meP@d2rhPG?e`-^5C{S_&nf4ovYW%@qzb+>_pcKe6L%R5U00r
z`=goMH`0K9Od-y3J#CEtArH>1m}&ZAQDzNe5n{D9wqOC%!K~gj;rN;mvtrp}6(2wc
zF|L@-c47geODBL}8t*1mk%|TrNJaP=Y*t^VI2W(e3rWc_Yr?l0Qhc^=<!d`DR~wj0
zGY;!33uYZOlj9}3Ljg?gJ0h^6mlO#_T@f4~7je^)$x7>=fG<w7K)E<_$;A=Yj(R5P
z>jmSeIA-<WL_B1{qHlS!17*_*I>2?7pX?{ImM@R_R>>S%9m+J;WHLEeozC&I!n;^0
zf_Bo$d_1L7xuD2d8DS-}9Ony$`68*=;hMM5*4pfHwk&FQ(b<FEh`!p@7ZzBRVNK)3
z0d_7|WcjUFTw1G<<WA<YVi64@B<1>yoq1$VIN}P0+))%9HLk}{A*|wnT8H+xi&qz=
z($dK_uyWTY-v*f1il3-Rw@)6^H{ZFS(dMM%brv<ZyV`Ay&Stc8@G4&z;TGkYj7A$d
z2Q=!<7ftuZBQBh;=f$$|@BHLO*A3ORRCWEP+To+N>KZzra5;7HMD>gygGikRr$I$R
z0IO%X;>l3kszt80z@jRbIPy`D|L-S1x?G|=yZn(D`b>}>#@yo|hgh8tg7^m~*Wib+
z`5HSO#C*>J*RWrIjjgq%$v7<kx6?G-(}mX|5Au9%dG5B{7bdc~tV|Q7jh`cMN&Sr`
z4cx%F&pO8;9?AKnL};Bf5|`AM`{T9>U_ahB6sQcUr(7rC%PtR-5o669k@|8!+Py`3
z{z9GLgQp&(oTT)76YX7-yxhO`rqFK?da{kV^`{>ajvMr|`}0Tvp-Ly&@@4%6%zsIv
zy2<k8{=YV%FZcgRdGUIdENexI>Xg+N`xD#9sR*=gq9EHZ%a{IJf;`f1Bq0R3zwxBd
zH|}r5NBcAU?*>M7s=-A;t{W_>&eJb*#vs2bNGyN^=}R$r9=SZHyj=<kn(P1Da`ff8
zdhZxrf$xkq*pTg)&<3F|Zg=+gKY0%PMfiQh7;@|XjU0XXy{G*`U5om+G33_27n#(4
zS-$)}6cqaOd>KQo{&$dR)L&n%aq;y+Uz3g%Msn-_AaH7;>>rz(G%hL6m(TT=<RyI!
z(sZk(zFa47xtN>D_7|NS=s!|_D>A4~QeW<8-TD>vcR-hXC-voeT%N1HS+rj~PTBfM
zKkS7r`A_Q0^I4Zp1&C0tlWZ*OErH)6VbssAryFz$S&o#?)&G5tzWiQc5&BY3QYy<B
z`oR#^CBR&LSw2>vJmi*N3}s{ewPhNY(;7Ho8f6+nu0H+TMws+pgInX$4K_~bJnTV1
z8^qx^Ip|XnRXMoSe<TMdSqhR)7Wz6m5Wiy+X(=dJ=?^Js&8Od}ORN|DN6P1zV|0#u
zE=ID;nstp$Ihi>ubL7!KGEdikJ4YdhW$i*g*MGlr==$wB1BTR>l)g*Jw63AYRrF#(
zSkL2uDA#|XJo;OO{@fhnIV@WZAjieLup{(=6QN!_xKofm50~tJ^8K|Q`m*nlybR^y
blQ5m1Zwxss`(_^fZS-P9csIv*uKs@lq3fIJ

diff --git a/nccl/Makefile b/nccl/Makefile
deleted file mode 100644
index d4423b4..0000000
--- a/nccl/Makefile
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright 2024 Parallel Software and Systems Group, University of Maryland.
-# See the top-level LICENSE file for details.
-# 
-# SPDX-License-Identifier: MIT
-
-CC = cc
-
-# perlmutter flags
-INC = -I/global/common/software/nersc9/nccl/2.19.4/include
-CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
-LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
-
-all: allgather.x allreduce.x reduce_scatter.x
-
-allgather.x: ../allgather.cu
-	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu
-
-allreduce.x: ../allreduce.cu
-	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu
-
-reduce_scatter.x: ../reduce_scatter.cu
-	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu
-
-clean: 
-	rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x
diff --git a/nccl/all-gather/128_gpu_run.sh b/nccl/all-gather/128_gpu_run.sh
deleted file mode 100644
index e9fc3ae..0000000
--- a/nccl/all-gather/128_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 32
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 32))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/128_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/all-gather/16_gpu_run.sh b/nccl/all-gather/16_gpu_run.sh
deleted file mode 100644
index a94a523..0000000
--- a/nccl/all-gather/16_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 4
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 256))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/16_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/all-gather/32_gpu_run.sh b/nccl/all-gather/32_gpu_run.sh
deleted file mode 100644
index f1ecd9f..0000000
--- a/nccl/all-gather/32_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 8
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 64))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/32_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/all-gather/64_gpu_run.sh b/nccl/all-gather/64_gpu_run.sh
deleted file mode 100644
index 357da9e..0000000
--- a/nccl/all-gather/64_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 16
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 32))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/64_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/all-gather/8_gpu_run.sh b/nccl/all-gather/8_gpu_run.sh
deleted file mode 100644
index 4bd249d..0000000
--- a/nccl/all-gather/8_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 2
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 256))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/8_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/all-gather/benchmarks/128_gpu.txt b/nccl/all-gather/benchmarks/128_gpu.txt
deleted file mode 100644
index c84792c..0000000
--- a/nccl/all-gather/benchmarks/128_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 32
-Global data size: 4096
-Number of GPUs: 128
-Message size range: 262144 - 33554432
-Number of iterations: 10
-262144 0.002247 seconds
-524288 0.002277 seconds
-1048576 0.002775 seconds
-2097152 0.004497 seconds
-4194304 0.007477 seconds
-8388608 0.015057 seconds
-16777216 0.028550 seconds
-33554432 0.056270 seconds
diff --git a/nccl/all-gather/benchmarks/16_gpu.txt b/nccl/all-gather/benchmarks/16_gpu.txt
deleted file mode 100644
index 73e83d9..0000000
--- a/nccl/all-gather/benchmarks/16_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 256
-Global data size: 4096
-Number of GPUs: 16
-Message size range: 2097152 - 268435456
-Number of iterations: 10
-2097152 0.000532 seconds
-4194304 0.000982 seconds
-8388608 0.001976 seconds
-16777216 0.003447 seconds
-33554432 0.006826 seconds
-67108864 0.013190 seconds
-134217728 0.026196 seconds
-268435456 0.052567 seconds
diff --git a/nccl/all-gather/benchmarks/32_gpu.txt b/nccl/all-gather/benchmarks/32_gpu.txt
deleted file mode 100644
index 72f0d07..0000000
--- a/nccl/all-gather/benchmarks/32_gpu.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-Local data size: 64
-Global data size: 2048
-Number of GPUs: 32
-Message size range: 262144 - 67108864
-Number of iterations: 10
-262144 0.000622 seconds
-524288 0.000577 seconds
-1048576 0.000780 seconds
-2097152 0.001190 seconds
-4194304 0.002041 seconds
-8388608 0.003571 seconds
-16777216 0.006995 seconds
-33554432 0.013830 seconds
-67108864 0.027698 seconds
diff --git a/nccl/all-gather/benchmarks/64_gpu.txt b/nccl/all-gather/benchmarks/64_gpu.txt
deleted file mode 100644
index db7919c..0000000
--- a/nccl/all-gather/benchmarks/64_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 32
-Global data size: 2048
-Number of GPUs: 64
-Message size range: 262144 - 33554432
-Number of iterations: 10
-262144 0.001077 seconds
-524288 0.001154 seconds
-1048576 0.001399 seconds
-2097152 0.002078 seconds
-4194304 0.003777 seconds
-8388608 0.007711 seconds
-16777216 0.014418 seconds
-33554432 0.028471 seconds
diff --git a/nccl/all-gather/benchmarks/8_gpu.txt b/nccl/all-gather/benchmarks/8_gpu.txt
deleted file mode 100644
index 1c654f3..0000000
--- a/nccl/all-gather/benchmarks/8_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 256
-Global data size: 2048
-Number of GPUs: 8
-Message size range: 2097152 - 268435456
-Number of iterations: 10
-2097152 0.000286 seconds
-4194304 0.000523 seconds
-8388608 0.000954 seconds
-16777216 0.001696 seconds
-33554432 0.003150 seconds
-67108864 0.006500 seconds
-134217728 0.012278 seconds
-268435456 0.024449 seconds
diff --git a/nccl/all-reduce/128_gpu_run.sh b/nccl/all-reduce/128_gpu_run.sh
deleted file mode 100644
index 623f0c2..0000000
--- a/nccl/all-reduce/128_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 32
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/128_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/all-reduce/16_gpu_run.sh b/nccl/all-reduce/16_gpu_run.sh
deleted file mode 100644
index af689e9..0000000
--- a/nccl/all-reduce/16_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 4
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/16_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/all-reduce/32_gpu_run.sh b/nccl/all-reduce/32_gpu_run.sh
deleted file mode 100644
index b672e7c..0000000
--- a/nccl/all-reduce/32_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 8
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/32_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/all-reduce/64_gpu_run.sh b/nccl/all-reduce/64_gpu_run.sh
deleted file mode 100644
index fc0416c..0000000
--- a/nccl/all-reduce/64_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 16
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/64_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/all-reduce/8_gpu_run.sh b/nccl/all-reduce/8_gpu_run.sh
deleted file mode 100644
index d9c0ef6..0000000
--- a/nccl/all-reduce/8_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 2
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/8_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/all-reduce/benchmarks/128_gpu.txt b/nccl/all-reduce/benchmarks/128_gpu.txt
deleted file mode 100644
index c8bc5f3..0000000
--- a/nccl/all-reduce/benchmarks/128_gpu.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 128
-Message size range: 33554432 - 2147483648
-Number of iterations: 10
-33554432 0.002252 seconds
-67108864 0.003958 seconds
-134217728 0.005696 seconds
-268435456 0.008861 seconds
-536870912 0.016701 seconds
-1073741824 0.035052 seconds
-2147483648 0.069582 seconds
diff --git a/nccl/all-reduce/benchmarks/16_gpu.txt b/nccl/all-reduce/benchmarks/16_gpu.txt
deleted file mode 100644
index 8199a8f..0000000
--- a/nccl/all-reduce/benchmarks/16_gpu.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 16
-Message size range: 33554432 - 2147483648
-Number of iterations: 10
-33554432 0.000971 seconds
-67108864 0.001813 seconds
-134217728 0.003415 seconds
-268435456 0.007049 seconds
-536870912 0.013323 seconds
-1073741824 0.026322 seconds
-2147483648 0.052252 seconds
diff --git a/nccl/all-reduce/benchmarks/32_gpu.txt b/nccl/all-reduce/benchmarks/32_gpu.txt
deleted file mode 100644
index fa6e736..0000000
--- a/nccl/all-reduce/benchmarks/32_gpu.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 32
-Message size range: 8388608 - 2147483648
-Number of iterations: 10
-8388608 0.000589 seconds
-16777216 0.001015 seconds
-33554432 0.001352 seconds
-67108864 0.002146 seconds
-134217728 0.003621 seconds
-268435456 0.006997 seconds
-536870912 0.013742 seconds
-1073741824 0.027021 seconds
-2147483648 0.054364 seconds
diff --git a/nccl/all-reduce/benchmarks/64_gpu.txt b/nccl/all-reduce/benchmarks/64_gpu.txt
deleted file mode 100644
index a773bf1..0000000
--- a/nccl/all-reduce/benchmarks/64_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 64
-Message size range: 16777216 - 2147483648
-Number of iterations: 10
-16777216 0.001196 seconds
-33554432 0.001740 seconds
-67108864 0.002970 seconds
-134217728 0.004544 seconds
-268435456 0.008213 seconds
-536870912 0.017505 seconds
-1073741824 0.035188 seconds
-2147483648 0.069951 seconds
diff --git a/nccl/all-reduce/benchmarks/8_gpu.txt b/nccl/all-reduce/benchmarks/8_gpu.txt
deleted file mode 100644
index 4d60f0f..0000000
--- a/nccl/all-reduce/benchmarks/8_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 8
-Message size range: 16777216 - 2147483648
-Number of iterations: 10
-16777216 0.000511 seconds
-33554432 0.000916 seconds
-67108864 0.001663 seconds
-134217728 0.003137 seconds
-268435456 0.006408 seconds
-536870912 0.012493 seconds
-1073741824 0.024300 seconds
-2147483648 0.048155 seconds
diff --git a/nccl/reduce-scatter/128_gpu_run.sh b/nccl/reduce-scatter/128_gpu_run.sh
deleted file mode 100644
index 8590821..0000000
--- a/nccl/reduce-scatter/128_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 32
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/128_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/reduce-scatter/16_gpu_run.sh b/nccl/reduce-scatter/16_gpu_run.sh
deleted file mode 100644
index 7a20fa6..0000000
--- a/nccl/reduce-scatter/16_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 4
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/16_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/reduce-scatter/32_gpu_run.sh b/nccl/reduce-scatter/32_gpu_run.sh
deleted file mode 100644
index 3d297ff..0000000
--- a/nccl/reduce-scatter/32_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 8
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/32_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/reduce-scatter/64_gpu_run.sh b/nccl/reduce-scatter/64_gpu_run.sh
deleted file mode 100644
index 6bbf97a..0000000
--- a/nccl/reduce-scatter/64_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 16
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/64_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/reduce-scatter/8_gpu_run.sh b/nccl/reduce-scatter/8_gpu_run.sh
deleted file mode 100644
index 21c0dc4..0000000
--- a/nccl/reduce-scatter/8_gpu_run.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-#SBATCH -A m4641_g
-#SBATCH -C gpu
-#SBATCH -q regular
-#SBATCH -t 10:00
-#SBATCH -N 2
-#SBATCH --ntasks-per-node=4
-#SBATCH -c 32
-#SBATCH --gpus-per-task=1
-#SBATCH --gpu-bind=none
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-NNODES=$SLURM_JOB_NUM_NODES
-GPUS=$(( NNODES * 4 ))
-export WORLD_SIZE=$GPUS
-export MASTER_ADDR=$(hostname)
-export MASTER_PORT=29500
-export CUDA_VISIBLE_DEVICES=3,2,1,0
-export NCCL_NET_GDR_LEVEL=PHB
-export NCCL_CROSS_NIC=1
-export NCCL_SOCKET_IFNAME=hsn
-export NCCL_NET="AWS Libfabric"
-export FI_CXI_RDZV_THRESHOLD=0
-export FI_CXI_RDZV_GET_MIN=0
-export FI_CXI_OFLOW_BUF_SIZE=1073741824
-export FI_CXI_OFLOW_BUF_COUNT=1
-
-MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
-MAX_MSG_SIZE=$((1048576 * 2048))
-
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/8_gpu.txt"
-
-echo $run_cmd
-eval $run_cmd
-set +x
diff --git a/nccl/reduce-scatter/benchmarks/128_gpu.txt b/nccl/reduce-scatter/benchmarks/128_gpu.txt
deleted file mode 100644
index 7c1c8f9..0000000
--- a/nccl/reduce-scatter/benchmarks/128_gpu.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 128
-Message size range: 33554432 - 2147483648
-Number of iterations: 10
-33554432 0.028300 seconds
-67108864 0.028351 seconds
-134217728 0.028351 seconds
-268435456 0.028502 seconds
-536870912 0.028579 seconds
-1073741824 0.028650 seconds
-2147483648 0.028506 seconds
diff --git a/nccl/reduce-scatter/benchmarks/16_gpu.txt b/nccl/reduce-scatter/benchmarks/16_gpu.txt
deleted file mode 100644
index 14acf87..0000000
--- a/nccl/reduce-scatter/benchmarks/16_gpu.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 16
-Message size range: 33554432 - 2147483648
-Number of iterations: 10
-33554432 0.033170 seconds
-67108864 0.033280 seconds
-134217728 0.033220 seconds
-268435456 0.033291 seconds
-536870912 0.033217 seconds
-1073741824 0.033158 seconds
-2147483648 0.033275 seconds
diff --git a/nccl/reduce-scatter/benchmarks/32_gpu.txt b/nccl/reduce-scatter/benchmarks/32_gpu.txt
deleted file mode 100644
index 7eecc67..0000000
--- a/nccl/reduce-scatter/benchmarks/32_gpu.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 32
-Message size range: 8388608 - 2147483648
-Number of iterations: 10
-8388608 0.027121 seconds
-16777216 0.027661 seconds
-33554432 0.027766 seconds
-67108864 0.027992 seconds
-134217728 0.027914 seconds
-268435456 0.027912 seconds
-536870912 0.027777 seconds
-1073741824 0.027861 seconds
-2147483648 0.027551 seconds
diff --git a/nccl/reduce-scatter/benchmarks/64_gpu.txt b/nccl/reduce-scatter/benchmarks/64_gpu.txt
deleted file mode 100644
index 8f8ddd0..0000000
--- a/nccl/reduce-scatter/benchmarks/64_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 64
-Message size range: 16777216 - 2147483648
-Number of iterations: 10
-16777216 0.028306 seconds
-33554432 0.028511 seconds
-67108864 0.028175 seconds
-134217728 0.027998 seconds
-268435456 0.027883 seconds
-536870912 0.027802 seconds
-1073741824 0.027954 seconds
-2147483648 0.028085 seconds
diff --git a/nccl/reduce-scatter/benchmarks/8_gpu.txt b/nccl/reduce-scatter/benchmarks/8_gpu.txt
deleted file mode 100644
index 26c22b6..0000000
--- a/nccl/reduce-scatter/benchmarks/8_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 8
-Message size range: 16777216 - 2147483648
-Number of iterations: 10
-16777216 0.024231 seconds
-33554432 0.024389 seconds
-67108864 0.024167 seconds
-134217728 0.024047 seconds
-268435456 0.024293 seconds
-536870912 0.024031 seconds
-1073741824 0.024048 seconds
-2147483648 0.024241 seconds
diff --git a/rccl/Makefile b/rccl/Makefile
deleted file mode 100644
index aa0a7b9..0000000
--- a/rccl/Makefile
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright 2024 Parallel Software and Systems Group, University of Maryland.
-# See the top-level LICENSE file for details.
-# 
-# SPDX-License-Identifier: MIT
-
-CC = cc
-
-# frontier flags
-INC = -I${ROCM_PATH}/include
-CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL
-LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl
-
-all: allgather.x allreduce.x reduce_scatter.x
-
-allgather.x: ../allgather.cu
-	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu
-
-allreduce.x: ../allreduce.cu
-	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu
-
-reduce_scatter.x: ../reduce_scatter.cu
-	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu
-
-clean: 
-	rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x
diff --git a/rccl/all-gather/allgather.x b/rccl/all-gather/allgather.x
deleted file mode 100755
index fc85917cfaeee3d0d9962cb061dab34a2359729d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 25736
zcmeHP3v`s#oxk5q5@I0vCcFe6^#cP$C6fms2IP^+119bSBP2pq947NfMkh1FJP_RS
zGGIFel-eHGcHPRlZp(4)S?addQVT>B(Czl%qsUfQsTMmSf)o_ku9E%z?|1Lyn<3M(
z-E;Pw?a77j{_p>H|NsAe-|yaUa_{;k#{!e0FzHNe9wTngWQ{|0+-EHRE)f8!W3{XR
z{!~`N^1$bCOpmW)8V^WXVbXC)^L3V)K}jzx;=?A5$4Gh&=NOYz=8!1q4LVH(b-Hu3
z&PvKGI!}7Jcy@{SU7~!F)(JUDje1nZ)01^Or|UHykhJbgI(}48sYk^kz5MI-`1}Tq
z2PEAg$}Op^m&M0p9x7k9^o|KVOQCL8)=R6<W5qhoiZwkTs6)V#$~wFqdX&%qsUv-Q
zwu=00740XfCJag{(<=}7J8G)S1D>*gKNw$Kwz{^atftx;4Oy$WJ!%Fj)51mV+=3at
zNP_d(Jkk+On<g>_oR)^@%w!OHGF>fzq40%Sa3}Dg()DM-S7pK1Wx>Ce1^-SK{Mjsc
zZx;OZEckD;;AgVnW;DQ1^-z!nACm>2lm)NJg3rr>JF?(QvfwMS;P+<1S7yOiXTjHI
z!4p~VZ)U-FWx-#_f|mm?W>>MDE)6Y*`H1_BH3@v5OXG;d9|vw>6Iq48c_h3iIerzZ
z%fSCh;D<7B8np`kk#A1>N2rX+4C(UZ%YV8A%?S$IhqN^0MV})S_V0oR#3Ix3REBh3
z2SzS_mH!IL!-s+=LgSj~Rd_J2Hi9W2tn}Kg*VLZba!q~HuRcDya@jRM`|#1PFsBnU
zmD?GOxgs%Vx62;{(%Bsf3d9LE;_dWDW8R3<7jboao&KON1k}CS<@EW3u7LjmFIbOv
zD6uYoxW(J;4zFPdE%C-0y{r6gFEI-uUQSzFfk4Pj$U+(25Q+z50%?jwLXjo0h(Fj#
zh_1h6P0-yH2?Z(TpxYgQW_!@TGVX2mNZzPNBDO$ap)1zqjc|d6P<MB8&>vgu3f{+}
zu}CZwU@fi9PQrMA1e_6yC75$jN-Y}mgyJ#g3nN1@AM4gL57ZlpFyH+Vf6U7@7RZVr
z^?J7}8e>Am=L^K6UCc+t=aH^R#E)!IcxlYv?PcE8$b{4Dam8Gy*N$jZZwf#`4`nv!
z?hflIhhq$l=j!&LX7rcKOsXVjHC{5*s`R?4VGAA2^$pG{Yn62-D-VTZ<?e`UO?lW`
z?sj|2D{HJ3)(RfPE6Pc)EPu!B^635LK2&fFIV+EbUGDqJyunq`@(x$jn=(-r4S1uK
zGs_<E24|3gvdWp<md6!|cDaIO5ws%8Q#Qj|ZLKJa)p!F{Ggn9M@AP%fytig%#1rX?
zL}#qLH(VXMHyRD=>1NMfnLFSLc9sujQUuH5!TW-t`-9>=+Zl|{PUY~@LSDWVs=x?3
z4%J~LF``(+R}lX|DyBQ^cT$~lfY&@Bow0zkE9y$gd$=hcy*k|O?M_9kjJrJD=$k2^
zJLHL@YPmT|$>&c6DH$otbLC4##Zw`7G#=I&#&$Mo82S1dO-vD--(|o}9MQcO3^=Vx
zq}yk}b0i3Q$bi$jQ#v^h6J}gz$a$FX>!mPg?Fx-^Bj6j;tux@pbnOP*m~N{9H?H@V
z8E{HNx)laoz8fg6%Yb8oO}VfEFG!(`tv28z4fqBFKFWZ9&47y~E|1-Az>5s>j~j5j
z08(y;0jKwcbUO{WTq;xCE(2~{$Gl*`X)P(;J_9~hf}n>C_&5W8*np2W;71Mkl?MEn
z0iR&NlLmaE0Y7cP>HQ|%1p|Jy1c|yd@)@<c++z`+@6w2H{8}E;J&Q}@0g{(n9Ren}
zG$LI7Iv&zJ&86`G$?GA`vQn4M5k5(W(tLXN59JgA4h(EqZ+xV*92*1c88bd|Y{;=d
zc8pItr=-?Lvd_!0LiyF*2Ba|f3=AMW!?P&f-QkUBA)gkGgxua})Em{h<I$Lg*~#6d
z1yj)rTic`70@eZ%SEpB_x2T2}Yo~X%HYMQE%E&KZ3tawy*E3tgTTsKRG!)UML}yR&
zXwwiot$;0RXmF(AQ=$OV(ttVGX%4Q!g2bcwgW-5g3%h8+f|U#9z-leW+q#o40<7*h
zqtvHZYkXSN>kb7yFxXBxgq^512l;n)hU3wB+MI5G(AgdBbW+ZVb*(nCc;QD}F@Go+
zomapX20|UKfJSd_UQk}d0_F(0hd}Mkt)fVJSt(&-p$0AMMQgD+9Je(%TUu<b&KBD;
z=UgU`w#CgY&NlnvrX}{<9F4#m7TdlI5s0_8Id5CAU@jU|x!QaS?TM+q_h3Tk0__L=
z3g``(2%Z4l0Qw@R1q1pp=<hHvO3;bsUB_4>=pN9m)T!X)T%bI#m?^6*%GIMr<ZnkD
z%|SHI7QuEI3D%FW5MP0(6WBb$Y(<ubObvy3YoG*{;#=`-M*L#XRJ^H$3y{q)o);0n
zNel#%--M?Y8F`zGz-E2Xg!&?_zNoamsKQoMyRc}3xo5=IyluG;=R9J5$YeI<71i2`
zD!2f|AZ)P}<u^bK`f!wwqj-LXKpmA+=F@zOLK$fUbOFy9)JrEQYb>%nVuGQE%}qtx
zwj3DVnrknrcqq?aRNFJ6rKm2#6SPr|M1u82`MmDP{*9Pm<hUhsMz=l9`L<V{Y)-s>
zk3I2$z2~>5TiYDla+7#%*tZqBIGH;gWf&N@2d~;s+wF<t>hrm`qN>&B_t`h^i<chA
zy#<_N-;(<*Bml+6+qccM1G8_NTX||=Ao(uL^efw!?-$yELG3vQa+eG3RYF^ZrRU~A
zJ8>}i9A#ijt~(XAZSMOgVeuJh@fYOmJr_;tgKJErbK3rT61*ikqVm<g>p9eG@A=II
zpBl=I^$!ey7EagcxE!5c`|gQ>fyy`3(kX5LjcVz&?*Q;Ms-Y=WB+;aP@`ex2s$NIE
zf-#?6UH4v-x^&-JhkD}ypBnBzMbIgA>B+Np^+wJdQJ0=0<{YB`z>$mU(g8v)lI$;`
zevZ`7HL7D@QOBN#Sl^{q4y1G%)o}8Z&w*5+d6G0wHLBweK>IVBI`#{M&)d|}b86eC
zAW60DW1qT;9u)YKPe4BfJ>P^VpG}SY$^xN_J~i^Tg^D@%CMvH@eTT~W&L@z%NO0k&
zeZ2G+{}bHB0jOMn5IA^WkSSDEVI5SP)V4lot~vp^{*#b9Wh2B!h}sXP^fZb73g&F;
z9lxiPzJTcar%3)2M4#huBf*5)QfS**%6g;PhH8BOGm3y{X%go9seFid1>pN9DJF^X
z9Jj#x2;P0C&Z_IDFSYl)l#jT#-X-rX@_s?y%l28XtD9$D92l_AIuvW5mPEt!`qXtV
zC61gwa{d6lo#KVQ&2PlUZ+wkjZ~X=8?hh;*-=No`PYu3AqP-_9zQowu>;t|1FYGt=
zk9ONq+a0!cTiX)*#*gh7jP}IY#ET^_qU+cbKPuS=+B1-=ZvH;Tm3_-!=U1NFP0PLH
zx3IWtPP`1rBB6~OItXZjgn}G807#S2U4U#opPSUJ<uKW_c|d)5Di54fAHJ3Xv(Bqu
z9ZRVt_HTYiefSCrZWB4$rspU*6-Mk}wjwKg@iZn&1VZ1#bYwD+##v975Tb4#0W%I1
zY!|qe#AnUD$F&ysIWqhLkYqawwPh2x_wDmKmUug<0&PqjZ&v^LRL_e%lEtg-Tjn}v
zld&`Q-V<Z@e8r-%%6-X~A&QtL#Ml!jc#Kz%c?L0##QEfNpv{ScjzrSlo1CO>N@Bn}
z_#DvOd$J@>bHF#!=Ya6<(5KP2OLaPq>Gay8ABj1@34rE+w*jDkhNf6ag3keex#OS8
zI>h)t&P)Es$3Ijrcl^WL|K0dUdH&P{P>l(oKV;uh_+K}pga1y|*dAK;@21ytvJ)?U
z`XI>qW>F8?GWQ6C>|4g{Dbc&&v{d{=9$)x88EGXW<tZbRA(SjjNq$0e0cM=W|HX`B
z6I0GlXq;_FN{+<2<gc;B-n>uU{Hz$*j{~<SP9<M5;#;KrMU1nb@<K*L{L|1&jmAQ<
zd<Yhj`w=afJsW`RE>xJK@>Ls+@5ZDD3wla*9kjRnh))VzzQu>MYS~m$ixkx2#*#2G
zrGi;O%ml%-5|b~OIv;VAECO3gjv{bY4SlO5sRy1YxxfSK?=Gpp)ZSL2`P5=Z38wnu
zoD$084n!yGMR{nO?{GiMXQouS%J;sB)@e+9gsJ7H5Zt=~tRry>wE^V&gzQAyCf)=0
zZE%$X$)`YCW_`A9nIkcf{6{c$QHjZA*bGtGf5;pE#S*IVWP<>I0x(%aCidAkzD#d`
zyKHycmfP;JIhXJEZEsPq;5yYj>*V^KG;_V4w9%xmw~ieZNwyrAIjKmF#0Nj6Zv;)1
zuO{1s`6pm7c_o<3QMyhjb&}EtSQ1{2(h0~>>8eQS2bZbzW1;i~&-Q1@_+{#?6?(5g
zFIfksc_U10U@y=<Ky5i{OT4GQeX3EE<a~H%)Z$INZ8yKOuGY8xa!NFTClYrg-$&Q5
z4g50xnr`zL@MJ$c*-g~V%P}pSQSa7MIHTV9uV>VIdiRb-*Fb04r6ei8eYFt60`-Tm
zws#K=dK2Pzzl|LCj|cwkbi5b1J<*GfMdjOzT4l+H5Q5}<l6(}(x`l_y!dt+nrQ?qf
z{vvQS9j6(f|7pAX(B3Cgxp<neZ$frVdM>^Xye~e$GkyqN)R62+in>#e`aycsDT<np
zC|bSSdp@p6to_)Yh##{j+K(P=`W&(BU=s~akWY1Tnuxt;?NO#~dIQ0}pAf&L>8O1R
zwhgp%=(*4k^Y>h6Q8!Zode(l*;x`I`-Niumyo4H|9Y7dGz%HW@wyD;7nm$+h-92kh
zv-ol;e<jJkCFJXL`MQHmr&IDXde&ZG@d_!=?SCxfExP;|6d@&_x4l1SJ2ou#o~EPu
zJ!?NVt+xnW>~7S}Gf)@Y;!Ql?>K>%e_I?gD`39<vXU;;Pwinny^5+1zZ+~?^->Z<9
zw*3M_%f;rz+q7bIY@703?3CN>+scS8u_eA}PMmYxKoy^F@BPA@Tmr-P%_n18<vVgc
z;z;}fFM-CyC$@pH@7jCzEB0Cc5kFz?c{%^C<#OJ^0yVXN$xQ$3erhe)-;(<qj5*rE
znFbD%{(c?jAu6Tr24+^_6*n-Y8oqj69YVx+QDomSD)N*YiKqJPJIwjX#G@Km-#*S?
zMLcSr_3h&PXyP#*Sl<rL7Z8tu&ib}<-b6g+Q`WbE^M9~{r_V`!Va}f+KA-p%oIgQ4
zO`CnKod1A$tQA>b9p`^dJl3+TuY&V$5I>4|jq|S&k3A;qvvB?;;){r9oZn5nO8jZe
z0|Qf@Bi=&%G0s0jJoU4_!<>JLc(f<V&-w2VPoF~jc5(hu;%U0;+rjy-6HlK$`?hob
zLE`C?XWs_SKR`V7`@S&eBg9W2eg)_6C7wQQ_O)`pgLqoo^wn|xPU7iPW?u#87ZZ;!
z6DU9DZzUdIB2a$L*AxFG;u-k<$r^(AwsXBse`nPxRR}p+=8zMF|K@1qucDTRlkqgw
z8Tp$tcv}X)B!lnF;3FBl{01Y_`$k6m-)Hc&iZbTs=NbIL4F2^Do_<p$XZ&_eJat%d
z7YZ~^sl$@nsS_rD3!sK1M-5DF$4HG+8bIV|mqSh@%d{05=cxmclj+caA-Bt<aY`MY
zoU~5^jNF6~8mBZs$h{{Fe|B*oM0_$&Bf@kafd)M^2|;qQ9P(Sb{JS3ctz5=a2PAi)
zUgMNH961?J1CZRy6&j~B0Li^8_|pS=ewK;+peyqH?G!vlBIjj2>Dt5oa19LCz;F!=
z*T8TM4A;PL4Gh=7a19LCz;F!=*T8TM4A;PL4Gh=7a19LCz;F!=*T8TM4A;PL4Gh=7
za1H!<HIOfUhpQ03SM3s1o>$MbnpdfDXuP0P1+5WufuMH?>JxO8pbrW9n4sSi^m#!K
z2zo@&j|4p@XkL{_U(l(7)(E;l&^rY63A#$qhXj30(C-QQyr2gJJtF8wf;QlwnZ}RY
zX{&I40tXy4{ivX(mCkf|XI53z&hpHf;i_;|RaST0?49BBd1m@tuA174>Wb=_wKd+E
zwKw2Tqx`}9yph=&H|KCP3L|V`L&I#X6o=n$&?+meGprR_RYg@LRBN?TF54V*TW=6z
zOWKz-X|NPueS>zh6^9JbBGd8czai?sOv?KWZsDzUsYkzKnphKj`AcTH%p4xSDHSh&
z-=seXASZv{q(6Wd%IW#}9-?&nvVT1~lrS(oy?amuUH*vZhw}H{V;Ov=85;mmeud&>
z`Q`8a&m+=<=f{XuHkckkP?2|CNEt?zmBWuG<tv{-V$|OXtvkv3BzVe|v*2H0-d?1E
z;^*v~TpWC5IaS2a9~+oh8~l7_BLQ$zg#|s`A0hP|<^!Iu)WIJ`f51c8%B-Fb5oyZL
zH6zNr9G>ZDBaobG4v+W{NUokTgho-kCFd%Ty!#+P&j^;EzY_F{hu{_b6?r^F9-_8#
zpW4Et<Wct-IS&n~l&+xQQ!u35$b*L|IGqPAh*GQ+%sUK^9!lg0=%|rTp%_ZVD+rI=
zPr*tG@}I&L=N{2=vZp9d6eUTfVV;QSge*DbiD!YU^C`^+$}&0iG0sxH9mKxHRf&BZ
zkA+%Hxnn;3-{L7=DNIfQL+-nf9_?q!QmVKSV8(0{5@VHi(^@uGX)~>Zf4gZt1jb2q
z$_E=i6CORv&7x)30Y=NhuLInOj1BfErC71#6)U3^bJ5jBQ;No*-<qt<Tr__afn=pv
z0X&kTE%+k|Q#B%IkGy4M^O*S)P5A2ua|s(ca^6TYx_{18);xx*FiT#0-k?RJ5ttpp
z4GFy>m>8e3nKF_)Xf{t#n#U+GJYo`z=c`DMlDT4>GHoKk1-U%gk;a_O!QV%W8po!A
z6k6H*iF%%=O{CQ8Cj3o3@FJ@+1WpFk+*E!{D6(bHFtNoWQ6-~Ckl<*<<m;6|F=G-B
zDu~0Os;@tYbAc1SQIGl=2apvjl@K3KIhivtSfuA>$eOwXsEMDrQk4HHDqsp63F6dS
zbWPOb4M)X!X@(=@IO=a*qW@{n6(%`zm}e=Unk_fY*KWCW(xgb#Y+30w;Wh&0dYCaY
z`a>hKT$r*9p80WJ-s(maWB1@O@oD)_X22bZ`3KXNGp)IoDb11D(@nTaL19;z<cw>6
zlPRVgVBbLsX0{CeB^fD?M<e3Kl}@)?o(zx1qh9@3c=^rc{-8S$_jnoe27G0M`Nq+G
zkJo4IlC>cRsd+zB%%cyPgE8}Hjr%ckaE^KOtqbpFrX2He@(w;~DxGL9u<{)9Df6<+
z9~>;docYbu6g#hQlwHpL=9>g2ZfU^DZn|^9k9z^A2u|D`;9A4)1#mXshHC;mIID<4
zdzib+6=7V4@p}vKrqvoMEA`i>R@NPh2fcGUy+Lop?*{9NxVz@+SJY@_o&5i8RazMg
z&c%Ux=#}|EYiFV2aGi}8Y`1=%vn=2ZcE-BqRsf0z@An5iWieL=4zGiEMa4A@p?DOh
z^wT5J32~_a78ud4GMpfddEIn~k{gb6ddqN9vka#)Ar*{~tq87!K;b%()mRsJU0o=K
zBFHIjD?rA5VLE<FIiUP-o4C0k7Fjd5vSxZ^HQ9%OG8{1thQI_v(4>r{ECFn7Zh)VG
z52g?cM9XkO-y4`qP2hJ0%Dk(+ZnOwBjA#)kM`Cohk7iq<RavVmv1UmVpsO5kqe7Sl
zW6bX^i+FJf21b&hoyoefvj(Vu^z!9_bPktNpvZK@HGQz-ss}#QT#-)q(6Q7JsfRdY
zth_7K?Jak8#QgzJc_iGe2kP6K9gXFkxQ4)8R%M-0S&lx)#Eld%yGqUhPG>mc&FGYu
z=7(1dl1%HrdTsF%#e-3QXV6Orc=h@l?zETJX@~BWyf-(Yw$s~2uW(GHgO2`WmYhW7
ztSo0IIUCEFNy<s8YY*y6y?4a=L+TwDywsDFmRscNxKHnvb{Pz(pye_c7f=p6tF%8U
zm=w9xR$HBjj0mc*dt}gPtL-vu-7am#QnoO7bZ4}6`z5w)LMo6_q_=jOlOlG3npnk?
zGVBwW!j`1z{85G$0XgjVsdkmZwCflO$8uyyd@^u_jbg3E8bX7Puh6mdm3xYLB8sSj
z8;I&$8b>G}<QCx}e1%IxXt2G%j4;+aO#WRe<&WG7JSNzef5*I-K)8GHkewYa4Iv8a
zZZ!{&PW}CA$_@k+(|9KL{}Nvh`Jw88b__$|k8%50lr?AwVOhz2eRV#R{68RmD4hNt
zS}_gQr`#<(3wZ^LLdyzGSk?(&&6y^q{*G8iHoy)u4c42)?-02BJ6?&?p%-Qv%QC-r
zw4r`XrlQo}^-B3WIj*MmzY<@|^M7eRUxUDCv{&XX=+_dTtZ@@%+-K}J0+;J9xjFj1
zz%5fXF3HdMBXkar#>S&#=s1srcO~*!jN2OJMpf45RN$kfkb%<4H_H%q=)N$`ASZ*}
zz-bI*x}Gffqgn80fDe@)+9?l(_h!Lg&4Ry~1wWnzKc5AsP4-avxh4x<o&}$q1z*JR
zLbl<0jaTU=@Z?rzkzbt!-<Sob4<1A1|L=egbx+*WS>&J1g8z%Kv(Kebwo~)K?kw`X
zS@2(G!H;Ia-_L@7lm+k4f}hWVD;U>9)mt9$q3+8o<oMt`&npAoM1dc2X%gb}OWb}T
zljl+D(ErHI;J6+(`nP1knH4KUZzRmDL2MqZw)$plJUW@xvnCi_)6J~du2{u~%QVIr
z(b+C+Jap+85G=o4q$)z;Kn$S}zk$u_?GpFib$K8uIcAM{S3`=g(ye??X60&wOJK%f
zeMi8oX)7?Eva8#VMR{ikR`irY-C<`4_rrx;wAHcFzQym2(&|laYMgRYgFT-fN&0%i
zxG0WU-MF_7QLyOkh<74wx@iZv&hnf4WK>5-#Jfty(5_B~v6qs;balF?&x+c`4iB`8
z?#<&Noyv_s#*PL%kPh4*804FqCcAUNVp~g-)6u-7%}I9)dP4dxRo@U`H-){32iMm*
zouQ6<u_?seYkHI}D>lC%LQ?LlSlZn_!H~1N+Z9HdVdEAGdBMgDsCDS7cJiDeCED#c
z2RGaLdf5Q;R_B)`Qp?Hv`4&2EtG7Am#+@ZiZO%4Zy`u?5Oy8IbBiy3AZqaBX<L*VC
zklgZnq9G?P*7IPe_ZNRDqw~(nYAU?mBN|c5t(8@Dw_yj3k62|*fUZLP|8sAn(<%C-
z(-(@M;R7_7P=d6}9EaEyr$O}9!0AjML`{ntF_5x#Xs*%t<wdrZ<_6=m`e%n}x~2!O
z5ANhW*7AO9d0tFJb6FWCO6TWB;F0=UO&Ykt`;~Q$L)??=IEm2yWF#J`FVD^G62MNp
zFUV0Elux-n#}`%Zu7i)gYDntKb85@-^my8zkP~K7=}7tMh@&%Yl9%V&b_)GEQihXd
z%*;Q1z&~fum*@79LcfA*52i2kFJS%)7UfN*FV6)|pakLMxj>!M>p+Hyw&0;WNqu?F
z@q##5xRZq7Wcg+Kvi_DLj`ZtE2u_}R+(C^3XFT_akL8!tzY7?(OBEh+a=pGC-}A`@
zWyW+CYmRj12W#wqOzuaEbvZ}6IM1sQx<5l-?q^$t{xULdaI*Xo+K2#&h~`TD?k4YB
zA1%ru<umjD)eL?4o#kas*P#Awbea8U8zPPA%kMJpYPuGUFQd!U|0W`h`lrSBnA4ij
z`V;m4IK-%kvVX|^y1d6dv%VxR>C@0v4f=9_dSNm^I4P%dgS^CdA%f~urZ4y5?0Q{8
z%4O5viwJ5vsW11z^8WfAqWoewrSc=&@ep*W{-nOVUv;PW39b}zMwgkNe?z2EKedmo
z)3sze(nhBKn;H7@JAp;$OFc;`Eu-rPLs*XhGxcTqg(5yPeYy(9n15}B#wE2nUBN<;
z2A8Q%KXVZ#>#xqGap?*hCv;zRn$yxa{1yg%N}?hIm-^q$z)6;zq}K_39Ub%!+eBCj
z3Rc#Kl(c5kuh%6ui2ftxGt9BsS@c_*bPc*g(daT*Mi<p&VkCcTfv)v_Mq~!d=sOQ#
znf15VuIsmDOc+vMQm4?@r3ZbDUMz4MxIgG+)*pRGk+ue<@E#HRbkDufWw2~DfQ%~U
z;>W~``?Q#tX*gbuNA^GY{@MV2+4qd;9B$V+`fg)%87%vH7X1!-F~U8SVLVg+zX7OP
BLl^)6

diff --git a/rccl/all-reduce/allreduce.x b/rccl/all-reduce/allreduce.x
deleted file mode 100755
index a21c76bfb1f099fdcb0b0fde0a1a598eac47e83d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 25840
zcmeHP3v^V~x!z|c2{DkI2`@pk93&t<GMNN|0eNKdfJvKRBtfjNVKR?oWHK|(1HpUc
zVZb`&A=kFns<-8Oz1Hh;^|EfQwOkh@qCBjvXsy!PR(#Qk5k#OstC0Ks`|LeAGh}*k
zb*-*-yRzY&fB)b9|9|iOpS}0l`(*ZBQ{TA2q$o@V6Pw3~J9oLpp=y3-EPRWIfK;<8
zR)G6dR>JbY=Wt9<uVxyLNNO|bxTN_y%gmsp7Z&MZlg3jdy_9o|Nh(W7l=KFICZak$
zHd1FLWfq+$y<A)pKPKuY=>Z`psZo#W*gaXd(_O3ah@>ko)A9VT=v3-awMg%v&^svf
zB;6+JEvfX&;*Vk;s$aJJ4hg;L5qf^oFU>;FR;=T;VvR>6ZPj5Wsr2Cu(4%sG*+BYq
zZ4%|#B%Yt7nlLD-%&#oqZ<{%zEZ`{(_=EA4r7Npumd>1EjfSih+#Wp!s?)+nOSlCy
z?jj4$XS<P(c=Tx^qrhoth|XLNLRV&}0x%f9FbnPkK3Kl~Ecl8n`06b9{aNrwvf$5T
z!F#gcuV=wO%z_`!f}8OG2J=Hf7JOtDd_ord$}IT2Eck*fcxx8?mMnNj7Ce##UzG)4
zn+4yL1%D_D{%98b`7C&M7W`V^#q1LHluN@CWlWAgO#<(BX#}V6ZNM#TJgahP1gF9e
zIerPNc4;Y$B0m=RK9@$UA()tge{?se{UcOQ(Iupm#ATWZy7X@V--o<3<e-lo3VToR
zlxgIOJe47z-vFZ!zr=qLjVOJBCo&lq(MxclzpVpPK-jd)HeFhEeB-4x^}qh);GH*J
z`ad5$_|MGg#0chgMq{o>%-QMk2Z6MAhJpfdf{l3F{n3~=;`BvaonEIu=nDaLuXH(m
z{-7)1zuODe;~h+_!yj()cDlo>7-B8nSe<u;-|Z!4LBz{xlPeGixd~Y)lWRlqU`!zO
zkw_@g5{vkQ?S$z1Evtg=jz}m-IS1YD05q2b{ddN_4IarG^+?1X2rTw`;>er0+E8a_
zL(m^v><TVt(O4uF3J4NlP0bBXO5+g{bVf*uV9rG;*J#WWipQ8QjDp2{tW&Q9P;Vr{
ze0N3sF)!CxARR>NHEvfl#)OK`7l=nYxKNEN67hQ@JiavM@ANVsImf#ag>ZU3u9yq{
zZHq?rM*=A5q0$E3onbxaaE#$0x;j1ZkpA+iBxgBi;3Y&)mtIFR*}}$#np$UtwZdA-
z%0l5-nLFZIRTlP^x!vBf@|jkf)yAWEg*oY!=5L!_7QL&?2S>+Hys~K6<z8Ou4X%im
zwYj3+l!?-4z#A>EEWO(soK6Nx%PYApk1G=Ga0N>vc$%nG>2&K1tF1IP(;KL$Tp78m
z-Pc}u$IQxzC(;p#PQUYx@QlzM(P&uDH+%6)-2qpyy=<V6B3c>`E)Rz83X1n|doVsb
zRl@TtdEr*bfe|zr@?kkKqFTh;h`&3P(i!$U$)_CPo+qR|7I1b%T`74FH^q}zhC98T
zsf0V@E>EW`){z1_L!LNX%gs?vK7T4o*+^MlDqkuoo{G7n@vzP?wsL`n5kQ~AiJ^k?
zYYn(@UUI(yH*-n-w$*^kIS{2hY`|$=DuW!438OJchHjU}qlB06n0~8Trt!!y+>QBG
z8*pR34g+q?x7mOj=YuyHaLPl5Wd>ZnJ1DKgfMcLdg|GoHNTKkv0UvI_*BWp!)#a)8
z8}LGd{8j^AWWXOb;CK<F!Zrggrv5zbDFZI1(j4Dzz>V{q=M6Z`HD%~F;G-o7y3c@*
zG2jOb_*ese(12fTzz-SlaRxkTz{eZ#V+Newb26MU;1eWB)TPm#(KDB8FyixF8WE0*
zr4Q#VE{#)?mrEf6#<?^iTz(>t={L=#@d(N5G0w7SE}bKMk`D2^&82ZlxN&|?Gl{{1
zBEbIswQG!*l$L#^e+^^COZFAnH^`3hO6Qc+{7Ciz*>|X%dRU7b2Cn{o<Y%}R#XH-)
z5iR7?!jX{M8;yFST4y{O(=b}OJG5Xbd13RCsI`DKLB!SW)#$CN;YHi-ovlp?c(hWw
z7qA5`f57XRt>N{l;nf<7Xj7uIr+Bn0ka|S{TU1-yn1)Y@0!+&T=3u8GxB?RukLC}C
z<1sDlqKOP<G*kkswG40X2%j8S-El^_PqEJQX;H5`6!gI05-K6=M7236zq36YkIvKP
zboztN&S<-nN=~e6rIE!1AmWPoL&5000=6&^YI6lNdXw{t@+uav#*lju)X~r^s-)MI
zGB#Ffz_MPoCVOM!^|j8XCVR89$$pb_E)z)W;)W(?t7CC}i{tvnI^ea7?O%ro#G6~4
z*DqKw7Y|gKV7`vl&@{IF5(7jS^mEYlptCSAJOR2D^d(RY9sD5ipr3%QM#q?g#`MdH
zjNJ;lNP|2zE(AG7D0eSr%1VneVZ^Zftw^J>iTdA-$fJ}5Ylc~fx8Vu_n@5<v$a0^l
zwlHrMl)zGYGp@}@UksW`H#KnqvKhwp5|vwY2*Rii)hI{^ltLq}hjBdy>?j#VJ~c(-
zYKpX)qG>fnHhWRk!lJe2u3?+<Hs@~1dBA+1$^5~vqAGiljSD~w!WMf`z8zwExsot5
z8v{o*)mxU^e4RoCs{@n|{0{i39dTAyWO={@BU{Y%McU>Z7~PcXD6-v`=P0V`8rD=)
z9pRbTsWc+9nxcH}%VyYl9CmWZjv;JK3%;%8M;a2Z-|k4f@96sQSaWOR=G+dvJ{+41
z-^1I*v3c%eD0lyuMUxpjxx|t9NPRB104`Ub>vnADj!)Z@`#Bng;@Frw2NHl{V;!3-
z^C1D%E06Z~CvSzdK4t4|yM^{O1KM-;<X#4jwDW|vig;uWv=e)i6;zImxhF8%kzo}U
z-#!A1S4oSnQgC#gGpYBk!c%l4jyYaWg11D6mGAGJ%%L7f*L!DtYADy!*WV9Xc&Sdu
zocx&cm%R#F{)RejN(&O|)M=OfoD%BP(3HzbvR?i44IhG4ok*>NF`q+S{fByWY4?dn
z^{PESHQaZUprh*2BPSf{Rh)TCU3!X`Q%L>`N6x8B`w2NmvcH7-DN;XGr;dJE9eo;N
zz2{rmlhUbE!^xw*M&ts`Bcyq>P93`k+JCaEqt7CK+OAGJrM8{~Nvf@%_|z42p~9bj
z3VIUsbUl)Mb~W;A3xv-3)X3Wws^;7YRA0OL4%PLYPa$=V;KC1kdF{{r8Qi&ksGNZi
zINZ*XDY&Z82Gx4CwHKNzK89T15y&026JjSs?E^FI7>T|Nb9VK{&nTy}5PkP3$$yIE
zQyi`%m{5BPZ9PFnuTxv$#&`cj2@su@g!w+I9}->$`0f!(NuoX<S#Udy+w!9))HT;E
zb#%R$kF+;`N4GcW_A1?8a?E;N-B5Y1zuz%yU#ym%BpytUPhI_D;;qwfo!&!lt9YSr
z!yB=&>t3PPU0;E^<9*AzH|RC#Q-d#(XwSzMUt;w2j{cs$=XV?1N7rn&+F#z?ceOfk
zEwvYQ$9tA_uM1VmrKhJCsv{OS`nw(O0+ivN+3HV6EbMs?B@gaLbb%i2IicRf<?V@m
z_R|OKr@MK<{>!&vf9&~n`+32h*}Zj%eX0FM`x1L=i(}m<4s`m4#OcHfCC?&jN8;HM
zD6y{oTy?`Y(2pB7uE{SywS%Ug$tN(`Y)HHe$ReSC1LR2T1~g7WTe#F7K$?V>1G0CW
zHL06am~?D-N8OT#NYjQ>>Xx(UbB(i3sNXn=NJC<8{f2&ZOOiU`W>KWgdXbX#loyyy
zD9bKfb;(+RJd;A~K<Z{aQ$mQk;jl2X9b9q+>acM=*Z$t=7fJ|Ayq&ZG?RtTuEI!?_
zaqiw(C|KfTL(eCpcRpj$SYzV-^6uoHA%?`Kk+}0aDf9!NjfvCAKG255OAUNn`H(8a
z#})AUxU!kbkw}dzBmRhfg8uT-VNS<9rqjz}pnP2U91$8<qLe`6$}1G1aYdbo0skLv
zJo))YsNVVG$+rovrx7Ikcyb6b7dW28A^P_lPf(vV*R=lC0c8dTl)jK-W8v~^(V{O#
zv)W0^j2-lvPyP^-2l_zCdS~N(jJCG`LJb=)-q{K{w8-{U`XxNQa4H#DtHMZG%19f8
zlAS5ZW7PXFg4O*EBbYrgx{-#kpHlzaikun~r;-Jjx^L)KH+(=D0H+xc;m0wxPvIM-
z{NcvL@t^TZJ}%PVfnKT?7LtX1u#h}~WXbGY3uH&3!Wzr>+n-0gF8PGW^(WBY_$=@9
z8^6uFvT9jhQiUAU;<}PBG1COIjF@qPX(omyPb60DBaX5~WK#*vK~JcmZ<i$X$YUjE
zcx273B{sY&T1zyaTHIKI0k}A)gvz)L$;pVQ&$Y(-*mjxm$m$YD@zfL)dDnJ4BuC;8
z@B)xep|a~B7dQv@5ZLnmWG@KbA^Q8kp}a#Q<O63=R7c_<W-N{Fq(ivVftqu7Z`{Po
z;Mki}f}+6vAODOd-1RW}bF9qhF<gRiD7gj9mr*8pQvN-1Pu}n)l}VnIw~_MbFR4uO
zr0gc;w|Ohqo6v>SndC`%Dk(og%IiMI+mJm^tA;wY=}cYXto|0+jBJt*<91vvUSC2@
z3-6%SUmKXeeHm{p6WM+jW|EV^^#3BhOV4Kt<zvH5wsdvF<ro2utGDX89=EGky>?u^
zy=PYqngbfhb|v{bEcAYHw!gnI@yBEibY)qR^m3;9{Sg@vNNpe)5#SMG;?OPccC34e
zUh%isZ?)fMzuoS<Z8zr0PeX3kQ5q{MkkyXE5bqlc{K0hm0NGCTpanI+)-IS~$p;XF
zWEDxi2xZ;E_sGJVz_oPz5yD>po}Z3CO865F_r6`PrAqN*!oCIB5$UCP4tQ_8zrP<z
zJ!rv(Y`>tSoAsn8)056p(tIS*BEr%2i7j!@Cyqq?kR!3=;NE&tWP9tWPlKG)$uT01
zu6qtLb^RNN_WqRkjr9i|8?kz!1x(kO+L*uVOq05SM!v3lPO|t_LSRQRke$c*liQnI
zg(_gNQwZDSwXXUvls<RYJ;zx5HYtBG$>$Vn@bKPbgDzjaxBgg4etOqEXIR`O<+=S!
zh5UG3eiW*ZlF!@Pm$Mb?CP!EO!Thd!zA&w^2wkj&)D6?&3vTghUT$?K@@Kog0GfOU
zuH%KX5UA|})}P!1aO>9nyI*}ZZTSa=rjHGYw`m61xOvKtuySAG*j!4q#hy6ZkT}(N
zC3!sG(R0?EY=K3`h9fbp{2jR-YE1kEFNeCsr}qBQzjJi$Rvfc_AOG0V^-}&Vx5;LT
z336)vl$rmD-Sn7Xe`D@v=*_g!GxZ-J{oOjwV^lpo^jAi#=!jEdxT_PZ5hK2X5_=C(
zU8meZJUO!W0O#9?NBvoEH|K9A9xi9S+d03Ocyt8TyN&Z-B_6$#^={>S4e=PlS?^lT
z&n6y2KkE&1zJmCC;+Jv$D&lFh?rrA$<-}uB$$G0fe+luJ+Ol37=SLDhf_RPd1;o>*
z-ChglO~e-w&p7`Vd<N;DPr1FvFyi!2IZnKV_(Pokn0RVny$3k|KJj>-s6Xd_OZ+I}
zw{!jt;%Ol5-NyM>h^J4py<0i|BJuP|ws$S(cMwkvzBkPI|0aGM@yj^>H1YK5wYQn`
zj}uRGq26lFKSKNj;%%INka+rJ)2ng*o5bS_3F^=Jdx`%F@r?6#6F-soV;D=Xo~$9t
zm|xHzmFYY=h{9=V2?&C)eHr}a)DjHoWf}bZ44yi-QT~<;K9s@Vox$Iq!GAA<|IZBm
z*$n>W4F0tY{x=ys{VkY+@mo9b)PO0B8?JFm4Vc1HI^o~<C`u2O0zGgF-9nx^6b~Xl
zL728o<1jTS3Njz+Koqu{G)}1lP>}Yi!%?se(>SFLL*cM6{HJsM`e!+juJKgH4%F(g
zNrIR4paw`m{!WP+90i$94U)n>p-&Bvf=s6lNnw4t#%UqH<6EHMkM-;23FqnQXqvoy
z-GWEY<b29cLVt)1d0@x`Lmn9Nz>o)qJTT;eArB0BV8{bQ9vJe#kOzi5Fyw(D4-9!=
z$OA(j81lf72ZlT_<bfd%{Czx-FaB<I%%<@S!{YBea=&_JDfWo;w*);b=qW+-Ds($z
z1)VDBOhFe2dZVB|L01TRpP&y3`lO)G3A#tnw*);b=qW+-ri=Uqohs-|K^F*mqo6)P
zR|tBapbrW9q@d3UT8j;38sA!{t-!trY+TUvErFUgt<vSKtguzh^30m<vbieCXS7}G
zo$m8_Dt#{3%qrUq+l<Ppncm8(EAh)Fe{i`sGF#*38XN0`5w@_lcD6PRo8Yh1%5B!^
zR-0B~t0;$Rl{SsbHU!<)D}`9gl9qZ6mf|a~)ULH+Goj&OU4yv(H#q$-d-A-4>kKl+
zDE%GK#OiUEe+e|EV*dh9U!O{se;*tv#N^)x=?4Xa1-(2^B1x1-w6_Nb69%T|cRQ+}
z%ReOAm;8I_whTVgj2(cezd~uU{_^kU&mqx+>nBK6)|wtbRFP+2NEt@X%Hi9Q@|8b9
zV#K3D>t?b(2{+|p{F;_sWbQ7~K(Q4sCl?!CSxyBp^v8P>KY1ZvSw{fERAE8S_gUng
z!+gN=m1^8a&@W-ASh71x_k4gvQ+}=)N#@&dGaal0k~4$D!#)6#tLF@%5tMGpp&!cS
zEr$SI!&rX)ouC)phg-qd=*DAoL((RGr_4-B9yO!k^Y9>*Y0D`3I1DLQ@#q1HUc;jn
zBq>&k<{iL|F3RLB&=JEQM>Q1N%ZLx(P0@0S@;@aj&ei1_n1LM{%40=IlBqo>nnXKK
z2J+Z5z}5MbXDt<(g8C3=soagkzR6XIeHfR8o|tmueB3|8Ra`FengWKx_aQyf&y=O)
zxM5&MH3^B)$`aE(Y_!s9T8;Y+rZo^4Bh{%KY-}ZNbSc-0C%YOjo-FRwfa_4Offl6{
zE0(-sWu#&*nou;QXcXEle$Zkrnm>X-vQn%79!|*?{K&vG1BtVTUpKsA)co-#{5rr~
z!iEo@H{6WopEH#;jN&THlD8ypz@pIz%nsv*gdTo5V;-BbnKF_)U^Y)t8b&EFJZut-
z=c~w%vbkuCa>aOp3vzk3!;K}IgWng77{jgrDYUZr<MlFMF`ja-9{1fE;6+wt5S$FE
zxvBD)P-V-2VPcDi!zCk!k>E(A<m=9$lu?O$6{KOa)Hm<NzQ1w*QUm{t0mzD#a)^(m
zlFS(&EYeFesHbiOYT`So6y@`x0;V94Aa=DyS4BPEa8&G%X4o>0E&kRP{iib*ndHb}
zo~4{LTdtn3UH8>VlOj>G<xZ~&XAvlqVaCkphdX4sFl8A$^J9O!)r};^ZpUR}Gz|Wq
z1#m}V{(=1ENNet4N<(D!H6|Q^ps<Tfa>O<NiYcZXVDvjhGrLLTkXDegcr+qTU}<-|
z<xcQuJnGfAftOud<`239agUcVZ@^bNP;PA1_jrBQ4(ScqNzJ>NVjj8A9E_PqYWyBE
z2j`eaes$rk%#>sPh;DlyG))_CF0k?v^C9zs>mMAbza06^S11ndaMWFn{^qL%CQfI-
zPHsAD!H;tQs0vOT7~opP&jD~YT#sV`JlKnfN_&{Q!xdp%hVgR>@W$0@(cSrLR4eU_
z#e?3t?cSg_;&+2}Mcf^8^&@Mv(sus;`U<TS2Ipb}J@iU_pjETrI2>=|1>33b*DMWq
zgYB`7xi&!Y;9dTpr!?kj!)A5xuBbSkAry~dSAKdT8X*o5z(gb3QHmX+F|V68Npi!H
zc5f+mVwPgpC8UBevK7Hm5U5-`iW=(xud55iPy{8#=>#aaFHGA@sRUFWZWAXJ#3HNa
zme0JVd<NNvfl_QC4Tiu3L(rsxq$~k!Zm7jQ10TpC7KoN&N4_^OmmY!N6)5$t^t$mx
z=)s650_8}I&ic{pEn0<jMmgp#X##X~15Qu~Q)i6%-K7yP4!=N8GCXIpZfvXpsvy05
zc_f{~Arz=GZDCFC>^RzicQsd}-930JHAHG5&KN7}2z7ePTy1fGz*80pcj}RvB@K;r
zW$ie2z+GBlonBssHps*Y6fnC&jsZ?*IO5G{l;@X+I|f;%wO`#^yhib0)ZZTT(gt1K
ze?yJ-!W!-1t&+FqdU!ki+2{_(KspfUC#V!8B1dI8I?2&kj!aTcQeAsMU+O(7<{wh;
zKLjuJB(;c0wV(=HBBS9HbeoLE1(d^1C`*1Im=rnRR%^9Lj0mc*+hx>ftMvkG-5_nn
zQnt`}^kB4g!+EysLMo6_q_=ijkRlF&nwaeeGVT+Y!dlXF{whP0fE@N&>bXi`T6GMD
zV>&V@emQW3jbQf|Ylsa5K10XUSFS1Mi6o*5P9CauX&j+)P*{YE@MSIyF@0u9^y|1V
z-%0(hlgdY787>p-%ijSnCJ^BcTx4gPOGAv}dRWQhBU8WsOxb~eVj9cj_W+5nf&5^8
zpcTVl_(R<OsMPOMrJaYf$p3p5e3O~VGvH6Tu6%~<V^U~YrXe;E_-xKJKJ`0XnZkeZ
zZW{2L#QzEQ8Ss6znuIvr$??%F^ZN&#A7(NYrG6JHm2T#^n)-dL#2@G7Kfj!>Kwu=E
zSLQ0{w*r@IQ2f}Nhxa6&u{ion;N4R-F30!jBX$ax#>S#y=r~Wr?P8R(7^gSNg{t%?
zZ7mupg$%R`xMdJK^}scQoQ!q?r#_Gwy0YL8X2G8ZK3I8Zr92qklLg<O1^-nR{G%*5
z{f>UHd})zA7=CFMyetboHw(Uq<Apd6QsXXN51zuE0&f=2o&VSx_!f?Hk+j=)vf%%k
z1*aWIgVpQVEclDS2Rl!0e-`=Iv)~7Xoo<)L%P+@`ceBVJ&VrxFg3}i5!RnQRemWRF
zJPUqt7JO0`d}<c_YT$#Nt5?DCf%k%ZnDYd_PxwdTwcLIollwoa(OxOs#Bn`tyf4dw
zGb?74-bk2PgIH`>?KKTpsI)VyXH_t|s*_o<ys?T8o@tCTqO%=XsOZu$AegSZNL9qb
zff!;TenOko+ab=!>+nEQa?Bd@u7nhy)m!-*&dSvWrp%1P`U-+s(`IZuXIG~mllb-!
ztmrv~I>XKo&W{VZXlZ1nb&lT~rP-cb1Ucm*2x~+=k@WS9agZFdx^b=@l3>x>7H>!1
zbn*^xo#iL|$)vWnh<Aldp;e%aW9=oQ>FRWzpB3K4iV(De&e!8Hoyvtn#tH~4mNuL-
z803qgdWUntVtZ4)v$3J2)k$X%dP4fDR$oA1^@X*I2glnvouRfnu!zLjY<iL|D;CQj
zLQ<~7SlU@X!H~1F(-lUZVdIhuWx+xXsC95xJ9$Zw6RnP%1B-Kger<sHQ|E^&(vy?t
z_bqI^zQ*22C-Ah?w>n$xHI4PCV){Z}7~vM>v5ZC=8D})|jN~T*Pc-Dj!FwL84FBN|
zZ*<;VK7%T+w}?7;xw*W8&OU6T{t+vm8DJ2pbmIuBP$$6hna*guvw1~}v)SKbbJ97A
z|NjnubUH=Db^1aPw2lDvIMgKVM94vw+G!Adif}s9ds_XXI`p<|t)6SteSML=siD?5
z_Wmt#8rJE;Ym5hZUbZ|(Tkhi$$y`>(iPFZ*VYsCJCX)tk;2dY2;}E~e`JqH;-7_4Q
z)R+6&whN#e?-dG^2bEL4zwxD(hl#jj4IGmCa{pL3Pfu^w2@_^g`AB*Ca7ug4B;TE<
zagA=FU(LCJI%bxi{xERLpwEVBx&{>(!KRZ`{<8c6=6@NZvdR4Aem$Gem;3dkym$>u
zmNg+pWs>@G|KT`tD#BAbIWK?dzokeg{Td-B_ak--eW^%g#7Fq^{O<xrWvakML9PS3
zX@@OBP!>#QF*ivMez2r3#cX1oE6*KoO&8~7H9~h~=*xBT{l&Tz-??m1$n^g@p)VeH
z>hF2-T=xe>J*0`u@_!>kUw*$?KS|f1_H7KA?PoI*ss1v5`Mu{cp-=tG7&7(0g+!zN
z_HvDjZx{L&=~!VTv;4GvrYg$*(Os)?$?nOVq@K6V4LpD8$0v}W;*$Du-MjAs^|wQp
z$|UvW{?h{&sJ{!k<U6S^*VhMy{x(s6(VbHDk$%_*UGksQm*=#uoC*+Onod%8S#Jsa
z0uiHrYJFU-OUQhre5U@dGW6y52aC{`dXiFJ#?S|bur2{+>dX8KMS5ob`A{~NUt6Ye
zNv&E}7)ObQkf~39J|j%}uiB+?=_)%XbpCZ((9$^kjSc#gg)IY@`rpsMNtS}76NSEx
z4%`pfMO+FBR{BFqTC?fb=n`v1`;qb)=Gg2k`pfEd4LWnt7&2Jq$fAF2fv)v#hC&9*
z==%|2nf`mtq3gG13>Z>hQm4>2mXYE%da)p^<@bSGrvK>sjI=c%g?;HlpU%@ah76Xi
z1d!okE`E$YSR!bm<I-Q+C*YFpPrkp_LSME$l9$0EKJn7`AY;g2**CN3KTa=3guNNY
HGxh%$>otm9

diff --git a/rccl/reduce-scatter/reduce_scatter.x b/rccl/reduce-scatter/reduce_scatter.x
deleted file mode 100755
index d2657f4967ef5b24773d2d3a26a0addf76226bc0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 25848
zcmeHP3wT_`b)LJc7qX?hvW;W&h%Z<M^U!KnwuJlwtseG@tUy>Y1_G?F_8~23AMC@}
zw0<F*u#rt1LK2dYgz`~J8``*S0)$d5W3UY&i6I2skOY$eSt0xgHntP6)&I<$SzWEQ
zDSY|b?`x~E?>%$=Ip@sGxifQTu6AZ!i)XP#QJ4%Cb~z)ie5S^s27YI3)#pV7q=D75
zQrs_Km8=MS0mqE=2Bz_dqz;RLOIl*E%nC|+F_9j#Xgo#Ib2!JCq_TuWNpB=*A}Z))
z!Afc~c+xARsKjS##4#r5ULhwbGwD$s_s-<fY;TjsBa-%<Yv3mYm3mYy(#r_FjL?(x
zK@vcaRQhG<J4rv)FJFH9g`Q&~SLe@)V(mh&xx&DkD>NRFw9|l@q|%4iK#$7#X#?rg
zwON#Bvv_`zYQmtTGQaBHP*>gD>Rx|UZzz&lRkf<VuBvXXJrT9naC_u-dR9xWTE;C{
zaTi%|KHrUWCXJCuk6Rl9KOY!fxuG7wSoo4WxEJ_X`G)e~EA!xM^Wb;q!M~XYe=HBa
zBM<&^9{lY*_=kD$VmyGc{7{|;pPC1skq58JgE!{EugrsAmk0Oe!F%)IEA!xM^57fu
z;ORX0SM%WC&4WLk2Y)UPz5sXyJBvN4Yj~oJ$?@kJfe+{!!CCxa;5Ig$)$1C;+3-D%
zpT!z<EsIg)34!m?HDXP{g8r?bPb+u_nEw%Kq!<!XY6O?57HE&iDC{NVr6C7n>`>S{
zf~QR57Z#}u`TQCfh16N0GiXFPBzPh-aS=TW7y8=<Fr|deK6mpu^&j+~)7bKh_xIg)
z-8ujEyL<nddA%6HeBMM-k0-rhJrn`b9gao>;sqNIbcYhjK-?RQ>*0Vm6bVLw`c~=Q
zU?`&ZhVBS}^#{fh>j}l$0%2clHAAc;kZcaD4EX}YERF{_ZPR;uqdr2G$mFJIDv}gP
zOFSNpcO>JXNH-yde#h#FuO}XjP|gvbuNRujBB9$-fmXld&3Yu#6b*-4BcbF{J#s6T
zSsL)Ce1Q(1o<z-9A{kFcdj;ubZSAdIlI9Uo^~On$V9q5d=S0#UO(j_{hQcL-ENoN+
zXdoVE!Q11ZWPocdmQEt|MxUNYGNBR-_NEd&EJ&{4iF!O9LTM<zJQ)fHSYQ<j;SKop
zqz)f<B@)JC0Tl65X(PUH%*Z*GWO$Hz*bgrmFQ8g-n0GE-MD&E|l~l);cv>5qyfyY3
zdo8Pu#*)>(xW2kN7O3|50@cnsyTk6_QM|^y^jZsb&96?}ULAz9lPFnrBBuLptqMd|
zCaSyiL?COTD$yHAIBTo!2t?+PfhuP$x8>L4i5@*t6~_}reX8cz=h_`r$+|#qP3@}q
z?cKrd+FR;s<NkP0JTd3CTVivgw<HoVBj5bRtMc{gk?!h|LW*crDspQidV55?i@PJK
z`PmYlUdc~xg&Y`1BO)I<i4oNz-a-5w*_3cB<RzbSfP0>h?qsjGC!uHM{oE8!UKI-m
z!r6q|Qo28^CwsC$IO<QqwcH%#6bxmfl#P_-r3z+~QrVa<k%}1%1^|S+7i*l-97zlr
zoZo7~&GVD{O}Ldy8n*{cxSR`7%A+Qn&rgMp9G3~Bu}FpiUE@*0D|yVg)vwTaq!@Q|
zz6~baoUhx2oAYfq;pTbabtathkYR-hm+uft>oMUNaI+z1!b`I#))yvxf(c)5!o`%A
zr`~PC%S`fHO?bHpzu$!8g^&#ons71Y=V_0ca51Ik_%;)6p6@(q!sQZ!6b4NA6mG$|
z?J?nJn()0Qe5wiGXTm>Y!uOl-X(l{l!l#??!zP^GcQTwX;b%*bsIJkS(KDB8G2%;f
zjR?o((ua7Pt`Xt*XL-!HP17|VAv-g8%(!W~#v_CqG0w8ty1@~Co&oW@L)SPZe5`po
z%`C<WiU5a)*RL~QQd;(z;dP9eFWF~gA0a#DYc!{1=S#9L$Ua2nHNtx2FmeqKBR|u1
zRVv&Sh-=ZH7K=xHfkYyZ(88%iQp3pQ>(L_F<R$IP682Ko1`)kGpwSyv!%Me2FkhS1
z>({F2Udk5hq27RhzJ@oXhSzK~uFXo!pXJvsMCyg5?5d_F&nSFW0^q1TU=DU#BP%g+
z@oS+-ES1z^I!$aa51<lIZnNxlK`jyRMI(L~SVkp+UP4=l@_M^tsl?^l!f+_!4JW$2
zR8nH~Rc011|G1tEMI(vJOWBg%XqVor(OaBXj900Yd7{2CP<Ly)sDe>9%D9x-?bUHU
zWAP@8DU02gGSzG(JEMMWE|2HxCU0AttKHk?y3V_Z38ZsrYn!*zy|ksneYK|<c+*nX
z^$>x0d#Cs6#fulA1)ObN(2j>o<J^lFEP6qYgRTdy#USw@=w{FX&~wn)UnL$i1Nv9!
zB=u-eFV0|$)+SeKkO!4xg>uJIrmV6lXHP6H*$SM-DC&pT!hRJfYb>@A@4yuWb~$0L
za@$>&rm~{dPy$QoRK6`pUkaK{x3qBqvKhnm9MU(64nf#PTn#8l6qJGo*ZsI20CtEB
zBcI0dX^rJtWBKgHa)+zDeo6UyYhUr^qAi8@6nx2gm&H0%SYGcccW?oSLD=RhFL6Q4
zC>Qx)0S1r;)Gb>s>jH%e)(ofw_$}~JH{z_h-1a34jND^wDc80X!06^ece&%PB6oRx
zUvXP`L!4*kqSA=W8p}(#FWX_~A=r`qnaokxIw}OWIuEp_U%uI${;j+3?ZfSzo-Ktv
zc#XKXl)Z(wjC;$X2T<<eGq0M-*wJO~^gHU~g?~Z|P#+&~ZyHF=-c>k6?ZMq&xDXP6
zl2hGVYEJ-z>O}{Khch?A+K{sKrkz6j96T_gy>M4y8934|651-_T??U|-kqtTa`YD-
z!Kg=uRakue04!cCExts--FM2O-nkl2(Vaf*emMi)mMC_<FgTM#JKTM5od~MYLjTb4
zFlgC120ioWdz?S_B~a%p>g-t^NN84PpZjA<XjY@M&L_zh_0TIp1iLzeS_fl6x4QN>
zE$Z@tBOdkQT|qTAbdaEf>hc3e-0H=gc}-n@jF@9c{u4(|smq56IYqKRh59j4Kh~^H
zd0w4z9Abl~TiKP>X;x#IgFz2+f#w0yJlL#G-39FrUFwvRh#z;UvyZ8rM?o@b=lel*
zC0(fSp+lfYL65f}Dd<w;zpz2*R8Wn-Zlh{0I-Bb2Qs1DuzHtarrwA^4dyv=u)E~i}
z8ivXV2!X@xB$<M%${bK_Q9B2rx$-^84IO~oK^Gw|LewEJvk#N#^DyU9ul)n%bP}R(
z9whlgNIu5lW`YTIWzo(fRP<)G6K;I-LrQ??><r8gQT>qcJis>(P)Y{%dB=v^ySUwY
z@QAwZ(&g^HXG@Uw>aXbbD&1b9+jH)DFRPnsPYn;d=j};0(UZi3*%4INKAV2+_-n^^
z(c3Cj7Tokoa_WW`>2)_$sy_8w+lE)@H5pVR&ywhl_iVxRl&jstJBFUzX>K2V^V!;v
zb71Hab;f#XFX~fo**3f^R4JDoI|iXTaj|=N!0jtV8P?2KpP0C0$6F|Q<OM_*8_^v{
z)a$ssE4{~ce6Q>H05910gPUGRKDprqUa-e@Ze8YD?z+~s%+=Z9-tfK~oxU}FJpFX#
z50JGx{ewy<vA*F#b<-Enk6ZiKl{k+*MbpsC!<clork@97lhEG)a;J9!nkJ!pxYRB{
znuKlz<mx+VQ8%kF>E85)dQTA|ZJUm%_nbtZ^UOP<e&Hx0t?At@n}*eUGSm^bh$3w<
zij--gyufTmS+?V9&NK<+;Vj|;(me06N<!34?+P>9z-3mV4*eUs_BW0{T}fE_^^5~(
z-_smrsX6ZcMNiB_!O}-tcf3F4na6Az^Q3?49LW3;Vo3ZZ5})~M7X2MiPx^Rf2(&f*
zTq_?}-lhuiaRt0Fu56)lq_gA7#NVTzpuar#E~jTcV9;}upnP2U2oV}r5|lvW%8L}C
zaYdcc{C{vf`SClb-s$7XeFV4A2$Fw1*$<gdIG&^+`ad_Gpg!v^?fkO?%3KU6Ls55s
z+2=1qi@p%e>KR&YJVmeh%->=1@Fc|s8>#j5FRFu3YyW4S>4Y3wWOp|Ge4bu5m5gjv
zVWc{1qzgisa8~jJ?9m9;{1=R1uJjZS4Pj4E|J;h4Jn3VZQcT@94XB&mrtS@#W<Z30
zfT?{J@0arLdeR^Kh*$C<k^U>_WqV;6S=a*$nIlM+%ronOJXNMJkMjlBlZZEG9u~Pi
z4(<LQ@IK#vAMeVlZDS?P7}bjA$`~=T1+#*fX@Y4dh9*xW)(|9)vPEQb<vv7?sL}f>
zGe+cr$`d@Y?#4<7UKO2{T2QU<RAK<GD5#_|K8oZ_m#9xYN@R=yG}O>wlzu*2KgagT
zc!bUAgLsO6fWr0xJT!OuH}DFOHwihw1x|t83)VTD`413`l50QbN&i0cD==>1&CEKi
zy2!=*c`GnnpSeYVGyrD8WMaU*;W--WKkvHHb(8C6m-nWf9^ZkXiRz37=2qT-?R5`J
zn{GIhKf(UZyjt$v1(m1*YV_WZX#U=Ua@{R**aMrH1z<jn^5=wdKPjIo_;kwULis9E
z-tj4wUxqw+tAdm_eoE#0gz~HKL1qalZ}^B-Fi&Ex&zjY?6V2(9#*67nlrggmw-3~c
zjg{0|@mgEkEZ*;*##I{6Gv&#=jwUtylhjTlgF(QVqqxadrEaRjX!3!2qmkVQF7@J{
zeW2dFV|yc-5*pVwC9?_QgEmNEqWt%;w*3?h7W<I?)B%)mXe#h8kH%jIju#i6w;Rga
z;WU=H2Qf%qMv~7$*|6{pvhXVK8Kd!U68<#s;?elG2!Gh^+q3=6Y$^Vgu&+RN!st>w
z4ty{*%nLq%7H!JbiXsi(U?lzR=%fNlx&leGig5S6??|tC-<?kFcc+)_+uiapY_i=g
z)Tu#^8ssn$ci)<QOx^ejqJvKm-`}#&-H!zbtwH)uG$liQC)(6aH1zeYIm%KO3xTI9
zfIL%NLGIq2S%oTKwNnP$<h8z*kCh=`-<rcLb(56;49QOy@@_-EVRy^nto)q5H78ig
zA?3M!r;wj!$WKNUvhqb+hYGf0;pFaX*;mrH=3~n`o6yB#NZm9CzTg%w;pJAJLH=y}
z$3Qc`gzI?WYy@iCfemM#2e@_X3p-yLwfqA^)5q5I>okM(Y?<{>ShX*6Z>b{M;Yy!u
zO&{}IL=G=;?>K4Abikl{(}ASsd_%5>Jn28-<<Oiy<QksxD|g>c#XaxWsrTG{&y{@s
zCOJG|f}CAH<>r56Cp{(D?=SoVx*o0cEW>+Af2V=-7}d@Q!?j&@be>s$+-bZUo<)2Q
zB@XVVs?NHBc=F@mUd}Hi9@S@q1DwB-czB!*ZsU9-@#qI^@IlVcCm!9M4Q}Op4e=Ou
z+2DH4UraoG-WrT?{(Ry~h+o0^vxujmb+DcDlZeNpk_|R+zLa=OZP}oM^A_UiQ}3Y0
z`9I+^M-LX?=--_GfcSFa8Ry?4o<8Xg9!CEkp7mSeZN%^A{4a^8_BFVd^REz(=ZX4r
z{zc*^6Tgk~&k|3g-r$3re~NheL_4^Z^Z!CTeVQFy&-s5Kp4$6hjPnl>KaKbmoc|{A
z^a*ybo%8n+PjjKc2F`zx__K+3aQ;r>>66W%#`!ylKbLqL=i|hGmUzbbTZo@Q{Na<s
z!wY6=h%&ZX(CyU*FTZ(zL@fb95dODQGk*cK1O%BrCx>s!;oEX}Zw{Zx;n(Kyn{)VY
z=kWiS!#|nBzmUWKJcs{v4o`m@reOZ&PCPYW3fcsXQ)<8zXzD|O8ZZTVuoUQlQ`jTq
zsYCG~@)Lw<D>M#MgQ6hwp$<f0n?>W4IsgS}pE?``N3q5!br=fo3Bw<r8a6)5iFA#p
zGPb|Th@B^RSr2M}6y)!gsKHT?>C_-8>=XLb04d0H>W~!fb84KH@jGLW3I6b~QJz?l
zk&dRx%Qql+^i0lY{Ur3q$+!o`JuvQpaSx1pVB7=a9vJt)xCh2PFz$hI4~%<Y+ymnt
z827-q2gW@x?tyU+jC)|*1LGbT_rQOT2TH`>shIeC&U!)Re)Zg3UlZx?3VKY?q8g2h
zP8IY5LF)uvEa<g@1_fOy=v{(-P0&XKeO%C8g1#o`yMi7Qv}lgVU(gE#trK*ypw|i-
z6m+GacM1A6K_3zHaY1(p+Jwzy8sAo@t;9YEY+TTc?SPs#yH*d>);Q|t`RC2i9eRy(
zZr5djIl-X6HmK`$^^UoYxwZ9mf!g|u@Ux>(<kmoZzQ)aYJk7!gThi1tUz?3h@E2)L
zhkcISq18BQoKUUTW^>urh|hkJ5bIdh(W1dpYSl&BWp->NG(D_K5jXzkCjOE))9}v%
zlZ-h^fA_Pn7To1u{>-V2E#&DNv+45hfs=%o{Cgn%fMBd(l;;s73Hzem+&h*qFeAU4
zQ3XT(5g{-CKDsT3&o$!$AnLF1G+BQ^A4j4e*AJ1Ztham#QAM78A^(=n$XNw^8&Zk#
zAtWY#OK9Cd*3ZLD`3!zN%g(UwE!RM?1+SnGn_O8z4Keh`dJ8{!p+wn00Kx^rf|2hJ
zkb40O0xwY-aGyv&e4%2=?kL^!J3y9_LMxK2H{oX4*9@d!E{BVM2c*!*8A1~&-BxfG
zNYSkjpsSdbl-ve-#$C9TevWQDMmHpF=6A}>q7+dxns7NDq%wO2MIVA8<zgP)OVLYt
z)P^L*PSK*hxY0$KyaqaP!b7Nr;&>kM2|Fq3q$vN>vEseQseu{Uk)b?Lo*|j0a*@!@
zlYu<&7;yCp%Cm`zOhNq`XQ|vCVqfK|#NLm~Mo&z+_6pqJ##P}GCTD@6@Eu4`3Nd9l
zIgXAln0%9vn4&DRtYK4>PRm-{ud%Fyz?o8=%E6}A;zpNpnRv2m0prQy-T=561siEm
zN`+!8s!%2=*7CE<XO&MzyTuP&tmRisB#^9BD1awWvJF2fu*^l`{0R#tv`)Tax&^-m
zuvW4O6E2@%Me{GXfVEEMD$G{2tZ2ld*$B)Qb3;N8zm%~~&DzWwDI76dq$sVE6&Nl)
z55`MW<VV?@ai((Nbb?C@dA1YGC0mH!6HGjlT?kTUXID%&%6#E;%DrLQ*BgPC+m$hJ
zGN=}2%VR;6Z6k(>t(X9pOe!Y9Nk}O%oIxp*(|0OJ!{(?j-idvF)4tpY|C|YsRVYq~
zPo<J9oE|ARN;9UXt_5o0JEs)oqw-RwAdn#TizQYk{DD|P?2TsFE{-k!_73BxFlSig
zNio)W%2BKBk}I?YSDtrXJYlun7O>!C0%a!5SQ-6Lhb)(6ErVwv?2Whkki^){xGao@
z!T+)VzIZY;lD`~jtvi^~8lQiu1xFw#><o(>ajidRilqP;{VdVSt`j+oDoAxI5f>-0
zbo+dA7kDC-2pC(yt1qh#MSQ&}e}J(-Z?I~l+}Nh?4+QN!(i^gqT6Z$VI%$tJlC)0J
z_&sTjEVNF#a><R%Qeb_DZoBWb%${y7weu45A@dX0KQdB(Ir3XCRNUO*sJk5ft(OQ)
zoX&t<+;rAL2xkCL6}&hnKwr(z0Pwb6jUxg4*o%lt`<bstk25aA_!$Lw<7!P#r|}xq
zs=~=sB(SJE5DCOXKCpV+*R#ktvPP@w=Krs*(W+o@5jM|5uPO*yKM#(>@iqalVPk)0
zRc|2Do$OiU0F;W{9*X#@l6n_5s)N@P;&_H=DuEsOqZ82xaex3O8i}4N><CQ;e6&fD
z8;*Aes&Ixv6?R=hDv~5yaU2DK%5|ft$sX{Ax=@V9QBs^tfPx2Ow4IbnK;_{!aY{il
zzIu_f?o#JmvJV4Q*fJW4f{8?-Nd?JT0@&W#gnJG?l0&jLQH9<3f!;;*2ts;qRbW-X
zhbKZ0Mm!NH$CGr{kLK#oYV32Jn7fP;pko_wdP0mkV>0BciU)A`1$vU{Ig@pBW9_91
z(#w}eMsqlT0#&BXp`$xHj&<N&O^<i`#!jV%NG-&hWYs;<aG+Z6N`-p;)$v%^h%_#1
z^)y#^<H!MDRgHa)vl?xXiPI-wcBLExyxv$mkkcqnFAsMNvK-ZZ4R7%pr6P$?cO*a?
zat;5DH`-6uXvc1qyfwGL+oPY2;cyJ3BY}Q$N<kuWRF<QY9F673B;_PEv`6%%-uJ}(
zL+brd@KR4wn}{?Bs<34;8p}dA$!JPI1?-5j?7M==lGAN<Hi*QypbEQLM$NW5KY^`l
zq^)Gu7CMg+%(kvM&6Z0@^=1|6tvxD85w}1s%<)|r4+>0S9iw#qEJKrk0``aObCtrh
z>KF^hbYx8YeBcV3$nLMu5E}`6hK{MPTvJ>wl87oed8k3xI6~#1a1}1XSLhmI#>|rF
z^|&zK$^MR$%12=ZE(`3--~BEn5aAYFWamL$LyY1^SjFR$vcLC~b^;JkEK`~M9w6~`
zkRQtrU(SPni`$=^{XMF*^X)wHkLJO@YNhgw_*1Sc|C#J#QfOPDAvO|<sDhU1+27sD
zl-pp3Sw{RO@r?rC*Q7~^)7>1O!g9ZVTo3yeOL_Kpuu_T653^X*?C)VE{ykp))64lY
z2u#BB%3TG$DR8+4#gDsrcvs?1jq`GGbWGrTFEDVvFCW-PxHL8u4a2~}@^B8yS%K3V
z<w8~Z^Ag~bq>za&0&W|_4xO*2ndD?N4xIWxZn!%S{`EZg_koX99@<+s7QQPF{?k18
zzC8GQdGL?(;1yWyja8oW^5Apx;4OLZ<s2`=d5{`+=|=Dr?hyD2@!a{3tAT%=<6LCa
z?K^q!NAlp`&x8Ma9{k6^$2w2$l|1sl%!9ur><s7{FTWf&XxH>uem;~3|2Pj$d$Y%~
zQ<evxk_Vrj2cMM(ug-(d1wPifdJ8x{@?MY+bBVzB2>(dj!|j(bx!<D!?UjPgaU*WN
zhw|Xeju~Yj9%J?h78`a~V=ERa-OTP^9Z9SXGdq?ycJaY;6yuE>Y!4PHhIA4Lrt3PX
zidd{SiCC1M&Snqvi1YD!{E(C!vnK<qAjN0(cD{zQbG4BvGvlzaf?)PhGd7;H9u8p=
z-yMY&Bd2IM=8fX)xTsD`BRj2gLV*O$_T(bSD;GgnBN~aMZ)A)^<e1%uGwqNBi-E3G
zH}a;Fc7Pi!KjBX%b#=u9D`g6;0%aU)FBu)JPUrdA;a#i<L3`+IJsvZtTsY*cfUsie
z!a0Kxz8Gq8dlxTtwY7LXtsR|SI)l(3HCDC80s^ZqtX=##+Rp2ZcHM$SB+g|sk_=g~
zSOyW2avjD_KSd`J^@hWG3>nAF>n;=st2LnZvHk7k#YJ|sMDmWT(2Ys93FeQUpR7oa
zPafd6#B+6{%R?vbbhLDOJ6(;Q7Swa}YF-%O7UkKDW*a#NHS&z)cLIMR>cz=>ek>3F
zU!36Ry}>z`9AGq!W_Y^YSwjaPc2P%3I_r8FL~4CFdny_R=&bW5QsMTM9p3g(hr>&U
zDgN&}0n+Ogjn^BD#?eB0sq3L8qmG3fW6eDZqHhvj@8~Yqa#b_BUA~sjHJY!#%GK7|
zWFCb7f;bKDbm29}gFH7|o~JGMZ;50sE8|3IKU*;_slVBxfg3r`+2A<DZ*ra}5nBID
zz$NwNKDKQF7{L350_8#Fl<#qTvE^X~?pO;)rM}!Z7ArE++YN#bnMROulF|oM+G8g9
zfg+6y4ha1Q&W+SDxBT=6f@3CqR;(EsRA2;$L9+SF@(Y;%g^bE3^OyVd97132)06Vz
zbu3xdh8UGe>dSqH8aWk#)<YCz{bl~rf6I|Z`i&%nAon5e5&BY*^zkwNH2>?ss7y7u
zD9H7|UfNlUkdOs4Sj<sIhY(oOmtyjqae4lD=V)<WRx@;aj=o$k-%mSr5%`W}Bg*fx
z{u0`N2<0yxclI22dA|F-q8?H{xBOqo(U;$G?mN%Wp!RJJx$S2Q63zL`?>>(Sed=H4
zkgNX{B%1XHoEjG&5c)RhSYaf${PzQ=D$4dD_jSs1<8%Ebc}X9Ju4>Yk>)(Ad0U}5_
zgB#%`z6}XfCYk^KCXI{k|AhM6Ax+Ot>dSSuJTL!2QGd~$vgMI}*aKbipVXJ<wBCII
zK!n)_$=+qTCGZnO%=+1Na)Tiu^O5qo`ajRnm){|5LSO1hN_m;X5E#Px1emKY^Dh(W
zx%rns*<5~Yg~lbd219|)fiZ_%efl#RVbXsMy2i!pU7XN4*rS3rio@U9pifyia&W2t
zogAEGDM&g)=o{$BeZNb@rJ!J?Kcu8RpMIkuv0k(vDW7AG&CjF1qQ%glvlh)EhsC)v
zBXu#$9bRm3Z{{fEu&h()=lbtwZbQE_XTXs9lG67k88_$epcf0mdVU|d<@zt2M}M2p
zzdXly4$D>n$Z;{x>@a<}M9@UXrN6kJjZ3ya`TklDecAR%UIv@^)O&AEhB++zVjlg6
O=*5VzH^+Fc{(k~WGn2>w

diff --git a/reduce_scatter.cu b/reduce_scatter.cu
deleted file mode 100644
index 99fc950..0000000
--- a/reduce_scatter.cu
+++ /dev/null
@@ -1,269 +0,0 @@
-/* \file reduce_scatter.cu
- * Copyright 2024 Parallel Software and Systems Group, University of Maryland.
- * See the top-level LICENSE file for details.
- * 
- * SPDX-License-Identifier: MIT
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-#include <stdint.h>
-
-#ifdef USE_CUDA
-  #include <cuda_bf16.h>
-  #define bfloat16 nv_bfloat16
-#elif USE_ROCM
-  #define __HIP_PLATFORM_AMD__
-  #include <hip/hip_bfloat16.h>
-  #include <hip/hip_runtime.h>
-  #include <hip/hip_runtime_api.h>
-  #define bfloat16 hip_bfloat16
-#endif
-
-#ifdef USE_NCCL
-  #include "nccl.h"
-#elif USE_RCCL
-  #include <rccl/rccl.h> 
-#endif
-
-#define NUM_WARMUP_ITERATIONS		5
-
-#define MPI_CHECK(cmd) do {                         \
-  int64_t e = cmd;                                      \
-  if( e != MPI_SUCCESS ) {                          \
-    printf("Failed: MPI error %s:%d '%ld'\n",        \
-        __FILE__,__LINE__, e);                      \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-#define CUDA_CHECK(cmd) do {                        \
-  cudaError_t e = cmd;                              \
-  if(e != cudaSuccess) {                            \
-    printf("CUDA error  %s:%d: %s\n",               \
-        __FILE__, __LINE__, cudaGetErrorString(e)); \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-#define HIP_CHECK(cmd) do {                        \
-  hipError_t e = cmd;                              \
-  if(e != hipSuccess) {                            \
-    printf("HIP error  %s:%d: %s\n",               \
-        __FILE__, __LINE__, hipGetErrorString(e)); \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-// NCCL_CHECK is used to validate RCCL functions as well
-#define NCCL_CHECK(cmd) do {                        \
-  ncclResult_t e = cmd;                             \
-  if (e != ncclSuccess) {                           \
-    printf("NCCL error %s:%d %s\n",                 \
-        __FILE__, __LINE__, ncclGetErrorString(e)); \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-void initializeData(bfloat16 *data, int64_t size) {
-    for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) {
-        #ifdef USE_CUDA
-        data[i] = __float2bfloat16((float)i);
-        #elif USE_ROCM
-        // ROCm doesn't have a float2bfloat16 method
-        data[i] = (bfloat16) ((float) i);
-        #endif
-    }
-}
-
-void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
-    bfloat16* in = (bfloat16*) invec;
-    bfloat16* inout = (bfloat16*) inoutvec;
-    for (int i = 0; i < *len; i++) {
-        #ifdef USE_CUDA
-        inout[i] = __hadd(in[i], inout[i]);
-        #elif USE_ROCM
-        inout[i] = in[i] + inout[i];
-        #endif
-    }
-}
-
-int main(int argc, char *argv[]) {
-    if (argc != 5) {
-        fprintf(stderr, "Usage: %s <num_gpus> <min_msg_size> <max_msg_size> <iterations>\n", argv[0]);
-        return EXIT_FAILURE;
-    }
-
-    int num_gpus = atoi(argv[1]);
-    int64_t min_msg_size = strtoll(argv[2], NULL, 10);
-    int64_t max_msg_size = strtoll(argv[3], NULL, 10);
-    int iterations = atoi(argv[4]);
-
-    if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
-        fprintf(stderr, "Invalid input parameters.\n");
-        return EXIT_FAILURE;
-    }
-
-    int my_rank, num_pes;
-    int num_gpus_per_node;
-    int msg_count;
-
-    MPI_Init(&argc, &argv);
-    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
-
-    if (num_pes != num_gpus) {
-        fprintf(stderr, "Number of processes must match number of GPUs.\n");
-        MPI_Finalize();
-        return EXIT_FAILURE;
-    }
-
-    // Initialize GPU context
-    #if USE_CUDA
-    cudaGetDeviceCount(&num_gpus_per_node);
-    cudaSetDevice((my_rank % num_gpus_per_node));
-    #elif USE_ROCM
-    hipGetDeviceCount(&num_gpus_per_node);
-    hipSetDevice((my_rank % num_gpus_per_node));
-    #endif
-
-    int64_t local_data_size = max_msg_size; // Size of local data
-    int64_t global_data_size = local_data_size; // Size of global data
-
-    if (my_rank == 0) {
-        fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024);
-        fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024);
-    }
-
-    bfloat16 *local_data = (bfloat16*)malloc(local_data_size);
-    bfloat16 *global_data = (bfloat16*)malloc(global_data_size);
-
-    // Initialize local data
-    initializeData(local_data, local_data_size);
-
-    bfloat16 *d_local_data, *d_global_data;
-    #ifdef USE_CUDA
-    CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
-    CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
-    // Copy local data to GPU
-    CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
-
-    #elif USE_ROCM
-    HIP_CHECK(hipMalloc(&d_local_data, local_data_size));
-    HIP_CHECK(hipMalloc(&d_global_data, global_data_size));
-    HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice));
-    #endif
-
-    #ifdef USE_MPI
-    // create 2-byte datatype (send raw, un-interpreted bytes)
-    MPI_Datatype mpi_type_bfloat16;
-    MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16);
-    MPI_Type_commit(&mpi_type_bfloat16);
-
-    // define custom reduce operation for nv_bfloat16 types
-    MPI_Op CUSTOM_SUM;
-    MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM);
-
-    #elif defined(USE_NCCL) || defined(USE_RCCL)
-    ncclUniqueId nccl_comm_id;
-    ncclComm_t nccl_comm;
-
-    if (my_rank == 0) {
-        /* Generates an Id to be used in ncclCommInitRank. */
-        ncclGetUniqueId(&nccl_comm_id);
-    }
-
-    /* distribute nccl_comm_id to all ranks */
-    MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
-                        0, MPI_COMM_WORLD));
-
-    /* Create a new NCCL/RCCL communicator */
-    NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
-    #endif
-
-    // init recvcounts to send an equal portion of data from the reduce operation
-    int num_elements = local_data_size / sizeof(bfloat16);
-    int portion = num_elements / num_pes;
-    int *recvcounts = (int*) malloc(sizeof(int) * num_pes);
-    for (int i = 0; i < num_pes; i++) 
-        recvcounts[i] = portion;
-
-    // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
-    double total_time, start_time;
-    MPI_Request request;
-    MPI_Status status;
-
-    // Print benchmark results
-    if (my_rank == 0) {
-        printf("Number of GPUs: %d\n", num_gpus);
-        printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size);
-        printf("Number of iterations: %d\n", iterations);
-    }
-    fflush(NULL);
-
-    for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
-	msg_count = msg_size / sizeof(bfloat16);
-	// warmup iterations
-	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
-            #ifdef USE_MPI
-            MPI_CHECK(MPI_Ireduce_scatter(d_local_data, d_global_data, recvcounts, mpi_type_bfloat16,
-                CUSTOM_SUM, MPI_COMM_WORLD, &request));
-
-            MPI_CHECK(MPI_Wait(&request, &status));
-            #elif defined(USE_NCCL) || defined(USE_RCCL)
-            NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL));
-            #endif
-            
-            #ifdef USE_CUDA
-            cudaDeviceSynchronize();
-            #elif USE_ROCM
-            hipDeviceSynchronize();
-            #endif
-        }
-
-	if(msg_size >= 8388608)
-	    iterations = 20;
-
-        MPI_Barrier(MPI_COMM_WORLD);
-        start_time = MPI_Wtime();
-	for (int i = 0; i < iterations; ++i) {
-            #ifdef USE_MPI
-            MPI_CHECK(MPI_Ireduce_scatter(d_local_data, d_global_data, recvcounts, mpi_type_bfloat16,
-                CUSTOM_SUM, MPI_COMM_WORLD, &request));
-
-            MPI_CHECK(MPI_Wait(&request, &status));
-            #elif defined(USE_NCCL) || defined(USE_RCCL)
-            NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL));
-            #endif
-            
-            #ifdef USE_CUDA
-            cudaDeviceSynchronize();
-            #elif USE_ROCM
-            hipDeviceSynchronize();
-            #endif
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-        total_time = MPI_Wtime() - start_time;
-	if (my_rank == 0)
-	    printf("%ld %.6f seconds\n", msg_size, (total_time / iterations));
-    }
-
-    // Cleanup
-    free(local_data);
-    free(global_data);
-    #ifdef USE_CUDA
-    CUDA_CHECK(cudaFree(d_local_data));
-    CUDA_CHECK(cudaFree(d_global_data));
-    #elif USE_ROCM
-    HIP_CHECK(hipFree(d_local_data));
-    HIP_CHECK(hipFree(d_global_data));
-    #endif
-
-    #ifdef defined(USE_NCCL) || defined(USE_RCCL)
-    ncclCommDestroy(nccl_comm);
-    #endif
-
-    MPI_Finalize();
-    return EXIT_SUCCESS;
-}

From f84dd26084d452099721e2aad433e2b275542dcd Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Thu, 11 Jul 2024 17:19:45 -0700
Subject: [PATCH 42/52] update .gitignore to ignore .x and .out files

---
 .gitignore                                    |   2 +
 LICENSE                                       |  20 ++
 README.md                                     |  15 +
 allgather.cu                                  | 248 ++++++++++++++++
 allreduce.cu                                  | 262 +++++++++++++++++
 mpi/Makefile                                  |  30 ++
 mpi/all-gather/frontier/128_gcd_run.sh        |  21 ++
 mpi/all-gather/frontier/16_gcd_run.sh         |  21 ++
 mpi/all-gather/frontier/32_gcd_run.sh         |  21 ++
 mpi/all-gather/frontier/64_gcd_run.sh         |  21 ++
 mpi/all-gather/frontier/8_gcd_run.sh          |  21 ++
 .../frontier/benchmarks/128_gcd.txt           |  13 +
 mpi/all-gather/frontier/benchmarks/16_gcd.txt |  13 +
 mpi/all-gather/frontier/benchmarks/32_gcd.txt |  15 +
 mpi/all-gather/frontier/benchmarks/64_gcd.txt |  14 +
 mpi/all-gather/frontier/benchmarks/8_gcd.txt  |  14 +
 mpi/all-gather/perlmutter/128_gpu_run.sh      |  37 +++
 mpi/all-gather/perlmutter/16_gpu_run.sh       |  37 +++
 mpi/all-gather/perlmutter/32_gpu_run.sh       |  37 +++
 mpi/all-gather/perlmutter/64_gpu_run.sh       |  37 +++
 mpi/all-gather/perlmutter/8_gpu_run.sh        |  37 +++
 .../perlmutter/benchmarks/128_gpu.txt         |  12 +
 .../perlmutter/benchmarks/16_gpu.txt          |  21 ++
 .../perlmutter/benchmarks/32_gpu.txt          |  14 +
 .../perlmutter/benchmarks/64_gpu.txt          |  13 +
 .../perlmutter/benchmarks/8_gpu.txt           |  13 +
 mpi/all-reduce/frontier/128_gcd_run.sh        |  21 ++
 mpi/all-reduce/frontier/16_gcd_run.sh         |  21 ++
 mpi/all-reduce/frontier/32_gcd_run.sh         |  21 ++
 mpi/all-reduce/frontier/64_gcd_run.sh         |  21 ++
 mpi/all-reduce/frontier/8_gcd_run.sh          |  21 ++
 .../frontier/benchmarks/128_gcd.txt           |  12 +
 mpi/all-reduce/frontier/benchmarks/16_gcd.txt |  12 +
 mpi/all-reduce/frontier/benchmarks/32_gcd.txt |  14 +
 mpi/all-reduce/frontier/benchmarks/64_gcd.txt |  13 +
 mpi/all-reduce/frontier/benchmarks/8_gcd.txt  |  13 +
 mpi/all-reduce/perlmutter/128_gpu_run.sh      |  37 +++
 mpi/all-reduce/perlmutter/16_gpu_run.sh       |  37 +++
 mpi/all-reduce/perlmutter/32_gpu_run.sh       |  37 +++
 mpi/all-reduce/perlmutter/64_gpu_run.sh       |  37 +++
 mpi/all-reduce/perlmutter/8_gpu_run.sh        |  37 +++
 .../perlmutter/benchmarks/16_gpu.txt          |  11 +
 .../perlmutter/benchmarks/32_gpu.txt          |  13 +
 .../perlmutter/benchmarks/8_gpu.txt           |  12 +
 mpi/reduce-scatter/frontier/128_gcd_run.sh    |  21 ++
 mpi/reduce-scatter/frontier/16_gcd_run.sh     |  21 ++
 mpi/reduce-scatter/frontier/32_gcd_run.sh     |  21 ++
 mpi/reduce-scatter/frontier/64_gcd_run.sh     |  21 ++
 mpi/reduce-scatter/frontier/8_gcd_run.sh      |  21 ++
 .../frontier/benchmarks/128_gcd.txt           |  13 +
 .../frontier/benchmarks/16_gcd.txt            |  13 +
 .../frontier/benchmarks/32_gcd.txt            |  15 +
 .../frontier/benchmarks/64_gcd.txt            |  17 ++
 .../frontier/benchmarks/8_gcd.txt             |  14 +
 mpi/reduce-scatter/perlmutter/128_gpu_run.sh  |  37 +++
 mpi/reduce-scatter/perlmutter/16_gpu_run.sh   |  37 +++
 mpi/reduce-scatter/perlmutter/32_gpu_run.sh   |  37 +++
 mpi/reduce-scatter/perlmutter/64_gpu_run.sh   |  37 +++
 mpi/reduce-scatter/perlmutter/8_gpu_run.sh    |  37 +++
 .../perlmutter/benchmarks/128_gpu.txt         |  12 +
 .../perlmutter/benchmarks/16_gpu.txt          |  12 +
 .../perlmutter/benchmarks/32_gpu.txt          |  14 +
 .../perlmutter/benchmarks/64_gpu.txt          |  13 +
 .../perlmutter/benchmarks/8_gpu.txt           |  13 +
 nccl/Makefile                                 |  25 ++
 nccl/all-gather/128_gpu_run.sh                |  37 +++
 nccl/all-gather/16_gpu_run.sh                 |  37 +++
 nccl/all-gather/32_gpu_run.sh                 |  37 +++
 nccl/all-gather/64_gpu_run.sh                 |  37 +++
 nccl/all-gather/8_gpu_run.sh                  |  37 +++
 nccl/all-gather/benchmarks/16_gpu.txt         |  13 +
 nccl/all-gather/benchmarks/32_gpu.txt         |  14 +
 nccl/all-gather/benchmarks/64_gpu.txt         |  13 +
 nccl/all-gather/benchmarks/8_gpu.txt          |  13 +
 nccl/all-reduce/128_gpu_run.sh                |  37 +++
 nccl/all-reduce/16_gpu_run.sh                 |  37 +++
 nccl/all-reduce/32_gpu_run.sh                 |  37 +++
 nccl/all-reduce/64_gpu_run.sh                 |  37 +++
 nccl/all-reduce/8_gpu_run.sh                  |  37 +++
 nccl/all-reduce/benchmarks/16_gpu.txt         |  12 +
 nccl/all-reduce/benchmarks/32_gpu.txt         |  14 +
 nccl/reduce-scatter/128_gpu_run.sh            |  37 +++
 nccl/reduce-scatter/16_gpu_run.sh             |  37 +++
 nccl/reduce-scatter/32_gpu_run.sh             |  37 +++
 nccl/reduce-scatter/64_gpu_run.sh             |  37 +++
 nccl/reduce-scatter/8_gpu_run.sh              |  37 +++
 nccl/reduce-scatter/benchmarks/128_gpu.txt    |  12 +
 nccl/reduce-scatter/benchmarks/16_gpu.txt     |  12 +
 nccl/reduce-scatter/benchmarks/32_gpu.txt     |  14 +
 nccl/reduce-scatter/benchmarks/64_gpu.txt     |  13 +
 nccl/reduce-scatter/benchmarks/8_gpu.txt      |  13 +
 rccl/Makefile                                 |  25 ++
 reduce_scatter.cu                             | 269 ++++++++++++++++++
 93 files changed, 2842 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100644 allgather.cu
 create mode 100644 allreduce.cu
 create mode 100644 mpi/Makefile
 create mode 100644 mpi/all-gather/frontier/128_gcd_run.sh
 create mode 100644 mpi/all-gather/frontier/16_gcd_run.sh
 create mode 100644 mpi/all-gather/frontier/32_gcd_run.sh
 create mode 100644 mpi/all-gather/frontier/64_gcd_run.sh
 create mode 100644 mpi/all-gather/frontier/8_gcd_run.sh
 create mode 100644 mpi/all-gather/frontier/benchmarks/128_gcd.txt
 create mode 100644 mpi/all-gather/frontier/benchmarks/16_gcd.txt
 create mode 100644 mpi/all-gather/frontier/benchmarks/32_gcd.txt
 create mode 100644 mpi/all-gather/frontier/benchmarks/64_gcd.txt
 create mode 100644 mpi/all-gather/frontier/benchmarks/8_gcd.txt
 create mode 100644 mpi/all-gather/perlmutter/128_gpu_run.sh
 create mode 100644 mpi/all-gather/perlmutter/16_gpu_run.sh
 create mode 100644 mpi/all-gather/perlmutter/32_gpu_run.sh
 create mode 100644 mpi/all-gather/perlmutter/64_gpu_run.sh
 create mode 100644 mpi/all-gather/perlmutter/8_gpu_run.sh
 create mode 100644 mpi/all-gather/perlmutter/benchmarks/128_gpu.txt
 create mode 100644 mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
 create mode 100644 mpi/all-gather/perlmutter/benchmarks/32_gpu.txt
 create mode 100644 mpi/all-gather/perlmutter/benchmarks/64_gpu.txt
 create mode 100644 mpi/all-gather/perlmutter/benchmarks/8_gpu.txt
 create mode 100644 mpi/all-reduce/frontier/128_gcd_run.sh
 create mode 100644 mpi/all-reduce/frontier/16_gcd_run.sh
 create mode 100644 mpi/all-reduce/frontier/32_gcd_run.sh
 create mode 100644 mpi/all-reduce/frontier/64_gcd_run.sh
 create mode 100644 mpi/all-reduce/frontier/8_gcd_run.sh
 create mode 100644 mpi/all-reduce/frontier/benchmarks/128_gcd.txt
 create mode 100644 mpi/all-reduce/frontier/benchmarks/16_gcd.txt
 create mode 100644 mpi/all-reduce/frontier/benchmarks/32_gcd.txt
 create mode 100644 mpi/all-reduce/frontier/benchmarks/64_gcd.txt
 create mode 100644 mpi/all-reduce/frontier/benchmarks/8_gcd.txt
 create mode 100644 mpi/all-reduce/perlmutter/128_gpu_run.sh
 create mode 100644 mpi/all-reduce/perlmutter/16_gpu_run.sh
 create mode 100644 mpi/all-reduce/perlmutter/32_gpu_run.sh
 create mode 100644 mpi/all-reduce/perlmutter/64_gpu_run.sh
 create mode 100644 mpi/all-reduce/perlmutter/8_gpu_run.sh
 create mode 100644 mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt
 create mode 100644 mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt
 create mode 100644 mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt
 create mode 100644 mpi/reduce-scatter/frontier/128_gcd_run.sh
 create mode 100644 mpi/reduce-scatter/frontier/16_gcd_run.sh
 create mode 100644 mpi/reduce-scatter/frontier/32_gcd_run.sh
 create mode 100644 mpi/reduce-scatter/frontier/64_gcd_run.sh
 create mode 100644 mpi/reduce-scatter/frontier/8_gcd_run.sh
 create mode 100644 mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt
 create mode 100644 mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt
 create mode 100644 mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt
 create mode 100644 mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt
 create mode 100644 mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt
 create mode 100644 mpi/reduce-scatter/perlmutter/128_gpu_run.sh
 create mode 100644 mpi/reduce-scatter/perlmutter/16_gpu_run.sh
 create mode 100644 mpi/reduce-scatter/perlmutter/32_gpu_run.sh
 create mode 100644 mpi/reduce-scatter/perlmutter/64_gpu_run.sh
 create mode 100644 mpi/reduce-scatter/perlmutter/8_gpu_run.sh
 create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt
 create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt
 create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt
 create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt
 create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt
 create mode 100644 nccl/Makefile
 create mode 100644 nccl/all-gather/128_gpu_run.sh
 create mode 100644 nccl/all-gather/16_gpu_run.sh
 create mode 100644 nccl/all-gather/32_gpu_run.sh
 create mode 100644 nccl/all-gather/64_gpu_run.sh
 create mode 100644 nccl/all-gather/8_gpu_run.sh
 create mode 100644 nccl/all-gather/benchmarks/16_gpu.txt
 create mode 100644 nccl/all-gather/benchmarks/32_gpu.txt
 create mode 100644 nccl/all-gather/benchmarks/64_gpu.txt
 create mode 100644 nccl/all-gather/benchmarks/8_gpu.txt
 create mode 100644 nccl/all-reduce/128_gpu_run.sh
 create mode 100644 nccl/all-reduce/16_gpu_run.sh
 create mode 100644 nccl/all-reduce/32_gpu_run.sh
 create mode 100644 nccl/all-reduce/64_gpu_run.sh
 create mode 100644 nccl/all-reduce/8_gpu_run.sh
 create mode 100644 nccl/all-reduce/benchmarks/16_gpu.txt
 create mode 100644 nccl/all-reduce/benchmarks/32_gpu.txt
 create mode 100644 nccl/reduce-scatter/128_gpu_run.sh
 create mode 100644 nccl/reduce-scatter/16_gpu_run.sh
 create mode 100644 nccl/reduce-scatter/32_gpu_run.sh
 create mode 100644 nccl/reduce-scatter/64_gpu_run.sh
 create mode 100644 nccl/reduce-scatter/8_gpu_run.sh
 create mode 100644 nccl/reduce-scatter/benchmarks/128_gpu.txt
 create mode 100644 nccl/reduce-scatter/benchmarks/16_gpu.txt
 create mode 100644 nccl/reduce-scatter/benchmarks/32_gpu.txt
 create mode 100644 nccl/reduce-scatter/benchmarks/64_gpu.txt
 create mode 100644 nccl/reduce-scatter/benchmarks/8_gpu.txt
 create mode 100644 rccl/Makefile
 create mode 100644 reduce_scatter.cu

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7882514
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+*.x
+*.out
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..9943369
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,20 @@
+Copyright (c) 2024, Parallel Software and Systems Group, University of
+Maryland.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..526fb95
--- /dev/null
+++ b/README.md
@@ -0,0 +1,15 @@
+Before compiling do these:
+
+### Perlmutter
+```sh
+module load PrgEnv-cray cudatoolkit craype-accel-nvidia80 nccl
+export CRAY_ACCEL_TARGET=nvidia80
+export MPICH_GPU_SUPPORT_ENABLED=1
+```
+### Frontier
+```sh
+module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05
+export MPICH_GPU_SUPPORT_ENABLED=1
+export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
+```
+
diff --git a/allgather.cu b/allgather.cu
new file mode 100644
index 0000000..8c357bb
--- /dev/null
+++ b/allgather.cu
@@ -0,0 +1,248 @@
+/* \file allgather.cu
+ * Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+ * See the top-level LICENSE file for details.
+ * 
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include <stdint.h>
+
+#ifdef USE_CUDA
+  #include <cuda_bf16.h>
+  #define bfloat16 nv_bfloat16
+#elif USE_ROCM
+  #define __HIP_PLATFORM_AMD__
+  #include <hip/hip_bfloat16.h>
+  #include <hip/hip_runtime.h>
+  #include <hip/hip_runtime_api.h>
+  #define bfloat16 hip_bfloat16
+#endif
+
+#ifdef USE_NCCL
+  #include "nccl.h"
+#elif USE_RCCL
+  #include <rccl/rccl.h> 
+#endif
+
+#define NUM_WARMUP_ITERATIONS		5
+
+#define MPI_CHECK(cmd) do {                         \
+  int64_t e = cmd;                                      \
+  if( e != MPI_SUCCESS ) {                          \
+    printf("Failed: MPI error %s:%d '%ld'\n",        \
+        __FILE__,__LINE__, e);                      \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define CUDA_CHECK(cmd) do {                        \
+  cudaError_t e = cmd;                              \
+  if(e != cudaSuccess) {                            \
+    printf("CUDA error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, cudaGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define HIP_CHECK(cmd) do {                        \
+  hipError_t e = cmd;                              \
+  if(e != hipSuccess) {                            \
+    printf("HIP error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, hipGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+// NCCL_CHECK is used to validate RCCL functions as well
+#define NCCL_CHECK(cmd) do {                        \
+  ncclResult_t e = cmd;                             \
+  if (e != ncclSuccess) {                           \
+    printf("NCCL error %s:%d %s\n",                 \
+        __FILE__, __LINE__, ncclGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+void initializeData(bfloat16 *data, int64_t size) {
+    for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) {
+        #ifdef USE_CUDA
+        data[i] = __float2bfloat16((float)i);
+        #elif USE_ROCM
+        // ROCm doesn't have a float2bfloat16 method
+        data[i] = (bfloat16) ((float) i);
+        #endif
+    }
+}
+
+int main(int argc, char *argv[]) {
+    if (argc != 5) {
+        fprintf(stderr, "Usage: %s <num_gpus> <min_msg_size> <max_msg_size> <iterations>\n", argv[0]);
+        return EXIT_FAILURE;
+    }
+
+    int num_gpus = atoi(argv[1]);
+    int64_t min_msg_size = atoi(argv[2]);
+    int64_t max_msg_size = atoi(argv[3]);
+    int iterations = atoi(argv[4]);
+
+    if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
+        fprintf(stderr, "Invalid input parameters.\n");
+        return EXIT_FAILURE;
+    }
+
+    int my_rank, num_pes;
+    int num_gpus_per_node;
+    int msg_count;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
+
+    if (num_pes != num_gpus) {
+        fprintf(stderr, "Number of processes must match number of GPUs.\n");
+        MPI_Finalize();
+        return EXIT_FAILURE;
+    }
+
+    // Initialize GPU context
+    #if USE_CUDA
+    cudaGetDeviceCount(&num_gpus_per_node);
+    cudaSetDevice((my_rank % num_gpus_per_node));
+    #elif USE_ROCM
+    hipGetDeviceCount(&num_gpus_per_node);
+    hipSetDevice((my_rank % num_gpus_per_node));
+    #endif
+
+    int64_t local_data_size = max_msg_size; // Size of local data
+    int64_t global_data_size = local_data_size * num_gpus; // Size of global data
+
+    if (my_rank == 0) {
+        fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024);
+        fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024);
+    }
+
+    bfloat16 *local_data = (bfloat16*)malloc(local_data_size);
+    bfloat16 *global_data = (bfloat16*)malloc(global_data_size);
+
+    // Initialize local data
+    initializeData(local_data, local_data_size);
+
+    // Allocate memory on GPU
+    bfloat16 *d_local_data, *d_global_data;
+    #ifdef USE_CUDA
+    CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
+    CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
+    // Copy local data to GPU
+    CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
+
+    #elif USE_ROCM
+    HIP_CHECK(hipMalloc(&d_local_data, local_data_size));
+    HIP_CHECK(hipMalloc(&d_global_data, global_data_size));
+    HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice));
+    #endif
+
+    #ifdef USE_MPI
+    // create 2-byte datatype (send raw, un-interpreted bytes)
+    MPI_Datatype mpi_type_bfloat16;
+    MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16);
+    MPI_Type_commit(&mpi_type_bfloat16);
+
+    #elif defined(USE_NCCL) || defined(USE_RCCL)
+    ncclUniqueId nccl_comm_id;
+    ncclComm_t nccl_comm;
+
+    if (my_rank == 0) {
+        /* Generates an Id to be used in ncclCommInitRank. */
+        ncclGetUniqueId(&nccl_comm_id);
+    }
+
+    /* distribute nccl_comm_id to all ranks */
+    MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
+                        0, MPI_COMM_WORLD));
+
+    /* Create a new NCCL/RCCL communicator */
+    NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
+    #endif
+
+    // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
+    double total_time, start_time;
+    MPI_Request request;
+    MPI_Status status;
+
+    // Print benchmark results
+    if (my_rank == 0) {
+        printf("Number of GPUs: %d\n", num_gpus);
+        printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size);
+        printf("Number of iterations: %d\n", iterations);
+    }
+    fflush(NULL);
+
+    for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
+	msg_count = msg_size / sizeof(bfloat16);
+	// warmup iterations
+	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
+            #ifdef USE_MPI
+	    MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16,
+		d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request));
+                
+            MPI_CHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
+            NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL));
+            #endif
+        
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
+            #endif
+        }
+
+	if(msg_size >= 8388608)
+	    iterations = 20;
+
+        MPI_Barrier(MPI_COMM_WORLD);
+        start_time = MPI_Wtime();
+	for (int i = 0; i < iterations; ++i) {
+            #ifdef USE_MPI
+	    MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16,
+		d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request));
+                
+            MPI_CHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
+            NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL));
+            #endif
+        
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
+            #endif
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+        total_time = MPI_Wtime() - start_time;
+	if (my_rank == 0)
+	    printf("%ld %.6f seconds\n", msg_size, (total_time / iterations));
+    }
+
+    // Cleanup
+    free(local_data);
+    free(global_data);
+    #ifdef USE_CUDA
+    CUDA_CHECK(cudaFree(d_local_data));
+    CUDA_CHECK(cudaFree(d_global_data));
+    #elif USE_ROCM
+    HIP_CHECK(hipFree(d_local_data));
+    HIP_CHECK(hipFree(d_global_data));
+    #endif
+
+    #ifdef defined(USE_NCCL) || defined(USE_RCCL)
+    ncclCommDestroy(nccl_comm);
+    #endif
+
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}
+
diff --git a/allreduce.cu b/allreduce.cu
new file mode 100644
index 0000000..111b254
--- /dev/null
+++ b/allreduce.cu
@@ -0,0 +1,262 @@
+/* \file allreduce.cu
+ * Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+ * See the top-level LICENSE file for details.
+ * 
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include <stdint.h>
+
+#ifdef USE_CUDA
+  #include <cuda_bf16.h>
+  #define bfloat16 nv_bfloat16
+#elif USE_ROCM
+  #define __HIP_PLATFORM_AMD__
+  #include <hip/hip_bfloat16.h>
+  #include <hip/hip_runtime.h>
+  #include <hip/hip_runtime_api.h>
+  #define bfloat16 hip_bfloat16
+#endif
+
+#ifdef USE_NCCL
+  #include "nccl.h"
+#elif USE_RCCL
+  #include <rccl/rccl.h> 
+#endif
+
+#define NUM_WARMUP_ITERATIONS		5
+
+#define MPI_CHECK(cmd) do {                         \
+  int64_t e = cmd;                                      \
+  if( e != MPI_SUCCESS ) {                          \
+    printf("Failed: MPI error %s:%d '%ld'\n",        \
+        __FILE__,__LINE__, e);                      \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define CUDA_CHECK(cmd) do {                        \
+  cudaError_t e = cmd;                              \
+  if(e != cudaSuccess) {                            \
+    printf("CUDA error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, cudaGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define HIP_CHECK(cmd) do {                        \
+  hipError_t e = cmd;                              \
+  if(e != hipSuccess) {                            \
+    printf("HIP error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, hipGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+// NCCL_CHECK is used to validate RCCL functions as well
+#define NCCL_CHECK(cmd) do {                        \
+  ncclResult_t e = cmd;                             \
+  if (e != ncclSuccess) {                           \
+    printf("NCCL error %s:%d %s\n",                 \
+        __FILE__, __LINE__, ncclGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+void initializeData(bfloat16 *data, int64_t size) {
+    for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) {
+        #ifdef USE_CUDA
+        data[i] = __float2bfloat16((float)i);
+        #elif USE_ROCM
+        // ROCm doesn't have a float2bfloat16 method
+        data[i] = (bfloat16) ((float) i);
+        #endif
+    }
+}
+
+void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
+    bfloat16* in = (bfloat16*) invec;
+    bfloat16* inout = (bfloat16*) inoutvec;
+    for (int i = 0; i < *len; i++) {
+        #ifdef USE_CUDA
+        inout[i] = __hadd(in[i], inout[i]);
+        #elif USE_ROCM
+        inout[i] = in[i] + inout[i];
+        #endif
+    }
+}
+
+int main(int argc, char *argv[]) {
+    if (argc != 5) {
+        fprintf(stderr, "Usage: %s <num_gpus> <min_msg_size> <max_msg_size> <iterations>\n", argv[0]);
+        return EXIT_FAILURE;
+    }
+
+    int num_gpus = atoi(argv[1]);
+    int64_t min_msg_size = strtoll(argv[2], NULL, 10);
+    int64_t max_msg_size = strtoll(argv[3], NULL, 10);
+    int iterations = atoi(argv[4]);
+
+    if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
+        fprintf(stderr, "Invalid input parameters.\n");
+        return EXIT_FAILURE;
+    }
+
+    int my_rank, num_pes;
+    int num_gpus_per_node;
+    int msg_count;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
+
+    if (num_pes != num_gpus) {
+        fprintf(stderr, "Number of processes must match number of GPUs.\n");
+        MPI_Finalize();
+        return EXIT_FAILURE;
+    }
+
+    // Initialize GPU context
+    #if USE_CUDA
+    cudaGetDeviceCount(&num_gpus_per_node);
+    cudaSetDevice((my_rank % num_gpus_per_node));
+    #elif USE_ROCM
+    hipGetDeviceCount(&num_gpus_per_node);
+    hipSetDevice((my_rank % num_gpus_per_node));
+    #endif
+
+    int64_t local_data_size = max_msg_size; // Size of local data
+    int64_t global_data_size = local_data_size; // Size of global data 
+    
+    if (my_rank == 0) {
+        fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024);
+        fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024);
+    }
+
+    bfloat16 *local_data = (bfloat16*)malloc(local_data_size);
+    bfloat16 *global_data = (bfloat16*)malloc(global_data_size);
+
+    // Initialize local data
+    initializeData(local_data, local_data_size);
+
+    bfloat16 *d_local_data, *d_global_data;
+    #ifdef USE_CUDA
+    CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
+    CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
+    // Copy local data to GPU
+    CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
+
+    #elif USE_ROCM
+    HIP_CHECK(hipMalloc(&d_local_data, local_data_size));
+    HIP_CHECK(hipMalloc(&d_global_data, global_data_size));
+    HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice));
+    #endif
+
+    #ifdef USE_MPI
+    // create 2-byte datatype (send raw, un-interpreted bytes)
+    MPI_Datatype mpi_type_bfloat16;
+    MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16);
+    MPI_Type_commit(&mpi_type_bfloat16);
+
+    // define custom reduce operation for nv_bfloat16 types
+    MPI_Op CUSTOM_SUM;
+    MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM);
+
+    #elif defined(USE_NCCL) || defined(USE_RCCL)
+    ncclUniqueId nccl_comm_id;
+    ncclComm_t nccl_comm;
+
+    if (my_rank == 0) {
+        /* Generates an Id to be used in ncclCommInitRank. */
+        ncclGetUniqueId(&nccl_comm_id);
+    }
+
+    /* distribute nccl_comm_id to all ranks */
+    MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
+                        0, MPI_COMM_WORLD));
+
+    /* Create a new NCCL/RCCL communicator */
+    NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
+    #endif
+
+    // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
+    double total_time, start_time;
+    MPI_Request request;
+    MPI_Status status;
+
+    // Print benchmark results
+    if (my_rank == 0) {
+        printf("Number of GPUs: %d\n", num_gpus);
+        printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size);
+        printf("Number of iterations: %d\n", iterations);
+    }
+    fflush(NULL);
+
+    for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
+	msg_count = msg_size / sizeof(bfloat16);
+	// warmup iterations
+	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
+            #ifdef USE_MPI
+            MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16,
+                CUSTOM_SUM, MPI_COMM_WORLD, &request));
+
+            MPI_CHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
+            NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL));
+            #endif
+            
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
+            #endif
+        }
+
+	if(msg_size >= 8388608)
+	    iterations = 20;
+
+        MPI_Barrier(MPI_COMM_WORLD);
+        start_time = MPI_Wtime();
+	for (int i = 0; i < iterations; ++i) {
+            #ifdef USE_MPI
+            MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16,
+                CUSTOM_SUM, MPI_COMM_WORLD, &request));
+
+            MPI_CHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
+            NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL));
+            #endif
+            
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
+            #endif
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+        total_time = MPI_Wtime() - start_time;
+	if (my_rank == 0)
+	    printf("%ld %.6f seconds\n", msg_size, (total_time / iterations));
+    }
+
+    // Cleanup
+    free(local_data);
+    free(global_data);
+    #ifdef USE_CUDA
+    CUDA_CHECK(cudaFree(d_local_data));
+    CUDA_CHECK(cudaFree(d_global_data));
+    #elif USE_ROCM
+    HIP_CHECK(hipFree(d_local_data));
+    HIP_CHECK(hipFree(d_global_data));
+    #endif
+
+    #ifdef defined(USE_NCCL) || defined(USE_RCCL)
+    ncclCommDestroy(nccl_comm);
+    #endif
+
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}
diff --git a/mpi/Makefile b/mpi/Makefile
new file mode 100644
index 0000000..12ed3bf
--- /dev/null
+++ b/mpi/Makefile
@@ -0,0 +1,30 @@
+# Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+# See the top-level LICENSE file for details.
+# 
+# SPDX-License-Identifier: MIT
+
+CC = cc
+
+# perlmutter flags
+INC = -I/global/common/software/nersc9/nccl/2.19.4/include
+CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI
+LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
+
+# frontier flags
+# INC = -I${ROCM_PATH}/include
+# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_MPI
+# LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl
+
+all: allgather.x allreduce.x reduce_scatter.x
+
+allgather.x: ../allgather.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu
+
+allreduce.x: ../allreduce.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu
+
+reduce_scatter.x: ../reduce_scatter.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu
+
+clean: 
+	rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x
diff --git a/mpi/all-gather/frontier/128_gcd_run.sh b/mpi/all-gather/frontier/128_gcd_run.sh
new file mode 100644
index 0000000..4e8c955
--- /dev/null
+++ b/mpi/all-gather/frontier/128_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 15:00
+#SBATCH -N 16
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/128_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 16))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/frontier/16_gcd_run.sh b/mpi/all-gather/frontier/16_gcd_run.sh
new file mode 100644
index 0000000..bb2429f
--- /dev/null
+++ b/mpi/all-gather/frontier/16_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 10:00
+#SBATCH -N 2
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/16_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 128))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/frontier/32_gcd_run.sh b/mpi/all-gather/frontier/32_gcd_run.sh
new file mode 100644
index 0000000..e630b97
--- /dev/null
+++ b/mpi/all-gather/frontier/32_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 15:00
+#SBATCH -N 4
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/32_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 64))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/frontier/64_gcd_run.sh b/mpi/all-gather/frontier/64_gcd_run.sh
new file mode 100644
index 0000000..e7c707f
--- /dev/null
+++ b/mpi/all-gather/frontier/64_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 15:00
+#SBATCH -N 8
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/64_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 32))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/frontier/8_gcd_run.sh b/mpi/all-gather/frontier/8_gcd_run.sh
new file mode 100644
index 0000000..563f933
--- /dev/null
+++ b/mpi/all-gather/frontier/8_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 10:00
+#SBATCH -N 1
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/8_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 256))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/frontier/benchmarks/128_gcd.txt b/mpi/all-gather/frontier/benchmarks/128_gcd.txt
new file mode 100644
index 0000000..824b380
--- /dev/null
+++ b/mpi/all-gather/frontier/benchmarks/128_gcd.txt
@@ -0,0 +1,13 @@
+srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 128 262144 16777216 10
+  0: Local data size: 16
+  0: Global data size: 2048
+  0: Number of GPUs: 128
+  0: Message size range: 262144 - 16777216
+  0: Number of iterations: 10
+  0: 262144 0.003748 seconds
+  0: 524288 0.005048 seconds
+  0: 1048576 0.008068 seconds
+  0: 2097152 0.014084 seconds
+  0: 4194304 0.026981 seconds
+  0: 8388608 0.051879 seconds
+  0: 16777216 0.255600 seconds
diff --git a/mpi/all-gather/frontier/benchmarks/16_gcd.txt b/mpi/all-gather/frontier/benchmarks/16_gcd.txt
new file mode 100644
index 0000000..35a9e26
--- /dev/null
+++ b/mpi/all-gather/frontier/benchmarks/16_gcd.txt
@@ -0,0 +1,13 @@
+srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 16 2097152 134217728 10
+ 0: Local data size: 128
+ 0: Global data size: 2048
+ 0: Number of GPUs: 16
+ 0: Message size range: 2097152 - 134217728
+ 0: Number of iterations: 10
+ 0: 2097152 0.002249 seconds
+ 0: 4194304 0.003148 seconds
+ 0: 8388608 0.006062 seconds
+ 0: 16777216 0.011871 seconds
+ 0: 33554432 0.023485 seconds
+ 0: 67108864 0.046822 seconds
+ 0: 134217728 0.139763 seconds
diff --git a/mpi/all-gather/frontier/benchmarks/32_gcd.txt b/mpi/all-gather/frontier/benchmarks/32_gcd.txt
new file mode 100644
index 0000000..f758360
--- /dev/null
+++ b/mpi/all-gather/frontier/benchmarks/32_gcd.txt
@@ -0,0 +1,15 @@
+srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 32 262144 67108864 10
+ 0: Local data size: 64
+ 0: Global data size: 2048
+ 0: Number of GPUs: 32
+ 0: Message size range: 262144 - 67108864
+ 0: Number of iterations: 10
+ 0: 262144 0.000783 seconds
+ 0: 524288 0.001513 seconds
+ 0: 1048576 0.002953 seconds
+ 0: 2097152 0.003404 seconds
+ 0: 4194304 0.006485 seconds
+ 0: 8388608 0.012489 seconds
+ 0: 16777216 0.024484 seconds
+ 0: 33554432 0.048460 seconds
+ 0: 67108864 0.185884 seconds
diff --git a/mpi/all-gather/frontier/benchmarks/64_gcd.txt b/mpi/all-gather/frontier/benchmarks/64_gcd.txt
new file mode 100644
index 0000000..3eed822
--- /dev/null
+++ b/mpi/all-gather/frontier/benchmarks/64_gcd.txt
@@ -0,0 +1,14 @@
+srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 64 262144 33554432 10
+ 0: Local data size: 32
+ 0: Global data size: 2048
+ 0: Number of GPUs: 64
+ 0: Message size range: 262144 - 33554432
+ 0: Number of iterations: 10
+ 0: 262144 0.001685 seconds
+ 0: 524288 0.003350 seconds
+ 0: 1048576 0.003938 seconds
+ 0: 2097152 0.006864 seconds
+ 0: 4194304 0.013037 seconds
+ 0: 8388608 0.025167 seconds
+ 0: 16777216 0.049414 seconds
+ 0: 33554432 0.211224 seconds
diff --git a/mpi/all-gather/frontier/benchmarks/8_gcd.txt b/mpi/all-gather/frontier/benchmarks/8_gcd.txt
new file mode 100644
index 0000000..7856a16
--- /dev/null
+++ b/mpi/all-gather/frontier/benchmarks/8_gcd.txt
@@ -0,0 +1,14 @@
+srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 8 2097152 268435456 10
+0: Local data size: 256
+0: Global data size: 2048
+0: Number of GPUs: 8
+0: Message size range: 2097152 - 268435456
+0: Number of iterations: 10
+0: 2097152 0.000505 seconds
+0: 4194304 0.000856 seconds
+0: 8388608 0.001645 seconds
+0: 16777216 0.003223 seconds
+0: 33554432 0.006379 seconds
+0: 67108864 0.012691 seconds
+0: 134217728 0.025316 seconds
+0: 268435456 0.053944 seconds
diff --git a/mpi/all-gather/perlmutter/128_gpu_run.sh b/mpi/all-gather/perlmutter/128_gpu_run.sh
new file mode 100644
index 0000000..30fd2fc
--- /dev/null
+++ b/mpi/all-gather/perlmutter/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 16))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/perlmutter/16_gpu_run.sh b/mpi/all-gather/perlmutter/16_gpu_run.sh
new file mode 100644
index 0000000..813b192
--- /dev/null
+++ b/mpi/all-gather/perlmutter/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 128))
+
+SCRIPT="$SCRATCH/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/perlmutter/32_gpu_run.sh b/mpi/all-gather/perlmutter/32_gpu_run.sh
new file mode 100644
index 0000000..aad7f68
--- /dev/null
+++ b/mpi/all-gather/perlmutter/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 64))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/perlmutter/64_gpu_run.sh b/mpi/all-gather/perlmutter/64_gpu_run.sh
new file mode 100644
index 0000000..4897de4
--- /dev/null
+++ b/mpi/all-gather/perlmutter/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 32))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/perlmutter/8_gpu_run.sh b/mpi/all-gather/perlmutter/8_gpu_run.sh
new file mode 100644
index 0000000..3a454cf
--- /dev/null
+++ b/mpi/all-gather/perlmutter/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 256))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..3c16468
--- /dev/null
+++ b/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 16
+Global data size: 2048
+Number of GPUs: 128
+Message size range: 262144 - 16777216
+Number of iterations: 10
+262144 0.003218 seconds
+524288 0.005240 seconds
+1048576 0.008649 seconds
+2097152 0.015703 seconds
+4194304 0.030562 seconds
+8388608 0.060407 seconds
+16777216 0.190813 seconds
diff --git a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..9dc96cf
--- /dev/null
+++ b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
@@ -0,0 +1,21 @@
+slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
+slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
+slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
+slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
+slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
+slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
+slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
+slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
+slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
+slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
+slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
+slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
+slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
+slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
+slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
+slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
+srun: error: nid002072: tasks 8-11: Exited with exit code 2
+srun: Terminating StepId=27970493.0
+srun: error: nid002073: tasks 12-15: Exited with exit code 2
+srun: error: nid001572: tasks 4-7: Exited with exit code 2
+srun: error: nid001569: tasks 0-3: Exited with exit code 2
diff --git a/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..754e581
--- /dev/null
+++ b/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt
@@ -0,0 +1,14 @@
+Local data size: 64
+Global data size: 2048
+Number of GPUs: 32
+Message size range: 262144 - 67108864
+Number of iterations: 10
+262144 0.000744 seconds
+524288 0.001397 seconds
+1048576 0.002723 seconds
+2097152 0.003728 seconds
+4194304 0.007619 seconds
+8388608 0.014516 seconds
+16777216 0.030634 seconds
+33554432 0.063410 seconds
+67108864 0.172556 seconds
diff --git a/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..cd13b86
--- /dev/null
+++ b/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 32
+Global data size: 2048
+Number of GPUs: 64
+Message size range: 262144 - 33554432
+Number of iterations: 10
+262144 0.001523 seconds
+524288 0.003143 seconds
+1048576 0.004237 seconds
+2097152 0.008015 seconds
+4194304 0.015194 seconds
+8388608 0.029697 seconds
+16777216 0.063139 seconds
+33554432 0.184281 seconds
diff --git a/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt
new file mode 100644
index 0000000..e010f99
--- /dev/null
+++ b/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 256
+Global data size: 2048
+Number of GPUs: 8
+Message size range: 2097152 - 268435456
+Number of iterations: 10
+2097152 0.000888 seconds
+4194304 0.001690 seconds
+8388608 0.003195 seconds
+16777216 0.006815 seconds
+33554432 0.013828 seconds
+67108864 0.028031 seconds
+134217728 0.055406 seconds
+268435456 0.104231 seconds
diff --git a/mpi/all-reduce/frontier/128_gcd_run.sh b/mpi/all-reduce/frontier/128_gcd_run.sh
new file mode 100644
index 0000000..5c6baf5
--- /dev/null
+++ b/mpi/all-reduce/frontier/128_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 16
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/128_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/frontier/16_gcd_run.sh b/mpi/all-reduce/frontier/16_gcd_run.sh
new file mode 100644
index 0000000..e1ad604
--- /dev/null
+++ b/mpi/all-reduce/frontier/16_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 2
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/16_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/frontier/32_gcd_run.sh b/mpi/all-reduce/frontier/32_gcd_run.sh
new file mode 100644
index 0000000..be7bdd9
--- /dev/null
+++ b/mpi/all-reduce/frontier/32_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 4
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/32_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/frontier/64_gcd_run.sh b/mpi/all-reduce/frontier/64_gcd_run.sh
new file mode 100644
index 0000000..a8e13d2
--- /dev/null
+++ b/mpi/all-reduce/frontier/64_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 8
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/64_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/frontier/8_gcd_run.sh b/mpi/all-reduce/frontier/8_gcd_run.sh
new file mode 100644
index 0000000..81ffbc4
--- /dev/null
+++ b/mpi/all-reduce/frontier/8_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 1
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/8_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/frontier/benchmarks/128_gcd.txt b/mpi/all-reduce/frontier/benchmarks/128_gcd.txt
new file mode 100644
index 0000000..56c18aa
--- /dev/null
+++ b/mpi/all-reduce/frontier/benchmarks/128_gcd.txt
@@ -0,0 +1,12 @@
+srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 128 33554432 1073741824 10
+  0: Local data size: 1024
+  0: Global data size: 1024
+  0: Number of GPUs: 128
+  0: Message size range: 33554432 - 1073741824
+  0: Number of iterations: 10
+  0: 33554432 0.240206 seconds
+  0: 67108864 0.476990 seconds
+  0: 134217728 1.041500 seconds
+  0: 268435456 2.951969 seconds
+  0: 536870912 5.990606 seconds
+  0: 1073741824 12.004613 seconds
diff --git a/mpi/all-reduce/frontier/benchmarks/16_gcd.txt b/mpi/all-reduce/frontier/benchmarks/16_gcd.txt
new file mode 100644
index 0000000..609afbd
--- /dev/null
+++ b/mpi/all-reduce/frontier/benchmarks/16_gcd.txt
@@ -0,0 +1,12 @@
+srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 16 33554432 1073741824 10
+ 0: Local data size: 1024
+ 0: Global data size: 1024
+ 0: Number of GPUs: 16
+ 0: Message size range: 33554432 - 1073741824
+ 0: Number of iterations: 10
+ 0: 33554432 0.133082 seconds
+ 0: 67108864 0.267616 seconds
+ 0: 134217728 0.634895 seconds
+ 0: 268435456 1.928400 seconds
+ 0: 536870912 3.973167 seconds
+ 0: 1073741824 7.913018 seconds
diff --git a/mpi/all-reduce/frontier/benchmarks/32_gcd.txt b/mpi/all-reduce/frontier/benchmarks/32_gcd.txt
new file mode 100644
index 0000000..b92c437
--- /dev/null
+++ b/mpi/all-reduce/frontier/benchmarks/32_gcd.txt
@@ -0,0 +1,14 @@
+srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 32 8388608 1073741824 10
+ 0: Local data size: 1024
+ 0: Global data size: 1024
+ 0: Number of GPUs: 32
+ 0: Message size range: 8388608 - 1073741824
+ 0: Number of iterations: 10
+ 0: 8388608 0.043066 seconds
+ 0: 16777216 0.084259 seconds
+ 0: 33554432 0.167705 seconds
+ 0: 67108864 0.336696 seconds
+ 0: 134217728 0.773389 seconds
+ 0: 268435456 2.284815 seconds
+ 0: 536870912 4.693147 seconds
+ 0: 1073741824 9.356859 seconds
diff --git a/mpi/all-reduce/frontier/benchmarks/64_gcd.txt b/mpi/all-reduce/frontier/benchmarks/64_gcd.txt
new file mode 100644
index 0000000..122c83e
--- /dev/null
+++ b/mpi/all-reduce/frontier/benchmarks/64_gcd.txt
@@ -0,0 +1,13 @@
+srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 64 16777216 1073741824 10
+ 0: Local data size: 1024
+ 0: Global data size: 1024
+ 0: Number of GPUs: 64
+ 0: Message size range: 16777216 - 1073741824
+ 0: Number of iterations: 10
+ 0: 16777216 0.101777 seconds
+ 0: 33554432 0.203258 seconds
+ 0: 67108864 0.406569 seconds
+ 0: 134217728 0.913391 seconds
+ 0: 268435456 2.633732 seconds
+ 0: 536870912 5.375804 seconds
+ 0: 1073741824 10.708706 seconds
diff --git a/mpi/all-reduce/frontier/benchmarks/8_gcd.txt b/mpi/all-reduce/frontier/benchmarks/8_gcd.txt
new file mode 100644
index 0000000..a9b69c1
--- /dev/null
+++ b/mpi/all-reduce/frontier/benchmarks/8_gcd.txt
@@ -0,0 +1,13 @@
+srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 8 16777216 1073741824 10
+0: Local data size: 1024
+0: Global data size: 1024
+0: Number of GPUs: 8
+0: Message size range: 16777216 - 1073741824
+0: Number of iterations: 10
+0: 16777216 0.049728 seconds
+0: 33554432 0.099497 seconds
+0: 67108864 0.202129 seconds
+0: 134217728 0.500335 seconds
+0: 268435456 1.560791 seconds
+0: 536870912 3.265382 seconds
+0: 1073741824 6.500534 seconds
diff --git a/mpi/all-reduce/perlmutter/128_gpu_run.sh b/mpi/all-reduce/perlmutter/128_gpu_run.sh
new file mode 100644
index 0000000..3438061
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 20:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/perlmutter/16_gpu_run.sh b/mpi/all-reduce/perlmutter/16_gpu_run.sh
new file mode 100644
index 0000000..33962b7
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 15:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/perlmutter/32_gpu_run.sh b/mpi/all-reduce/perlmutter/32_gpu_run.sh
new file mode 100644
index 0000000..fcad983
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 20:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/perlmutter/64_gpu_run.sh b/mpi/all-reduce/perlmutter/64_gpu_run.sh
new file mode 100644
index 0000000..cd5b8fa
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 20:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/perlmutter/8_gpu_run.sh b/mpi/all-reduce/perlmutter/8_gpu_run.sh
new file mode 100644
index 0000000..ddf1050
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 15:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..76b174e
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt
@@ -0,0 +1,11 @@
+Local data size: 1024
+Global data size: 1024
+Number of GPUs: 16
+Message size range: 33554432 - 1073741824
+Number of iterations: 10
+33554432 0.145773 seconds
+67108864 0.327744 seconds
+134217728 0.680940 seconds
+268435456 2.172019 seconds
+536870912 4.377939 seconds
+1073741824 8.740797 seconds
diff --git a/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..c7d90db
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 1024
+Global data size: 1024
+Number of GPUs: 32
+Message size range: 8388608 - 1073741824
+Number of iterations: 10
+8388608 0.050947 seconds
+16777216 0.093279 seconds
+33554432 0.183651 seconds
+67108864 0.368861 seconds
+134217728 0.804120 seconds
+268435456 2.351269 seconds
+536870912 4.727807 seconds
+1073741824 9.445482 seconds
diff --git a/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt
new file mode 100644
index 0000000..43c1c73
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 1024
+Global data size: 1024
+Number of GPUs: 8
+Message size range: 16777216 - 1073741824
+Number of iterations: 10
+16777216 0.056679 seconds
+33554432 0.108849 seconds
+67108864 0.216523 seconds
+134217728 0.510124 seconds
+268435456 1.547371 seconds
+536870912 3.104556 seconds
+1073741824 6.214916 seconds
diff --git a/mpi/reduce-scatter/frontier/128_gcd_run.sh b/mpi/reduce-scatter/frontier/128_gcd_run.sh
new file mode 100644
index 0000000..b6505f8
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/128_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 16
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/frontier/16_gcd_run.sh b/mpi/reduce-scatter/frontier/16_gcd_run.sh
new file mode 100644
index 0000000..eb6b2ba
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/16_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 2
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/frontier/32_gcd_run.sh b/mpi/reduce-scatter/frontier/32_gcd_run.sh
new file mode 100644
index 0000000..4ed3437
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/32_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 4
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/frontier/64_gcd_run.sh b/mpi/reduce-scatter/frontier/64_gcd_run.sh
new file mode 100644
index 0000000..a5a9957
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/64_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 8
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/frontier/8_gcd_run.sh b/mpi/reduce-scatter/frontier/8_gcd_run.sh
new file mode 100644
index 0000000..9d4191c
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/8_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 1
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt
new file mode 100644
index 0000000..af5e98a
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt
@@ -0,0 +1,13 @@
+srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 128 33554432 2147483648 10
+  0: Local data size: 2048
+  0: Global data size: 2048
+  0: Number of GPUs: 128
+  0: Message size range: 33554432 - 2147483648
+  0: Number of iterations: 10
+  0: 33554432 5.046207 seconds
+  0: 67108864 5.031027 seconds
+  0: 134217728 5.063647 seconds
+  0: 268435456 5.054240 seconds
+  0: 536870912 5.047598 seconds
+  0: 1073741824 5.051536 seconds
+  0: 2147483648 5.057082 seconds
diff --git a/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt
new file mode 100644
index 0000000..fa9c67a
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt
@@ -0,0 +1,13 @@
+srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 16 33554432 2147483648 10
+ 0: Local data size: 2048
+ 0: Global data size: 2048
+ 0: Number of GPUs: 16
+ 0: Message size range: 33554432 - 2147483648
+ 0: Number of iterations: 10
+ 0: 33554432 5.091016 seconds
+ 0: 67108864 5.092117 seconds
+ 0: 134217728 5.082377 seconds
+ 0: 268435456 5.103443 seconds
+ 0: 536870912 5.102289 seconds
+ 0: 1073741824 5.116191 seconds
+ 0: 2147483648 5.115768 seconds
diff --git a/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt
new file mode 100644
index 0000000..23a0ace
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt
@@ -0,0 +1,15 @@
+srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 32 8388608 2147483648 10
+ 0: Local data size: 2048
+ 0: Global data size: 2048
+ 0: Number of GPUs: 32
+ 0: Message size range: 8388608 - 2147483648
+ 0: Number of iterations: 10
+ 0: 8388608 5.006776 seconds
+ 0: 16777216 4.981770 seconds
+ 0: 33554432 5.014587 seconds
+ 0: 67108864 4.994224 seconds
+ 0: 134217728 4.977063 seconds
+ 0: 268435456 4.980235 seconds
+ 0: 536870912 5.007770 seconds
+ 0: 1073741824 5.013561 seconds
+ 0: 2147483648 5.015718 seconds
diff --git a/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt
new file mode 100644
index 0000000..560c383
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt
@@ -0,0 +1,17 @@
+srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 64 16777216 2147483648 10
+ 0: Local data size: 2048
+ 0: Global data size: 2048
+ 0: Number of GPUs: 64
+ 0: Message size range: 16777216 - 2147483648
+ 0: Number of iterations: 10
+ 0: 16777216 5.006610 seconds
+ 0: 33554432 4.998351 seconds
+ 0: 67108864 5.003749 seconds
+ 0: 134217728 5.066133 seconds
+ 0: 268435456 4.980950 seconds
+ 0: 536870912 4.982830 seconds
+ 0: 1073741824 5.023178 seconds
+ 0: 2147483648 4.988750 seconds
+ 0: 
+ 0: MPICH Slingshot Network Summary: 4 network timeouts
+ 0: 
diff --git a/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt
new file mode 100644
index 0000000..493d5ee
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt
@@ -0,0 +1,14 @@
+srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 8 16777216 2147483648 10
+0: Local data size: 2048
+0: Global data size: 2048
+0: Number of GPUs: 8
+0: Message size range: 16777216 - 2147483648
+0: Number of iterations: 10
+0: 16777216 5.130130 seconds
+0: 33554432 5.120491 seconds
+0: 67108864 5.115654 seconds
+0: 134217728 5.128319 seconds
+0: 268435456 5.111989 seconds
+0: 536870912 5.115996 seconds
+0: 1073741824 5.127237 seconds
+0: 2147483648 5.116940 seconds
diff --git a/mpi/reduce-scatter/perlmutter/128_gpu_run.sh b/mpi/reduce-scatter/perlmutter/128_gpu_run.sh
new file mode 100644
index 0000000..469aeaf
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/perlmutter/16_gpu_run.sh b/mpi/reduce-scatter/perlmutter/16_gpu_run.sh
new file mode 100644
index 0000000..e66b9f4
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/perlmutter/32_gpu_run.sh b/mpi/reduce-scatter/perlmutter/32_gpu_run.sh
new file mode 100644
index 0000000..07d6020
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 30:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/perlmutter/64_gpu_run.sh b/mpi/reduce-scatter/perlmutter/64_gpu_run.sh
new file mode 100644
index 0000000..e51945a
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/perlmutter/8_gpu_run.sh b/mpi/reduce-scatter/perlmutter/8_gpu_run.sh
new file mode 100644
index 0000000..1b51537
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 30:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..d696072
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 128
+Message size range: 33554432 - 2147483648
+Number of iterations: 10
+33554432 3.352414 seconds
+67108864 3.323000 seconds
+134217728 3.331817 seconds
+268435456 3.327162 seconds
+536870912 3.345694 seconds
+1073741824 3.326455 seconds
+2147483648 3.321790 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..b71477d
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 16
+Message size range: 33554432 - 2147483648
+Number of iterations: 10
+33554432 3.368300 seconds
+67108864 3.361940 seconds
+134217728 3.367816 seconds
+268435456 3.360722 seconds
+536870912 3.363088 seconds
+1073741824 3.392373 seconds
+2147483648 3.375325 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..38e09b1
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt
@@ -0,0 +1,14 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 32
+Message size range: 8388608 - 2147483648
+Number of iterations: 10
+8388608 3.368554 seconds
+16777216 3.367485 seconds
+33554432 3.376475 seconds
+67108864 3.381592 seconds
+134217728 3.384111 seconds
+268435456 3.375780 seconds
+536870912 3.371542 seconds
+1073741824 3.379895 seconds
+2147483648 3.381470 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..d982100
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 64
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 2.220629 seconds
+33554432 2.201147 seconds
+67108864 2.196879 seconds
+134217728 2.199449 seconds
+268435456 2.194973 seconds
+536870912 2.196809 seconds
+1073741824 2.196212 seconds
+2147483648 2.201029 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt
new file mode 100644
index 0000000..d2bdd9a
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 8
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 3.558431 seconds
+33554432 3.553477 seconds
+67108864 3.562137 seconds
+134217728 3.556267 seconds
+268435456 3.551567 seconds
+536870912 3.599067 seconds
+1073741824 3.608635 seconds
+2147483648 3.624090 seconds
diff --git a/nccl/Makefile b/nccl/Makefile
new file mode 100644
index 0000000..d4423b4
--- /dev/null
+++ b/nccl/Makefile
@@ -0,0 +1,25 @@
+# Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+# See the top-level LICENSE file for details.
+# 
+# SPDX-License-Identifier: MIT
+
+CC = cc
+
+# perlmutter flags
+INC = -I/global/common/software/nersc9/nccl/2.19.4/include
+CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
+LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
+
+all: allgather.x allreduce.x reduce_scatter.x
+
+allgather.x: ../allgather.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu
+
+allreduce.x: ../allreduce.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu
+
+reduce_scatter.x: ../reduce_scatter.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu
+
+clean: 
+	rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x
diff --git a/nccl/all-gather/128_gpu_run.sh b/nccl/all-gather/128_gpu_run.sh
new file mode 100644
index 0000000..82998f7
--- /dev/null
+++ b/nccl/all-gather/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 32))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-gather/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-gather/16_gpu_run.sh b/nccl/all-gather/16_gpu_run.sh
new file mode 100644
index 0000000..47b5f7c
--- /dev/null
+++ b/nccl/all-gather/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 256))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-gather/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-gather/32_gpu_run.sh b/nccl/all-gather/32_gpu_run.sh
new file mode 100644
index 0000000..5459a34
--- /dev/null
+++ b/nccl/all-gather/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 64))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-gather/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-gather/64_gpu_run.sh b/nccl/all-gather/64_gpu_run.sh
new file mode 100644
index 0000000..2ad7e3a
--- /dev/null
+++ b/nccl/all-gather/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 32))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-gather/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-gather/8_gpu_run.sh b/nccl/all-gather/8_gpu_run.sh
new file mode 100644
index 0000000..55e05f8
--- /dev/null
+++ b/nccl/all-gather/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 256))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-gather/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-gather/benchmarks/16_gpu.txt b/nccl/all-gather/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..22b1d19
--- /dev/null
+++ b/nccl/all-gather/benchmarks/16_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 256
+Global data size: 4096
+Number of GPUs: 16
+Message size range: 2097152 - 268435456
+Number of iterations: 10
+2097152 0.000546 seconds
+4194304 0.000963 seconds
+8388608 0.001810 seconds
+16777216 0.003587 seconds
+33554432 0.006843 seconds
+67108864 0.013602 seconds
+134217728 0.026932 seconds
+268435456 0.052715 seconds
diff --git a/nccl/all-gather/benchmarks/32_gpu.txt b/nccl/all-gather/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..da3b81b
--- /dev/null
+++ b/nccl/all-gather/benchmarks/32_gpu.txt
@@ -0,0 +1,14 @@
+Local data size: 64
+Global data size: 2048
+Number of GPUs: 32
+Message size range: 262144 - 67108864
+Number of iterations: 10
+262144 0.000531 seconds
+524288 0.000602 seconds
+1048576 0.000700 seconds
+2097152 0.001056 seconds
+4194304 0.001907 seconds
+8388608 0.003960 seconds
+16777216 0.006958 seconds
+33554432 0.014047 seconds
+67108864 0.027585 seconds
diff --git a/nccl/all-gather/benchmarks/64_gpu.txt b/nccl/all-gather/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..f05957a
--- /dev/null
+++ b/nccl/all-gather/benchmarks/64_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 32
+Global data size: 2048
+Number of GPUs: 64
+Message size range: 262144 - 33554432
+Number of iterations: 10
+262144 0.001041 seconds
+524288 0.001212 seconds
+1048576 0.001357 seconds
+2097152 0.002122 seconds
+4194304 0.003750 seconds
+8388608 0.007686 seconds
+16777216 0.014414 seconds
+33554432 0.028307 seconds
diff --git a/nccl/all-gather/benchmarks/8_gpu.txt b/nccl/all-gather/benchmarks/8_gpu.txt
new file mode 100644
index 0000000..9d9c99f
--- /dev/null
+++ b/nccl/all-gather/benchmarks/8_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 256
+Global data size: 2048
+Number of GPUs: 8
+Message size range: 2097152 - 268435456
+Number of iterations: 10
+2097152 0.000298 seconds
+4194304 0.000477 seconds
+8388608 0.000903 seconds
+16777216 0.001661 seconds
+33554432 0.003230 seconds
+67108864 0.006674 seconds
+134217728 0.012419 seconds
+268435456 0.024550 seconds
diff --git a/nccl/all-reduce/128_gpu_run.sh b/nccl/all-reduce/128_gpu_run.sh
new file mode 100644
index 0000000..591cdf3
--- /dev/null
+++ b/nccl/all-reduce/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-reduce/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-reduce/16_gpu_run.sh b/nccl/all-reduce/16_gpu_run.sh
new file mode 100644
index 0000000..9232407
--- /dev/null
+++ b/nccl/all-reduce/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-reduce/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-reduce/32_gpu_run.sh b/nccl/all-reduce/32_gpu_run.sh
new file mode 100644
index 0000000..7130fa8
--- /dev/null
+++ b/nccl/all-reduce/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-reduce/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-reduce/64_gpu_run.sh b/nccl/all-reduce/64_gpu_run.sh
new file mode 100644
index 0000000..057637f
--- /dev/null
+++ b/nccl/all-reduce/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-reduce/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-reduce/8_gpu_run.sh b/nccl/all-reduce/8_gpu_run.sh
new file mode 100644
index 0000000..be7f5f1
--- /dev/null
+++ b/nccl/all-reduce/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-reduce/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-reduce/benchmarks/16_gpu.txt b/nccl/all-reduce/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..a866d54
--- /dev/null
+++ b/nccl/all-reduce/benchmarks/16_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 16
+Message size range: 33554432 - 2147483648
+Number of iterations: 10
+33554432 0.001007 seconds
+67108864 0.001788 seconds
+134217728 0.003634 seconds
+268435456 0.006935 seconds
+536870912 0.013610 seconds
+1073741824 0.027019 seconds
+2147483648 0.052864 seconds
diff --git a/nccl/all-reduce/benchmarks/32_gpu.txt b/nccl/all-reduce/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..a20b1cd
--- /dev/null
+++ b/nccl/all-reduce/benchmarks/32_gpu.txt
@@ -0,0 +1,14 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 32
+Message size range: 8388608 - 2147483648
+Number of iterations: 10
+8388608 0.001052 seconds
+16777216 0.001220 seconds
+33554432 0.001356 seconds
+67108864 0.002028 seconds
+134217728 0.003714 seconds
+268435456 0.007242 seconds
+536870912 0.013809 seconds
+1073741824 0.027274 seconds
+2147483648 0.054261 seconds
diff --git a/nccl/reduce-scatter/128_gpu_run.sh b/nccl/reduce-scatter/128_gpu_run.sh
new file mode 100644
index 0000000..8590821
--- /dev/null
+++ b/nccl/reduce-scatter/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/reduce-scatter/16_gpu_run.sh b/nccl/reduce-scatter/16_gpu_run.sh
new file mode 100644
index 0000000..7a20fa6
--- /dev/null
+++ b/nccl/reduce-scatter/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/reduce-scatter/32_gpu_run.sh b/nccl/reduce-scatter/32_gpu_run.sh
new file mode 100644
index 0000000..3d297ff
--- /dev/null
+++ b/nccl/reduce-scatter/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/reduce-scatter/64_gpu_run.sh b/nccl/reduce-scatter/64_gpu_run.sh
new file mode 100644
index 0000000..6bbf97a
--- /dev/null
+++ b/nccl/reduce-scatter/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/reduce-scatter/8_gpu_run.sh b/nccl/reduce-scatter/8_gpu_run.sh
new file mode 100644
index 0000000..21c0dc4
--- /dev/null
+++ b/nccl/reduce-scatter/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/reduce-scatter/benchmarks/128_gpu.txt b/nccl/reduce-scatter/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..7c1c8f9
--- /dev/null
+++ b/nccl/reduce-scatter/benchmarks/128_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 128
+Message size range: 33554432 - 2147483648
+Number of iterations: 10
+33554432 0.028300 seconds
+67108864 0.028351 seconds
+134217728 0.028351 seconds
+268435456 0.028502 seconds
+536870912 0.028579 seconds
+1073741824 0.028650 seconds
+2147483648 0.028506 seconds
diff --git a/nccl/reduce-scatter/benchmarks/16_gpu.txt b/nccl/reduce-scatter/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..14acf87
--- /dev/null
+++ b/nccl/reduce-scatter/benchmarks/16_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 16
+Message size range: 33554432 - 2147483648
+Number of iterations: 10
+33554432 0.033170 seconds
+67108864 0.033280 seconds
+134217728 0.033220 seconds
+268435456 0.033291 seconds
+536870912 0.033217 seconds
+1073741824 0.033158 seconds
+2147483648 0.033275 seconds
diff --git a/nccl/reduce-scatter/benchmarks/32_gpu.txt b/nccl/reduce-scatter/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..7eecc67
--- /dev/null
+++ b/nccl/reduce-scatter/benchmarks/32_gpu.txt
@@ -0,0 +1,14 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 32
+Message size range: 8388608 - 2147483648
+Number of iterations: 10
+8388608 0.027121 seconds
+16777216 0.027661 seconds
+33554432 0.027766 seconds
+67108864 0.027992 seconds
+134217728 0.027914 seconds
+268435456 0.027912 seconds
+536870912 0.027777 seconds
+1073741824 0.027861 seconds
+2147483648 0.027551 seconds
diff --git a/nccl/reduce-scatter/benchmarks/64_gpu.txt b/nccl/reduce-scatter/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..8f8ddd0
--- /dev/null
+++ b/nccl/reduce-scatter/benchmarks/64_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 64
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 0.028306 seconds
+33554432 0.028511 seconds
+67108864 0.028175 seconds
+134217728 0.027998 seconds
+268435456 0.027883 seconds
+536870912 0.027802 seconds
+1073741824 0.027954 seconds
+2147483648 0.028085 seconds
diff --git a/nccl/reduce-scatter/benchmarks/8_gpu.txt b/nccl/reduce-scatter/benchmarks/8_gpu.txt
new file mode 100644
index 0000000..26c22b6
--- /dev/null
+++ b/nccl/reduce-scatter/benchmarks/8_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 8
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 0.024231 seconds
+33554432 0.024389 seconds
+67108864 0.024167 seconds
+134217728 0.024047 seconds
+268435456 0.024293 seconds
+536870912 0.024031 seconds
+1073741824 0.024048 seconds
+2147483648 0.024241 seconds
diff --git a/rccl/Makefile b/rccl/Makefile
new file mode 100644
index 0000000..aa0a7b9
--- /dev/null
+++ b/rccl/Makefile
@@ -0,0 +1,25 @@
+# Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+# See the top-level LICENSE file for details.
+# 
+# SPDX-License-Identifier: MIT
+
+CC = cc
+
+# frontier flags
+INC = -I${ROCM_PATH}/include
+CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL
+LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl
+
+all: allgather.x allreduce.x reduce_scatter.x
+
+allgather.x: ../allgather.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu
+
+allreduce.x: ../allreduce.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu
+
+reduce_scatter.x: ../reduce_scatter.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu
+
+clean: 
+	rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x
diff --git a/reduce_scatter.cu b/reduce_scatter.cu
new file mode 100644
index 0000000..99fc950
--- /dev/null
+++ b/reduce_scatter.cu
@@ -0,0 +1,269 @@
+/* \file reduce_scatter.cu
+ * Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+ * See the top-level LICENSE file for details.
+ * 
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include <stdint.h>
+
+#ifdef USE_CUDA
+  #include <cuda_bf16.h>
+  #define bfloat16 nv_bfloat16
+#elif USE_ROCM
+  #define __HIP_PLATFORM_AMD__
+  #include <hip/hip_bfloat16.h>
+  #include <hip/hip_runtime.h>
+  #include <hip/hip_runtime_api.h>
+  #define bfloat16 hip_bfloat16
+#endif
+
+#ifdef USE_NCCL
+  #include "nccl.h"
+#elif USE_RCCL
+  #include <rccl/rccl.h> 
+#endif
+
+#define NUM_WARMUP_ITERATIONS		5
+
+#define MPI_CHECK(cmd) do {                         \
+  int64_t e = cmd;                                      \
+  if( e != MPI_SUCCESS ) {                          \
+    printf("Failed: MPI error %s:%d '%ld'\n",        \
+        __FILE__,__LINE__, e);                      \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define CUDA_CHECK(cmd) do {                        \
+  cudaError_t e = cmd;                              \
+  if(e != cudaSuccess) {                            \
+    printf("CUDA error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, cudaGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define HIP_CHECK(cmd) do {                        \
+  hipError_t e = cmd;                              \
+  if(e != hipSuccess) {                            \
+    printf("HIP error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, hipGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+// NCCL_CHECK is used to validate RCCL functions as well
+#define NCCL_CHECK(cmd) do {                        \
+  ncclResult_t e = cmd;                             \
+  if (e != ncclSuccess) {                           \
+    printf("NCCL error %s:%d %s\n",                 \
+        __FILE__, __LINE__, ncclGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+void initializeData(bfloat16 *data, int64_t size) {
+    for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) {
+        #ifdef USE_CUDA
+        data[i] = __float2bfloat16((float)i);
+        #elif USE_ROCM
+        // ROCm doesn't have a float2bfloat16 method
+        data[i] = (bfloat16) ((float) i);
+        #endif
+    }
+}
+
+void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
+    bfloat16* in = (bfloat16*) invec;
+    bfloat16* inout = (bfloat16*) inoutvec;
+    for (int i = 0; i < *len; i++) {
+        #ifdef USE_CUDA
+        inout[i] = __hadd(in[i], inout[i]);
+        #elif USE_ROCM
+        inout[i] = in[i] + inout[i];
+        #endif
+    }
+}
+
+int main(int argc, char *argv[]) {
+    if (argc != 5) {
+        fprintf(stderr, "Usage: %s <num_gpus> <min_msg_size> <max_msg_size> <iterations>\n", argv[0]);
+        return EXIT_FAILURE;
+    }
+
+    int num_gpus = atoi(argv[1]);
+    int64_t min_msg_size = strtoll(argv[2], NULL, 10);
+    int64_t max_msg_size = strtoll(argv[3], NULL, 10);
+    int iterations = atoi(argv[4]);
+
+    if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
+        fprintf(stderr, "Invalid input parameters.\n");
+        return EXIT_FAILURE;
+    }
+
+    int my_rank, num_pes;
+    int num_gpus_per_node;
+    int msg_count;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
+
+    if (num_pes != num_gpus) {
+        fprintf(stderr, "Number of processes must match number of GPUs.\n");
+        MPI_Finalize();
+        return EXIT_FAILURE;
+    }
+
+    // Initialize GPU context
+    #if USE_CUDA
+    cudaGetDeviceCount(&num_gpus_per_node);
+    cudaSetDevice((my_rank % num_gpus_per_node));
+    #elif USE_ROCM
+    hipGetDeviceCount(&num_gpus_per_node);
+    hipSetDevice((my_rank % num_gpus_per_node));
+    #endif
+
+    int64_t local_data_size = max_msg_size; // Size of local data
+    int64_t global_data_size = local_data_size; // Size of global data
+
+    if (my_rank == 0) {
+        fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024);
+        fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024);
+    }
+
+    bfloat16 *local_data = (bfloat16*)malloc(local_data_size);
+    bfloat16 *global_data = (bfloat16*)malloc(global_data_size);
+
+    // Initialize local data
+    initializeData(local_data, local_data_size);
+
+    bfloat16 *d_local_data, *d_global_data;
+    #ifdef USE_CUDA
+    CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
+    CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
+    // Copy local data to GPU
+    CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
+
+    #elif USE_ROCM
+    HIP_CHECK(hipMalloc(&d_local_data, local_data_size));
+    HIP_CHECK(hipMalloc(&d_global_data, global_data_size));
+    HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice));
+    #endif
+
+    #ifdef USE_MPI
+    // create 2-byte datatype (send raw, un-interpreted bytes)
+    MPI_Datatype mpi_type_bfloat16;
+    MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16);
+    MPI_Type_commit(&mpi_type_bfloat16);
+
+    // define custom reduce operation for nv_bfloat16 types
+    MPI_Op CUSTOM_SUM;
+    MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM);
+
+    #elif defined(USE_NCCL) || defined(USE_RCCL)
+    ncclUniqueId nccl_comm_id;
+    ncclComm_t nccl_comm;
+
+    if (my_rank == 0) {
+        /* Generates an Id to be used in ncclCommInitRank. */
+        ncclGetUniqueId(&nccl_comm_id);
+    }
+
+    /* distribute nccl_comm_id to all ranks */
+    MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
+                        0, MPI_COMM_WORLD));
+
+    /* Create a new NCCL/RCCL communicator */
+    NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
+    #endif
+
+    // init recvcounts to send an equal portion of data from the reduce operation
+    int num_elements = local_data_size / sizeof(bfloat16);
+    int portion = num_elements / num_pes;
+    int *recvcounts = (int*) malloc(sizeof(int) * num_pes);
+    for (int i = 0; i < num_pes; i++) 
+        recvcounts[i] = portion;
+
+    // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
+    double total_time, start_time;
+    MPI_Request request;
+    MPI_Status status;
+
+    // Print benchmark results
+    if (my_rank == 0) {
+        printf("Number of GPUs: %d\n", num_gpus);
+        printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size);
+        printf("Number of iterations: %d\n", iterations);
+    }
+    fflush(NULL);
+
+    for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
+	msg_count = msg_size / sizeof(bfloat16);
+	// warmup iterations
+	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
+            #ifdef USE_MPI
+            MPI_CHECK(MPI_Ireduce_scatter(d_local_data, d_global_data, recvcounts, mpi_type_bfloat16,
+                CUSTOM_SUM, MPI_COMM_WORLD, &request));
+
+            MPI_CHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
+            NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL));
+            #endif
+            
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
+            #endif
+        }
+
+	if(msg_size >= 8388608)
+	    iterations = 20;
+
+        MPI_Barrier(MPI_COMM_WORLD);
+        start_time = MPI_Wtime();
+	for (int i = 0; i < iterations; ++i) {
+            #ifdef USE_MPI
+            MPI_CHECK(MPI_Ireduce_scatter(d_local_data, d_global_data, recvcounts, mpi_type_bfloat16,
+                CUSTOM_SUM, MPI_COMM_WORLD, &request));
+
+            MPI_CHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
+            NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL));
+            #endif
+            
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
+            #endif
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+        total_time = MPI_Wtime() - start_time;
+	if (my_rank == 0)
+	    printf("%ld %.6f seconds\n", msg_size, (total_time / iterations));
+    }
+
+    // Cleanup
+    free(local_data);
+    free(global_data);
+    #ifdef USE_CUDA
+    CUDA_CHECK(cudaFree(d_local_data));
+    CUDA_CHECK(cudaFree(d_global_data));
+    #elif USE_ROCM
+    HIP_CHECK(hipFree(d_local_data));
+    HIP_CHECK(hipFree(d_global_data));
+    #endif
+
+    #ifdef defined(USE_NCCL) || defined(USE_RCCL)
+    ncclCommDestroy(nccl_comm);
+    #endif
+
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}

From 559f4bb99318595ac37e53bac77871d588eefcf3 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Thu, 11 Jul 2024 17:44:21 -0700
Subject: [PATCH 43/52] fix reduce_scatter bug

---
 reduce_scatter.cu | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/reduce_scatter.cu b/reduce_scatter.cu
index 99fc950..b2dc99e 100644
--- a/reduce_scatter.cu
+++ b/reduce_scatter.cu
@@ -183,11 +183,8 @@ int main(int argc, char *argv[]) {
     #endif
 
     // init recvcounts to send an equal portion of data from the reduce operation
-    int num_elements = local_data_size / sizeof(bfloat16);
-    int portion = num_elements / num_pes;
     int *recvcounts = (int*) malloc(sizeof(int) * num_pes);
-    for (int i = 0; i < num_pes; i++) 
-        recvcounts[i] = portion;
+    int portion;
 
     // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
     double total_time, start_time;
@@ -204,6 +201,11 @@ int main(int argc, char *argv[]) {
 
     for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
 	msg_count = msg_size / sizeof(bfloat16);
+
+    portion = msg_count / num_pes;
+    for (int i = 0; i < num_pes; i++)
+        recvcounts[i] = portion;
+
 	// warmup iterations
 	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
             #ifdef USE_MPI

From d70e475861f02a871b9f56a758f845a53b1543e8 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Thu, 11 Jul 2024 18:10:23 -0700
Subject: [PATCH 44/52] push latest benchmarks

---
 mpi/reduce-scatter/perlmutter/128_gpu_run.sh       |  4 ++--
 mpi/reduce-scatter/perlmutter/16_gpu_run.sh        |  4 ++--
 mpi/reduce-scatter/perlmutter/32_gpu_run.sh        |  4 ++--
 mpi/reduce-scatter/perlmutter/64_gpu_run.sh        |  4 ++--
 mpi/reduce-scatter/perlmutter/8_gpu_run.sh         |  6 +++---
 .../perlmutter/benchmarks/128_gpu.txt              | 12 ------------
 .../perlmutter/benchmarks/16_gpu.txt               | 12 ------------
 .../perlmutter/benchmarks/32_gpu.txt               | 14 --------------
 .../perlmutter/benchmarks/64_gpu.txt               | 13 -------------
 mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt | 13 -------------
 nccl/all-reduce/benchmarks/64_gpu.txt              | 13 +++++++++++++
 nccl/all-reduce/benchmarks/8_gpu.txt               | 13 +++++++++++++
 nccl/reduce-scatter/128_gpu_run.sh                 |  4 ++--
 nccl/reduce-scatter/16_gpu_run.sh                  |  4 ++--
 nccl/reduce-scatter/32_gpu_run.sh                  |  4 ++--
 nccl/reduce-scatter/64_gpu_run.sh                  |  4 ++--
 nccl/reduce-scatter/8_gpu_run.sh                   |  4 ++--
 nccl/reduce-scatter/benchmarks/128_gpu.txt         | 12 ------------
 nccl/reduce-scatter/benchmarks/16_gpu.txt          | 12 ------------
 nccl/reduce-scatter/benchmarks/32_gpu.txt          | 14 --------------
 nccl/reduce-scatter/benchmarks/64_gpu.txt          | 13 -------------
 nccl/reduce-scatter/benchmarks/8_gpu.txt           | 13 -------------
 22 files changed, 47 insertions(+), 149 deletions(-)
 delete mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt
 delete mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt
 delete mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt
 delete mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt
 delete mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt
 create mode 100644 nccl/all-reduce/benchmarks/64_gpu.txt
 create mode 100644 nccl/all-reduce/benchmarks/8_gpu.txt
 delete mode 100644 nccl/reduce-scatter/benchmarks/128_gpu.txt
 delete mode 100644 nccl/reduce-scatter/benchmarks/16_gpu.txt
 delete mode 100644 nccl/reduce-scatter/benchmarks/32_gpu.txt
 delete mode 100644 nccl/reduce-scatter/benchmarks/64_gpu.txt
 delete mode 100644 nccl/reduce-scatter/benchmarks/8_gpu.txt

diff --git a/mpi/reduce-scatter/perlmutter/128_gpu_run.sh b/mpi/reduce-scatter/perlmutter/128_gpu_run.sh
index 469aeaf..28c8479 100644
--- a/mpi/reduce-scatter/perlmutter/128_gpu_run.sh
+++ b/mpi/reduce-scatter/perlmutter/128_gpu_run.sh
@@ -29,8 +29,8 @@ export FI_CXI_OFLOW_BUF_COUNT=1
 MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
 MAX_MSG_SIZE=$((1048576 * 2048))
 
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt"
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt"
 
 echo $run_cmd
 eval $run_cmd
diff --git a/mpi/reduce-scatter/perlmutter/16_gpu_run.sh b/mpi/reduce-scatter/perlmutter/16_gpu_run.sh
index e66b9f4..c3b9e32 100644
--- a/mpi/reduce-scatter/perlmutter/16_gpu_run.sh
+++ b/mpi/reduce-scatter/perlmutter/16_gpu_run.sh
@@ -29,8 +29,8 @@ export FI_CXI_OFLOW_BUF_COUNT=1
 MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
 MAX_MSG_SIZE=$((1048576 * 2048))
 
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt"
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt"
 
 echo $run_cmd
 eval $run_cmd
diff --git a/mpi/reduce-scatter/perlmutter/32_gpu_run.sh b/mpi/reduce-scatter/perlmutter/32_gpu_run.sh
index 07d6020..1681d65 100644
--- a/mpi/reduce-scatter/perlmutter/32_gpu_run.sh
+++ b/mpi/reduce-scatter/perlmutter/32_gpu_run.sh
@@ -29,8 +29,8 @@ export FI_CXI_OFLOW_BUF_COUNT=1
 MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
 MAX_MSG_SIZE=$((1048576 * 2048))
 
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt"
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt"
 
 echo $run_cmd
 eval $run_cmd
diff --git a/mpi/reduce-scatter/perlmutter/64_gpu_run.sh b/mpi/reduce-scatter/perlmutter/64_gpu_run.sh
index e51945a..f932006 100644
--- a/mpi/reduce-scatter/perlmutter/64_gpu_run.sh
+++ b/mpi/reduce-scatter/perlmutter/64_gpu_run.sh
@@ -29,8 +29,8 @@ export FI_CXI_OFLOW_BUF_COUNT=1
 MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
 MAX_MSG_SIZE=$((1048576 * 2048))
 
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt"
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt"
 
 echo $run_cmd
 eval $run_cmd
diff --git a/mpi/reduce-scatter/perlmutter/8_gpu_run.sh b/mpi/reduce-scatter/perlmutter/8_gpu_run.sh
index 1b51537..977ba91 100644
--- a/mpi/reduce-scatter/perlmutter/8_gpu_run.sh
+++ b/mpi/reduce-scatter/perlmutter/8_gpu_run.sh
@@ -3,7 +3,7 @@
 #SBATCH -A m4641_g
 #SBATCH -C gpu
 #SBATCH -q regular
-#SBATCH -t 30:00
+#SBATCH -t 20:00
 #SBATCH -N 2
 #SBATCH --ntasks-per-node=4
 #SBATCH -c 32
@@ -29,8 +29,8 @@ export FI_CXI_OFLOW_BUF_COUNT=1
 MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
 MAX_MSG_SIZE=$((1048576 * 2048))
 
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt"
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt"
 
 echo $run_cmd
 eval $run_cmd
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt
deleted file mode 100644
index d696072..0000000
--- a/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 128
-Message size range: 33554432 - 2147483648
-Number of iterations: 10
-33554432 3.352414 seconds
-67108864 3.323000 seconds
-134217728 3.331817 seconds
-268435456 3.327162 seconds
-536870912 3.345694 seconds
-1073741824 3.326455 seconds
-2147483648 3.321790 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt
deleted file mode 100644
index b71477d..0000000
--- a/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 16
-Message size range: 33554432 - 2147483648
-Number of iterations: 10
-33554432 3.368300 seconds
-67108864 3.361940 seconds
-134217728 3.367816 seconds
-268435456 3.360722 seconds
-536870912 3.363088 seconds
-1073741824 3.392373 seconds
-2147483648 3.375325 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt
deleted file mode 100644
index 38e09b1..0000000
--- a/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 32
-Message size range: 8388608 - 2147483648
-Number of iterations: 10
-8388608 3.368554 seconds
-16777216 3.367485 seconds
-33554432 3.376475 seconds
-67108864 3.381592 seconds
-134217728 3.384111 seconds
-268435456 3.375780 seconds
-536870912 3.371542 seconds
-1073741824 3.379895 seconds
-2147483648 3.381470 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt
deleted file mode 100644
index d982100..0000000
--- a/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 64
-Message size range: 16777216 - 2147483648
-Number of iterations: 10
-16777216 2.220629 seconds
-33554432 2.201147 seconds
-67108864 2.196879 seconds
-134217728 2.199449 seconds
-268435456 2.194973 seconds
-536870912 2.196809 seconds
-1073741824 2.196212 seconds
-2147483648 2.201029 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt
deleted file mode 100644
index d2bdd9a..0000000
--- a/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 8
-Message size range: 16777216 - 2147483648
-Number of iterations: 10
-16777216 3.558431 seconds
-33554432 3.553477 seconds
-67108864 3.562137 seconds
-134217728 3.556267 seconds
-268435456 3.551567 seconds
-536870912 3.599067 seconds
-1073741824 3.608635 seconds
-2147483648 3.624090 seconds
diff --git a/nccl/all-reduce/benchmarks/64_gpu.txt b/nccl/all-reduce/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..07fbe6d
--- /dev/null
+++ b/nccl/all-reduce/benchmarks/64_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 64
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 0.001536 seconds
+33554432 0.001953 seconds
+67108864 0.002903 seconds
+134217728 0.004239 seconds
+268435456 0.007382 seconds
+536870912 0.014722 seconds
+1073741824 0.028043 seconds
+2147483648 0.055311 seconds
diff --git a/nccl/all-reduce/benchmarks/8_gpu.txt b/nccl/all-reduce/benchmarks/8_gpu.txt
new file mode 100644
index 0000000..c9bda12
--- /dev/null
+++ b/nccl/all-reduce/benchmarks/8_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 8
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 0.000507 seconds
+33554432 0.000855 seconds
+67108864 0.001697 seconds
+134217728 0.003146 seconds
+268435456 0.006394 seconds
+536870912 0.012162 seconds
+1073741824 0.024174 seconds
+2147483648 0.047715 seconds
diff --git a/nccl/reduce-scatter/128_gpu_run.sh b/nccl/reduce-scatter/128_gpu_run.sh
index 8590821..e37f70b 100644
--- a/nccl/reduce-scatter/128_gpu_run.sh
+++ b/nccl/reduce-scatter/128_gpu_run.sh
@@ -29,8 +29,8 @@ export FI_CXI_OFLOW_BUF_COUNT=1
 MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
 MAX_MSG_SIZE=$((1048576 * 2048))
 
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/128_gpu.txt"
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/reduce-scatter/benchmarks/128_gpu.txt"
 
 echo $run_cmd
 eval $run_cmd
diff --git a/nccl/reduce-scatter/16_gpu_run.sh b/nccl/reduce-scatter/16_gpu_run.sh
index 7a20fa6..0ea1f3b 100644
--- a/nccl/reduce-scatter/16_gpu_run.sh
+++ b/nccl/reduce-scatter/16_gpu_run.sh
@@ -29,8 +29,8 @@ export FI_CXI_OFLOW_BUF_COUNT=1
 MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
 MAX_MSG_SIZE=$((1048576 * 2048))
 
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/16_gpu.txt"
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/reduce-scatter/benchmarks/16_gpu.txt"
 
 echo $run_cmd
 eval $run_cmd
diff --git a/nccl/reduce-scatter/32_gpu_run.sh b/nccl/reduce-scatter/32_gpu_run.sh
index 3d297ff..0bccbb2 100644
--- a/nccl/reduce-scatter/32_gpu_run.sh
+++ b/nccl/reduce-scatter/32_gpu_run.sh
@@ -29,8 +29,8 @@ export FI_CXI_OFLOW_BUF_COUNT=1
 MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
 MAX_MSG_SIZE=$((1048576 * 2048))
 
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/32_gpu.txt"
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/reduce-scatter/benchmarks/32_gpu.txt"
 
 echo $run_cmd
 eval $run_cmd
diff --git a/nccl/reduce-scatter/64_gpu_run.sh b/nccl/reduce-scatter/64_gpu_run.sh
index 6bbf97a..79dd4cb 100644
--- a/nccl/reduce-scatter/64_gpu_run.sh
+++ b/nccl/reduce-scatter/64_gpu_run.sh
@@ -29,8 +29,8 @@ export FI_CXI_OFLOW_BUF_COUNT=1
 MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
 MAX_MSG_SIZE=$((1048576 * 2048))
 
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/64_gpu.txt"
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/reduce-scatter/benchmarks/64_gpu.txt"
 
 echo $run_cmd
 eval $run_cmd
diff --git a/nccl/reduce-scatter/8_gpu_run.sh b/nccl/reduce-scatter/8_gpu_run.sh
index 21c0dc4..6fba196 100644
--- a/nccl/reduce-scatter/8_gpu_run.sh
+++ b/nccl/reduce-scatter/8_gpu_run.sh
@@ -29,8 +29,8 @@ export FI_CXI_OFLOW_BUF_COUNT=1
 MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
 MAX_MSG_SIZE=$((1048576 * 2048))
 
-SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
-run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/8_gpu.txt"
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/reduce-scatter/benchmarks/8_gpu.txt"
 
 echo $run_cmd
 eval $run_cmd
diff --git a/nccl/reduce-scatter/benchmarks/128_gpu.txt b/nccl/reduce-scatter/benchmarks/128_gpu.txt
deleted file mode 100644
index 7c1c8f9..0000000
--- a/nccl/reduce-scatter/benchmarks/128_gpu.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 128
-Message size range: 33554432 - 2147483648
-Number of iterations: 10
-33554432 0.028300 seconds
-67108864 0.028351 seconds
-134217728 0.028351 seconds
-268435456 0.028502 seconds
-536870912 0.028579 seconds
-1073741824 0.028650 seconds
-2147483648 0.028506 seconds
diff --git a/nccl/reduce-scatter/benchmarks/16_gpu.txt b/nccl/reduce-scatter/benchmarks/16_gpu.txt
deleted file mode 100644
index 14acf87..0000000
--- a/nccl/reduce-scatter/benchmarks/16_gpu.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 16
-Message size range: 33554432 - 2147483648
-Number of iterations: 10
-33554432 0.033170 seconds
-67108864 0.033280 seconds
-134217728 0.033220 seconds
-268435456 0.033291 seconds
-536870912 0.033217 seconds
-1073741824 0.033158 seconds
-2147483648 0.033275 seconds
diff --git a/nccl/reduce-scatter/benchmarks/32_gpu.txt b/nccl/reduce-scatter/benchmarks/32_gpu.txt
deleted file mode 100644
index 7eecc67..0000000
--- a/nccl/reduce-scatter/benchmarks/32_gpu.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 32
-Message size range: 8388608 - 2147483648
-Number of iterations: 10
-8388608 0.027121 seconds
-16777216 0.027661 seconds
-33554432 0.027766 seconds
-67108864 0.027992 seconds
-134217728 0.027914 seconds
-268435456 0.027912 seconds
-536870912 0.027777 seconds
-1073741824 0.027861 seconds
-2147483648 0.027551 seconds
diff --git a/nccl/reduce-scatter/benchmarks/64_gpu.txt b/nccl/reduce-scatter/benchmarks/64_gpu.txt
deleted file mode 100644
index 8f8ddd0..0000000
--- a/nccl/reduce-scatter/benchmarks/64_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 64
-Message size range: 16777216 - 2147483648
-Number of iterations: 10
-16777216 0.028306 seconds
-33554432 0.028511 seconds
-67108864 0.028175 seconds
-134217728 0.027998 seconds
-268435456 0.027883 seconds
-536870912 0.027802 seconds
-1073741824 0.027954 seconds
-2147483648 0.028085 seconds
diff --git a/nccl/reduce-scatter/benchmarks/8_gpu.txt b/nccl/reduce-scatter/benchmarks/8_gpu.txt
deleted file mode 100644
index 26c22b6..0000000
--- a/nccl/reduce-scatter/benchmarks/8_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 8
-Message size range: 16777216 - 2147483648
-Number of iterations: 10
-16777216 0.024231 seconds
-33554432 0.024389 seconds
-67108864 0.024167 seconds
-134217728 0.024047 seconds
-268435456 0.024293 seconds
-536870912 0.024031 seconds
-1073741824 0.024048 seconds
-2147483648 0.024241 seconds

From 4e7ac6fa4eeb774e236ca375df37565230b9a1a3 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Fri, 12 Jul 2024 07:32:04 -0700
Subject: [PATCH 45/52] add benchmarks so far

---
 .../perlmutter/benchmarks/128_gpu.txt          | 12 ------------
 .../perlmutter/benchmarks/16_gpu.txt           | 10 +++++-----
 .../perlmutter/benchmarks/32_gpu.txt           | 18 +++++++++---------
 .../perlmutter/benchmarks/64_gpu.txt           | 16 ++++++++--------
 mpi/all-gather/perlmutter/benchmarks/8_gpu.txt | 16 ++++++++--------
 .../perlmutter/benchmarks/16_gpu.txt           | 12 ++++++------
 .../perlmutter/benchmarks/32_gpu.txt           | 16 ++++++++--------
 .../perlmutter/benchmarks/64_gpu.txt           | 12 ++++++++++++
 mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt | 14 +++++++-------
 .../perlmutter/benchmarks/128_gpu.txt          | 12 ++++++++++++
 .../perlmutter/benchmarks/16_gpu.txt           | 12 ++++++++++++
 .../perlmutter/benchmarks/32_gpu.txt           | 14 ++++++++++++++
 .../perlmutter/benchmarks/64_gpu.txt           | 13 +++++++++++++
 .../perlmutter/benchmarks/8_gpu.txt            | 13 +++++++++++++
 nccl/all-gather/benchmarks/16_gpu.txt          | 16 ++++++++--------
 nccl/all-gather/benchmarks/32_gpu.txt          | 18 +++++++++---------
 nccl/all-gather/benchmarks/64_gpu.txt          | 16 ++++++++--------
 nccl/all-gather/benchmarks/8_gpu.txt           | 16 ++++++++--------
 nccl/all-reduce/benchmarks/16_gpu.txt          | 14 +++++++-------
 nccl/all-reduce/benchmarks/32_gpu.txt          | 14 --------------
 nccl/all-reduce/benchmarks/64_gpu.txt          | 13 -------------
 nccl/all-reduce/benchmarks/8_gpu.txt           | 16 ++++++++--------
 nccl/reduce-scatter/benchmarks/8_gpu.txt       | 13 +++++++++++++
 23 files changed, 188 insertions(+), 138 deletions(-)
 delete mode 100644 mpi/all-gather/perlmutter/benchmarks/128_gpu.txt
 create mode 100644 mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt
 create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt
 create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt
 create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt
 create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt
 create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt
 delete mode 100644 nccl/all-reduce/benchmarks/32_gpu.txt
 delete mode 100644 nccl/all-reduce/benchmarks/64_gpu.txt
 create mode 100644 nccl/reduce-scatter/benchmarks/8_gpu.txt

diff --git a/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt
deleted file mode 100644
index 3c16468..0000000
--- a/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Local data size: 16
-Global data size: 2048
-Number of GPUs: 128
-Message size range: 262144 - 16777216
-Number of iterations: 10
-262144 0.003218 seconds
-524288 0.005240 seconds
-1048576 0.008649 seconds
-2097152 0.015703 seconds
-4194304 0.030562 seconds
-8388608 0.060407 seconds
-16777216 0.190813 seconds
diff --git a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
index 9dc96cf..ca685cf 100644
--- a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
+++ b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
@@ -10,12 +10,12 @@ slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/a
 slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
 slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
 slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
+srun: error: nid003924: tasks 12-15: Exited with exit code 2
+srun: Terminating StepId=27986453.0
 slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
 slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
 slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
 slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
-srun: error: nid002072: tasks 8-11: Exited with exit code 2
-srun: Terminating StepId=27970493.0
-srun: error: nid002073: tasks 12-15: Exited with exit code 2
-srun: error: nid001572: tasks 4-7: Exited with exit code 2
-srun: error: nid001569: tasks 0-3: Exited with exit code 2
+srun: error: nid003732: tasks 4-7: Exited with exit code 2
+srun: error: nid003628: tasks 0-3: Exited with exit code 2
+srun: error: nid003920: tasks 8-11: Exited with exit code 2
diff --git a/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt
index 754e581..fca9dfb 100644
--- a/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt
+++ b/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt
@@ -3,12 +3,12 @@ Global data size: 2048
 Number of GPUs: 32
 Message size range: 262144 - 67108864
 Number of iterations: 10
-262144 0.000744 seconds
-524288 0.001397 seconds
-1048576 0.002723 seconds
-2097152 0.003728 seconds
-4194304 0.007619 seconds
-8388608 0.014516 seconds
-16777216 0.030634 seconds
-33554432 0.063410 seconds
-67108864 0.172556 seconds
+262144 0.000814 seconds
+524288 0.001392 seconds
+1048576 0.002735 seconds
+2097152 0.003736 seconds
+4194304 0.007699 seconds
+8388608 0.014426 seconds
+16777216 0.030468 seconds
+33554432 0.063086 seconds
+67108864 0.172433 seconds
diff --git a/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt
index cd13b86..fd082e7 100644
--- a/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt
+++ b/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt
@@ -3,11 +3,11 @@ Global data size: 2048
 Number of GPUs: 64
 Message size range: 262144 - 33554432
 Number of iterations: 10
-262144 0.001523 seconds
-524288 0.003143 seconds
-1048576 0.004237 seconds
-2097152 0.008015 seconds
-4194304 0.015194 seconds
-8388608 0.029697 seconds
-16777216 0.063139 seconds
-33554432 0.184281 seconds
+262144 0.001616 seconds
+524288 0.003051 seconds
+1048576 0.004224 seconds
+2097152 0.008058 seconds
+4194304 0.015085 seconds
+8388608 0.029593 seconds
+16777216 0.063129 seconds
+33554432 0.185107 seconds
diff --git a/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt
index e010f99..d027526 100644
--- a/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt
+++ b/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt
@@ -3,11 +3,11 @@ Global data size: 2048
 Number of GPUs: 8
 Message size range: 2097152 - 268435456
 Number of iterations: 10
-2097152 0.000888 seconds
-4194304 0.001690 seconds
-8388608 0.003195 seconds
-16777216 0.006815 seconds
-33554432 0.013828 seconds
-67108864 0.028031 seconds
-134217728 0.055406 seconds
-268435456 0.104231 seconds
+2097152 0.000804 seconds
+4194304 0.001514 seconds
+8388608 0.003268 seconds
+16777216 0.006800 seconds
+33554432 0.013764 seconds
+67108864 0.027832 seconds
+134217728 0.055076 seconds
+268435456 0.103476 seconds
diff --git a/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt
index 76b174e..7536923 100644
--- a/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt
+++ b/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt
@@ -3,9 +3,9 @@ Global data size: 1024
 Number of GPUs: 16
 Message size range: 33554432 - 1073741824
 Number of iterations: 10
-33554432 0.145773 seconds
-67108864 0.327744 seconds
-134217728 0.680940 seconds
-268435456 2.172019 seconds
-536870912 4.377939 seconds
-1073741824 8.740797 seconds
+33554432 0.142862 seconds
+67108864 0.282599 seconds
+134217728 0.635635 seconds
+268435456 1.893851 seconds
+536870912 3.800098 seconds
+1073741824 7.591759 seconds
diff --git a/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt
index c7d90db..f210edf 100644
--- a/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt
+++ b/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt
@@ -3,11 +3,11 @@ Global data size: 1024
 Number of GPUs: 32
 Message size range: 8388608 - 1073741824
 Number of iterations: 10
-8388608 0.050947 seconds
-16777216 0.093279 seconds
-33554432 0.183651 seconds
-67108864 0.368861 seconds
-134217728 0.804120 seconds
-268435456 2.351269 seconds
-536870912 4.727807 seconds
-1073741824 9.445482 seconds
+8388608 0.050115 seconds
+16777216 0.093747 seconds
+33554432 0.182627 seconds
+67108864 0.363477 seconds
+134217728 0.777837 seconds
+268435456 2.348574 seconds
+536870912 4.726795 seconds
+1073741824 9.478696 seconds
diff --git a/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..0052be4
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 1024
+Global data size: 1024
+Number of GPUs: 64
+Message size range: 16777216 - 1073741824
+Number of iterations: 10
+16777216 0.120696 seconds
+33554432 0.238777 seconds
+67108864 0.470335 seconds
+134217728 0.963299 seconds
+268435456 2.857795 seconds
+536870912 5.742566 seconds
+1073741824 11.495248 seconds
diff --git a/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt
index 43c1c73..def3166 100644
--- a/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt
+++ b/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt
@@ -3,10 +3,10 @@ Global data size: 1024
 Number of GPUs: 8
 Message size range: 16777216 - 1073741824
 Number of iterations: 10
-16777216 0.056679 seconds
-33554432 0.108849 seconds
-67108864 0.216523 seconds
-134217728 0.510124 seconds
-268435456 1.547371 seconds
-536870912 3.104556 seconds
-1073741824 6.214916 seconds
+16777216 0.056844 seconds
+33554432 0.108090 seconds
+67108864 0.215626 seconds
+134217728 0.502310 seconds
+268435456 1.519484 seconds
+536870912 3.075941 seconds
+1073741824 6.121168 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..7306758
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 128
+Message size range: 33554432 - 2147483648
+Number of iterations: 10
+33554432 0.410163 seconds
+67108864 0.429161 seconds
+134217728 0.544002 seconds
+268435456 0.679339 seconds
+536870912 0.981913 seconds
+1073741824 1.583797 seconds
+2147483648 3.678590 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..190422f
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 16
+Message size range: 33554432 - 2147483648
+Number of iterations: 10
+33554432 0.056117 seconds
+67108864 0.092396 seconds
+134217728 0.169070 seconds
+268435456 0.331578 seconds
+536870912 0.641127 seconds
+1073741824 1.270086 seconds
+2147483648 3.735213 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..7b9f084
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt
@@ -0,0 +1,14 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 32
+Message size range: 8388608 - 2147483648
+Number of iterations: 10
+8388608 0.053765 seconds
+16777216 0.064537 seconds
+33554432 0.084740 seconds
+67108864 0.133787 seconds
+134217728 0.220573 seconds
+268435456 0.377243 seconds
+536870912 0.683938 seconds
+1073741824 1.321649 seconds
+2147483648 3.716915 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..675dc8f
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 64
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 0.157345 seconds
+33554432 0.205494 seconds
+67108864 0.216133 seconds
+134217728 0.316748 seconds
+268435456 0.476547 seconds
+536870912 0.776507 seconds
+1073741824 1.387122 seconds
+2147483648 3.688627 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt
new file mode 100644
index 0000000..c7ca325
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 8
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 0.024237 seconds
+33554432 0.043589 seconds
+67108864 0.083173 seconds
+134217728 0.153300 seconds
+268435456 0.300631 seconds
+536870912 0.598284 seconds
+1073741824 1.190578 seconds
+2147483648 3.832743 seconds
diff --git a/nccl/all-gather/benchmarks/16_gpu.txt b/nccl/all-gather/benchmarks/16_gpu.txt
index 22b1d19..1afafc0 100644
--- a/nccl/all-gather/benchmarks/16_gpu.txt
+++ b/nccl/all-gather/benchmarks/16_gpu.txt
@@ -3,11 +3,11 @@ Global data size: 4096
 Number of GPUs: 16
 Message size range: 2097152 - 268435456
 Number of iterations: 10
-2097152 0.000546 seconds
-4194304 0.000963 seconds
-8388608 0.001810 seconds
-16777216 0.003587 seconds
-33554432 0.006843 seconds
-67108864 0.013602 seconds
-134217728 0.026932 seconds
-268435456 0.052715 seconds
+2097152 0.000643 seconds
+4194304 0.000944 seconds
+8388608 0.001838 seconds
+16777216 0.003452 seconds
+33554432 0.007084 seconds
+67108864 0.013794 seconds
+134217728 0.026821 seconds
+268435456 0.052760 seconds
diff --git a/nccl/all-gather/benchmarks/32_gpu.txt b/nccl/all-gather/benchmarks/32_gpu.txt
index da3b81b..03e6ee9 100644
--- a/nccl/all-gather/benchmarks/32_gpu.txt
+++ b/nccl/all-gather/benchmarks/32_gpu.txt
@@ -3,12 +3,12 @@ Global data size: 2048
 Number of GPUs: 32
 Message size range: 262144 - 67108864
 Number of iterations: 10
-262144 0.000531 seconds
-524288 0.000602 seconds
-1048576 0.000700 seconds
-2097152 0.001056 seconds
-4194304 0.001907 seconds
-8388608 0.003960 seconds
-16777216 0.006958 seconds
-33554432 0.014047 seconds
-67108864 0.027585 seconds
+262144 0.000528 seconds
+524288 0.000604 seconds
+1048576 0.000701 seconds
+2097152 0.001044 seconds
+4194304 0.002055 seconds
+8388608 0.004240 seconds
+16777216 0.006949 seconds
+33554432 0.014221 seconds
+67108864 0.027622 seconds
diff --git a/nccl/all-gather/benchmarks/64_gpu.txt b/nccl/all-gather/benchmarks/64_gpu.txt
index f05957a..c0872ab 100644
--- a/nccl/all-gather/benchmarks/64_gpu.txt
+++ b/nccl/all-gather/benchmarks/64_gpu.txt
@@ -3,11 +3,11 @@ Global data size: 2048
 Number of GPUs: 64
 Message size range: 262144 - 33554432
 Number of iterations: 10
-262144 0.001041 seconds
-524288 0.001212 seconds
-1048576 0.001357 seconds
-2097152 0.002122 seconds
-4194304 0.003750 seconds
-8388608 0.007686 seconds
-16777216 0.014414 seconds
-33554432 0.028307 seconds
+262144 0.001230 seconds
+524288 0.001226 seconds
+1048576 0.001381 seconds
+2097152 0.002098 seconds
+4194304 0.003764 seconds
+8388608 0.007649 seconds
+16777216 0.014257 seconds
+33554432 0.027941 seconds
diff --git a/nccl/all-gather/benchmarks/8_gpu.txt b/nccl/all-gather/benchmarks/8_gpu.txt
index 9d9c99f..8fc4917 100644
--- a/nccl/all-gather/benchmarks/8_gpu.txt
+++ b/nccl/all-gather/benchmarks/8_gpu.txt
@@ -3,11 +3,11 @@ Global data size: 2048
 Number of GPUs: 8
 Message size range: 2097152 - 268435456
 Number of iterations: 10
-2097152 0.000298 seconds
-4194304 0.000477 seconds
-8388608 0.000903 seconds
-16777216 0.001661 seconds
-33554432 0.003230 seconds
-67108864 0.006674 seconds
-134217728 0.012419 seconds
-268435456 0.024550 seconds
+2097152 0.000325 seconds
+4194304 0.000482 seconds
+8388608 0.000881 seconds
+16777216 0.001679 seconds
+33554432 0.003206 seconds
+67108864 0.006338 seconds
+134217728 0.012452 seconds
+268435456 0.024147 seconds
diff --git a/nccl/all-reduce/benchmarks/16_gpu.txt b/nccl/all-reduce/benchmarks/16_gpu.txt
index a866d54..26fc256 100644
--- a/nccl/all-reduce/benchmarks/16_gpu.txt
+++ b/nccl/all-reduce/benchmarks/16_gpu.txt
@@ -3,10 +3,10 @@ Global data size: 2048
 Number of GPUs: 16
 Message size range: 33554432 - 2147483648
 Number of iterations: 10
-33554432 0.001007 seconds
-67108864 0.001788 seconds
-134217728 0.003634 seconds
-268435456 0.006935 seconds
-536870912 0.013610 seconds
-1073741824 0.027019 seconds
-2147483648 0.052864 seconds
+33554432 0.000969 seconds
+67108864 0.001819 seconds
+134217728 0.003596 seconds
+268435456 0.006813 seconds
+536870912 0.013459 seconds
+1073741824 0.026683 seconds
+2147483648 0.052290 seconds
diff --git a/nccl/all-reduce/benchmarks/32_gpu.txt b/nccl/all-reduce/benchmarks/32_gpu.txt
deleted file mode 100644
index a20b1cd..0000000
--- a/nccl/all-reduce/benchmarks/32_gpu.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 32
-Message size range: 8388608 - 2147483648
-Number of iterations: 10
-8388608 0.001052 seconds
-16777216 0.001220 seconds
-33554432 0.001356 seconds
-67108864 0.002028 seconds
-134217728 0.003714 seconds
-268435456 0.007242 seconds
-536870912 0.013809 seconds
-1073741824 0.027274 seconds
-2147483648 0.054261 seconds
diff --git a/nccl/all-reduce/benchmarks/64_gpu.txt b/nccl/all-reduce/benchmarks/64_gpu.txt
deleted file mode 100644
index 07fbe6d..0000000
--- a/nccl/all-reduce/benchmarks/64_gpu.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-Local data size: 2048
-Global data size: 2048
-Number of GPUs: 64
-Message size range: 16777216 - 2147483648
-Number of iterations: 10
-16777216 0.001536 seconds
-33554432 0.001953 seconds
-67108864 0.002903 seconds
-134217728 0.004239 seconds
-268435456 0.007382 seconds
-536870912 0.014722 seconds
-1073741824 0.028043 seconds
-2147483648 0.055311 seconds
diff --git a/nccl/all-reduce/benchmarks/8_gpu.txt b/nccl/all-reduce/benchmarks/8_gpu.txt
index c9bda12..e5a5769 100644
--- a/nccl/all-reduce/benchmarks/8_gpu.txt
+++ b/nccl/all-reduce/benchmarks/8_gpu.txt
@@ -3,11 +3,11 @@ Global data size: 2048
 Number of GPUs: 8
 Message size range: 16777216 - 2147483648
 Number of iterations: 10
-16777216 0.000507 seconds
-33554432 0.000855 seconds
-67108864 0.001697 seconds
-134217728 0.003146 seconds
-268435456 0.006394 seconds
-536870912 0.012162 seconds
-1073741824 0.024174 seconds
-2147483648 0.047715 seconds
+16777216 0.000635 seconds
+33554432 0.000887 seconds
+67108864 0.001639 seconds
+134217728 0.003232 seconds
+268435456 0.006303 seconds
+536870912 0.011998 seconds
+1073741824 0.024143 seconds
+2147483648 0.047652 seconds
diff --git a/nccl/reduce-scatter/benchmarks/8_gpu.txt b/nccl/reduce-scatter/benchmarks/8_gpu.txt
new file mode 100644
index 0000000..5cee721
--- /dev/null
+++ b/nccl/reduce-scatter/benchmarks/8_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 8
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 0.000363 seconds
+33554432 0.000450 seconds
+67108864 0.000876 seconds
+134217728 0.001650 seconds
+268435456 0.003169 seconds
+536870912 0.006491 seconds
+1073741824 0.012103 seconds
+2147483648 0.024166 seconds

From e878a751e8d4b5b5fdaca83187d832fdde7686ad Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Fri, 12 Jul 2024 07:36:45 -0700
Subject: [PATCH 46/52] add mpi all-gather 128gpu benchmarks

---
 mpi/all-gather/perlmutter/benchmarks/128_gpu.txt | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 mpi/all-gather/perlmutter/benchmarks/128_gpu.txt

diff --git a/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..295c6c0
--- /dev/null
+++ b/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 16
+Global data size: 2048
+Number of GPUs: 128
+Message size range: 262144 - 16777216
+Number of iterations: 10
+262144 0.003072 seconds
+524288 0.005233 seconds
+1048576 0.008462 seconds
+2097152 0.015449 seconds
+4194304 0.030325 seconds
+8388608 0.060131 seconds
+16777216 0.190401 seconds

From 75c8208ed9dd229c3c43329ded95d74f25840896 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Fri, 12 Jul 2024 07:47:39 -0700
Subject: [PATCH 47/52] update benchmarks

---
 mpi/all-gather/perlmutter/16_gpu_run.sh       |  2 +-
 .../perlmutter/benchmarks/16_gpu.txt          | 21 -------------------
 .../perlmutter/benchmarks/128_gpu.txt         |  0
 nccl/all-gather/benchmarks/128_gpu.txt        | 13 ++++++++++++
 nccl/all-reduce/benchmarks/32_gpu.txt         | 14 +++++++++++++
 5 files changed, 28 insertions(+), 22 deletions(-)
 delete mode 100644 mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
 create mode 100644 mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt
 create mode 100644 nccl/all-gather/benchmarks/128_gpu.txt
 create mode 100644 nccl/all-reduce/benchmarks/32_gpu.txt

diff --git a/mpi/all-gather/perlmutter/16_gpu_run.sh b/mpi/all-gather/perlmutter/16_gpu_run.sh
index 813b192..e68834a 100644
--- a/mpi/all-gather/perlmutter/16_gpu_run.sh
+++ b/mpi/all-gather/perlmutter/16_gpu_run.sh
@@ -29,7 +29,7 @@ export FI_CXI_OFLOW_BUF_COUNT=1
 MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
 MAX_MSG_SIZE=$((1048576 * 128))
 
-SCRIPT="$SCRATCH/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
 run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt"
 
 echo $run_cmd
diff --git a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
deleted file mode 100644
index ca685cf..0000000
--- a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
-slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
-slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
-slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
-slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
-slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
-slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
-slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
-slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
-slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
-slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
-slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
-srun: error: nid003924: tasks 12-15: Exited with exit code 2
-srun: Terminating StepId=27986453.0
-slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
-slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
-slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
-slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory
-srun: error: nid003732: tasks 4-7: Exited with exit code 2
-srun: error: nid003628: tasks 0-3: Exited with exit code 2
-srun: error: nid003920: tasks 8-11: Exited with exit code 2
diff --git a/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..e69de29
diff --git a/nccl/all-gather/benchmarks/128_gpu.txt b/nccl/all-gather/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..3ac04bb
--- /dev/null
+++ b/nccl/all-gather/benchmarks/128_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 32
+Global data size: 4096
+Number of GPUs: 128
+Message size range: 262144 - 33554432
+Number of iterations: 10
+262144 0.002077 seconds
+524288 0.002368 seconds
+1048576 0.002832 seconds
+2097152 0.004504 seconds
+4194304 0.007551 seconds
+8388608 0.014982 seconds
+16777216 0.028604 seconds
+33554432 0.056227 seconds
diff --git a/nccl/all-reduce/benchmarks/32_gpu.txt b/nccl/all-reduce/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..90fc0f0
--- /dev/null
+++ b/nccl/all-reduce/benchmarks/32_gpu.txt
@@ -0,0 +1,14 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 32
+Message size range: 8388608 - 2147483648
+Number of iterations: 10
+8388608 0.001510 seconds
+16777216 0.001222 seconds
+33554432 0.001317 seconds
+67108864 0.002024 seconds
+134217728 0.003762 seconds
+268435456 0.007554 seconds
+536870912 0.014173 seconds
+1073741824 0.027756 seconds
+2147483648 0.054544 seconds

From 39d17ffe4e5fa350ca23bed8caa46f8f1dbe7a26 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Fri, 12 Jul 2024 07:54:38 -0700
Subject: [PATCH 48/52] update results

---
 .../perlmutter/benchmarks/16_gpu.txt          | 21 +++++++++++++++++++
 nccl/reduce-scatter/benchmarks/16_gpu.txt     | 12 +++++++++++
 nccl/reduce-scatter/benchmarks/32_gpu.txt     | 14 +++++++++++++
 3 files changed, 47 insertions(+)
 create mode 100644 mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
 create mode 100644 nccl/reduce-scatter/benchmarks/16_gpu.txt
 create mode 100644 nccl/reduce-scatter/benchmarks/32_gpu.txt

diff --git a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..737af38
--- /dev/null
+++ b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
@@ -0,0 +1,21 @@
+/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
+/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
+/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
+/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
+/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
+/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
+/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
+/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
+/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
+/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
+/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
+/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
+srun: error: nid008252: tasks 0-3: Exited with exit code 127
+srun: Terminating StepId=27999986.0
+/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
+/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
+/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
+/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
+srun: error: nid008649: tasks 4-7: Exited with exit code 127
+srun: error: nid008652: tasks 8-11: Exited with exit code 127
+srun: error: nid008653: tasks 12-15: Exited with exit code 127
diff --git a/nccl/reduce-scatter/benchmarks/16_gpu.txt b/nccl/reduce-scatter/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..0bae9e9
--- /dev/null
+++ b/nccl/reduce-scatter/benchmarks/16_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 16
+Message size range: 33554432 - 2147483648
+Number of iterations: 10
+33554432 0.000552 seconds
+67108864 0.000933 seconds
+134217728 0.001772 seconds
+268435456 0.003462 seconds
+536870912 0.007059 seconds
+1073741824 0.013749 seconds
+2147483648 0.026539 seconds
diff --git a/nccl/reduce-scatter/benchmarks/32_gpu.txt b/nccl/reduce-scatter/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..307b0ce
--- /dev/null
+++ b/nccl/reduce-scatter/benchmarks/32_gpu.txt
@@ -0,0 +1,14 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 32
+Message size range: 8388608 - 2147483648
+Number of iterations: 10
+8388608 0.000586 seconds
+16777216 0.000629 seconds
+33554432 0.000712 seconds
+67108864 0.001141 seconds
+134217728 0.002012 seconds
+268435456 0.003715 seconds
+536870912 0.007022 seconds
+1073741824 0.014078 seconds
+2147483648 0.027699 seconds

From 4c714037a0bf628fb3d3ad72a47ae39e5e3ddc0d Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Fri, 12 Jul 2024 08:00:00 -0700
Subject: [PATCH 49/52] push results

---
 .../perlmutter/benchmarks/16_gpu.txt          | 21 -------------------
 .../perlmutter/benchmarks/128_gpu.txt         | 11 ++++++++++
 2 files changed, 11 insertions(+), 21 deletions(-)
 delete mode 100644 mpi/all-gather/perlmutter/benchmarks/16_gpu.txt

diff --git a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
deleted file mode 100644
index 737af38..0000000
--- a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
-/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
-/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
-/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
-/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
-/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
-/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
-/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
-/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
-/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
-/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
-/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
-srun: error: nid008252: tasks 0-3: Exited with exit code 127
-srun: Terminating StepId=27999986.0
-/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
-/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
-/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
-/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory
-srun: error: nid008649: tasks 4-7: Exited with exit code 127
-srun: error: nid008652: tasks 8-11: Exited with exit code 127
-srun: error: nid008653: tasks 12-15: Exited with exit code 127
diff --git a/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt
index e69de29..a4485f5 100644
--- a/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt
+++ b/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt
@@ -0,0 +1,11 @@
+Local data size: 1024
+Global data size: 1024
+Number of GPUs: 128
+Message size range: 33554432 - 1073741824
+Number of iterations: 10
+33554432 0.260096 seconds
+67108864 0.535750 seconds
+134217728 1.089220 seconds
+268435456 3.236966 seconds
+536870912 6.499632 seconds
+1073741824 12.975189 seconds

From 6e59c91ee1c0752048738791dc8cf2ee96a13da8 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Fri, 12 Jul 2024 08:05:19 -0700
Subject: [PATCH 50/52] push results

---
 nccl/all-reduce/benchmarks/64_gpu.txt | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 nccl/all-reduce/benchmarks/64_gpu.txt

diff --git a/nccl/all-reduce/benchmarks/64_gpu.txt b/nccl/all-reduce/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..ebd310e
--- /dev/null
+++ b/nccl/all-reduce/benchmarks/64_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 64
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 0.001551 seconds
+33554432 0.001949 seconds
+67108864 0.002918 seconds
+134217728 0.004132 seconds
+268435456 0.007447 seconds
+536870912 0.014747 seconds
+1073741824 0.028172 seconds
+2147483648 0.055372 seconds

From cc1b03fae4135bd66f61d50851fe6c2cf8d27c34 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Fri, 12 Jul 2024 10:01:56 -0700
Subject: [PATCH 51/52] update results

---
 mpi/all-gather/perlmutter/benchmarks/16_gpu.txt | 12 ++++++++++++
 nccl/reduce-scatter/benchmarks/64_gpu.txt       | 13 +++++++++++++
 2 files changed, 25 insertions(+)
 create mode 100644 mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
 create mode 100644 nccl/reduce-scatter/benchmarks/64_gpu.txt

diff --git a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..740a003
--- /dev/null
+++ b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 128
+Global data size: 2048
+Number of GPUs: 16
+Message size range: 2097152 - 134217728
+Number of iterations: 10
+2097152 0.002476 seconds
+4194304 0.003571 seconds
+8388608 0.007188 seconds
+16777216 0.014909 seconds
+33554432 0.030427 seconds
+67108864 0.061974 seconds
+134217728 0.150229 seconds
diff --git a/nccl/reduce-scatter/benchmarks/64_gpu.txt b/nccl/reduce-scatter/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..45bd514
--- /dev/null
+++ b/nccl/reduce-scatter/benchmarks/64_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 64
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 0.001059 seconds
+33554432 0.001147 seconds
+67108864 0.001410 seconds
+134217728 0.002090 seconds
+268435456 0.004116 seconds
+536870912 0.007125 seconds
+1073741824 0.014305 seconds
+2147483648 0.028156 seconds

From 92d5eecf59655c2aead09cb176502105fadf5391 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Fri, 12 Jul 2024 14:05:09 -0700
Subject: [PATCH 52/52] push final results

---
 nccl/all-reduce/benchmarks/128_gpu.txt     | 12 ++++++++++++
 nccl/reduce-scatter/benchmarks/128_gpu.txt | 12 ++++++++++++
 2 files changed, 24 insertions(+)
 create mode 100644 nccl/all-reduce/benchmarks/128_gpu.txt
 create mode 100644 nccl/reduce-scatter/benchmarks/128_gpu.txt

diff --git a/nccl/all-reduce/benchmarks/128_gpu.txt b/nccl/all-reduce/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..30388e3
--- /dev/null
+++ b/nccl/all-reduce/benchmarks/128_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 128
+Message size range: 33554432 - 2147483648
+Number of iterations: 10
+33554432 0.002305 seconds
+67108864 0.003309 seconds
+134217728 0.005263 seconds
+268435456 0.008851 seconds
+536870912 0.017150 seconds
+1073741824 0.037149 seconds
+2147483648 0.075655 seconds
diff --git a/nccl/reduce-scatter/benchmarks/128_gpu.txt b/nccl/reduce-scatter/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..846d583
--- /dev/null
+++ b/nccl/reduce-scatter/benchmarks/128_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 128
+Message size range: 33554432 - 2147483648
+Number of iterations: 10
+33554432 0.002055 seconds
+67108864 0.002314 seconds
+134217728 0.003003 seconds
+268435456 0.004164 seconds
+536870912 0.007515 seconds
+1073741824 0.014791 seconds
+2147483648 0.027948 seconds