From f7fb83c79b9e24e2ca0836030e148f88c8a44829 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Mon, 25 Mar 2024 20:33:13 -0700 Subject: [PATCH 01/52] add cudaDeviceSynchronize for NCCL --- allgather.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/allgather.cu b/allgather.cu index cf1a882..5953041 100644 --- a/allgather.cu +++ b/allgather.cu @@ -158,6 +158,7 @@ int main(int argc, char *argv[]) { MPI_CHECK(MPI_Wait(&request, &status)); #elif defined(USE_NCCL) NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL)); + cudaDeviceSynchronize(); #elif defined(USE_RCCL) // TODO: fix later rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); @@ -177,6 +178,7 @@ int main(int argc, char *argv[]) { MPI_CHECK(MPI_Wait(&request, &status)); #elif defined(USE_NCCL) NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL)); + cudaDeviceSynchronize(); #elif defined(USE_RCCL) // TODO: fix later rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); From 453a397d2c609f41eac92012e9564ef6e1fa8ed6 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Sat, 30 Mar 2024 14:15:33 -0700 Subject: [PATCH 02/52] add allreduce code --- Makefile | 9 ++- allreduce.cu | 219 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 225 insertions(+), 3 deletions(-) create mode 100644 allreduce.cu diff --git a/Makefile b/Makefile index df453b4..231e499 100644 --- a/Makefile +++ b/Makefile @@ -5,14 +5,17 @@ CC = cc INC = -I/global/common/software/nersc9/nccl/2.19.4/include -CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL +CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl -all: allgather.x +all: allgather.x allreduce.x allgather.x: allgather.cu ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu +allreduce.x: allreduce.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allreduce.x allreduce.cu + clean: - rm -f allgather.x + rm -f allgather.x allreduce.x diff --git a/allreduce.cu b/allreduce.cu new file mode 100644 index 0000000..062b120 --- /dev/null +++ b/allreduce.cu @@ -0,0 +1,219 @@ +/* \file allreduce.cu + * Copyright 2024 Parallel Software and Systems Group, University of Maryland. + * See the top-level LICENSE file for details. + * + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include + +#ifdef USE_CUDA + #include + #include +#endif + +#ifdef USE_NCCL + #include "nccl.h" +#elif defined(USE_RCCL) + #include "rccl.h" +#endif + +#define NUM_WARMUP_ITERATIONS 5 + +#define MPI_CHECK(cmd) do { \ + int e = cmd; \ + if( e != MPI_SUCCESS ) { \ + printf("Failed: MPI error %s:%d '%d'\n", \ + __FILE__,__LINE__, e); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define CUDA_CHECK(cmd) do { \ + cudaError_t e = cmd; \ + if(e != cudaSuccess) { \ + printf("CUDA error %s:%d: %s\n", \ + __FILE__, __LINE__, cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define NCCL_CHECK(cmd) do { \ + ncclResult_t e = cmd; \ + if (e != ncclSuccess) { \ + printf("NCCL error %s:%d %s\n", \ + __FILE__, __LINE__, ncclGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +void initializeData(nv_bfloat16 *data, int size) { + for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) { + data[i] = __float2bfloat16((float)i); + } +} + +void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { + nv_bfloat16* in = (nv_bfloat16*) invec; + nv_bfloat16* inout = (nv_bfloat16*) inoutvec; + for (int i = 0; i < *len; i++) + inout[i] = __hadd(in[i], inout[i]); +} + +int main(int argc, char *argv[]) { + if (argc != 5) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return EXIT_FAILURE; + } + + int num_gpus = atoi(argv[1]); + int min_msg_size = atoi(argv[2]); + int max_msg_size = atoi(argv[3]); + int iterations = atoi(argv[4]); + + if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { + fprintf(stderr, "Invalid input parameters.\n"); + return EXIT_FAILURE; + } + + int my_rank, num_pes; + int num_gpus_per_node; + int msg_count; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_pes); + + if (num_pes != num_gpus) { + fprintf(stderr, "Number of processes must match number of GPUs.\n"); + MPI_Finalize(); + return EXIT_FAILURE; + } + + // Initialize GPU context + cudaGetDeviceCount(&num_gpus_per_node); + cudaSetDevice((my_rank % num_gpus_per_node)); + + int local_data_size = max_msg_size; // Size of local data + int global_data_size = local_data_size; // Size of global data + + nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size); + nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size); + + // Initialize local data + initializeData(local_data, local_data_size); + + // Allocate memory on GPU + nv_bfloat16 *d_local_data, *d_global_data; + CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); + CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); + + // Copy local data to GPU + CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); + + #ifdef USE_MPI + // create 2-byte datatype (send raw, un-interpreted bytes) + MPI_Datatype mpi_type_bfloat16; + MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16); + MPI_Type_commit(&mpi_type_bfloat16); + + // define custom reduce operation for nv_bfloat16 types + MPI_Op CUSTOM_SUM; + MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM); + + #elif USE_NCCL + ncclUniqueId nccl_comm_id; + ncclComm_t nccl_comm; + + if (my_rank == 0) { + /* Generates an Id to be used in ncclCommInitRank. */ + ncclGetUniqueId(&nccl_comm_id); + } + + /* distribute nccl_comm_id to all ranks */ + MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, + 0, MPI_COMM_WORLD)); + + /* Create a new NCCL communicator */ + NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); + + #elif defined(USE_RCCL) + // TODO: fix later + rcclComm_t rccl_comm; + rcclCommInitRank(&comm, num_gpus, 0, rccl_root); + #endif + + // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather + double total_time, start_time; + MPI_Request request; + MPI_Status status; + + // Print benchmark results + if (my_rank == 0) { + printf("Number of GPUs: %d\n", num_gpus); + printf("Message size range: %d - %d\n", min_msg_size, max_msg_size); + printf("Number of iterations: %d\n", iterations); + } + fflush(NULL); + + for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { + msg_count = msg_size / sizeof(nv_bfloat16); + // warmup iterations + for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { + #ifdef USE_MPI + MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16, + CUSTOM_SUM, MPI_COMM_WORLD, &request)); + + MPI_CHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) + NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL)); + cudaDeviceSynchronize(); + #elif defined(USE_RCCL) + // TODO: fix later + rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); + #endif + } + + if(msg_size >= 8388608) + iterations = 20; + + MPI_Barrier(MPI_COMM_WORLD); + start_time = MPI_Wtime(); + for (int i = 0; i < iterations; ++i) { + #ifdef USE_MPI + MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16, + CUSTOM_SUM, MPI_COMM_WORLD, &request)); + + MPI_CHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) + NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL)); + cudaDeviceSynchronize(); + #elif defined(USE_RCCL) + // TODO: fix later + rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); + #endif + } + MPI_Barrier(MPI_COMM_WORLD); + total_time = MPI_Wtime() - start_time; + if (my_rank == 0) + printf("%d %.6f seconds\n", msg_size, (total_time / iterations)); + } + + // Cleanup + free(local_data); + free(global_data); + CUDA_CHECK(cudaFree(d_local_data)); + CUDA_CHECK(cudaFree(d_global_data)); + + #ifdef USE_NCCL + ncclCommDestroy(nccl_comm); + #elif defined(USE_RCCL) + rcclCommDestroy(rccl_comm); + #endif + + MPI_Finalize(); + return EXIT_SUCCESS; +} + From f516fa01dc6a1fe40f255216da98f0b235a1ede4 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Sat, 30 Mar 2024 16:40:26 -0700 Subject: [PATCH 03/52] add reduce scatter code --- Makefile | 7 +- reduce_scatter.cu | 226 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 231 insertions(+), 2 deletions(-) create mode 100644 reduce_scatter.cu diff --git a/Makefile b/Makefile index 231e499..973364d 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl -all: allgather.x allreduce.x +all: allgather.x allreduce.x reduce_scatter.x allgather.x: allgather.cu ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu @@ -17,5 +17,8 @@ allgather.x: allgather.cu allreduce.x: allreduce.cu ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allreduce.x allreduce.cu +reduce_scatter.x: reduce_scatter.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce_scatter.x reduce_scatter.cu + clean: - rm -f allgather.x allreduce.x + rm -f allgather.x allreduce.x reduce_scatter.x diff --git a/reduce_scatter.cu b/reduce_scatter.cu new file mode 100644 index 0000000..9ed9e53 --- /dev/null +++ b/reduce_scatter.cu @@ -0,0 +1,226 @@ +/* \file reduce_scatter.cu + * Copyright 2024 Parallel Software and Systems Group, University of Maryland. + * See the top-level LICENSE file for details. + * + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include + +#ifdef USE_CUDA + #include + #include +#endif + +#ifdef USE_NCCL + #include "nccl.h" +#elif defined(USE_RCCL) + #include "rccl.h" +#endif + +#define NUM_WARMUP_ITERATIONS 5 + +#define MPI_CHECK(cmd) do { \ + int e = cmd; \ + if( e != MPI_SUCCESS ) { \ + printf("Failed: MPI error %s:%d '%d'\n", \ + __FILE__,__LINE__, e); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define CUDA_CHECK(cmd) do { \ + cudaError_t e = cmd; \ + if(e != cudaSuccess) { \ + printf("CUDA error %s:%d: %s\n", \ + __FILE__, __LINE__, cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define NCCL_CHECK(cmd) do { \ + ncclResult_t e = cmd; \ + if (e != ncclSuccess) { \ + printf("NCCL error %s:%d %s\n", \ + __FILE__, __LINE__, ncclGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +void initializeData(nv_bfloat16 *data, int size) { + for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) { + data[i] = __float2bfloat16((float)i); + } +} + +void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { + nv_bfloat16* in = (nv_bfloat16*) invec; + nv_bfloat16* inout = (nv_bfloat16*) inoutvec; + for (int i = 0; i < *len; i++) + inout[i] = __hadd(in[i], inout[i]); +} + +int main(int argc, char *argv[]) { + if (argc != 5) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return EXIT_FAILURE; + } + + int num_gpus = atoi(argv[1]); + int min_msg_size = atoi(argv[2]); + int max_msg_size = atoi(argv[3]); + int iterations = atoi(argv[4]); + + if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { + fprintf(stderr, "Invalid input parameters.\n"); + return EXIT_FAILURE; + } + + int my_rank, num_pes; + int num_gpus_per_node; + int msg_count; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_pes); + + if (num_pes != num_gpus) { + fprintf(stderr, "Number of processes must match number of GPUs.\n"); + MPI_Finalize(); + return EXIT_FAILURE; + } + + // Initialize GPU context + cudaGetDeviceCount(&num_gpus_per_node); + cudaSetDevice((my_rank % num_gpus_per_node)); + + int local_data_size = max_msg_size; // Size of local data + int global_data_size = local_data_size; // Size of global data + + nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size); + nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size); + + // Initialize local data + initializeData(local_data, local_data_size); + + // Allocate memory on GPU + nv_bfloat16 *d_local_data, *d_global_data; + CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); + CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); + + // Copy local data to GPU + CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); + + #ifdef USE_MPI + // create 2-byte datatype (send raw, un-interpreted bytes) + MPI_Datatype mpi_type_bfloat16; + MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16); + MPI_Type_commit(&mpi_type_bfloat16); + + // define custom reduce operation for nv_bfloat16 types + MPI_Op CUSTOM_SUM; + MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM); + + #elif USE_NCCL + ncclUniqueId nccl_comm_id; + ncclComm_t nccl_comm; + + if (my_rank == 0) { + /* Generates an Id to be used in ncclCommInitRank. */ + ncclGetUniqueId(&nccl_comm_id); + } + + /* distribute nccl_comm_id to all ranks */ + MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, + 0, MPI_COMM_WORLD)); + + /* Create a new NCCL communicator */ + NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); + + #elif defined(USE_RCCL) + // TODO: fix later + rcclComm_t rccl_comm; + rcclCommInitRank(&comm, num_gpus, 0, rccl_root); + #endif + + // init recvcounts to send an equal portion of data from the reduce operation + int num_elements = local_data_size / sizeof(nv_bfloat16); + int portion = num_elements / num_pes; + int *recvcounts = (int*) malloc(sizeof(int) * num_pes); + for (int i = 0; i < num_pes; i++) + recvcounts[i] = portion; + + // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather + double total_time, start_time; + MPI_Request request; + MPI_Status status; + + // Print benchmark results + if (my_rank == 0) { + printf("Number of GPUs: %d\n", num_gpus); + printf("Message size range: %d - %d\n", min_msg_size, max_msg_size); + printf("Number of iterations: %d\n", iterations); + } + fflush(NULL); + + for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { + msg_count = msg_size / sizeof(nv_bfloat16); + // warmup iterations + for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { + #ifdef USE_MPI + MPI_CHECK(MPI_Ireduce_scatter(d_local_data, d_global_data, recvcounts, mpi_type_bfloat16, + CUSTOM_SUM, MPI_COMM_WORLD, &request)); + + MPI_CHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) + NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL)); + cudaDeviceSynchronize(); + #elif defined(USE_RCCL) + // TODO: fix later + rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); + #endif + } + + if(msg_size >= 8388608) + iterations = 20; + + MPI_Barrier(MPI_COMM_WORLD); + start_time = MPI_Wtime(); + for (int i = 0; i < iterations; ++i) { + #ifdef USE_MPI + MPI_CHECK(MPI_Ireduce_scatter(d_local_data, d_global_data, recvcounts, mpi_type_bfloat16, + CUSTOM_SUM, MPI_COMM_WORLD, &request)); + + MPI_CHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) + NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL)); + cudaDeviceSynchronize(); + #elif defined(USE_RCCL) + // TODO: fix later + rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); + #endif + } + MPI_Barrier(MPI_COMM_WORLD); + total_time = MPI_Wtime() - start_time; + if (my_rank == 0) + printf("%d %.6f seconds\n", msg_size, (total_time / iterations)); + } + + // Cleanup + free(local_data); + free(global_data); + CUDA_CHECK(cudaFree(d_local_data)); + CUDA_CHECK(cudaFree(d_global_data)); + + #ifdef USE_NCCL + ncclCommDestroy(nccl_comm); + #elif defined(USE_RCCL) + rcclCommDestroy(rccl_comm); + #endif + + MPI_Finalize(); + return EXIT_SUCCESS; +} + From 23e9f5cea25b1c430ea0cfb3a0b5977a4ed27ff0 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Sat, 30 Mar 2024 16:43:04 -0700 Subject: [PATCH 04/52] remove duplicate commit --- allreduce.cu | 219 --------------------------------------------------- 1 file changed, 219 deletions(-) delete mode 100644 allreduce.cu diff --git a/allreduce.cu b/allreduce.cu deleted file mode 100644 index 062b120..0000000 --- a/allreduce.cu +++ /dev/null @@ -1,219 +0,0 @@ -/* \file allreduce.cu - * Copyright 2024 Parallel Software and Systems Group, University of Maryland. - * See the top-level LICENSE file for details. - * - * SPDX-License-Identifier: MIT - */ - -#include -#include -#include - -#ifdef USE_CUDA - #include - #include -#endif - -#ifdef USE_NCCL - #include "nccl.h" -#elif defined(USE_RCCL) - #include "rccl.h" -#endif - -#define NUM_WARMUP_ITERATIONS 5 - -#define MPI_CHECK(cmd) do { \ - int e = cmd; \ - if( e != MPI_SUCCESS ) { \ - printf("Failed: MPI error %s:%d '%d'\n", \ - __FILE__,__LINE__, e); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - -#define CUDA_CHECK(cmd) do { \ - cudaError_t e = cmd; \ - if(e != cudaSuccess) { \ - printf("CUDA error %s:%d: %s\n", \ - __FILE__, __LINE__, cudaGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - -#define NCCL_CHECK(cmd) do { \ - ncclResult_t e = cmd; \ - if (e != ncclSuccess) { \ - printf("NCCL error %s:%d %s\n", \ - __FILE__, __LINE__, ncclGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - -void initializeData(nv_bfloat16 *data, int size) { - for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) { - data[i] = __float2bfloat16((float)i); - } -} - -void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { - nv_bfloat16* in = (nv_bfloat16*) invec; - nv_bfloat16* inout = (nv_bfloat16*) inoutvec; - for (int i = 0; i < *len; i++) - inout[i] = __hadd(in[i], inout[i]); -} - -int main(int argc, char *argv[]) { - if (argc != 5) { - fprintf(stderr, "Usage: %s \n", argv[0]); - return EXIT_FAILURE; - } - - int num_gpus = atoi(argv[1]); - int min_msg_size = atoi(argv[2]); - int max_msg_size = atoi(argv[3]); - int iterations = atoi(argv[4]); - - if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { - fprintf(stderr, "Invalid input parameters.\n"); - return EXIT_FAILURE; - } - - int my_rank, num_pes; - int num_gpus_per_node; - int msg_count; - - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); - MPI_Comm_size(MPI_COMM_WORLD, &num_pes); - - if (num_pes != num_gpus) { - fprintf(stderr, "Number of processes must match number of GPUs.\n"); - MPI_Finalize(); - return EXIT_FAILURE; - } - - // Initialize GPU context - cudaGetDeviceCount(&num_gpus_per_node); - cudaSetDevice((my_rank % num_gpus_per_node)); - - int local_data_size = max_msg_size; // Size of local data - int global_data_size = local_data_size; // Size of global data - - nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size); - nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size); - - // Initialize local data - initializeData(local_data, local_data_size); - - // Allocate memory on GPU - nv_bfloat16 *d_local_data, *d_global_data; - CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); - CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); - - // Copy local data to GPU - CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); - - #ifdef USE_MPI - // create 2-byte datatype (send raw, un-interpreted bytes) - MPI_Datatype mpi_type_bfloat16; - MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16); - MPI_Type_commit(&mpi_type_bfloat16); - - // define custom reduce operation for nv_bfloat16 types - MPI_Op CUSTOM_SUM; - MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM); - - #elif USE_NCCL - ncclUniqueId nccl_comm_id; - ncclComm_t nccl_comm; - - if (my_rank == 0) { - /* Generates an Id to be used in ncclCommInitRank. */ - ncclGetUniqueId(&nccl_comm_id); - } - - /* distribute nccl_comm_id to all ranks */ - MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, - 0, MPI_COMM_WORLD)); - - /* Create a new NCCL communicator */ - NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); - - #elif defined(USE_RCCL) - // TODO: fix later - rcclComm_t rccl_comm; - rcclCommInitRank(&comm, num_gpus, 0, rccl_root); - #endif - - // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather - double total_time, start_time; - MPI_Request request; - MPI_Status status; - - // Print benchmark results - if (my_rank == 0) { - printf("Number of GPUs: %d\n", num_gpus); - printf("Message size range: %d - %d\n", min_msg_size, max_msg_size); - printf("Number of iterations: %d\n", iterations); - } - fflush(NULL); - - for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { - msg_count = msg_size / sizeof(nv_bfloat16); - // warmup iterations - for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { - #ifdef USE_MPI - MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16, - CUSTOM_SUM, MPI_COMM_WORLD, &request)); - - MPI_CHECK(MPI_Wait(&request, &status)); - #elif defined(USE_NCCL) - NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL)); - cudaDeviceSynchronize(); - #elif defined(USE_RCCL) - // TODO: fix later - rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); - #endif - } - - if(msg_size >= 8388608) - iterations = 20; - - MPI_Barrier(MPI_COMM_WORLD); - start_time = MPI_Wtime(); - for (int i = 0; i < iterations; ++i) { - #ifdef USE_MPI - MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16, - CUSTOM_SUM, MPI_COMM_WORLD, &request)); - - MPI_CHECK(MPI_Wait(&request, &status)); - #elif defined(USE_NCCL) - NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL)); - cudaDeviceSynchronize(); - #elif defined(USE_RCCL) - // TODO: fix later - rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); - #endif - } - MPI_Barrier(MPI_COMM_WORLD); - total_time = MPI_Wtime() - start_time; - if (my_rank == 0) - printf("%d %.6f seconds\n", msg_size, (total_time / iterations)); - } - - // Cleanup - free(local_data); - free(global_data); - CUDA_CHECK(cudaFree(d_local_data)); - CUDA_CHECK(cudaFree(d_global_data)); - - #ifdef USE_NCCL - ncclCommDestroy(nccl_comm); - #elif defined(USE_RCCL) - rcclCommDestroy(rccl_comm); - #endif - - MPI_Finalize(); - return EXIT_SUCCESS; -} - From 7ff3fb503e4549ca170ea9ee08d763a90eb55584 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Sat, 30 Mar 2024 16:47:59 -0700 Subject: [PATCH 05/52] fix Makefile --- Makefile | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 973364d..7c01696 100644 --- a/Makefile +++ b/Makefile @@ -9,16 +9,13 @@ CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl -all: allgather.x allreduce.x reduce_scatter.x +all: allgather.x reduce_scatter.x allgather.x: allgather.cu ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu -allreduce.x: allreduce.cu - ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allreduce.x allreduce.cu - reduce_scatter.x: reduce_scatter.cu ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce_scatter.x reduce_scatter.cu clean: - rm -f allgather.x allreduce.x reduce_scatter.x + rm -f allgather.x reduce_scatter.x From 982ccaf7c6d265bf9ed21962432f8d462621dfbe Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Sun, 31 Mar 2024 23:43:59 -0400 Subject: [PATCH 06/52] add code for ROCm and RCCL --- Makefile | 16 +++++--- allgather.cu | 101 ++++++++++++++++++++++++++++++++++----------------- 2 files changed, 78 insertions(+), 39 deletions(-) diff --git a/Makefile b/Makefile index df453b4..9f6d40a 100644 --- a/Makefile +++ b/Makefile @@ -4,15 +4,21 @@ # SPDX-License-Identifier: MIT CC = cc -INC = -I/global/common/software/nersc9/nccl/2.19.4/include -CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL -LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl +# perlmutter flags +# INC = -I/global/common/software/nersc9/nccl/2.19.4/include +# CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL +# LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl + +# frontier flags +INC = -L${ROCM_PATH}/lib -lamdhip64 +CFLAGS = -std=c++11 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL +LDFLAGS = --rocm-path=${ROCM_PATH} -lrccl all: allgather.x -allgather.x: allgather.cu +allgather.x: allgather.cu ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu clean: - rm -f allgather.x + rm -f allgather.x diff --git a/allgather.cu b/allgather.cu index 5953041..b6fd992 100644 --- a/allgather.cu +++ b/allgather.cu @@ -8,16 +8,21 @@ #include #include #include - +#include #ifdef USE_CUDA - #include #include + #define bfloat16 nv_bfloat16 +#elif USE_ROCM + #include + #include + #include + #define bfloat16 hip_bfloat16 #endif #ifdef USE_NCCL #include "nccl.h" -#elif defined(USE_RCCL) - #include "rccl.h" +#elif USE_RCCL + #include #endif #define NUM_WARMUP_ITERATIONS 5 @@ -40,6 +45,16 @@ } \ } while(0) +#define HIP_CHECK(cmd) do { \ + hipError_t e = cmd; \ + if(e != hipSuccess) { \ + printf("HIP error %s:%d: %s\n", \ + __FILE__, __LINE__, hipGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +// NCCL_CHECK is used to validate RCCL functions as well #define NCCL_CHECK(cmd) do { \ ncclResult_t e = cmd; \ if (e != ncclSuccess) { \ @@ -49,9 +64,14 @@ } \ } while(0) -void initializeData(nv_bfloat16 *data, int size) { - for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) { +void initializeData(bfloat16 *data, int size) { + for (int i = 0; i < (size / sizeof(bfloat16)); ++i) { + #ifdef USE_CUDA data[i] = __float2bfloat16((float)i); + #elif USE_ROCM + // ROCm doesn't have a float2bfloat16 method + data[i] = (bfloat16) ((float) i); + #endif } } @@ -86,33 +106,44 @@ int main(int argc, char *argv[]) { } // Initialize GPU context + #if USE_CUDA cudaGetDeviceCount(&num_gpus_per_node); cudaSetDevice((my_rank % num_gpus_per_node)); + #elif USE_ROCM + hipGetDeviceCount(&num_gpus_per_node); + hipSetDevice((my_rank % num_gpus_per_node)); + #endif int local_data_size = max_msg_size; // Size of local data int global_data_size = local_data_size * num_gpus; // Size of global data - nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size); - nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size); + bfloat16 *local_data = (bfloat16*)malloc(local_data_size); + bfloat16 *global_data = (bfloat16*)malloc(global_data_size); // Initialize local data initializeData(local_data, local_data_size); // Allocate memory on GPU - nv_bfloat16 *d_local_data, *d_global_data; + bfloat16 *d_local_data, *d_global_data; + #ifdef USE_CUDA CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); - // Copy local data to GPU CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); + #elif USE_ROCM + HIP_CHECK(hipMalloc(&d_local_data, local_data_size)); + HIP_CHECK(hipMalloc(&d_global_data, global_data_size)); + HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice)); + #endif + #ifdef USE_MPI // create 2-byte datatype (send raw, un-interpreted bytes) MPI_Datatype mpi_type_bfloat16; MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16); MPI_Type_commit(&mpi_type_bfloat16); - #elif USE_NCCL + #elif defined(USE_NCCL) || defined(USE_RCCL) ncclUniqueId nccl_comm_id; ncclComm_t nccl_comm; @@ -125,13 +156,8 @@ int main(int argc, char *argv[]) { MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, 0, MPI_COMM_WORLD)); - /* Create a new NCCL communicator */ + /* Create a new NCCL/RCCL communicator */ NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); - - #elif defined(USE_RCCL) - // TODO: fix later - rcclComm_t rccl_comm; - rcclCommInitRank(&comm, num_gpus, 0, rccl_root); #endif // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather @@ -148,7 +174,7 @@ int main(int argc, char *argv[]) { fflush(NULL); for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { - msg_count = msg_size / sizeof(nv_bfloat16); + msg_count = msg_size / sizeof(bfloat16); // warmup iterations for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { #ifdef USE_MPI @@ -156,12 +182,14 @@ int main(int argc, char *argv[]) { d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request)); MPI_CHECK(MPI_Wait(&request, &status)); - #elif defined(USE_NCCL) + #elif defined(USE_NCCL) || defined(USE_RCCL) NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL)); - cudaDeviceSynchronize(); - #elif defined(USE_RCCL) - // TODO: fix later - rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); #endif } @@ -172,16 +200,18 @@ int main(int argc, char *argv[]) { start_time = MPI_Wtime(); for (int i = 0; i < iterations; ++i) { #ifdef USE_MPI - MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16, - d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request)); - + MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16, + d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request)); + MPI_CHECK(MPI_Wait(&request, &status)); - #elif defined(USE_NCCL) + #elif defined(USE_NCCL) || defined(USE_RCCL) NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL)); - cudaDeviceSynchronize(); - #elif defined(USE_RCCL) - // TODO: fix later - rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); #endif } MPI_Barrier(MPI_COMM_WORLD); @@ -193,13 +223,16 @@ int main(int argc, char *argv[]) { // Cleanup free(local_data); free(global_data); + #ifdef USE_CUDA CUDA_CHECK(cudaFree(d_local_data)); CUDA_CHECK(cudaFree(d_global_data)); + #elif USE_ROCM + HIP_CHECK(hipFree(d_local_data)); + HIP_CHECK(hipFree(d_global_data)); + #endif - #ifdef USE_NCCL + #ifdef defined(USE_NCCL) || defined(USE_RCCL) ncclCommDestroy(nccl_comm); - #elif defined(USE_RCCL) - rcclCommDestroy(rccl_comm); #endif MPI_Finalize(); From f70e65c096433273840f836e067450d1d1a760af Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Sun, 31 Mar 2024 23:56:46 -0400 Subject: [PATCH 07/52] add flags for ROCm and RCCL --- Makefile | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index df453b4..9f6d40a 100644 --- a/Makefile +++ b/Makefile @@ -4,15 +4,21 @@ # SPDX-License-Identifier: MIT CC = cc -INC = -I/global/common/software/nersc9/nccl/2.19.4/include -CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL -LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl +# perlmutter flags +# INC = -I/global/common/software/nersc9/nccl/2.19.4/include +# CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL +# LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl + +# frontier flags +INC = -L${ROCM_PATH}/lib -lamdhip64 +CFLAGS = -std=c++11 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL +LDFLAGS = --rocm-path=${ROCM_PATH} -lrccl all: allgather.x -allgather.x: allgather.cu +allgather.x: allgather.cu ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu clean: - rm -f allgather.x + rm -f allgather.x From 8ab25d10e8dce0cce796837d5813b47a8425ebd7 Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Sun, 31 Mar 2024 23:59:33 -0400 Subject: [PATCH 08/52] revert Makefile to original --- Makefile | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index 9f6d40a..11f5145 100644 --- a/Makefile +++ b/Makefile @@ -4,20 +4,14 @@ # SPDX-License-Identifier: MIT CC = cc +INC = -I/global/common/software/nersc9/nccl/2.19.4/include +CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL +LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl -# perlmutter flags -# INC = -I/global/common/software/nersc9/nccl/2.19.4/include -# CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL -# LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl - -# frontier flags -INC = -L${ROCM_PATH}/lib -lamdhip64 -CFLAGS = -std=c++11 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL -LDFLAGS = --rocm-path=${ROCM_PATH} -lrccl all: allgather.x -allgather.x: allgather.cu +allgather.x: allgather.cu ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu clean: From ef65ccde73efcf113fa3799462638dc3c8fcbf5e Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Mon, 1 Apr 2024 00:12:38 -0400 Subject: [PATCH 09/52] remove unneeded import --- allgather.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/allgather.cu b/allgather.cu index b6fd992..698e425 100644 --- a/allgather.cu +++ b/allgather.cu @@ -8,7 +8,6 @@ #include #include #include -#include #ifdef USE_CUDA #include #define bfloat16 nv_bfloat16 From 795a6d3e4f5323b3fa8e706ae4f159b58c0dbfca Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Mon, 1 Apr 2024 00:28:31 -0400 Subject: [PATCH 10/52] add ROCm and RCCL code for all-reduce --- allreduce.cu | 95 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 63 insertions(+), 32 deletions(-) diff --git a/allreduce.cu b/allreduce.cu index 062b120..4394249 100644 --- a/allreduce.cu +++ b/allreduce.cu @@ -8,16 +8,20 @@ #include #include #include - #ifdef USE_CUDA - #include #include + #define bfloat16 nv_bfloat16 +#elif USE_ROCM + #include + #include + #include + #define bfloat16 hip_bfloat16 #endif #ifdef USE_NCCL #include "nccl.h" -#elif defined(USE_RCCL) - #include "rccl.h" +#elif USE_RCCL + #include #endif #define NUM_WARMUP_ITERATIONS 5 @@ -40,6 +44,16 @@ } \ } while(0) +#define HIP_CHECK(cmd) do { \ + hipError_t e = cmd; \ + if(e != hipSuccess) { \ + printf("HIP error %s:%d: %s\n", \ + __FILE__, __LINE__, hipGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +// NCCL_CHECK is used to validate RCCL functions as well #define NCCL_CHECK(cmd) do { \ ncclResult_t e = cmd; \ if (e != ncclSuccess) { \ @@ -49,9 +63,14 @@ } \ } while(0) -void initializeData(nv_bfloat16 *data, int size) { - for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) { +void initializeData(bfloat16 *data, int size) { + for (int i = 0; i < (size / sizeof(bfloat16)); ++i) { + #ifdef USE_CUDA data[i] = __float2bfloat16((float)i); + #elif USE_ROCM + // ROCm doesn't have a float2bfloat16 method + data[i] = (bfloat16) ((float) i); + #endif } } @@ -93,26 +112,36 @@ int main(int argc, char *argv[]) { } // Initialize GPU context + #if USE_CUDA cudaGetDeviceCount(&num_gpus_per_node); cudaSetDevice((my_rank % num_gpus_per_node)); + #elif USE_ROCM + hipGetDeviceCount(&num_gpus_per_node); + hipSetDevice((my_rank % num_gpus_per_node)); + #endif int local_data_size = max_msg_size; // Size of local data int global_data_size = local_data_size; // Size of global data - nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size); - nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size); + bfloat16 *local_data = (bfloat16*)malloc(local_data_size); + bfloat16 *global_data = (bfloat16*)malloc(global_data_size); // Initialize local data initializeData(local_data, local_data_size); - // Allocate memory on GPU - nv_bfloat16 *d_local_data, *d_global_data; + bfloat16 *d_local_data, *d_global_data; + #ifdef USE_CUDA CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); - // Copy local data to GPU CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); + #elif USE_ROCM + HIP_CHECK(hipMalloc(&d_local_data, local_data_size)); + HIP_CHECK(hipMalloc(&d_global_data, global_data_size)); + HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice)); + #endif + #ifdef USE_MPI // create 2-byte datatype (send raw, un-interpreted bytes) MPI_Datatype mpi_type_bfloat16; @@ -123,7 +152,7 @@ int main(int argc, char *argv[]) { MPI_Op CUSTOM_SUM; MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM); - #elif USE_NCCL + #elif defined(USE_NCCL) || defined(USE_RCCL) ncclUniqueId nccl_comm_id; ncclComm_t nccl_comm; @@ -136,13 +165,8 @@ int main(int argc, char *argv[]) { MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, 0, MPI_COMM_WORLD)); - /* Create a new NCCL communicator */ + /* Create a new NCCL/RCCL communicator */ NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); - - #elif defined(USE_RCCL) - // TODO: fix later - rcclComm_t rccl_comm; - rcclCommInitRank(&comm, num_gpus, 0, rccl_root); #endif // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather @@ -159,7 +183,7 @@ int main(int argc, char *argv[]) { fflush(NULL); for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { - msg_count = msg_size / sizeof(nv_bfloat16); + msg_count = msg_size / sizeof(bfloat16); // warmup iterations for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { #ifdef USE_MPI @@ -167,12 +191,14 @@ int main(int argc, char *argv[]) { CUSTOM_SUM, MPI_COMM_WORLD, &request)); MPI_CHECK(MPI_Wait(&request, &status)); - #elif defined(USE_NCCL) + #elif defined(USE_NCCL) || defined(USE_RCCL) NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL)); - cudaDeviceSynchronize(); - #elif defined(USE_RCCL) - // TODO: fix later - rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); #endif } @@ -187,12 +213,14 @@ int main(int argc, char *argv[]) { CUSTOM_SUM, MPI_COMM_WORLD, &request)); MPI_CHECK(MPI_Wait(&request, &status)); - #elif defined(USE_NCCL) + #elif defined(USE_NCCL) || defined(USE_RCCL) NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL)); - cudaDeviceSynchronize(); - #elif defined(USE_RCCL) - // TODO: fix later - rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); #endif } MPI_Barrier(MPI_COMM_WORLD); @@ -204,13 +232,16 @@ int main(int argc, char *argv[]) { // Cleanup free(local_data); free(global_data); + #ifdef USE_CUDA CUDA_CHECK(cudaFree(d_local_data)); CUDA_CHECK(cudaFree(d_global_data)); + #elif USE_ROCM + HIP_CHECK(hipFree(d_local_data)); + HIP_CHECK(hipFree(d_global_data)); + #endif - #ifdef USE_NCCL + #ifdef defined(USE_NCCL) || defined(USE_RCCL) ncclCommDestroy(nccl_comm); - #elif defined(USE_RCCL) - rcclCommDestroy(rccl_comm); #endif MPI_Finalize(); From b9e882437578691448c9748aa67caae643db422a Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Mon, 1 Apr 2024 00:38:59 -0400 Subject: [PATCH 11/52] add ROCm and RCCL code for reduce-scatter --- reduce_scatter.cu | 98 +++++++++++++++++++++++++++++++---------------- 1 file changed, 64 insertions(+), 34 deletions(-) diff --git a/reduce_scatter.cu b/reduce_scatter.cu index 9ed9e53..8f851d4 100644 --- a/reduce_scatter.cu +++ b/reduce_scatter.cu @@ -8,16 +8,20 @@ #include #include #include - #ifdef USE_CUDA - #include #include + #define bfloat16 nv_bfloat16 +#elif USE_ROCM + #include + #include + #include + #define bfloat16 hip_bfloat16 #endif #ifdef USE_NCCL #include "nccl.h" -#elif defined(USE_RCCL) - #include "rccl.h" +#elif USE_RCCL + #include #endif #define NUM_WARMUP_ITERATIONS 5 @@ -40,6 +44,16 @@ } \ } while(0) +#define HIP_CHECK(cmd) do { \ + hipError_t e = cmd; \ + if(e != hipSuccess) { \ + printf("HIP error %s:%d: %s\n", \ + __FILE__, __LINE__, hipGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +// NCCL_CHECK is used to validate RCCL functions as well #define NCCL_CHECK(cmd) do { \ ncclResult_t e = cmd; \ if (e != ncclSuccess) { \ @@ -49,9 +63,14 @@ } \ } while(0) -void initializeData(nv_bfloat16 *data, int size) { - for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) { +void initializeData(bfloat16 *data, int size) { + for (int i = 0; i < (size / sizeof(bfloat16)); ++i) { + #ifdef USE_CUDA data[i] = __float2bfloat16((float)i); + #elif USE_ROCM + // ROCm doesn't have a float2bfloat16 method + data[i] = (bfloat16) ((float) i); + #endif } } @@ -93,26 +112,36 @@ int main(int argc, char *argv[]) { } // Initialize GPU context + #if USE_CUDA cudaGetDeviceCount(&num_gpus_per_node); cudaSetDevice((my_rank % num_gpus_per_node)); + #elif USE_ROCM + hipGetDeviceCount(&num_gpus_per_node); + hipSetDevice((my_rank % num_gpus_per_node)); + #endif int local_data_size = max_msg_size; // Size of local data int global_data_size = local_data_size; // Size of global data - nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size); - nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size); + bfloat16 *local_data = (bfloat16*)malloc(local_data_size); + bfloat16 *global_data = (bfloat16*)malloc(global_data_size); // Initialize local data initializeData(local_data, local_data_size); - // Allocate memory on GPU - nv_bfloat16 *d_local_data, *d_global_data; + bfloat16 *d_local_data, *d_global_data; + #ifdef USE_CUDA CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); - // Copy local data to GPU CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); + #elif USE_ROCM + HIP_CHECK(hipMalloc(&d_local_data, local_data_size)); + HIP_CHECK(hipMalloc(&d_global_data, global_data_size)); + HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice)); + #endif + #ifdef USE_MPI // create 2-byte datatype (send raw, un-interpreted bytes) MPI_Datatype mpi_type_bfloat16; @@ -123,7 +152,7 @@ int main(int argc, char *argv[]) { MPI_Op CUSTOM_SUM; MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM); - #elif USE_NCCL + #elif defined(USE_NCCL) || defined(USE_RCCL) ncclUniqueId nccl_comm_id; ncclComm_t nccl_comm; @@ -136,17 +165,12 @@ int main(int argc, char *argv[]) { MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, 0, MPI_COMM_WORLD)); - /* Create a new NCCL communicator */ + /* Create a new NCCL/RCCL communicator */ NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); - - #elif defined(USE_RCCL) - // TODO: fix later - rcclComm_t rccl_comm; - rcclCommInitRank(&comm, num_gpus, 0, rccl_root); #endif // init recvcounts to send an equal portion of data from the reduce operation - int num_elements = local_data_size / sizeof(nv_bfloat16); + int num_elements = local_data_size / sizeof(bfloat16); int portion = num_elements / num_pes; int *recvcounts = (int*) malloc(sizeof(int) * num_pes); for (int i = 0; i < num_pes; i++) @@ -166,7 +190,7 @@ int main(int argc, char *argv[]) { fflush(NULL); for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { - msg_count = msg_size / sizeof(nv_bfloat16); + msg_count = msg_size / sizeof(bfloat16); // warmup iterations for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { #ifdef USE_MPI @@ -174,12 +198,14 @@ int main(int argc, char *argv[]) { CUSTOM_SUM, MPI_COMM_WORLD, &request)); MPI_CHECK(MPI_Wait(&request, &status)); - #elif defined(USE_NCCL) + #elif defined(USE_NCCL) || defined(USE_RCCL) NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL)); - cudaDeviceSynchronize(); - #elif defined(USE_RCCL) - // TODO: fix later - rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); #endif } @@ -194,12 +220,14 @@ int main(int argc, char *argv[]) { CUSTOM_SUM, MPI_COMM_WORLD, &request)); MPI_CHECK(MPI_Wait(&request, &status)); - #elif defined(USE_NCCL) + #elif defined(USE_NCCL) || defined(USE_RCCL) NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL)); - cudaDeviceSynchronize(); - #elif defined(USE_RCCL) - // TODO: fix later - rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); #endif } MPI_Barrier(MPI_COMM_WORLD); @@ -211,16 +239,18 @@ int main(int argc, char *argv[]) { // Cleanup free(local_data); free(global_data); + #ifdef USE_CUDA CUDA_CHECK(cudaFree(d_local_data)); CUDA_CHECK(cudaFree(d_global_data)); + #elif USE_ROCM + HIP_CHECK(hipFree(d_local_data)); + HIP_CHECK(hipFree(d_global_data)); + #endif - #ifdef USE_NCCL + #ifdef defined(USE_NCCL) || defined(USE_RCCL) ncclCommDestroy(nccl_comm); - #elif defined(USE_RCCL) - rcclCommDestroy(rccl_comm); #endif MPI_Finalize(); return EXIT_SUCCESS; } - From e077503742f70d62e1c26043eb755f25f9b61358 Mon Sep 17 00:00:00 2001 From: Aditya Tomar <59426357+RoastSea8@users.noreply.github.com> Date: Sun, 31 Mar 2024 21:48:27 -0700 Subject: [PATCH 12/52] Update and rename README to README.md --- README => README.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) rename README => README.md (52%) diff --git a/README b/README.md similarity index 52% rename from README rename to README.md index eba2046..396231b 100644 --- a/README +++ b/README.md @@ -1,9 +1,13 @@ Before compiling do these: +### Perlmutter +```sh module load PrgEnv-cray cudatoolkit craype-accel-nvidia80 export CRAY_ACCEL_TARGET=nvidia80 - -When running do these: - -module load cudatoolkit export MPICH_GPU_SUPPORT_ENABLED=1 +``` +### Frontier +```sh +module load PrgEnv-cray amd-mixed craype-accel-amd-gfx90a +export MPICH_GPU_SUPPORT_ENABLED=1 +``` From 686be82807a62cb66f3ba91fe055c9644b2d4442 Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Mon, 1 Apr 2024 00:57:19 -0400 Subject: [PATCH 13/52] revert Makefile to original --- Makefile | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 7c01696..11f5145 100644 --- a/Makefile +++ b/Makefile @@ -5,17 +5,14 @@ CC = cc INC = -I/global/common/software/nersc9/nccl/2.19.4/include -CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI +CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl -all: allgather.x reduce_scatter.x +all: allgather.x allgather.x: allgather.cu ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu -reduce_scatter.x: reduce_scatter.cu - ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce_scatter.x reduce_scatter.cu - clean: - rm -f allgather.x reduce_scatter.x + rm -f allgather.x From ec4fdedfe0e26a4344a778e0b9ec7fd0ed8985ab Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Mon, 1 Apr 2024 00:58:42 -0400 Subject: [PATCH 14/52] revert Makefile to original --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 11f5145..df453b4 100644 --- a/Makefile +++ b/Makefile @@ -15,4 +15,4 @@ allgather.x: allgather.cu ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu clean: - rm -f allgather.x + rm -f allgather.x From 4a87dfc7a3084f57f1727865ab4889b3441fcdfd Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Mon, 1 Apr 2024 00:59:47 -0400 Subject: [PATCH 15/52] revert Makefile to original --- Makefile | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 231e499..df453b4 100644 --- a/Makefile +++ b/Makefile @@ -5,17 +5,14 @@ CC = cc INC = -I/global/common/software/nersc9/nccl/2.19.4/include -CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI +CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl -all: allgather.x allreduce.x +all: allgather.x allgather.x: allgather.cu ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu -allreduce.x: allreduce.cu - ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allreduce.x allreduce.cu - clean: - rm -f allgather.x allreduce.x + rm -f allgather.x From b6083d1cea284f619120d27e1a9c62018cf1464a Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Mon, 1 Apr 2024 01:00:41 -0400 Subject: [PATCH 16/52] revert Makefile to original --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 11f5145..df453b4 100644 --- a/Makefile +++ b/Makefile @@ -15,4 +15,4 @@ allgather.x: allgather.cu ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu clean: - rm -f allgather.x + rm -f allgather.x From 79b2fb96a335ff798f63160ab36d530753665114 Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Mon, 1 Apr 2024 01:34:35 -0400 Subject: [PATCH 17/52] update custom bf16 sum function --- allreduce.cu | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/allreduce.cu b/allreduce.cu index 4394249..7bf1031 100644 --- a/allreduce.cu +++ b/allreduce.cu @@ -75,10 +75,14 @@ void initializeData(bfloat16 *data, int size) { } void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { - nv_bfloat16* in = (nv_bfloat16*) invec; - nv_bfloat16* inout = (nv_bfloat16*) inoutvec; + bfloat16* in = (bfloat16*) invec; + bfloat16* inout = (bfloat16*) inoutvec; for (int i = 0; i < *len; i++) + #ifdef USE_CUDA inout[i] = __hadd(in[i], inout[i]); + #elif USE_ROCM + inout[i] = in[i] + inout[i]; + #endif } int main(int argc, char *argv[]) { From 60e6911eb66fed80565a0fba1f59f932163fd194 Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Mon, 1 Apr 2024 01:40:18 -0400 Subject: [PATCH 18/52] update custom bf16 sum function --- reduce_scatter.cu | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/reduce_scatter.cu b/reduce_scatter.cu index 8f851d4..5db2b60 100644 --- a/reduce_scatter.cu +++ b/reduce_scatter.cu @@ -75,10 +75,15 @@ void initializeData(bfloat16 *data, int size) { } void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { - nv_bfloat16* in = (nv_bfloat16*) invec; - nv_bfloat16* inout = (nv_bfloat16*) inoutvec; - for (int i = 0; i < *len; i++) + bfloat16* in = (bfloat16*) invec; + bfloat16* inout = (bfloat16*) inoutvec; + for (int i = 0; i < *len; i++) { + #ifdef USE_CUDA inout[i] = __hadd(in[i], inout[i]); + #elif USE_ROCM + inout[i] = in[i] + inout[i]; + #endif + } } int main(int argc, char *argv[]) { From ef6fb0d6d22e9e0dac426da032cd678df12875e3 Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Mon, 1 Apr 2024 01:41:28 -0400 Subject: [PATCH 19/52] fix custom bf16 sum function --- allreduce.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/allreduce.cu b/allreduce.cu index 7bf1031..9f017db 100644 --- a/allreduce.cu +++ b/allreduce.cu @@ -77,12 +77,13 @@ void initializeData(bfloat16 *data, int size) { void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { bfloat16* in = (bfloat16*) invec; bfloat16* inout = (bfloat16*) inoutvec; - for (int i = 0; i < *len; i++) + for (int i = 0; i < *len; i++) { #ifdef USE_CUDA inout[i] = __hadd(in[i], inout[i]); #elif USE_ROCM inout[i] = in[i] + inout[i]; #endif + } } int main(int argc, char *argv[]) { From 8052ca765fc87da0cb8f9de17bcc8d252b4dad56 Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Mon, 1 Apr 2024 01:45:14 -0400 Subject: [PATCH 20/52] fix indents --- allreduce.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/allreduce.cu b/allreduce.cu index 9f017db..2ffac86 100644 --- a/allreduce.cu +++ b/allreduce.cu @@ -75,11 +75,11 @@ void initializeData(bfloat16 *data, int size) { } void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { - bfloat16* in = (bfloat16*) invec; - bfloat16* inout = (bfloat16*) inoutvec; - for (int i = 0; i < *len; i++) { + bfloat16* in = (bfloat16*) invec; + bfloat16* inout = (bfloat16*) inoutvec; + for (int i = 0; i < *len; i++) { #ifdef USE_CUDA - inout[i] = __hadd(in[i], inout[i]); + inout[i] = __hadd(in[i], inout[i]); #elif USE_ROCM inout[i] = in[i] + inout[i]; #endif From a67570e98862f1e62ac850d295cffa9a3fc79206 Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Mon, 1 Apr 2024 01:46:47 -0400 Subject: [PATCH 21/52] fix indents --- reduce_scatter.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/reduce_scatter.cu b/reduce_scatter.cu index 5db2b60..b667c01 100644 --- a/reduce_scatter.cu +++ b/reduce_scatter.cu @@ -76,10 +76,10 @@ void initializeData(bfloat16 *data, int size) { void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { bfloat16* in = (bfloat16*) invec; - bfloat16* inout = (bfloat16*) inoutvec; - for (int i = 0; i < *len; i++) { + bfloat16* inout = (bfloat16*) inoutvec; + for (int i = 0; i < *len; i++) { #ifdef USE_CUDA - inout[i] = __hadd(in[i], inout[i]); + inout[i] = __hadd(in[i], inout[i]); #elif USE_ROCM inout[i] = in[i] + inout[i]; #endif From fdb324ad8fb90fb6c9ec78c7551c8324e2655fa7 Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Tue, 9 Apr 2024 06:05:06 -0700 Subject: [PATCH 22/52] update Makefile --- Makefile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 9f6d40a..52c0235 100644 --- a/Makefile +++ b/Makefile @@ -3,17 +3,17 @@ # # SPDX-License-Identifier: MIT -CC = cc +CC = cc # perlmutter flags -# INC = -I/global/common/software/nersc9/nccl/2.19.4/include -# CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL -# LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl +# INC = -I/global/common/software/nersc9/nccl/2.19.4/include +# CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL +# LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl # frontier flags -INC = -L${ROCM_PATH}/lib -lamdhip64 -CFLAGS = -std=c++11 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL -LDFLAGS = --rocm-path=${ROCM_PATH} -lrccl +INC = -I${ROCM_PATH}/include +CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL +LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl all: allgather.x From 405f09084247c87e318804074fdc2e0a0ac0f296 Mon Sep 17 00:00:00 2001 From: Aditya Tomar <59426357+RoastSea8@users.noreply.github.com> Date: Thu, 11 Apr 2024 15:05:03 -0700 Subject: [PATCH 23/52] Create allreduce.cu --- allreduce.cu | 254 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 254 insertions(+) create mode 100644 allreduce.cu diff --git a/allreduce.cu b/allreduce.cu new file mode 100644 index 0000000..51b6248 --- /dev/null +++ b/allreduce.cu @@ -0,0 +1,254 @@ +/* \file allreduce.cu + * Copyright 2024 Parallel Software and Systems Group, University of Maryland. + * See the top-level LICENSE file for details. + * + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include +#ifdef USE_CUDA + #include + #define bfloat16 nv_bfloat16 +#elif USE_ROCM + #include + #include + #include + #define bfloat16 hip_bfloat16 +#endif + +#ifdef USE_NCCL + #include "nccl.h" +#elif USE_RCCL + #include +#endif + +#define NUM_WARMUP_ITERATIONS 5 + +#define MPI_CHECK(cmd) do { \ + int e = cmd; \ + if( e != MPI_SUCCESS ) { \ + printf("Failed: MPI error %s:%d '%d'\n", \ + __FILE__,__LINE__, e); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define CUDA_CHECK(cmd) do { \ + cudaError_t e = cmd; \ + if(e != cudaSuccess) { \ + printf("CUDA error %s:%d: %s\n", \ + __FILE__, __LINE__, cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define HIP_CHECK(cmd) do { \ + hipError_t e = cmd; \ + if(e != hipSuccess) { \ + printf("HIP error %s:%d: %s\n", \ + __FILE__, __LINE__, hipGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +// NCCL_CHECK is used to validate RCCL functions as well +#define NCCL_CHECK(cmd) do { \ + ncclResult_t e = cmd; \ + if (e != ncclSuccess) { \ + printf("NCCL error %s:%d %s\n", \ + __FILE__, __LINE__, ncclGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +void initializeData(bfloat16 *data, int size) { + for (int i = 0; i < (size / sizeof(bfloat16)); ++i) { + #ifdef USE_CUDA + data[i] = __float2bfloat16((float)i); + #elif USE_ROCM + // ROCm doesn't have a float2bfloat16 method + data[i] = (bfloat16) ((float) i); + #endif + } +} + +void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { + bfloat16* in = (bfloat16*) invec; + bfloat16* inout = (bfloat16*) inoutvec; + for (int i = 0; i < *len; i++) { + #ifdef USE_CUDA + inout[i] = __hadd(in[i], inout[i]); + #elif USE_ROCM + inout[i] = in[i] + inout[i]; + #endif + } +} + +int main(int argc, char *argv[]) { + if (argc != 5) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return EXIT_FAILURE; + } + + int num_gpus = atoi(argv[1]); + int min_msg_size = atoi(argv[2]); + int max_msg_size = atoi(argv[3]); + int iterations = atoi(argv[4]); + + if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { + fprintf(stderr, "Invalid input parameters.\n"); + return EXIT_FAILURE; + } + + int my_rank, num_pes; + int num_gpus_per_node; + int msg_count; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_pes); + + if (num_pes != num_gpus) { + fprintf(stderr, "Number of processes must match number of GPUs.\n"); + MPI_Finalize(); + return EXIT_FAILURE; + } + + // Initialize GPU context + #if USE_CUDA + cudaGetDeviceCount(&num_gpus_per_node); + cudaSetDevice((my_rank % num_gpus_per_node)); + #elif USE_ROCM + hipGetDeviceCount(&num_gpus_per_node); + hipSetDevice((my_rank % num_gpus_per_node)); + #endif + + int local_data_size = max_msg_size; // Size of local data + int global_data_size = local_data_size; // Size of global data + + bfloat16 *local_data = (bfloat16*)malloc(local_data_size); + bfloat16 *global_data = (bfloat16*)malloc(global_data_size); + + // Initialize local data + initializeData(local_data, local_data_size); + + bfloat16 *d_local_data, *d_global_data; + #ifdef USE_CUDA + CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); + CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); + // Copy local data to GPU + CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); + + #elif USE_ROCM + HIP_CHECK(hipMalloc(&d_local_data, local_data_size)); + HIP_CHECK(hipMalloc(&d_global_data, global_data_size)); + HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice)); + #endif + + #ifdef USE_MPI + // create 2-byte datatype (send raw, un-interpreted bytes) + MPI_Datatype mpi_type_bfloat16; + MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16); + MPI_Type_commit(&mpi_type_bfloat16); + + // define custom reduce operation for nv_bfloat16 types + MPI_Op CUSTOM_SUM; + MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM); + + #elif defined(USE_NCCL) || defined(USE_RCCL) + ncclUniqueId nccl_comm_id; + ncclComm_t nccl_comm; + + if (my_rank == 0) { + /* Generates an Id to be used in ncclCommInitRank. */ + ncclGetUniqueId(&nccl_comm_id); + } + + /* distribute nccl_comm_id to all ranks */ + MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, + 0, MPI_COMM_WORLD)); + + /* Create a new NCCL/RCCL communicator */ + NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); + #endif + + // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather + double total_time, start_time; + MPI_Request request; + MPI_Status status; + + // Print benchmark results + if (my_rank == 0) { + printf("Number of GPUs: %d\n", num_gpus); + printf("Message size range: %d - %d\n", min_msg_size, max_msg_size); + printf("Number of iterations: %d\n", iterations); + } + fflush(NULL); + + for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { + msg_count = msg_size / sizeof(bfloat16); + // warmup iterations + for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { + #ifdef USE_MPI + MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16, + CUSTOM_SUM, MPI_COMM_WORLD, &request)); + + MPI_CHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) || defined(USE_RCCL) + NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL)); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); + #endif + } + + if(msg_size >= 8388608) + iterations = 20; + + MPI_Barrier(MPI_COMM_WORLD); + start_time = MPI_Wtime(); + for (int i = 0; i < iterations; ++i) { + #ifdef USE_MPI + MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16, + CUSTOM_SUM, MPI_COMM_WORLD, &request)); + + MPI_CHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) || defined(USE_RCCL) + NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL)); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); + #endif + } + MPI_Barrier(MPI_COMM_WORLD); + total_time = MPI_Wtime() - start_time; + if (my_rank == 0) + printf("%d %.6f seconds\n", msg_size, (total_time / iterations)); + } + + // Cleanup + free(local_data); + free(global_data); + #ifdef USE_CUDA + CUDA_CHECK(cudaFree(d_local_data)); + CUDA_CHECK(cudaFree(d_global_data)); + #elif USE_ROCM + HIP_CHECK(hipFree(d_local_data)); + HIP_CHECK(hipFree(d_global_data)); + #endif + + #ifdef defined(USE_NCCL) || defined(USE_RCCL) + ncclCommDestroy(nccl_comm); + #endif + + MPI_Finalize(); + return EXIT_SUCCESS; +} From 63bb696da18e45195885cd465cb54af3930b77b9 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Sat, 13 Apr 2024 13:47:23 -0700 Subject: [PATCH 24/52] change to int64_t for global/local data size --- reduce_scatter.cu | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/reduce_scatter.cu b/reduce_scatter.cu index b667c01..820cf4f 100644 --- a/reduce_scatter.cu +++ b/reduce_scatter.cu @@ -8,6 +8,8 @@ #include #include #include +#include + #ifdef USE_CUDA #include #define bfloat16 nv_bfloat16 @@ -27,9 +29,9 @@ #define NUM_WARMUP_ITERATIONS 5 #define MPI_CHECK(cmd) do { \ - int e = cmd; \ + int64_t e = cmd; \ if( e != MPI_SUCCESS ) { \ - printf("Failed: MPI error %s:%d '%d'\n", \ + printf("Failed: MPI error %s:%d '%ld'\n", \ __FILE__,__LINE__, e); \ exit(EXIT_FAILURE); \ } \ @@ -63,8 +65,8 @@ } \ } while(0) -void initializeData(bfloat16 *data, int size) { - for (int i = 0; i < (size / sizeof(bfloat16)); ++i) { +void initializeData(bfloat16 *data, int64_t size) { + for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) { #ifdef USE_CUDA data[i] = __float2bfloat16((float)i); #elif USE_ROCM @@ -74,10 +76,10 @@ void initializeData(bfloat16 *data, int size) { } } -void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { +void custom_bf16_sum(void *invec, void *inoutvec, int64_t *len, MPI_Datatype *datatype) { bfloat16* in = (bfloat16*) invec; bfloat16* inout = (bfloat16*) inoutvec; - for (int i = 0; i < *len; i++) { + for (int64_t i = 0; i < *len; i++) { #ifdef USE_CUDA inout[i] = __hadd(in[i], inout[i]); #elif USE_ROCM @@ -93,8 +95,8 @@ int main(int argc, char *argv[]) { } int num_gpus = atoi(argv[1]); - int min_msg_size = atoi(argv[2]); - int max_msg_size = atoi(argv[3]); + int64_t min_msg_size = atoi(argv[2]); + int64_t max_msg_size = atoi(argv[3]); int iterations = atoi(argv[4]); if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { @@ -125,8 +127,13 @@ int main(int argc, char *argv[]) { hipSetDevice((my_rank % num_gpus_per_node)); #endif - int local_data_size = max_msg_size; // Size of local data - int global_data_size = local_data_size; // Size of global data + int64_t local_data_size = max_msg_size; // Size of local data + int64_t global_data_size = local_data_size; // Size of global data + + if (my_rank == 0) { + fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024); + fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024); + } bfloat16 *local_data = (bfloat16*)malloc(local_data_size); bfloat16 *global_data = (bfloat16*)malloc(global_data_size); @@ -189,12 +196,12 @@ int main(int argc, char *argv[]) { // Print benchmark results if (my_rank == 0) { printf("Number of GPUs: %d\n", num_gpus); - printf("Message size range: %d - %d\n", min_msg_size, max_msg_size); + printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size); printf("Number of iterations: %d\n", iterations); } fflush(NULL); - for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { + for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { msg_count = msg_size / sizeof(bfloat16); // warmup iterations for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { @@ -238,7 +245,7 @@ int main(int argc, char *argv[]) { MPI_Barrier(MPI_COMM_WORLD); total_time = MPI_Wtime() - start_time; if (my_rank == 0) - printf("%d %.6f seconds\n", msg_size, (total_time / iterations)); + printf("%ld %.6f seconds\n", msg_size, (total_time / iterations)); } // Cleanup From 3082c980f49b0ab7a88f2af24139a069c54adb95 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Sat, 13 Apr 2024 13:57:03 -0700 Subject: [PATCH 25/52] change to int64_t for global/local data size --- allgather.cu | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/allgather.cu b/allgather.cu index 698e425..8ae7481 100644 --- a/allgather.cu +++ b/allgather.cu @@ -8,6 +8,8 @@ #include #include #include +#include + #ifdef USE_CUDA #include #define bfloat16 nv_bfloat16 @@ -27,9 +29,9 @@ #define NUM_WARMUP_ITERATIONS 5 #define MPI_CHECK(cmd) do { \ - int e = cmd; \ + int64_t e = cmd; \ if( e != MPI_SUCCESS ) { \ - printf("Failed: MPI error %s:%d '%d'\n", \ + printf("Failed: MPI error %s:%d '%ld'\n", \ __FILE__,__LINE__, e); \ exit(EXIT_FAILURE); \ } \ @@ -63,8 +65,8 @@ } \ } while(0) -void initializeData(bfloat16 *data, int size) { - for (int i = 0; i < (size / sizeof(bfloat16)); ++i) { +void initializeData(bfloat16 *data, int64_t size) { + for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) { #ifdef USE_CUDA data[i] = __float2bfloat16((float)i); #elif USE_ROCM @@ -81,8 +83,8 @@ int main(int argc, char *argv[]) { } int num_gpus = atoi(argv[1]); - int min_msg_size = atoi(argv[2]); - int max_msg_size = atoi(argv[3]); + int64_t min_msg_size = atoi(argv[2]); + int64_t max_msg_size = atoi(argv[3]); int iterations = atoi(argv[4]); if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { @@ -113,8 +115,13 @@ int main(int argc, char *argv[]) { hipSetDevice((my_rank % num_gpus_per_node)); #endif - int local_data_size = max_msg_size; // Size of local data - int global_data_size = local_data_size * num_gpus; // Size of global data + int64_t local_data_size = max_msg_size; // Size of local data + int64_t global_data_size = local_data_size * num_gpus; // Size of global data + + if (my_rank == 0) { + fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024); + fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024); + } bfloat16 *local_data = (bfloat16*)malloc(local_data_size); bfloat16 *global_data = (bfloat16*)malloc(global_data_size); @@ -167,12 +174,12 @@ int main(int argc, char *argv[]) { // Print benchmark results if (my_rank == 0) { printf("Number of GPUs: %d\n", num_gpus); - printf("Message size range: %d - %d\n", min_msg_size, max_msg_size); + printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size); printf("Number of iterations: %d\n", iterations); } fflush(NULL); - for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { + for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { msg_count = msg_size / sizeof(bfloat16); // warmup iterations for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { @@ -216,7 +223,7 @@ int main(int argc, char *argv[]) { MPI_Barrier(MPI_COMM_WORLD); total_time = MPI_Wtime() - start_time; if (my_rank == 0) - printf("%d %.6f seconds\n", msg_size, (total_time / iterations)); + printf("%ld %.6f seconds\n", msg_size, (total_time / iterations)); } // Cleanup From 3c91d01234babf777f467428306eb78665368639 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Sat, 13 Apr 2024 14:02:49 -0700 Subject: [PATCH 26/52] change to int64_t for global/local data size --- allreduce.cu | 33 ++++++++++++++++++++------------- reduce_scatter.cu | 33 ++++++++++++++++++++------------- 2 files changed, 40 insertions(+), 26 deletions(-) diff --git a/allreduce.cu b/allreduce.cu index 51b6248..63e1635 100644 --- a/allreduce.cu +++ b/allreduce.cu @@ -8,6 +8,8 @@ #include #include #include +#include + #ifdef USE_CUDA #include #define bfloat16 nv_bfloat16 @@ -27,9 +29,9 @@ #define NUM_WARMUP_ITERATIONS 5 #define MPI_CHECK(cmd) do { \ - int e = cmd; \ + int64_t e = cmd; \ if( e != MPI_SUCCESS ) { \ - printf("Failed: MPI error %s:%d '%d'\n", \ + printf("Failed: MPI error %s:%d '%ld'\n", \ __FILE__,__LINE__, e); \ exit(EXIT_FAILURE); \ } \ @@ -63,8 +65,8 @@ } \ } while(0) -void initializeData(bfloat16 *data, int size) { - for (int i = 0; i < (size / sizeof(bfloat16)); ++i) { +void initializeData(bfloat16 *data, int64_t size) { + for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) { #ifdef USE_CUDA data[i] = __float2bfloat16((float)i); #elif USE_ROCM @@ -74,10 +76,10 @@ void initializeData(bfloat16 *data, int size) { } } -void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { +void custom_bf16_sum(void *invec, void *inoutvec, int64_t *len, MPI_Datatype *datatype) { bfloat16* in = (bfloat16*) invec; bfloat16* inout = (bfloat16*) inoutvec; - for (int i = 0; i < *len; i++) { + for (int64_t i = 0; i < *len; i++) { #ifdef USE_CUDA inout[i] = __hadd(in[i], inout[i]); #elif USE_ROCM @@ -93,8 +95,8 @@ int main(int argc, char *argv[]) { } int num_gpus = atoi(argv[1]); - int min_msg_size = atoi(argv[2]); - int max_msg_size = atoi(argv[3]); + int64_t min_msg_size = atoi(argv[2]); + int64_t max_msg_size = atoi(argv[3]); int iterations = atoi(argv[4]); if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { @@ -125,8 +127,13 @@ int main(int argc, char *argv[]) { hipSetDevice((my_rank % num_gpus_per_node)); #endif - int local_data_size = max_msg_size; // Size of local data - int global_data_size = local_data_size; // Size of global data + int64_t local_data_size = max_msg_size; // Size of local data + int64_t global_data_size = local_data_size; // Size of global data + + if (my_rank == 0) { + fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024); + fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024); + } bfloat16 *local_data = (bfloat16*)malloc(local_data_size); bfloat16 *global_data = (bfloat16*)malloc(global_data_size); @@ -182,12 +189,12 @@ int main(int argc, char *argv[]) { // Print benchmark results if (my_rank == 0) { printf("Number of GPUs: %d\n", num_gpus); - printf("Message size range: %d - %d\n", min_msg_size, max_msg_size); + printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size); printf("Number of iterations: %d\n", iterations); } fflush(NULL); - for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { + for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { msg_count = msg_size / sizeof(bfloat16); // warmup iterations for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { @@ -231,7 +238,7 @@ int main(int argc, char *argv[]) { MPI_Barrier(MPI_COMM_WORLD); total_time = MPI_Wtime() - start_time; if (my_rank == 0) - printf("%d %.6f seconds\n", msg_size, (total_time / iterations)); + printf("%ld %.6f seconds\n", msg_size, (total_time / iterations)); } // Cleanup diff --git a/reduce_scatter.cu b/reduce_scatter.cu index b667c01..820cf4f 100644 --- a/reduce_scatter.cu +++ b/reduce_scatter.cu @@ -8,6 +8,8 @@ #include #include #include +#include + #ifdef USE_CUDA #include #define bfloat16 nv_bfloat16 @@ -27,9 +29,9 @@ #define NUM_WARMUP_ITERATIONS 5 #define MPI_CHECK(cmd) do { \ - int e = cmd; \ + int64_t e = cmd; \ if( e != MPI_SUCCESS ) { \ - printf("Failed: MPI error %s:%d '%d'\n", \ + printf("Failed: MPI error %s:%d '%ld'\n", \ __FILE__,__LINE__, e); \ exit(EXIT_FAILURE); \ } \ @@ -63,8 +65,8 @@ } \ } while(0) -void initializeData(bfloat16 *data, int size) { - for (int i = 0; i < (size / sizeof(bfloat16)); ++i) { +void initializeData(bfloat16 *data, int64_t size) { + for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) { #ifdef USE_CUDA data[i] = __float2bfloat16((float)i); #elif USE_ROCM @@ -74,10 +76,10 @@ void initializeData(bfloat16 *data, int size) { } } -void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { +void custom_bf16_sum(void *invec, void *inoutvec, int64_t *len, MPI_Datatype *datatype) { bfloat16* in = (bfloat16*) invec; bfloat16* inout = (bfloat16*) inoutvec; - for (int i = 0; i < *len; i++) { + for (int64_t i = 0; i < *len; i++) { #ifdef USE_CUDA inout[i] = __hadd(in[i], inout[i]); #elif USE_ROCM @@ -93,8 +95,8 @@ int main(int argc, char *argv[]) { } int num_gpus = atoi(argv[1]); - int min_msg_size = atoi(argv[2]); - int max_msg_size = atoi(argv[3]); + int64_t min_msg_size = atoi(argv[2]); + int64_t max_msg_size = atoi(argv[3]); int iterations = atoi(argv[4]); if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { @@ -125,8 +127,13 @@ int main(int argc, char *argv[]) { hipSetDevice((my_rank % num_gpus_per_node)); #endif - int local_data_size = max_msg_size; // Size of local data - int global_data_size = local_data_size; // Size of global data + int64_t local_data_size = max_msg_size; // Size of local data + int64_t global_data_size = local_data_size; // Size of global data + + if (my_rank == 0) { + fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024); + fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024); + } bfloat16 *local_data = (bfloat16*)malloc(local_data_size); bfloat16 *global_data = (bfloat16*)malloc(global_data_size); @@ -189,12 +196,12 @@ int main(int argc, char *argv[]) { // Print benchmark results if (my_rank == 0) { printf("Number of GPUs: %d\n", num_gpus); - printf("Message size range: %d - %d\n", min_msg_size, max_msg_size); + printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size); printf("Number of iterations: %d\n", iterations); } fflush(NULL); - for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { + for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { msg_count = msg_size / sizeof(bfloat16); // warmup iterations for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { @@ -238,7 +245,7 @@ int main(int argc, char *argv[]) { MPI_Barrier(MPI_COMM_WORLD); total_time = MPI_Wtime() - start_time; if (my_rank == 0) - printf("%d %.6f seconds\n", msg_size, (total_time / iterations)); + printf("%ld %.6f seconds\n", msg_size, (total_time / iterations)); } // Cleanup From 0a33166c8ed7059a422d3bdda3c4804604a9f849 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Sat, 13 Apr 2024 14:33:25 -0700 Subject: [PATCH 27/52] revert type change for custom sum --- reduce_scatter.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reduce_scatter.cu b/reduce_scatter.cu index 820cf4f..f824072 100644 --- a/reduce_scatter.cu +++ b/reduce_scatter.cu @@ -76,10 +76,10 @@ void initializeData(bfloat16 *data, int64_t size) { } } -void custom_bf16_sum(void *invec, void *inoutvec, int64_t *len, MPI_Datatype *datatype) { +void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { bfloat16* in = (bfloat16*) invec; bfloat16* inout = (bfloat16*) inoutvec; - for (int64_t i = 0; i < *len; i++) { + for (int i = 0; i < *len; i++) { #ifdef USE_CUDA inout[i] = __hadd(in[i], inout[i]); #elif USE_ROCM From 8be09db3867b60b275bd587c16aaba1a4eb5b40c Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Sat, 13 Apr 2024 16:41:03 -0700 Subject: [PATCH 28/52] setup benchmarks rig and add results so far --- allreduce.cu | 4 +-- mpi/Makefile | 30 ++++++++++++++++++++++ mpi/all-gather/128_gpu_run.sh | 37 ++++++++++++++++++++++++++++ mpi/all-gather/16_gpu_run.sh | 37 ++++++++++++++++++++++++++++ mpi/all-gather/32_gpu_run.sh | 37 ++++++++++++++++++++++++++++ mpi/all-gather/64_gpu_run.sh | 37 ++++++++++++++++++++++++++++ mpi/all-gather/8_gpu_run.sh | 37 ++++++++++++++++++++++++++++ mpi/all-gather/benchmarks/16_gpu.txt | 12 +++++++++ mpi/all-gather/benchmarks/8_gpu.txt | 13 ++++++++++ mpi/all-reduce/128_gpu_run.sh | 37 ++++++++++++++++++++++++++++ mpi/all-reduce/16_gpu_run.sh | 37 ++++++++++++++++++++++++++++ mpi/all-reduce/32_gpu_run.sh | 37 ++++++++++++++++++++++++++++ mpi/all-reduce/64_gpu_run.sh | 37 ++++++++++++++++++++++++++++ mpi/reduce-scatter/128_gpu_run.sh | 37 ++++++++++++++++++++++++++++ mpi/reduce-scatter/16_gpu_run.sh | 37 ++++++++++++++++++++++++++++ mpi/reduce-scatter/32_gpu_run.sh | 37 ++++++++++++++++++++++++++++ mpi/reduce-scatter/64_gpu_run.sh | 37 ++++++++++++++++++++++++++++ mpi/reduce-scatter/8_gpu_run.sh | 37 ++++++++++++++++++++++++++++ nccl/Makefile | 30 ++++++++++++++++++++++ nccl/all-gather/128_gpu_run.sh | 37 ++++++++++++++++++++++++++++ nccl/all-gather/16_gpu_run.sh | 37 ++++++++++++++++++++++++++++ nccl/all-gather/32_gpu_run.sh | 37 ++++++++++++++++++++++++++++ nccl/all-gather/64_gpu_run.sh | 37 ++++++++++++++++++++++++++++ nccl/all-gather/8_gpu_run.sh | 37 ++++++++++++++++++++++++++++ nccl/all-reduce/128_gpu_run.sh | 37 ++++++++++++++++++++++++++++ nccl/all-reduce/16_gpu_run.sh | 37 ++++++++++++++++++++++++++++ nccl/all-reduce/32_gpu_run.sh | 37 ++++++++++++++++++++++++++++ nccl/all-reduce/64_gpu_run.sh | 37 ++++++++++++++++++++++++++++ nccl/all-reduce/8_gpu_run.sh | 37 ++++++++++++++++++++++++++++ nccl/reduce-scatter/128_gpu_run.sh | 37 ++++++++++++++++++++++++++++ nccl/reduce-scatter/16_gpu_run.sh | 37 ++++++++++++++++++++++++++++ nccl/reduce-scatter/32_gpu_run.sh | 37 ++++++++++++++++++++++++++++ nccl/reduce-scatter/64_gpu_run.sh | 37 ++++++++++++++++++++++++++++ nccl/reduce-scatter/8_gpu_run.sh | 37 ++++++++++++++++++++++++++++ Makefile => rccl/Makefile | 14 ++++++++--- reduce_scatter.cu | 4 +-- 36 files changed, 1172 insertions(+), 8 deletions(-) create mode 100644 mpi/Makefile create mode 100644 mpi/all-gather/128_gpu_run.sh create mode 100644 mpi/all-gather/16_gpu_run.sh create mode 100644 mpi/all-gather/32_gpu_run.sh create mode 100644 mpi/all-gather/64_gpu_run.sh create mode 100644 mpi/all-gather/8_gpu_run.sh create mode 100644 mpi/all-gather/benchmarks/16_gpu.txt create mode 100644 mpi/all-gather/benchmarks/8_gpu.txt create mode 100644 mpi/all-reduce/128_gpu_run.sh create mode 100644 mpi/all-reduce/16_gpu_run.sh create mode 100644 mpi/all-reduce/32_gpu_run.sh create mode 100644 mpi/all-reduce/64_gpu_run.sh create mode 100644 mpi/reduce-scatter/128_gpu_run.sh create mode 100644 mpi/reduce-scatter/16_gpu_run.sh create mode 100644 mpi/reduce-scatter/32_gpu_run.sh create mode 100644 mpi/reduce-scatter/64_gpu_run.sh create mode 100644 mpi/reduce-scatter/8_gpu_run.sh create mode 100644 nccl/Makefile create mode 100644 nccl/all-gather/128_gpu_run.sh create mode 100644 nccl/all-gather/16_gpu_run.sh create mode 100644 nccl/all-gather/32_gpu_run.sh create mode 100644 nccl/all-gather/64_gpu_run.sh create mode 100644 nccl/all-gather/8_gpu_run.sh create mode 100644 nccl/all-reduce/128_gpu_run.sh create mode 100644 nccl/all-reduce/16_gpu_run.sh create mode 100644 nccl/all-reduce/32_gpu_run.sh create mode 100644 nccl/all-reduce/64_gpu_run.sh create mode 100644 nccl/all-reduce/8_gpu_run.sh create mode 100644 nccl/reduce-scatter/128_gpu_run.sh create mode 100644 nccl/reduce-scatter/16_gpu_run.sh create mode 100644 nccl/reduce-scatter/32_gpu_run.sh create mode 100644 nccl/reduce-scatter/64_gpu_run.sh create mode 100644 nccl/reduce-scatter/8_gpu_run.sh rename Makefile => rccl/Makefile (57%) diff --git a/allreduce.cu b/allreduce.cu index 63e1635..ddbfb97 100644 --- a/allreduce.cu +++ b/allreduce.cu @@ -76,10 +76,10 @@ void initializeData(bfloat16 *data, int64_t size) { } } -void custom_bf16_sum(void *invec, void *inoutvec, int64_t *len, MPI_Datatype *datatype) { +void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { bfloat16* in = (bfloat16*) invec; bfloat16* inout = (bfloat16*) inoutvec; - for (int64_t i = 0; i < *len; i++) { + for (int i = 0; i < *len; i++) { #ifdef USE_CUDA inout[i] = __hadd(in[i], inout[i]); #elif USE_ROCM diff --git a/mpi/Makefile b/mpi/Makefile new file mode 100644 index 0000000..782a6bf --- /dev/null +++ b/mpi/Makefile @@ -0,0 +1,30 @@ +# Copyright 2024 Parallel Software and Systems Group, University of Maryland. +# See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +CC = cc + +# perlmutter flags +INC = -I/global/common/software/nersc9/nccl/2.19.4/include +CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI +LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl + +# frontier flags +# INC = -I${ROCM_PATH}/include +# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL +# LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl + +all: allgather.x allreduce.x reduce_scatter.x + +allgather.x: ../allgather.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu + +allreduce.x: ../allreduce.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu + +reduce_scatter.x: ../reduce_scatter.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu + +clean: + rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x diff --git a/mpi/all-gather/128_gpu_run.sh b/mpi/all-gather/128_gpu_run.sh new file mode 100644 index 0000000..3af373c --- /dev/null +++ b/mpi/all-gather/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 16)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/16_gpu_run.sh b/mpi/all-gather/16_gpu_run.sh new file mode 100644 index 0000000..25d7b92 --- /dev/null +++ b/mpi/all-gather/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 128)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/32_gpu_run.sh b/mpi/all-gather/32_gpu_run.sh new file mode 100644 index 0000000..3a03ef0 --- /dev/null +++ b/mpi/all-gather/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 64)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/64_gpu_run.sh b/mpi/all-gather/64_gpu_run.sh new file mode 100644 index 0000000..37ba334 --- /dev/null +++ b/mpi/all-gather/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 32)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/8_gpu_run.sh b/mpi/all-gather/8_gpu_run.sh new file mode 100644 index 0000000..aa3e3a8 --- /dev/null +++ b/mpi/all-gather/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 256)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/benchmarks/16_gpu.txt b/mpi/all-gather/benchmarks/16_gpu.txt new file mode 100644 index 0000000..b69654b --- /dev/null +++ b/mpi/all-gather/benchmarks/16_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 128 +Global data size: 2048 +Number of GPUs: 16 +Message size range: 2097152 - 134217728 +Number of iterations: 10 +2097152 0.002391 seconds +4194304 0.003558 seconds +8388608 0.007162 seconds +16777216 0.014929 seconds +33554432 0.030427 seconds +67108864 0.062092 seconds +134217728 0.151508 seconds diff --git a/mpi/all-gather/benchmarks/8_gpu.txt b/mpi/all-gather/benchmarks/8_gpu.txt new file mode 100644 index 0000000..de3a837 --- /dev/null +++ b/mpi/all-gather/benchmarks/8_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 256 +Global data size: 2048 +Number of GPUs: 8 +Message size range: 2097152 - 268435456 +Number of iterations: 10 +2097152 0.000838 seconds +4194304 0.001719 seconds +8388608 0.003172 seconds +16777216 0.006797 seconds +33554432 0.013860 seconds +67108864 0.027938 seconds +134217728 0.055353 seconds +268435456 0.104310 seconds diff --git a/mpi/all-reduce/128_gpu_run.sh b/mpi/all-reduce/128_gpu_run.sh new file mode 100644 index 0000000..6a5ccff --- /dev/null +++ b/mpi/all-reduce/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/16_gpu_run.sh b/mpi/all-reduce/16_gpu_run.sh new file mode 100644 index 0000000..4158fe0 --- /dev/null +++ b/mpi/all-reduce/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/32_gpu_run.sh b/mpi/all-reduce/32_gpu_run.sh new file mode 100644 index 0000000..8990167 --- /dev/null +++ b/mpi/all-reduce/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/64_gpu_run.sh b/mpi/all-reduce/64_gpu_run.sh new file mode 100644 index 0000000..314f852 --- /dev/null +++ b/mpi/all-reduce/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/128_gpu_run.sh b/mpi/reduce-scatter/128_gpu_run.sh new file mode 100644 index 0000000..e0a9db1 --- /dev/null +++ b/mpi/reduce-scatter/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/16_gpu_run.sh b/mpi/reduce-scatter/16_gpu_run.sh new file mode 100644 index 0000000..be576de --- /dev/null +++ b/mpi/reduce-scatter/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/32_gpu_run.sh b/mpi/reduce-scatter/32_gpu_run.sh new file mode 100644 index 0000000..04a7f0a --- /dev/null +++ b/mpi/reduce-scatter/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/64_gpu_run.sh b/mpi/reduce-scatter/64_gpu_run.sh new file mode 100644 index 0000000..48c7645 --- /dev/null +++ b/mpi/reduce-scatter/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/8_gpu_run.sh b/mpi/reduce-scatter/8_gpu_run.sh new file mode 100644 index 0000000..5f8f10e --- /dev/null +++ b/mpi/reduce-scatter/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/Makefile b/nccl/Makefile new file mode 100644 index 0000000..5652112 --- /dev/null +++ b/nccl/Makefile @@ -0,0 +1,30 @@ +# Copyright 2024 Parallel Software and Systems Group, University of Maryland. +# See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +CC = cc + +# perlmutter flags +INC = -I/global/common/software/nersc9/nccl/2.19.4/include +CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL +LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl + +# frontier flags +# INC = -I${ROCM_PATH}/include +# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL +# LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl + +all: allgather.x allreduce.x reduce_scatter.x + +allgather.x: ../allgather.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu + +allreduce.x: ../allreduce.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu + +reduce_scatter.x: ../reduce_scatter.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu + +clean: + rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x diff --git a/nccl/all-gather/128_gpu_run.sh b/nccl/all-gather/128_gpu_run.sh new file mode 100644 index 0000000..e9fc3ae --- /dev/null +++ b/nccl/all-gather/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 32)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-gather/16_gpu_run.sh b/nccl/all-gather/16_gpu_run.sh new file mode 100644 index 0000000..a94a523 --- /dev/null +++ b/nccl/all-gather/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 256)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-gather/32_gpu_run.sh b/nccl/all-gather/32_gpu_run.sh new file mode 100644 index 0000000..f1ecd9f --- /dev/null +++ b/nccl/all-gather/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 64)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-gather/64_gpu_run.sh b/nccl/all-gather/64_gpu_run.sh new file mode 100644 index 0000000..357da9e --- /dev/null +++ b/nccl/all-gather/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 32)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-gather/8_gpu_run.sh b/nccl/all-gather/8_gpu_run.sh new file mode 100644 index 0000000..4bd249d --- /dev/null +++ b/nccl/all-gather/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 256)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-reduce/128_gpu_run.sh b/nccl/all-reduce/128_gpu_run.sh new file mode 100644 index 0000000..0e1358b --- /dev/null +++ b/nccl/all-reduce/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 4096)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-reduce/16_gpu_run.sh b/nccl/all-reduce/16_gpu_run.sh new file mode 100644 index 0000000..6553e02 --- /dev/null +++ b/nccl/all-reduce/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 4096)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-reduce/32_gpu_run.sh b/nccl/all-reduce/32_gpu_run.sh new file mode 100644 index 0000000..b672e7c --- /dev/null +++ b/nccl/all-reduce/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-reduce/64_gpu_run.sh b/nccl/all-reduce/64_gpu_run.sh new file mode 100644 index 0000000..fc0416c --- /dev/null +++ b/nccl/all-reduce/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-reduce/8_gpu_run.sh b/nccl/all-reduce/8_gpu_run.sh new file mode 100644 index 0000000..d9c0ef6 --- /dev/null +++ b/nccl/all-reduce/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/reduce-scatter/128_gpu_run.sh b/nccl/reduce-scatter/128_gpu_run.sh new file mode 100644 index 0000000..fa2199a --- /dev/null +++ b/nccl/reduce-scatter/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 4096)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/reduce-scatter/16_gpu_run.sh b/nccl/reduce-scatter/16_gpu_run.sh new file mode 100644 index 0000000..2edffa6 --- /dev/null +++ b/nccl/reduce-scatter/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 4096)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/reduce-scatter/32_gpu_run.sh b/nccl/reduce-scatter/32_gpu_run.sh new file mode 100644 index 0000000..3d297ff --- /dev/null +++ b/nccl/reduce-scatter/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/reduce-scatter/64_gpu_run.sh b/nccl/reduce-scatter/64_gpu_run.sh new file mode 100644 index 0000000..6bbf97a --- /dev/null +++ b/nccl/reduce-scatter/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/reduce-scatter/8_gpu_run.sh b/nccl/reduce-scatter/8_gpu_run.sh new file mode 100644 index 0000000..21c0dc4 --- /dev/null +++ b/nccl/reduce-scatter/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/Makefile b/rccl/Makefile similarity index 57% rename from Makefile rename to rccl/Makefile index 52c0235..590dee7 100644 --- a/Makefile +++ b/rccl/Makefile @@ -15,10 +15,16 @@ INC = -I${ROCM_PATH}/include CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl -all: allgather.x +all: allgather.x allreduce.x reduce_scatter.x -allgather.x: allgather.cu - ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu +allgather.x: ../allgather.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu + +allreduce.x: ../allreduce.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu + +reduce_scatter.x: ../reduce_scatter.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu clean: - rm -f allgather.x + rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x diff --git a/reduce_scatter.cu b/reduce_scatter.cu index 820cf4f..f824072 100644 --- a/reduce_scatter.cu +++ b/reduce_scatter.cu @@ -76,10 +76,10 @@ void initializeData(bfloat16 *data, int64_t size) { } } -void custom_bf16_sum(void *invec, void *inoutvec, int64_t *len, MPI_Datatype *datatype) { +void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { bfloat16* in = (bfloat16*) invec; bfloat16* inout = (bfloat16*) inoutvec; - for (int64_t i = 0; i < *len; i++) { + for (int i = 0; i < *len; i++) { #ifdef USE_CUDA inout[i] = __hadd(in[i], inout[i]); #elif USE_ROCM From c7bb21719e15613e832c4cb57bad340a54eef9e6 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Sat, 13 Apr 2024 16:57:43 -0700 Subject: [PATCH 29/52] add results so far --- nccl/all-gather/benchmarks/16_gpu.txt | 13 +++++++++++++ nccl/all-gather/benchmarks/8_gpu.txt | 13 +++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 nccl/all-gather/benchmarks/16_gpu.txt create mode 100644 nccl/all-gather/benchmarks/8_gpu.txt diff --git a/nccl/all-gather/benchmarks/16_gpu.txt b/nccl/all-gather/benchmarks/16_gpu.txt new file mode 100644 index 0000000..73e83d9 --- /dev/null +++ b/nccl/all-gather/benchmarks/16_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 256 +Global data size: 4096 +Number of GPUs: 16 +Message size range: 2097152 - 268435456 +Number of iterations: 10 +2097152 0.000532 seconds +4194304 0.000982 seconds +8388608 0.001976 seconds +16777216 0.003447 seconds +33554432 0.006826 seconds +67108864 0.013190 seconds +134217728 0.026196 seconds +268435456 0.052567 seconds diff --git a/nccl/all-gather/benchmarks/8_gpu.txt b/nccl/all-gather/benchmarks/8_gpu.txt new file mode 100644 index 0000000..1c654f3 --- /dev/null +++ b/nccl/all-gather/benchmarks/8_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 256 +Global data size: 2048 +Number of GPUs: 8 +Message size range: 2097152 - 268435456 +Number of iterations: 10 +2097152 0.000286 seconds +4194304 0.000523 seconds +8388608 0.000954 seconds +16777216 0.001696 seconds +33554432 0.003150 seconds +67108864 0.006500 seconds +134217728 0.012278 seconds +268435456 0.024449 seconds From cb99cadc5f0674137b3225f923061337c2ab8002 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Sat, 13 Apr 2024 17:13:57 -0700 Subject: [PATCH 30/52] add results so far --- mpi/all-gather/benchmarks/128_gpu.txt | 12 ++++++++++++ mpi/all-gather/benchmarks/32_gpu.txt | 14 ++++++++++++++ mpi/all-gather/benchmarks/64_gpu.txt | 13 +++++++++++++ nccl/all-gather/benchmarks/128_gpu.txt | 13 +++++++++++++ nccl/all-gather/benchmarks/32_gpu.txt | 14 ++++++++++++++ nccl/all-gather/benchmarks/64_gpu.txt | 13 +++++++++++++ 6 files changed, 79 insertions(+) create mode 100644 mpi/all-gather/benchmarks/128_gpu.txt create mode 100644 mpi/all-gather/benchmarks/32_gpu.txt create mode 100644 mpi/all-gather/benchmarks/64_gpu.txt create mode 100644 nccl/all-gather/benchmarks/128_gpu.txt create mode 100644 nccl/all-gather/benchmarks/32_gpu.txt create mode 100644 nccl/all-gather/benchmarks/64_gpu.txt diff --git a/mpi/all-gather/benchmarks/128_gpu.txt b/mpi/all-gather/benchmarks/128_gpu.txt new file mode 100644 index 0000000..3787302 --- /dev/null +++ b/mpi/all-gather/benchmarks/128_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 16 +Global data size: 2048 +Number of GPUs: 128 +Message size range: 262144 - 16777216 +Number of iterations: 10 +262144 0.003218 seconds +524288 0.005101 seconds +1048576 0.008701 seconds +2097152 0.015526 seconds +4194304 0.030239 seconds +8388608 0.060280 seconds +16777216 0.189415 seconds diff --git a/mpi/all-gather/benchmarks/32_gpu.txt b/mpi/all-gather/benchmarks/32_gpu.txt new file mode 100644 index 0000000..0e15475 --- /dev/null +++ b/mpi/all-gather/benchmarks/32_gpu.txt @@ -0,0 +1,14 @@ +Local data size: 64 +Global data size: 2048 +Number of GPUs: 32 +Message size range: 262144 - 67108864 +Number of iterations: 10 +262144 0.000730 seconds +524288 0.001367 seconds +1048576 0.002650 seconds +2097152 0.003740 seconds +4194304 0.007503 seconds +8388608 0.014208 seconds +16777216 0.029923 seconds +33554432 0.061970 seconds +67108864 0.168545 seconds diff --git a/mpi/all-gather/benchmarks/64_gpu.txt b/mpi/all-gather/benchmarks/64_gpu.txt new file mode 100644 index 0000000..ed700b9 --- /dev/null +++ b/mpi/all-gather/benchmarks/64_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 32 +Global data size: 2048 +Number of GPUs: 64 +Message size range: 262144 - 33554432 +Number of iterations: 10 +262144 0.001561 seconds +524288 0.002915 seconds +1048576 0.004163 seconds +2097152 0.007885 seconds +4194304 0.014989 seconds +8388608 0.029413 seconds +16777216 0.063034 seconds +33554432 0.183096 seconds diff --git a/nccl/all-gather/benchmarks/128_gpu.txt b/nccl/all-gather/benchmarks/128_gpu.txt new file mode 100644 index 0000000..c84792c --- /dev/null +++ b/nccl/all-gather/benchmarks/128_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 32 +Global data size: 4096 +Number of GPUs: 128 +Message size range: 262144 - 33554432 +Number of iterations: 10 +262144 0.002247 seconds +524288 0.002277 seconds +1048576 0.002775 seconds +2097152 0.004497 seconds +4194304 0.007477 seconds +8388608 0.015057 seconds +16777216 0.028550 seconds +33554432 0.056270 seconds diff --git a/nccl/all-gather/benchmarks/32_gpu.txt b/nccl/all-gather/benchmarks/32_gpu.txt new file mode 100644 index 0000000..72f0d07 --- /dev/null +++ b/nccl/all-gather/benchmarks/32_gpu.txt @@ -0,0 +1,14 @@ +Local data size: 64 +Global data size: 2048 +Number of GPUs: 32 +Message size range: 262144 - 67108864 +Number of iterations: 10 +262144 0.000622 seconds +524288 0.000577 seconds +1048576 0.000780 seconds +2097152 0.001190 seconds +4194304 0.002041 seconds +8388608 0.003571 seconds +16777216 0.006995 seconds +33554432 0.013830 seconds +67108864 0.027698 seconds diff --git a/nccl/all-gather/benchmarks/64_gpu.txt b/nccl/all-gather/benchmarks/64_gpu.txt new file mode 100644 index 0000000..db7919c --- /dev/null +++ b/nccl/all-gather/benchmarks/64_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 32 +Global data size: 2048 +Number of GPUs: 64 +Message size range: 262144 - 33554432 +Number of iterations: 10 +262144 0.001077 seconds +524288 0.001154 seconds +1048576 0.001399 seconds +2097152 0.002078 seconds +4194304 0.003777 seconds +8388608 0.007711 seconds +16777216 0.014418 seconds +33554432 0.028471 seconds From d2d2bbc6b5b7a079e8af857852904d19196b5230 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Sat, 13 Apr 2024 17:56:42 -0700 Subject: [PATCH 31/52] change atoi to strtoll --- allreduce.cu | 4 ++-- reduce_scatter.cu | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/allreduce.cu b/allreduce.cu index ddbfb97..7fdf2b9 100644 --- a/allreduce.cu +++ b/allreduce.cu @@ -95,8 +95,8 @@ int main(int argc, char *argv[]) { } int num_gpus = atoi(argv[1]); - int64_t min_msg_size = atoi(argv[2]); - int64_t max_msg_size = atoi(argv[3]); + int64_t min_msg_size = strtoll(argv[2], NULL, 10); + int64_t max_msg_size = strtoll(argv[3], NULL, 10); int iterations = atoi(argv[4]); if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { diff --git a/reduce_scatter.cu b/reduce_scatter.cu index f824072..1853aed 100644 --- a/reduce_scatter.cu +++ b/reduce_scatter.cu @@ -95,8 +95,8 @@ int main(int argc, char *argv[]) { } int num_gpus = atoi(argv[1]); - int64_t min_msg_size = atoi(argv[2]); - int64_t max_msg_size = atoi(argv[3]); + int64_t min_msg_size = strtoll(argv[2], NULL, 10); + int64_t max_msg_size = strtoll(argv[3], NULL, 10); int iterations = atoi(argv[4]); if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { From 74cfdd8c32a705f2a74d82a9ae5ec230060aa317 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Sat, 13 Apr 2024 17:59:18 -0700 Subject: [PATCH 32/52] change atoi to strtoll --- reduce_scatter.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reduce_scatter.cu b/reduce_scatter.cu index f824072..1853aed 100644 --- a/reduce_scatter.cu +++ b/reduce_scatter.cu @@ -95,8 +95,8 @@ int main(int argc, char *argv[]) { } int num_gpus = atoi(argv[1]); - int64_t min_msg_size = atoi(argv[2]); - int64_t max_msg_size = atoi(argv[3]); + int64_t min_msg_size = strtoll(argv[2], NULL, 10); + int64_t max_msg_size = strtoll(argv[3], NULL, 10); int iterations = atoi(argv[4]); if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { From ccb73ae591a813dd10cacc35be6cb6b44bd3ef60 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Sun, 14 Apr 2024 13:04:35 -0700 Subject: [PATCH 33/52] add all perlmutter code and benchmark data --- Makefile | 18 -- README | 9 - README.md | 15 + allgather.cu | 123 ++++++--- allreduce.cu | 261 ++++++++++++++++++ mpi/Makefile | 30 ++ mpi/all-gather/perlmutter/128_gpu_run.sh | 37 +++ mpi/all-gather/perlmutter/16_gpu_run.sh | 37 +++ mpi/all-gather/perlmutter/32_gpu_run.sh | 37 +++ mpi/all-gather/perlmutter/64_gpu_run.sh | 37 +++ mpi/all-gather/perlmutter/8_gpu_run.sh | 37 +++ .../perlmutter/benchmarks/128_gpu.txt | 12 + .../perlmutter/benchmarks/16_gpu.txt | 12 + .../perlmutter/benchmarks/32_gpu.txt | 14 + .../perlmutter/benchmarks/64_gpu.txt | 13 + .../perlmutter/benchmarks/8_gpu.txt | 13 + mpi/all-reduce/perlmutter/128_gpu_run.sh | 37 +++ mpi/all-reduce/perlmutter/16_gpu_run.sh | 37 +++ mpi/all-reduce/perlmutter/32_gpu_run.sh | 37 +++ mpi/all-reduce/perlmutter/64_gpu_run.sh | 37 +++ mpi/all-reduce/perlmutter/8_gpu_run.sh | 37 +++ .../perlmutter/benchmarks/128_gpu.txt | 11 + .../perlmutter/benchmarks/16_gpu.txt | 11 + .../perlmutter/benchmarks/32_gpu.txt | 13 + .../perlmutter/benchmarks/64_gpu.txt | 12 + .../perlmutter/benchmarks/8_gpu.txt | 12 + mpi/reduce-scatter/perlmutter/128_gpu_run.sh | 37 +++ mpi/reduce-scatter/perlmutter/16_gpu_run.sh | 37 +++ mpi/reduce-scatter/perlmutter/32_gpu_run.sh | 37 +++ mpi/reduce-scatter/perlmutter/64_gpu_run.sh | 37 +++ mpi/reduce-scatter/perlmutter/8_gpu_run.sh | 37 +++ .../perlmutter/benchmarks/128_gpu.txt | 12 + .../perlmutter/benchmarks/16_gpu.txt | 12 + .../perlmutter/benchmarks/32_gpu.txt | 14 + .../perlmutter/benchmarks/64_gpu.txt | 13 + .../perlmutter/benchmarks/8_gpu.txt | 13 + nccl/Makefile | 30 ++ nccl/all-gather/128_gpu_run.sh | 37 +++ nccl/all-gather/16_gpu_run.sh | 37 +++ nccl/all-gather/32_gpu_run.sh | 37 +++ nccl/all-gather/64_gpu_run.sh | 37 +++ nccl/all-gather/8_gpu_run.sh | 37 +++ nccl/all-gather/benchmarks/128_gpu.txt | 13 + nccl/all-gather/benchmarks/16_gpu.txt | 13 + nccl/all-gather/benchmarks/32_gpu.txt | 14 + nccl/all-gather/benchmarks/64_gpu.txt | 13 + nccl/all-gather/benchmarks/8_gpu.txt | 13 + nccl/all-reduce/128_gpu_run.sh | 37 +++ nccl/all-reduce/16_gpu_run.sh | 37 +++ nccl/all-reduce/32_gpu_run.sh | 37 +++ nccl/all-reduce/64_gpu_run.sh | 37 +++ nccl/all-reduce/8_gpu_run.sh | 37 +++ nccl/all-reduce/benchmarks/128_gpu.txt | 12 + nccl/all-reduce/benchmarks/16_gpu.txt | 12 + nccl/all-reduce/benchmarks/32_gpu.txt | 14 + nccl/all-reduce/benchmarks/64_gpu.txt | 13 + nccl/all-reduce/benchmarks/8_gpu.txt | 13 + nccl/reduce-scatter/128_gpu_run.sh | 37 +++ nccl/reduce-scatter/16_gpu_run.sh | 37 +++ nccl/reduce-scatter/32_gpu_run.sh | 37 +++ nccl/reduce-scatter/64_gpu_run.sh | 37 +++ nccl/reduce-scatter/8_gpu_run.sh | 37 +++ nccl/reduce-scatter/benchmarks/128_gpu.txt | 12 + nccl/reduce-scatter/benchmarks/16_gpu.txt | 12 + nccl/reduce-scatter/benchmarks/32_gpu.txt | 14 + nccl/reduce-scatter/benchmarks/64_gpu.txt | 13 + nccl/reduce-scatter/benchmarks/8_gpu.txt | 13 + 67 files changed, 1908 insertions(+), 69 deletions(-) delete mode 100644 Makefile delete mode 100644 README create mode 100644 README.md create mode 100644 allreduce.cu create mode 100644 mpi/Makefile create mode 100644 mpi/all-gather/perlmutter/128_gpu_run.sh create mode 100644 mpi/all-gather/perlmutter/16_gpu_run.sh create mode 100644 mpi/all-gather/perlmutter/32_gpu_run.sh create mode 100644 mpi/all-gather/perlmutter/64_gpu_run.sh create mode 100644 mpi/all-gather/perlmutter/8_gpu_run.sh create mode 100644 mpi/all-gather/perlmutter/benchmarks/128_gpu.txt create mode 100644 mpi/all-gather/perlmutter/benchmarks/16_gpu.txt create mode 100644 mpi/all-gather/perlmutter/benchmarks/32_gpu.txt create mode 100644 mpi/all-gather/perlmutter/benchmarks/64_gpu.txt create mode 100644 mpi/all-gather/perlmutter/benchmarks/8_gpu.txt create mode 100644 mpi/all-reduce/perlmutter/128_gpu_run.sh create mode 100644 mpi/all-reduce/perlmutter/16_gpu_run.sh create mode 100644 mpi/all-reduce/perlmutter/32_gpu_run.sh create mode 100644 mpi/all-reduce/perlmutter/64_gpu_run.sh create mode 100644 mpi/all-reduce/perlmutter/8_gpu_run.sh create mode 100644 mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt create mode 100644 mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt create mode 100644 mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt create mode 100644 mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt create mode 100644 mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt create mode 100644 mpi/reduce-scatter/perlmutter/128_gpu_run.sh create mode 100644 mpi/reduce-scatter/perlmutter/16_gpu_run.sh create mode 100644 mpi/reduce-scatter/perlmutter/32_gpu_run.sh create mode 100644 mpi/reduce-scatter/perlmutter/64_gpu_run.sh create mode 100644 mpi/reduce-scatter/perlmutter/8_gpu_run.sh create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt create mode 100644 nccl/Makefile create mode 100644 nccl/all-gather/128_gpu_run.sh create mode 100644 nccl/all-gather/16_gpu_run.sh create mode 100644 nccl/all-gather/32_gpu_run.sh create mode 100644 nccl/all-gather/64_gpu_run.sh create mode 100644 nccl/all-gather/8_gpu_run.sh create mode 100644 nccl/all-gather/benchmarks/128_gpu.txt create mode 100644 nccl/all-gather/benchmarks/16_gpu.txt create mode 100644 nccl/all-gather/benchmarks/32_gpu.txt create mode 100644 nccl/all-gather/benchmarks/64_gpu.txt create mode 100644 nccl/all-gather/benchmarks/8_gpu.txt create mode 100644 nccl/all-reduce/128_gpu_run.sh create mode 100644 nccl/all-reduce/16_gpu_run.sh create mode 100644 nccl/all-reduce/32_gpu_run.sh create mode 100644 nccl/all-reduce/64_gpu_run.sh create mode 100644 nccl/all-reduce/8_gpu_run.sh create mode 100644 nccl/all-reduce/benchmarks/128_gpu.txt create mode 100644 nccl/all-reduce/benchmarks/16_gpu.txt create mode 100644 nccl/all-reduce/benchmarks/32_gpu.txt create mode 100644 nccl/all-reduce/benchmarks/64_gpu.txt create mode 100644 nccl/all-reduce/benchmarks/8_gpu.txt create mode 100644 nccl/reduce-scatter/128_gpu_run.sh create mode 100644 nccl/reduce-scatter/16_gpu_run.sh create mode 100644 nccl/reduce-scatter/32_gpu_run.sh create mode 100644 nccl/reduce-scatter/64_gpu_run.sh create mode 100644 nccl/reduce-scatter/8_gpu_run.sh create mode 100644 nccl/reduce-scatter/benchmarks/128_gpu.txt create mode 100644 nccl/reduce-scatter/benchmarks/16_gpu.txt create mode 100644 nccl/reduce-scatter/benchmarks/32_gpu.txt create mode 100644 nccl/reduce-scatter/benchmarks/64_gpu.txt create mode 100644 nccl/reduce-scatter/benchmarks/8_gpu.txt diff --git a/Makefile b/Makefile deleted file mode 100644 index df453b4..0000000 --- a/Makefile +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright 2024 Parallel Software and Systems Group, University of Maryland. -# See the top-level LICENSE file for details. -# -# SPDX-License-Identifier: MIT - -CC = cc -INC = -I/global/common/software/nersc9/nccl/2.19.4/include -CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL -LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl - - -all: allgather.x - -allgather.x: allgather.cu - ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu - -clean: - rm -f allgather.x diff --git a/README b/README deleted file mode 100644 index eba2046..0000000 --- a/README +++ /dev/null @@ -1,9 +0,0 @@ -Before compiling do these: - -module load PrgEnv-cray cudatoolkit craype-accel-nvidia80 -export CRAY_ACCEL_TARGET=nvidia80 - -When running do these: - -module load cudatoolkit -export MPICH_GPU_SUPPORT_ENABLED=1 diff --git a/README.md b/README.md new file mode 100644 index 0000000..a1fdcdb --- /dev/null +++ b/README.md @@ -0,0 +1,15 @@ +Before compiling do these: + +### Perlmutter +```sh +module load PrgEnv-cray cudatoolkit craype-accel-nvidia80 nccl/2.19.4 +export CRAY_ACCEL_TARGET=nvidia80 +export MPICH_GPU_SUPPORT_ENABLED=1 +``` +### Frontier +```sh +module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 +export MPICH_GPU_SUPPORT_ENABLED=1 +export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}" +``` + diff --git a/allgather.cu b/allgather.cu index 5953041..8ae7481 100644 --- a/allgather.cu +++ b/allgather.cu @@ -8,24 +8,30 @@ #include #include #include +#include #ifdef USE_CUDA - #include #include + #define bfloat16 nv_bfloat16 +#elif USE_ROCM + #include + #include + #include + #define bfloat16 hip_bfloat16 #endif #ifdef USE_NCCL #include "nccl.h" -#elif defined(USE_RCCL) - #include "rccl.h" +#elif USE_RCCL + #include #endif #define NUM_WARMUP_ITERATIONS 5 #define MPI_CHECK(cmd) do { \ - int e = cmd; \ + int64_t e = cmd; \ if( e != MPI_SUCCESS ) { \ - printf("Failed: MPI error %s:%d '%d'\n", \ + printf("Failed: MPI error %s:%d '%ld'\n", \ __FILE__,__LINE__, e); \ exit(EXIT_FAILURE); \ } \ @@ -40,6 +46,16 @@ } \ } while(0) +#define HIP_CHECK(cmd) do { \ + hipError_t e = cmd; \ + if(e != hipSuccess) { \ + printf("HIP error %s:%d: %s\n", \ + __FILE__, __LINE__, hipGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +// NCCL_CHECK is used to validate RCCL functions as well #define NCCL_CHECK(cmd) do { \ ncclResult_t e = cmd; \ if (e != ncclSuccess) { \ @@ -49,9 +65,14 @@ } \ } while(0) -void initializeData(nv_bfloat16 *data, int size) { - for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) { +void initializeData(bfloat16 *data, int64_t size) { + for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) { + #ifdef USE_CUDA data[i] = __float2bfloat16((float)i); + #elif USE_ROCM + // ROCm doesn't have a float2bfloat16 method + data[i] = (bfloat16) ((float) i); + #endif } } @@ -62,8 +83,8 @@ int main(int argc, char *argv[]) { } int num_gpus = atoi(argv[1]); - int min_msg_size = atoi(argv[2]); - int max_msg_size = atoi(argv[3]); + int64_t min_msg_size = atoi(argv[2]); + int64_t max_msg_size = atoi(argv[3]); int iterations = atoi(argv[4]); if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { @@ -86,33 +107,49 @@ int main(int argc, char *argv[]) { } // Initialize GPU context + #if USE_CUDA cudaGetDeviceCount(&num_gpus_per_node); cudaSetDevice((my_rank % num_gpus_per_node)); + #elif USE_ROCM + hipGetDeviceCount(&num_gpus_per_node); + hipSetDevice((my_rank % num_gpus_per_node)); + #endif - int local_data_size = max_msg_size; // Size of local data - int global_data_size = local_data_size * num_gpus; // Size of global data + int64_t local_data_size = max_msg_size; // Size of local data + int64_t global_data_size = local_data_size * num_gpus; // Size of global data - nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size); - nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size); + if (my_rank == 0) { + fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024); + fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024); + } + + bfloat16 *local_data = (bfloat16*)malloc(local_data_size); + bfloat16 *global_data = (bfloat16*)malloc(global_data_size); // Initialize local data initializeData(local_data, local_data_size); // Allocate memory on GPU - nv_bfloat16 *d_local_data, *d_global_data; + bfloat16 *d_local_data, *d_global_data; + #ifdef USE_CUDA CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); - // Copy local data to GPU CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); + #elif USE_ROCM + HIP_CHECK(hipMalloc(&d_local_data, local_data_size)); + HIP_CHECK(hipMalloc(&d_global_data, global_data_size)); + HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice)); + #endif + #ifdef USE_MPI // create 2-byte datatype (send raw, un-interpreted bytes) MPI_Datatype mpi_type_bfloat16; MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16); MPI_Type_commit(&mpi_type_bfloat16); - #elif USE_NCCL + #elif defined(USE_NCCL) || defined(USE_RCCL) ncclUniqueId nccl_comm_id; ncclComm_t nccl_comm; @@ -125,13 +162,8 @@ int main(int argc, char *argv[]) { MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, 0, MPI_COMM_WORLD)); - /* Create a new NCCL communicator */ + /* Create a new NCCL/RCCL communicator */ NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); - - #elif defined(USE_RCCL) - // TODO: fix later - rcclComm_t rccl_comm; - rcclCommInitRank(&comm, num_gpus, 0, rccl_root); #endif // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather @@ -142,13 +174,13 @@ int main(int argc, char *argv[]) { // Print benchmark results if (my_rank == 0) { printf("Number of GPUs: %d\n", num_gpus); - printf("Message size range: %d - %d\n", min_msg_size, max_msg_size); + printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size); printf("Number of iterations: %d\n", iterations); } fflush(NULL); - for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { - msg_count = msg_size / sizeof(nv_bfloat16); + for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { + msg_count = msg_size / sizeof(bfloat16); // warmup iterations for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { #ifdef USE_MPI @@ -156,12 +188,14 @@ int main(int argc, char *argv[]) { d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request)); MPI_CHECK(MPI_Wait(&request, &status)); - #elif defined(USE_NCCL) + #elif defined(USE_NCCL) || defined(USE_RCCL) NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL)); - cudaDeviceSynchronize(); - #elif defined(USE_RCCL) - // TODO: fix later - rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); #endif } @@ -172,34 +206,39 @@ int main(int argc, char *argv[]) { start_time = MPI_Wtime(); for (int i = 0; i < iterations; ++i) { #ifdef USE_MPI - MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16, - d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request)); - + MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16, + d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request)); + MPI_CHECK(MPI_Wait(&request, &status)); - #elif defined(USE_NCCL) + #elif defined(USE_NCCL) || defined(USE_RCCL) NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL)); - cudaDeviceSynchronize(); - #elif defined(USE_RCCL) - // TODO: fix later - rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); #endif } MPI_Barrier(MPI_COMM_WORLD); total_time = MPI_Wtime() - start_time; if (my_rank == 0) - printf("%d %.6f seconds\n", msg_size, (total_time / iterations)); + printf("%ld %.6f seconds\n", msg_size, (total_time / iterations)); } // Cleanup free(local_data); free(global_data); + #ifdef USE_CUDA CUDA_CHECK(cudaFree(d_local_data)); CUDA_CHECK(cudaFree(d_global_data)); + #elif USE_ROCM + HIP_CHECK(hipFree(d_local_data)); + HIP_CHECK(hipFree(d_global_data)); + #endif - #ifdef USE_NCCL + #ifdef defined(USE_NCCL) || defined(USE_RCCL) ncclCommDestroy(nccl_comm); - #elif defined(USE_RCCL) - rcclCommDestroy(rccl_comm); #endif MPI_Finalize(); diff --git a/allreduce.cu b/allreduce.cu new file mode 100644 index 0000000..7fdf2b9 --- /dev/null +++ b/allreduce.cu @@ -0,0 +1,261 @@ +/* \file allreduce.cu + * Copyright 2024 Parallel Software and Systems Group, University of Maryland. + * See the top-level LICENSE file for details. + * + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include +#include + +#ifdef USE_CUDA + #include + #define bfloat16 nv_bfloat16 +#elif USE_ROCM + #include + #include + #include + #define bfloat16 hip_bfloat16 +#endif + +#ifdef USE_NCCL + #include "nccl.h" +#elif USE_RCCL + #include +#endif + +#define NUM_WARMUP_ITERATIONS 5 + +#define MPI_CHECK(cmd) do { \ + int64_t e = cmd; \ + if( e != MPI_SUCCESS ) { \ + printf("Failed: MPI error %s:%d '%ld'\n", \ + __FILE__,__LINE__, e); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define CUDA_CHECK(cmd) do { \ + cudaError_t e = cmd; \ + if(e != cudaSuccess) { \ + printf("CUDA error %s:%d: %s\n", \ + __FILE__, __LINE__, cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define HIP_CHECK(cmd) do { \ + hipError_t e = cmd; \ + if(e != hipSuccess) { \ + printf("HIP error %s:%d: %s\n", \ + __FILE__, __LINE__, hipGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +// NCCL_CHECK is used to validate RCCL functions as well +#define NCCL_CHECK(cmd) do { \ + ncclResult_t e = cmd; \ + if (e != ncclSuccess) { \ + printf("NCCL error %s:%d %s\n", \ + __FILE__, __LINE__, ncclGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +void initializeData(bfloat16 *data, int64_t size) { + for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) { + #ifdef USE_CUDA + data[i] = __float2bfloat16((float)i); + #elif USE_ROCM + // ROCm doesn't have a float2bfloat16 method + data[i] = (bfloat16) ((float) i); + #endif + } +} + +void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { + bfloat16* in = (bfloat16*) invec; + bfloat16* inout = (bfloat16*) inoutvec; + for (int i = 0; i < *len; i++) { + #ifdef USE_CUDA + inout[i] = __hadd(in[i], inout[i]); + #elif USE_ROCM + inout[i] = in[i] + inout[i]; + #endif + } +} + +int main(int argc, char *argv[]) { + if (argc != 5) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return EXIT_FAILURE; + } + + int num_gpus = atoi(argv[1]); + int64_t min_msg_size = strtoll(argv[2], NULL, 10); + int64_t max_msg_size = strtoll(argv[3], NULL, 10); + int iterations = atoi(argv[4]); + + if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { + fprintf(stderr, "Invalid input parameters.\n"); + return EXIT_FAILURE; + } + + int my_rank, num_pes; + int num_gpus_per_node; + int msg_count; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_pes); + + if (num_pes != num_gpus) { + fprintf(stderr, "Number of processes must match number of GPUs.\n"); + MPI_Finalize(); + return EXIT_FAILURE; + } + + // Initialize GPU context + #if USE_CUDA + cudaGetDeviceCount(&num_gpus_per_node); + cudaSetDevice((my_rank % num_gpus_per_node)); + #elif USE_ROCM + hipGetDeviceCount(&num_gpus_per_node); + hipSetDevice((my_rank % num_gpus_per_node)); + #endif + + int64_t local_data_size = max_msg_size; // Size of local data + int64_t global_data_size = local_data_size; // Size of global data + + if (my_rank == 0) { + fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024); + fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024); + } + + bfloat16 *local_data = (bfloat16*)malloc(local_data_size); + bfloat16 *global_data = (bfloat16*)malloc(global_data_size); + + // Initialize local data + initializeData(local_data, local_data_size); + + bfloat16 *d_local_data, *d_global_data; + #ifdef USE_CUDA + CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); + CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); + // Copy local data to GPU + CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); + + #elif USE_ROCM + HIP_CHECK(hipMalloc(&d_local_data, local_data_size)); + HIP_CHECK(hipMalloc(&d_global_data, global_data_size)); + HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice)); + #endif + + #ifdef USE_MPI + // create 2-byte datatype (send raw, un-interpreted bytes) + MPI_Datatype mpi_type_bfloat16; + MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16); + MPI_Type_commit(&mpi_type_bfloat16); + + // define custom reduce operation for nv_bfloat16 types + MPI_Op CUSTOM_SUM; + MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM); + + #elif defined(USE_NCCL) || defined(USE_RCCL) + ncclUniqueId nccl_comm_id; + ncclComm_t nccl_comm; + + if (my_rank == 0) { + /* Generates an Id to be used in ncclCommInitRank. */ + ncclGetUniqueId(&nccl_comm_id); + } + + /* distribute nccl_comm_id to all ranks */ + MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, + 0, MPI_COMM_WORLD)); + + /* Create a new NCCL/RCCL communicator */ + NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); + #endif + + // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather + double total_time, start_time; + MPI_Request request; + MPI_Status status; + + // Print benchmark results + if (my_rank == 0) { + printf("Number of GPUs: %d\n", num_gpus); + printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size); + printf("Number of iterations: %d\n", iterations); + } + fflush(NULL); + + for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { + msg_count = msg_size / sizeof(bfloat16); + // warmup iterations + for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { + #ifdef USE_MPI + MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16, + CUSTOM_SUM, MPI_COMM_WORLD, &request)); + + MPI_CHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) || defined(USE_RCCL) + NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL)); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); + #endif + } + + if(msg_size >= 8388608) + iterations = 20; + + MPI_Barrier(MPI_COMM_WORLD); + start_time = MPI_Wtime(); + for (int i = 0; i < iterations; ++i) { + #ifdef USE_MPI + MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16, + CUSTOM_SUM, MPI_COMM_WORLD, &request)); + + MPI_CHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) || defined(USE_RCCL) + NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL)); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); + #endif + } + MPI_Barrier(MPI_COMM_WORLD); + total_time = MPI_Wtime() - start_time; + if (my_rank == 0) + printf("%ld %.6f seconds\n", msg_size, (total_time / iterations)); + } + + // Cleanup + free(local_data); + free(global_data); + #ifdef USE_CUDA + CUDA_CHECK(cudaFree(d_local_data)); + CUDA_CHECK(cudaFree(d_global_data)); + #elif USE_ROCM + HIP_CHECK(hipFree(d_local_data)); + HIP_CHECK(hipFree(d_global_data)); + #endif + + #ifdef defined(USE_NCCL) || defined(USE_RCCL) + ncclCommDestroy(nccl_comm); + #endif + + MPI_Finalize(); + return EXIT_SUCCESS; +} diff --git a/mpi/Makefile b/mpi/Makefile new file mode 100644 index 0000000..782a6bf --- /dev/null +++ b/mpi/Makefile @@ -0,0 +1,30 @@ +# Copyright 2024 Parallel Software and Systems Group, University of Maryland. +# See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +CC = cc + +# perlmutter flags +INC = -I/global/common/software/nersc9/nccl/2.19.4/include +CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI +LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl + +# frontier flags +# INC = -I${ROCM_PATH}/include +# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL +# LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl + +all: allgather.x allreduce.x reduce_scatter.x + +allgather.x: ../allgather.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu + +allreduce.x: ../allreduce.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu + +reduce_scatter.x: ../reduce_scatter.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu + +clean: + rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x diff --git a/mpi/all-gather/perlmutter/128_gpu_run.sh b/mpi/all-gather/perlmutter/128_gpu_run.sh new file mode 100644 index 0000000..710a399 --- /dev/null +++ b/mpi/all-gather/perlmutter/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 16)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/perlmutter/16_gpu_run.sh b/mpi/all-gather/perlmutter/16_gpu_run.sh new file mode 100644 index 0000000..d4d984e --- /dev/null +++ b/mpi/all-gather/perlmutter/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 128)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/perlmutter/32_gpu_run.sh b/mpi/all-gather/perlmutter/32_gpu_run.sh new file mode 100644 index 0000000..d2f1b0d --- /dev/null +++ b/mpi/all-gather/perlmutter/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 64)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/perlmutter/64_gpu_run.sh b/mpi/all-gather/perlmutter/64_gpu_run.sh new file mode 100644 index 0000000..515d667 --- /dev/null +++ b/mpi/all-gather/perlmutter/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 32)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/perlmutter/8_gpu_run.sh b/mpi/all-gather/perlmutter/8_gpu_run.sh new file mode 100644 index 0000000..210ea3d --- /dev/null +++ b/mpi/all-gather/perlmutter/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 256)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt new file mode 100644 index 0000000..3787302 --- /dev/null +++ b/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 16 +Global data size: 2048 +Number of GPUs: 128 +Message size range: 262144 - 16777216 +Number of iterations: 10 +262144 0.003218 seconds +524288 0.005101 seconds +1048576 0.008701 seconds +2097152 0.015526 seconds +4194304 0.030239 seconds +8388608 0.060280 seconds +16777216 0.189415 seconds diff --git a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt new file mode 100644 index 0000000..b69654b --- /dev/null +++ b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 128 +Global data size: 2048 +Number of GPUs: 16 +Message size range: 2097152 - 134217728 +Number of iterations: 10 +2097152 0.002391 seconds +4194304 0.003558 seconds +8388608 0.007162 seconds +16777216 0.014929 seconds +33554432 0.030427 seconds +67108864 0.062092 seconds +134217728 0.151508 seconds diff --git a/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt new file mode 100644 index 0000000..0e15475 --- /dev/null +++ b/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt @@ -0,0 +1,14 @@ +Local data size: 64 +Global data size: 2048 +Number of GPUs: 32 +Message size range: 262144 - 67108864 +Number of iterations: 10 +262144 0.000730 seconds +524288 0.001367 seconds +1048576 0.002650 seconds +2097152 0.003740 seconds +4194304 0.007503 seconds +8388608 0.014208 seconds +16777216 0.029923 seconds +33554432 0.061970 seconds +67108864 0.168545 seconds diff --git a/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt new file mode 100644 index 0000000..ed700b9 --- /dev/null +++ b/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 32 +Global data size: 2048 +Number of GPUs: 64 +Message size range: 262144 - 33554432 +Number of iterations: 10 +262144 0.001561 seconds +524288 0.002915 seconds +1048576 0.004163 seconds +2097152 0.007885 seconds +4194304 0.014989 seconds +8388608 0.029413 seconds +16777216 0.063034 seconds +33554432 0.183096 seconds diff --git a/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt new file mode 100644 index 0000000..de3a837 --- /dev/null +++ b/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 256 +Global data size: 2048 +Number of GPUs: 8 +Message size range: 2097152 - 268435456 +Number of iterations: 10 +2097152 0.000838 seconds +4194304 0.001719 seconds +8388608 0.003172 seconds +16777216 0.006797 seconds +33554432 0.013860 seconds +67108864 0.027938 seconds +134217728 0.055353 seconds +268435456 0.104310 seconds diff --git a/mpi/all-reduce/perlmutter/128_gpu_run.sh b/mpi/all-reduce/perlmutter/128_gpu_run.sh new file mode 100644 index 0000000..33729eb --- /dev/null +++ b/mpi/all-reduce/perlmutter/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 20:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/perlmutter/16_gpu_run.sh b/mpi/all-reduce/perlmutter/16_gpu_run.sh new file mode 100644 index 0000000..dc30279 --- /dev/null +++ b/mpi/all-reduce/perlmutter/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 15:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/perlmutter/32_gpu_run.sh b/mpi/all-reduce/perlmutter/32_gpu_run.sh new file mode 100644 index 0000000..be73564 --- /dev/null +++ b/mpi/all-reduce/perlmutter/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 20:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/perlmutter/64_gpu_run.sh b/mpi/all-reduce/perlmutter/64_gpu_run.sh new file mode 100644 index 0000000..cf714da --- /dev/null +++ b/mpi/all-reduce/perlmutter/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 20:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/perlmutter/8_gpu_run.sh b/mpi/all-reduce/perlmutter/8_gpu_run.sh new file mode 100644 index 0000000..49ff135 --- /dev/null +++ b/mpi/all-reduce/perlmutter/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 15:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt new file mode 100644 index 0000000..4e3e17d --- /dev/null +++ b/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt @@ -0,0 +1,11 @@ +Local data size: 1024 +Global data size: 1024 +Number of GPUs: 128 +Message size range: 33554432 - 1073741824 +Number of iterations: 10 +33554432 0.264543 seconds +67108864 0.527909 seconds +134217728 1.092095 seconds +268435456 3.194094 seconds +536870912 6.415718 seconds +1073741824 12.819154 seconds diff --git a/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt new file mode 100644 index 0000000..b377ec2 --- /dev/null +++ b/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt @@ -0,0 +1,11 @@ +Local data size: 1024 +Global data size: 1024 +Number of GPUs: 16 +Message size range: 33554432 - 1073741824 +Number of iterations: 10 +33554432 0.142677 seconds +67108864 0.324897 seconds +134217728 0.673650 seconds +268435456 2.140369 seconds +536870912 4.318430 seconds +1073741824 8.632880 seconds diff --git a/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt new file mode 100644 index 0000000..cda53bf --- /dev/null +++ b/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 1024 +Global data size: 1024 +Number of GPUs: 32 +Message size range: 8388608 - 1073741824 +Number of iterations: 10 +8388608 0.049975 seconds +16777216 0.092395 seconds +33554432 0.181888 seconds +67108864 0.368241 seconds +134217728 0.774021 seconds +268435456 2.362729 seconds +536870912 4.760279 seconds +1073741824 9.524390 seconds diff --git a/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt new file mode 100644 index 0000000..341fc93 --- /dev/null +++ b/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 1024 +Global data size: 1024 +Number of GPUs: 64 +Message size range: 16777216 - 1073741824 +Number of iterations: 10 +16777216 0.111867 seconds +33554432 0.230462 seconds +67108864 0.465838 seconds +134217728 0.970915 seconds +268435456 2.875694 seconds +536870912 5.771569 seconds +1073741824 11.522959 seconds diff --git a/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt new file mode 100644 index 0000000..05fd1e8 --- /dev/null +++ b/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 1024 +Global data size: 1024 +Number of GPUs: 8 +Message size range: 16777216 - 1073741824 +Number of iterations: 10 +16777216 0.058292 seconds +33554432 0.107128 seconds +67108864 0.211506 seconds +134217728 0.491929 seconds +268435456 1.508757 seconds +536870912 3.052047 seconds +1073741824 6.103450 seconds diff --git a/mpi/reduce-scatter/perlmutter/128_gpu_run.sh b/mpi/reduce-scatter/perlmutter/128_gpu_run.sh new file mode 100644 index 0000000..469aeaf --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/perlmutter/16_gpu_run.sh b/mpi/reduce-scatter/perlmutter/16_gpu_run.sh new file mode 100644 index 0000000..e66b9f4 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/perlmutter/32_gpu_run.sh b/mpi/reduce-scatter/perlmutter/32_gpu_run.sh new file mode 100644 index 0000000..07d6020 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 30:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/perlmutter/64_gpu_run.sh b/mpi/reduce-scatter/perlmutter/64_gpu_run.sh new file mode 100644 index 0000000..e51945a --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/perlmutter/8_gpu_run.sh b/mpi/reduce-scatter/perlmutter/8_gpu_run.sh new file mode 100644 index 0000000..1b51537 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 30:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt new file mode 100644 index 0000000..d696072 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 128 +Message size range: 33554432 - 2147483648 +Number of iterations: 10 +33554432 3.352414 seconds +67108864 3.323000 seconds +134217728 3.331817 seconds +268435456 3.327162 seconds +536870912 3.345694 seconds +1073741824 3.326455 seconds +2147483648 3.321790 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt new file mode 100644 index 0000000..b71477d --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 16 +Message size range: 33554432 - 2147483648 +Number of iterations: 10 +33554432 3.368300 seconds +67108864 3.361940 seconds +134217728 3.367816 seconds +268435456 3.360722 seconds +536870912 3.363088 seconds +1073741824 3.392373 seconds +2147483648 3.375325 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt new file mode 100644 index 0000000..38e09b1 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt @@ -0,0 +1,14 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 32 +Message size range: 8388608 - 2147483648 +Number of iterations: 10 +8388608 3.368554 seconds +16777216 3.367485 seconds +33554432 3.376475 seconds +67108864 3.381592 seconds +134217728 3.384111 seconds +268435456 3.375780 seconds +536870912 3.371542 seconds +1073741824 3.379895 seconds +2147483648 3.381470 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt new file mode 100644 index 0000000..d982100 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 64 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 2.220629 seconds +33554432 2.201147 seconds +67108864 2.196879 seconds +134217728 2.199449 seconds +268435456 2.194973 seconds +536870912 2.196809 seconds +1073741824 2.196212 seconds +2147483648 2.201029 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt new file mode 100644 index 0000000..d2bdd9a --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 8 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 3.558431 seconds +33554432 3.553477 seconds +67108864 3.562137 seconds +134217728 3.556267 seconds +268435456 3.551567 seconds +536870912 3.599067 seconds +1073741824 3.608635 seconds +2147483648 3.624090 seconds diff --git a/nccl/Makefile b/nccl/Makefile new file mode 100644 index 0000000..5652112 --- /dev/null +++ b/nccl/Makefile @@ -0,0 +1,30 @@ +# Copyright 2024 Parallel Software and Systems Group, University of Maryland. +# See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +CC = cc + +# perlmutter flags +INC = -I/global/common/software/nersc9/nccl/2.19.4/include +CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL +LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl + +# frontier flags +# INC = -I${ROCM_PATH}/include +# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL +# LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl + +all: allgather.x allreduce.x reduce_scatter.x + +allgather.x: ../allgather.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu + +allreduce.x: ../allreduce.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu + +reduce_scatter.x: ../reduce_scatter.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu + +clean: + rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x diff --git a/nccl/all-gather/128_gpu_run.sh b/nccl/all-gather/128_gpu_run.sh new file mode 100644 index 0000000..e9fc3ae --- /dev/null +++ b/nccl/all-gather/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 32)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-gather/16_gpu_run.sh b/nccl/all-gather/16_gpu_run.sh new file mode 100644 index 0000000..a94a523 --- /dev/null +++ b/nccl/all-gather/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 256)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-gather/32_gpu_run.sh b/nccl/all-gather/32_gpu_run.sh new file mode 100644 index 0000000..f1ecd9f --- /dev/null +++ b/nccl/all-gather/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 64)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-gather/64_gpu_run.sh b/nccl/all-gather/64_gpu_run.sh new file mode 100644 index 0000000..357da9e --- /dev/null +++ b/nccl/all-gather/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 32)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-gather/8_gpu_run.sh b/nccl/all-gather/8_gpu_run.sh new file mode 100644 index 0000000..4bd249d --- /dev/null +++ b/nccl/all-gather/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 256)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-gather/benchmarks/128_gpu.txt b/nccl/all-gather/benchmarks/128_gpu.txt new file mode 100644 index 0000000..c84792c --- /dev/null +++ b/nccl/all-gather/benchmarks/128_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 32 +Global data size: 4096 +Number of GPUs: 128 +Message size range: 262144 - 33554432 +Number of iterations: 10 +262144 0.002247 seconds +524288 0.002277 seconds +1048576 0.002775 seconds +2097152 0.004497 seconds +4194304 0.007477 seconds +8388608 0.015057 seconds +16777216 0.028550 seconds +33554432 0.056270 seconds diff --git a/nccl/all-gather/benchmarks/16_gpu.txt b/nccl/all-gather/benchmarks/16_gpu.txt new file mode 100644 index 0000000..73e83d9 --- /dev/null +++ b/nccl/all-gather/benchmarks/16_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 256 +Global data size: 4096 +Number of GPUs: 16 +Message size range: 2097152 - 268435456 +Number of iterations: 10 +2097152 0.000532 seconds +4194304 0.000982 seconds +8388608 0.001976 seconds +16777216 0.003447 seconds +33554432 0.006826 seconds +67108864 0.013190 seconds +134217728 0.026196 seconds +268435456 0.052567 seconds diff --git a/nccl/all-gather/benchmarks/32_gpu.txt b/nccl/all-gather/benchmarks/32_gpu.txt new file mode 100644 index 0000000..72f0d07 --- /dev/null +++ b/nccl/all-gather/benchmarks/32_gpu.txt @@ -0,0 +1,14 @@ +Local data size: 64 +Global data size: 2048 +Number of GPUs: 32 +Message size range: 262144 - 67108864 +Number of iterations: 10 +262144 0.000622 seconds +524288 0.000577 seconds +1048576 0.000780 seconds +2097152 0.001190 seconds +4194304 0.002041 seconds +8388608 0.003571 seconds +16777216 0.006995 seconds +33554432 0.013830 seconds +67108864 0.027698 seconds diff --git a/nccl/all-gather/benchmarks/64_gpu.txt b/nccl/all-gather/benchmarks/64_gpu.txt new file mode 100644 index 0000000..db7919c --- /dev/null +++ b/nccl/all-gather/benchmarks/64_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 32 +Global data size: 2048 +Number of GPUs: 64 +Message size range: 262144 - 33554432 +Number of iterations: 10 +262144 0.001077 seconds +524288 0.001154 seconds +1048576 0.001399 seconds +2097152 0.002078 seconds +4194304 0.003777 seconds +8388608 0.007711 seconds +16777216 0.014418 seconds +33554432 0.028471 seconds diff --git a/nccl/all-gather/benchmarks/8_gpu.txt b/nccl/all-gather/benchmarks/8_gpu.txt new file mode 100644 index 0000000..1c654f3 --- /dev/null +++ b/nccl/all-gather/benchmarks/8_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 256 +Global data size: 2048 +Number of GPUs: 8 +Message size range: 2097152 - 268435456 +Number of iterations: 10 +2097152 0.000286 seconds +4194304 0.000523 seconds +8388608 0.000954 seconds +16777216 0.001696 seconds +33554432 0.003150 seconds +67108864 0.006500 seconds +134217728 0.012278 seconds +268435456 0.024449 seconds diff --git a/nccl/all-reduce/128_gpu_run.sh b/nccl/all-reduce/128_gpu_run.sh new file mode 100644 index 0000000..623f0c2 --- /dev/null +++ b/nccl/all-reduce/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-reduce/16_gpu_run.sh b/nccl/all-reduce/16_gpu_run.sh new file mode 100644 index 0000000..af689e9 --- /dev/null +++ b/nccl/all-reduce/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-reduce/32_gpu_run.sh b/nccl/all-reduce/32_gpu_run.sh new file mode 100644 index 0000000..b672e7c --- /dev/null +++ b/nccl/all-reduce/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-reduce/64_gpu_run.sh b/nccl/all-reduce/64_gpu_run.sh new file mode 100644 index 0000000..fc0416c --- /dev/null +++ b/nccl/all-reduce/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-reduce/8_gpu_run.sh b/nccl/all-reduce/8_gpu_run.sh new file mode 100644 index 0000000..d9c0ef6 --- /dev/null +++ b/nccl/all-reduce/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-reduce/benchmarks/128_gpu.txt b/nccl/all-reduce/benchmarks/128_gpu.txt new file mode 100644 index 0000000..c8bc5f3 --- /dev/null +++ b/nccl/all-reduce/benchmarks/128_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 128 +Message size range: 33554432 - 2147483648 +Number of iterations: 10 +33554432 0.002252 seconds +67108864 0.003958 seconds +134217728 0.005696 seconds +268435456 0.008861 seconds +536870912 0.016701 seconds +1073741824 0.035052 seconds +2147483648 0.069582 seconds diff --git a/nccl/all-reduce/benchmarks/16_gpu.txt b/nccl/all-reduce/benchmarks/16_gpu.txt new file mode 100644 index 0000000..8199a8f --- /dev/null +++ b/nccl/all-reduce/benchmarks/16_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 16 +Message size range: 33554432 - 2147483648 +Number of iterations: 10 +33554432 0.000971 seconds +67108864 0.001813 seconds +134217728 0.003415 seconds +268435456 0.007049 seconds +536870912 0.013323 seconds +1073741824 0.026322 seconds +2147483648 0.052252 seconds diff --git a/nccl/all-reduce/benchmarks/32_gpu.txt b/nccl/all-reduce/benchmarks/32_gpu.txt new file mode 100644 index 0000000..fa6e736 --- /dev/null +++ b/nccl/all-reduce/benchmarks/32_gpu.txt @@ -0,0 +1,14 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 32 +Message size range: 8388608 - 2147483648 +Number of iterations: 10 +8388608 0.000589 seconds +16777216 0.001015 seconds +33554432 0.001352 seconds +67108864 0.002146 seconds +134217728 0.003621 seconds +268435456 0.006997 seconds +536870912 0.013742 seconds +1073741824 0.027021 seconds +2147483648 0.054364 seconds diff --git a/nccl/all-reduce/benchmarks/64_gpu.txt b/nccl/all-reduce/benchmarks/64_gpu.txt new file mode 100644 index 0000000..a773bf1 --- /dev/null +++ b/nccl/all-reduce/benchmarks/64_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 64 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 0.001196 seconds +33554432 0.001740 seconds +67108864 0.002970 seconds +134217728 0.004544 seconds +268435456 0.008213 seconds +536870912 0.017505 seconds +1073741824 0.035188 seconds +2147483648 0.069951 seconds diff --git a/nccl/all-reduce/benchmarks/8_gpu.txt b/nccl/all-reduce/benchmarks/8_gpu.txt new file mode 100644 index 0000000..4d60f0f --- /dev/null +++ b/nccl/all-reduce/benchmarks/8_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 8 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 0.000511 seconds +33554432 0.000916 seconds +67108864 0.001663 seconds +134217728 0.003137 seconds +268435456 0.006408 seconds +536870912 0.012493 seconds +1073741824 0.024300 seconds +2147483648 0.048155 seconds diff --git a/nccl/reduce-scatter/128_gpu_run.sh b/nccl/reduce-scatter/128_gpu_run.sh new file mode 100644 index 0000000..8590821 --- /dev/null +++ b/nccl/reduce-scatter/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/reduce-scatter/16_gpu_run.sh b/nccl/reduce-scatter/16_gpu_run.sh new file mode 100644 index 0000000..7a20fa6 --- /dev/null +++ b/nccl/reduce-scatter/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/reduce-scatter/32_gpu_run.sh b/nccl/reduce-scatter/32_gpu_run.sh new file mode 100644 index 0000000..3d297ff --- /dev/null +++ b/nccl/reduce-scatter/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/reduce-scatter/64_gpu_run.sh b/nccl/reduce-scatter/64_gpu_run.sh new file mode 100644 index 0000000..6bbf97a --- /dev/null +++ b/nccl/reduce-scatter/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/reduce-scatter/8_gpu_run.sh b/nccl/reduce-scatter/8_gpu_run.sh new file mode 100644 index 0000000..21c0dc4 --- /dev/null +++ b/nccl/reduce-scatter/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/reduce-scatter/benchmarks/128_gpu.txt b/nccl/reduce-scatter/benchmarks/128_gpu.txt new file mode 100644 index 0000000..7c1c8f9 --- /dev/null +++ b/nccl/reduce-scatter/benchmarks/128_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 128 +Message size range: 33554432 - 2147483648 +Number of iterations: 10 +33554432 0.028300 seconds +67108864 0.028351 seconds +134217728 0.028351 seconds +268435456 0.028502 seconds +536870912 0.028579 seconds +1073741824 0.028650 seconds +2147483648 0.028506 seconds diff --git a/nccl/reduce-scatter/benchmarks/16_gpu.txt b/nccl/reduce-scatter/benchmarks/16_gpu.txt new file mode 100644 index 0000000..14acf87 --- /dev/null +++ b/nccl/reduce-scatter/benchmarks/16_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 16 +Message size range: 33554432 - 2147483648 +Number of iterations: 10 +33554432 0.033170 seconds +67108864 0.033280 seconds +134217728 0.033220 seconds +268435456 0.033291 seconds +536870912 0.033217 seconds +1073741824 0.033158 seconds +2147483648 0.033275 seconds diff --git a/nccl/reduce-scatter/benchmarks/32_gpu.txt b/nccl/reduce-scatter/benchmarks/32_gpu.txt new file mode 100644 index 0000000..7eecc67 --- /dev/null +++ b/nccl/reduce-scatter/benchmarks/32_gpu.txt @@ -0,0 +1,14 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 32 +Message size range: 8388608 - 2147483648 +Number of iterations: 10 +8388608 0.027121 seconds +16777216 0.027661 seconds +33554432 0.027766 seconds +67108864 0.027992 seconds +134217728 0.027914 seconds +268435456 0.027912 seconds +536870912 0.027777 seconds +1073741824 0.027861 seconds +2147483648 0.027551 seconds diff --git a/nccl/reduce-scatter/benchmarks/64_gpu.txt b/nccl/reduce-scatter/benchmarks/64_gpu.txt new file mode 100644 index 0000000..8f8ddd0 --- /dev/null +++ b/nccl/reduce-scatter/benchmarks/64_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 64 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 0.028306 seconds +33554432 0.028511 seconds +67108864 0.028175 seconds +134217728 0.027998 seconds +268435456 0.027883 seconds +536870912 0.027802 seconds +1073741824 0.027954 seconds +2147483648 0.028085 seconds diff --git a/nccl/reduce-scatter/benchmarks/8_gpu.txt b/nccl/reduce-scatter/benchmarks/8_gpu.txt new file mode 100644 index 0000000..26c22b6 --- /dev/null +++ b/nccl/reduce-scatter/benchmarks/8_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 8 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 0.024231 seconds +33554432 0.024389 seconds +67108864 0.024167 seconds +134217728 0.024047 seconds +268435456 0.024293 seconds +536870912 0.024031 seconds +1073741824 0.024048 seconds +2147483648 0.024241 seconds From fd73957a5b1685a72da8dd5433583f11c1d8e7e2 Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Sun, 14 Apr 2024 16:13:32 -0400 Subject: [PATCH 34/52] add frontier code and benchmark results so far --- README.md | 6 ++- allgather.cu | 1 + allreduce.cu | 1 + mpi/Makefile | 12 +++--- mpi/all-gather/128_gpu_run.sh | 37 ------------------ mpi/all-gather/16_gpu_run.sh | 37 ------------------ mpi/all-gather/32_gpu_run.sh | 37 ------------------ mpi/all-gather/64_gpu_run.sh | 37 ------------------ mpi/all-gather/8_gpu_run.sh | 37 ------------------ mpi/all-gather/allgather.x | Bin 0 -> 25696 bytes mpi/all-gather/benchmarks/128_gpu.txt | 12 ------ mpi/all-gather/benchmarks/16_gpu.txt | 12 ------ mpi/all-gather/benchmarks/32_gpu.txt | 14 ------- mpi/all-gather/benchmarks/64_gpu.txt | 13 ------ mpi/all-gather/benchmarks/8_gpu.txt | 13 ------ mpi/all-gather/frontier/128_gcd_run.sh | 21 ++++++++++ mpi/all-gather/frontier/16_gcd_run.sh | 21 ++++++++++ mpi/all-gather/frontier/32_gcd_run.sh | 21 ++++++++++ mpi/all-gather/frontier/64_gcd_run.sh | 21 ++++++++++ mpi/all-gather/frontier/8_gcd_run.sh | 21 ++++++++++ mpi/all-gather/frontier/benchmarks/16_gcd.txt | 13 ++++++ mpi/all-gather/frontier/benchmarks/32_gcd.txt | 15 +++++++ mpi/all-gather/frontier/benchmarks/8_gcd.txt | 14 +++++++ mpi/all-reduce/128_gpu_run.sh | 37 ------------------ mpi/all-reduce/16_gpu_run.sh | 37 ------------------ mpi/all-reduce/32_gpu_run.sh | 37 ------------------ mpi/all-reduce/64_gpu_run.sh | 37 ------------------ mpi/all-reduce/allreduce.x | Bin 0 -> 25832 bytes mpi/all-reduce/frontier/8_gcd_run.sh | 21 ++++++++++ mpi/all-reduce/frontier/benchmarks/8_gcd.txt | 13 ++++++ mpi/reduce-scatter/128_gpu_run.sh | 37 ------------------ mpi/reduce-scatter/16_gpu_run.sh | 37 ------------------ mpi/reduce-scatter/32_gpu_run.sh | 37 ------------------ mpi/reduce-scatter/64_gpu_run.sh | 37 ------------------ mpi/reduce-scatter/8_gpu_run.sh | 37 ------------------ mpi/reduce-scatter/frontier/8_gcd_run.sh | 21 ++++++++++ .../frontier/benchmarks/8_gcd.txt | 14 +++++++ mpi/reduce-scatter/reduce_scatter.x | Bin 0 -> 25888 bytes nccl/Makefile | 30 -------------- nccl/all-gather/128_gpu_run.sh | 37 ------------------ nccl/all-gather/16_gpu_run.sh | 37 ------------------ nccl/all-gather/32_gpu_run.sh | 37 ------------------ nccl/all-gather/64_gpu_run.sh | 37 ------------------ nccl/all-gather/8_gpu_run.sh | 37 ------------------ nccl/all-gather/benchmarks/128_gpu.txt | 13 ------ nccl/all-gather/benchmarks/16_gpu.txt | 13 ------ nccl/all-gather/benchmarks/32_gpu.txt | 14 ------- nccl/all-gather/benchmarks/64_gpu.txt | 13 ------ nccl/all-gather/benchmarks/8_gpu.txt | 13 ------ nccl/all-reduce/128_gpu_run.sh | 37 ------------------ nccl/all-reduce/16_gpu_run.sh | 37 ------------------ nccl/all-reduce/32_gpu_run.sh | 37 ------------------ nccl/all-reduce/64_gpu_run.sh | 37 ------------------ nccl/all-reduce/8_gpu_run.sh | 37 ------------------ nccl/reduce-scatter/128_gpu_run.sh | 37 ------------------ nccl/reduce-scatter/16_gpu_run.sh | 37 ------------------ nccl/reduce-scatter/32_gpu_run.sh | 37 ------------------ nccl/reduce-scatter/64_gpu_run.sh | 37 ------------------ nccl/reduce-scatter/8_gpu_run.sh | 37 ------------------ rccl/all-gather/allgather.x | Bin 0 -> 25736 bytes rccl/all-reduce/allreduce.x | Bin 0 -> 25840 bytes rccl/reduce-scatter/reduce_scatter.x | Bin 0 -> 25848 bytes reduce_scatter.cu | 1 + 63 files changed, 229 insertions(+), 1241 deletions(-) delete mode 100644 mpi/all-gather/128_gpu_run.sh delete mode 100644 mpi/all-gather/16_gpu_run.sh delete mode 100644 mpi/all-gather/32_gpu_run.sh delete mode 100644 mpi/all-gather/64_gpu_run.sh delete mode 100644 mpi/all-gather/8_gpu_run.sh create mode 100755 mpi/all-gather/allgather.x delete mode 100644 mpi/all-gather/benchmarks/128_gpu.txt delete mode 100644 mpi/all-gather/benchmarks/16_gpu.txt delete mode 100644 mpi/all-gather/benchmarks/32_gpu.txt delete mode 100644 mpi/all-gather/benchmarks/64_gpu.txt delete mode 100644 mpi/all-gather/benchmarks/8_gpu.txt create mode 100644 mpi/all-gather/frontier/128_gcd_run.sh create mode 100644 mpi/all-gather/frontier/16_gcd_run.sh create mode 100644 mpi/all-gather/frontier/32_gcd_run.sh create mode 100644 mpi/all-gather/frontier/64_gcd_run.sh create mode 100644 mpi/all-gather/frontier/8_gcd_run.sh create mode 100644 mpi/all-gather/frontier/benchmarks/16_gcd.txt create mode 100644 mpi/all-gather/frontier/benchmarks/32_gcd.txt create mode 100644 mpi/all-gather/frontier/benchmarks/8_gcd.txt delete mode 100644 mpi/all-reduce/128_gpu_run.sh delete mode 100644 mpi/all-reduce/16_gpu_run.sh delete mode 100644 mpi/all-reduce/32_gpu_run.sh delete mode 100644 mpi/all-reduce/64_gpu_run.sh create mode 100755 mpi/all-reduce/allreduce.x create mode 100644 mpi/all-reduce/frontier/8_gcd_run.sh create mode 100644 mpi/all-reduce/frontier/benchmarks/8_gcd.txt delete mode 100644 mpi/reduce-scatter/128_gpu_run.sh delete mode 100644 mpi/reduce-scatter/16_gpu_run.sh delete mode 100644 mpi/reduce-scatter/32_gpu_run.sh delete mode 100644 mpi/reduce-scatter/64_gpu_run.sh delete mode 100644 mpi/reduce-scatter/8_gpu_run.sh create mode 100644 mpi/reduce-scatter/frontier/8_gcd_run.sh create mode 100644 mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt create mode 100755 mpi/reduce-scatter/reduce_scatter.x delete mode 100644 nccl/Makefile delete mode 100644 nccl/all-gather/128_gpu_run.sh delete mode 100644 nccl/all-gather/16_gpu_run.sh delete mode 100644 nccl/all-gather/32_gpu_run.sh delete mode 100644 nccl/all-gather/64_gpu_run.sh delete mode 100644 nccl/all-gather/8_gpu_run.sh delete mode 100644 nccl/all-gather/benchmarks/128_gpu.txt delete mode 100644 nccl/all-gather/benchmarks/16_gpu.txt delete mode 100644 nccl/all-gather/benchmarks/32_gpu.txt delete mode 100644 nccl/all-gather/benchmarks/64_gpu.txt delete mode 100644 nccl/all-gather/benchmarks/8_gpu.txt delete mode 100644 nccl/all-reduce/128_gpu_run.sh delete mode 100644 nccl/all-reduce/16_gpu_run.sh delete mode 100644 nccl/all-reduce/32_gpu_run.sh delete mode 100644 nccl/all-reduce/64_gpu_run.sh delete mode 100644 nccl/all-reduce/8_gpu_run.sh delete mode 100644 nccl/reduce-scatter/128_gpu_run.sh delete mode 100644 nccl/reduce-scatter/16_gpu_run.sh delete mode 100644 nccl/reduce-scatter/32_gpu_run.sh delete mode 100644 nccl/reduce-scatter/64_gpu_run.sh delete mode 100644 nccl/reduce-scatter/8_gpu_run.sh create mode 100755 rccl/all-gather/allgather.x create mode 100755 rccl/all-reduce/allreduce.x create mode 100755 rccl/reduce-scatter/reduce_scatter.x diff --git a/README.md b/README.md index 396231b..a1fdcdb 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,14 @@ Before compiling do these: ### Perlmutter ```sh -module load PrgEnv-cray cudatoolkit craype-accel-nvidia80 +module load PrgEnv-cray cudatoolkit craype-accel-nvidia80 nccl/2.19.4 export CRAY_ACCEL_TARGET=nvidia80 export MPICH_GPU_SUPPORT_ENABLED=1 ``` ### Frontier ```sh -module load PrgEnv-cray amd-mixed craype-accel-amd-gfx90a +module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 export MPICH_GPU_SUPPORT_ENABLED=1 +export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}" ``` + diff --git a/allgather.cu b/allgather.cu index 8ae7481..8c357bb 100644 --- a/allgather.cu +++ b/allgather.cu @@ -14,6 +14,7 @@ #include #define bfloat16 nv_bfloat16 #elif USE_ROCM + #define __HIP_PLATFORM_AMD__ #include #include #include diff --git a/allreduce.cu b/allreduce.cu index 7fdf2b9..111b254 100644 --- a/allreduce.cu +++ b/allreduce.cu @@ -14,6 +14,7 @@ #include #define bfloat16 nv_bfloat16 #elif USE_ROCM + #define __HIP_PLATFORM_AMD__ #include #include #include diff --git a/mpi/Makefile b/mpi/Makefile index 782a6bf..ba9d72b 100644 --- a/mpi/Makefile +++ b/mpi/Makefile @@ -6,14 +6,14 @@ CC = cc # perlmutter flags -INC = -I/global/common/software/nersc9/nccl/2.19.4/include -CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI -LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl +# INC = -I/global/common/software/nersc9/nccl/2.19.4/include +# CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI +# LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl # frontier flags -# INC = -I${ROCM_PATH}/include -# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL -# LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl +INC = -I${ROCM_PATH}/include +CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_MPI +LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl all: allgather.x allreduce.x reduce_scatter.x diff --git a/mpi/all-gather/128_gpu_run.sh b/mpi/all-gather/128_gpu_run.sh deleted file mode 100644 index 3af373c..0000000 --- a/mpi/all-gather/128_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 32 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 16)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/benchmarks/128_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-gather/16_gpu_run.sh b/mpi/all-gather/16_gpu_run.sh deleted file mode 100644 index 25d7b92..0000000 --- a/mpi/all-gather/16_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 4 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 128)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/benchmarks/16_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-gather/32_gpu_run.sh b/mpi/all-gather/32_gpu_run.sh deleted file mode 100644 index 3a03ef0..0000000 --- a/mpi/all-gather/32_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 8 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 64)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/benchmarks/32_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-gather/64_gpu_run.sh b/mpi/all-gather/64_gpu_run.sh deleted file mode 100644 index 37ba334..0000000 --- a/mpi/all-gather/64_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 16 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 32)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/benchmarks/64_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-gather/8_gpu_run.sh b/mpi/all-gather/8_gpu_run.sh deleted file mode 100644 index aa3e3a8..0000000 --- a/mpi/all-gather/8_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 2 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 256)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/benchmarks/8_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-gather/allgather.x b/mpi/all-gather/allgather.x new file mode 100755 index 0000000000000000000000000000000000000000..03793882f6c87b1c4fbedcb5062d4db7921d7a3f GIT binary patch literal 25696 zcmeHP3v^UPny%ZOmjQB{r{H6}V1THk^MJ&FfaxR+9jBua67Yd`I{io*(+~TB!Bs#b z&a?q#*gcNx%+9dR?yP6lnPGQl9md@OA_|N<55UK8bY^76S2xH*MPPi8{ri_?G+^V`&w{O*YyK{+2QJ8clwwMw3gBco!>bcL@ky}Inq@LBW zV)(OIDJuY<$1y#=o@qQFX{AZWB`wlfW(Fm_h=`AvG#(@AC7feSQkg@dq&MU=5!C5p z6LeNmX3=@l%g41%#BUSjlXSPL%Smd~qcV2P(BnHAG#-$2|Ce;U`!bzMJt`LI^$NXS zp(p7!QEo|Py)1hx?xphOO7Do&8>goy>!nrbJtfNflttqLk~(#9CaJ8$8=*(}{Bs@Y z)3r(DXOp;pl4`=Bq%ysVoN9t1YjcXN`re)!ZIspUSlK`gU%? z3|}O{`CK09h+8{MWFk0i7@{*XK(#C>~E0TSJI3Ii8pyPxx-PMsNoIDsT&%#>(6p!5Q}m$1h^_ zZY_gR;2Q$p@79PlI1~D-fbp>K^U)1G~x`RGfAmk4N_4K-3{y@mx z6S&(4*6SNdtUC~C@dZ7RwG5%n~p$@^nYTAxb$GkH*72tfjTt)f@`Md7v>I47#H3&>cfuOqcbB6LIE` zz+BwVf_l=iIPxB4{=1@qxQ}Zrk;NjbZEGVwmnR&G2f7mBM2rgvEEq(B!kFLRlZbUQ zKNX8-2FAPG@ory~vn$;qZw>BfG$6rvAn0ShUPQTkUU%G$dhCqF^r{CG@>20ao?t}J zYb4Hav)n;1YC?Z#)KI0k=HcN%caa_=wQQ-gxuMZjZLPM}u!?XbUg3$l*H%P)6&{bT zqN>(fX|3czJbGO8C<=5gsEFNF;YWqWk+X_e#O=AG+!tCCtLSvcd>Iqvu^wNns;2yI zUuZ5FD6gvFw!H3WtlJ$bkK(SOJmqt(^Q@KS@mgO`bxm*dt}cI9&Fb2ks5jajjm^Dt zb!1+6bu1Rq)6Jc|a!-#t)KxK*Nf9hhgzgB1?+S_MXICh(Ad|z-7V?W*p$d$8JUvv0 zRm6y55noCC-I^1`~ zrn($|(twXQ*x7Etjq{Bi2At-I((N|jlOzbb-+)gx;0Fx&6a#+PfM009j~MW&20UfJ zry1~52ArN}(w#Hl7fX<+TO*&*J(p`I;)~oG5sqKVL%L^iYdk>mVkyJ zHXwzeYj6}%kGkW5a45F8m@VxIce;BtdKmLE@zNACXV^0W>S%5i zrP7N<2^$MFWLYm-i_Pi0q0!aSVrzA^*luzyVghMf*4*N1b1btjciiA?0^YdHb~8jE z-rDB6Vabw3)O?hS&DS_U>E4>qnZ2MDpuM0$(8oZZ1Kk1IhEDo(&>d*(-DofqF2h_9 zbSvlqP}#SYyO%Mg*P>iJwyHF0 zkP*E+8tTG-$>8M0c+{S5cO>6ZpU%Gpw@iI{w`1e(MA_c_FN0GY{rPhs z0VqDjvAO0ZU{FmT9~?~ez}hio%WZpv_B}(|3-{&+h4z{0y0!{S&n`qbxi8f~`RLDo zBNMfG(bvi10%`H*NQ;vfv@RnF%)z8wGb7;WP_xp2xHJsmcY;X{C z+$Tplo&1tcFMSHM>X2GCb1np$)Ur$enIf9h@JthF*wvGV{BTzFGHP9n`5o%I-`Ulb zyFYNMSMK$zkz*$aI-#yS{((cik~6QUE6)&f2GJjJwA@ws&>ZIq@ zNoOIJ{;ZX~8J#9Ik~-mcA{A&JC(RR0>Xg0E{?MjQ`V`@_Hnr@G+IAWwrMA86SJ%*m z0-rnydK&bs9Z`Op8hzOUq4Rz<`fCdnbJ53X6w{`@MrD2NB&5z0JnlJy?dthIf;&G5 zm2(gR2k%odg{m57K~>q+wlw6{9EIGmt$cZeRNel_$gi4GjK_>+@va10I{+p)*kKH6<7Z7Xc; zwzlPtzIS^a$q$nEl+JS`AgZb*l_2^EH{=X`!I`a(8{!+)`Hz)T2vPkGv4m}TO zs)Y7&XfGhmnfzFy{{qyJY%10IJ~pYFXx3}rcwBw>EE>Lj;~Dkg6BL;Lq574#5a_$7 zRAcJHuTrnrOj+8DETz_w353S(LS~v$>jhh>vm3z9UsYPEZcIU~C3(7e;7zT?bDj*$ zNtqDa-&8vGZ_k2D{yOz6rn$-AHmg58iL}PD#5_m;qDL1n=1hL%7&tm<=WL6{s&=QY zp)~pz-GdlM@+gn7=rPj~<4g{wuHf|dj)C-a^?@Tu8KC3f$>xD~OB2+yAEGiO|0sL* z>UYpm(PC%lbn-`UbNh34v9P0m+`>A>c5PKPwv%S}&){juQYlPUcTgz(SMbsJ zE`*Syf8x$xK@Lsw$xQsMJbv8St6{`KMk+E!z5}7u4>FQfBsu?A|H}cy!3PlP#C6oE zEY9Tl)Z>_9Z``eJe2j()fKwgG6RC{?{01LCAnE9jm)4M^kyoxifh(H5OFwpH;iop@hlbp#lm4?y3jpOrC*FARofuuI1>L zl#T}AO#oGcspB9m^Utqqa`fL*T9-NjmO_@)OIT1)uCEjsoSJ%8AeBH;2~ywf=zFgB zR@*AuZMNHOuG{uFH+xH0Igr3BQ)IF4gzUM5Ib_E|R-l6)U^a!G#$x^+@$>*4XT5xM#MAv5oYs(6CT9L3b^b8_=f1~az|nb{`HP%BM+=kv{U7|cxM zWS|KdauLG%@VabUnl%x5+^LY3%C*ATLG@)-&TTa=NXZ5E+HFQ#o;Q7^8no8?Q zL&I}s-HpTMN1s!&Ur?@j1tnIAyu1Jib4hpVAR3Bo@K=fd)YE$wc#7nCbEQl<)Q!LW zba3!}brsToU%k>S285k!(M_&;AJn*gpF?ot)Om zDI$)(dk-`9fkOzUe@uM8{jj4Sb9kD@_nm8u2l~#ns2eFqefOSbi7SP`Gvk5m4ALNg z{CH4a+@^8+?5C+3`s|-5$2@)ao??mHq&&Ts*|}cG&(-DY_t{Tn;<_Y>24jC(xuSZb&n=b|pS#jAL})tyM6?fL|0 z>Uq>2&zyxoZ5Ob?)OLVdw)|`lUpNSF*m4LAP2ZZ6zor?SbMwrHvBGS3Y%V9d+?M>b z8RO#>RQE-Wfltk;<*@D8cs#CEy(Y&m44)W8o02DOgOh&a=-Z<>=KnTv6ivS9*4yOB zhY4e5J&~RM2Ycv-!G3@KL3BY{Qke!1kp3PW=OHSn?gnfA70;-_nLmQBURI9~@iaEF z^bsoc%TR49|@fa6b zdIRV0CLRwEmX2^fN<6K=(;b{&O+38P%~# zzm9mU`B>V*`3B-kh-aK%K)g!)DNO1IXI2w$A^r&GuOyzDTKWLzXAqD3iSl#)BI4hWaY_x8+><(C@*O?hIC6AD$@PxWIHe9l?tlo0WSQ2Xahw_qIhhW1 z5OUj18mID|zO+vrin~IM6Y3D;eklxpcz!TU{DA_E2-AJM_iE5X(+Px=<&f{V<##ml z9kz_821l-2=u-nDC*$RB3r&?8$Em}STP65YgL;1IMSjo}dHyyD9zBoqvYvG9QGc`s zMr&ZS21aXOv<60NV6+BCYhbhnMr&ZS21aXOv<60NV6+BCYhbhnMr&ZS21aXOv<60N zV6+BCYhbhn{xWKyNc=9iy+Y#!suaJY$^GitrF>AtKPu>V1$|o3y@I|X=-Yyx5wxI6 zPiKmtvjnXbbcvuV1oaELM$iWZeN@oz3i`C5dj)+((6w2 zHew^1#<%2YYp@vuyCpPzE1{;9)wq2%)s=Pgz4Pa~E8W#q^E$8g&Gq}eHGa3dwytts z<-D4@T3=1w75Hu`5W2$`U7&GuPG^%a!j?8RF3`%b>HP|=s?s{wTB%i6R#!o_PAlWG z%^{EV3L&<xk;k!M{<8Ag?r$G7nnDIY>&>^FqgEo6NMe=L1NMA^!& zo;MI_D#|w_%6uC<)8Qr{dGk12NZ)AX>nTHMEX7;$E&?gI0|ImvvZA6pK`(d^Uh$X7 z<00}8wTb(5FHA}SHKQ@~MVeC9LBS_rNV$>+=_>+d4i8!orC2FgZ~z`%l*lWfW5>`3 zgG%M|2#?uA!72*!pXL?UVsYnWOHsaALSHT_jXTAV(*;>_$~T_^u3k%NHd2BW4J7I$CMSYEjB~#ZoX{nV^_UE-sl_G7;_8 zWM$@(YsV5uR>muU$56BdA8nZCA#%Z(YsNHBymp!iUt5?<*_bhl$C%Ol^JcN;iCl$Q z3fc>XEERRoZ>!weS#>|#G zeI}e&pj-|!W=0=6k>zn2%ivi6JMpa^L@{C2JUJiwIZ=z=*W z9Fw513rupvHNVIdQy#F#VcX1Z5-AMJNJSzR6(_EAc|39lc`On0>D$LEuC54#JUt1o zk1=15zkDd)I26F^^IN-RZOBe)-oq60g#G4F+&n?!e%u^dXr6H0(pAiqXMT&keOpas z)6B(Io?|{_e&OwZ^I{{V}SQ2jhv5Z&8;o z8m+vG|KCupmBZj7Y`lkFxgWG{J}M5!+4#T)_5GmbJ-$#^yn9h4phW1dK*(DjcXwiw zJ9u|Y9N&P`6tJ6rcqAGj4j;foBi3Dx9jI}ihjwOi!_h8ZIktb6W2+~mLUFPc#ZeI` zTosds%AmHvP_x1WbxFd99#2tZhG)`yxXtw2AwRK(< z<}Sko=*S10ybz(z7!P>Lqdpvtfu3Zz&t%=$SbL~|^zh|@;T(>;K#^(R?C{QxBO!QK zb4R;8Bgaxhq!!|evx@F;&{yH^Oaywo712mg4>YtlJDV!Ha9n|>yxKaqsse40iIXc} zc8weZT&_sem(?ggn;%{=NOD;F)oY8FC=rSUx7>wLSda z=oOBEbjZ;syW}JyM`bxW$fI{lA5zaJc&R5TO}EI^bDy3q?J^k2 zK)1HX+rM zQKYB#FegPE0yVM9Z_BV>UaF2Xz98bU+&`(}hO-(m7QS;`-|4qPVKm)}({ zBM@#iF0!-Dtsz8V-SzVDgv|G}89NYAOjDR#|4V#5aJdF87lwW>a5?Xi zi=+1h&Sq&`lAj%b&>37Bn}UX+<2(}Hg~;c4oYp88s{B z2I;&r>I2!XF9*Ig2mU?aBjtxy$|K<-ai~In1 za5N`-d;#sLe);r(t+ zLY$TWr}iWF!PKLjk-MDZdf4b+l>=v1%mjVW2(yN;NU+)(nz6v>Vpi|kP;70GS+Oj! ziWil`7*|whyRpE~rQ<*_&32Qj2t|702!;8HY*t^lIQOpG3rWc_YuwihDLxyw^0k+h zs|`(c8He?mKeG;-Y4McZ!2l-TU136|_* zY8NXx&~7?;kB4+BC;V9}7pycoaeiQkFJkNt*OFzn7Q4&Yyu8gtXA646`f604_+!iN|zOjTo55C*H3KN!9by~D;RV~kY>cVR6<^`zyfL=xvE_} zr$~ubHLjsWwLUL4!2GWB!xHJv$@BV_I&WyOIqA5b<@PpLo2|iVM-hiFz=aWRQ69Hw zw2^i8B2P#z?Y*(E3kU6av7-Bbekh~sma2J_b^U%ep^jRss_CS{PHNrps@k5v^m7|s zF3|>E{%{mGx`#SX_F;}AtagV%^ftid8s2H_*EgZ##$oh-9H!x! zE)4VB$@8fr0+9RCL^PL`VWPB8uMn5i-(=Fj4V{~;a~$HHoS#XA)*WMTNqxCLkDjOG zw&3|dj?$of%6T{5lyY|&e5^skQop`H<9K}mCzYQE&GEB4u7v)Ln z%l(uG#0JJENeC`GeOZ4i5l8wBBm^h-SN76PfumnW$dNu?f`6v|-N5L+RO2Ei=i#k* zPfzY+p^lx>9z1NnX-_fS78~m+Q{%FQ~sAx|AoGzT8h6 z`GWeppiB2o>dW=5JePi(Xn&$RW!kT-hyBo{`jh(deATJqr?oP~8J#S*#D0N*Q9rXj zt=A=FI#NDc|HUkQx&F5ZeW@oYrDb%-z!26az-)b)zHG+X=@&uSn18K9SoTm3{p;w#2)8xMc((q30~vNPh5!Hn literal 0 HcmV?d00001 diff --git a/mpi/all-gather/benchmarks/128_gpu.txt b/mpi/all-gather/benchmarks/128_gpu.txt deleted file mode 100644 index 3787302..0000000 --- a/mpi/all-gather/benchmarks/128_gpu.txt +++ /dev/null @@ -1,12 +0,0 @@ -Local data size: 16 -Global data size: 2048 -Number of GPUs: 128 -Message size range: 262144 - 16777216 -Number of iterations: 10 -262144 0.003218 seconds -524288 0.005101 seconds -1048576 0.008701 seconds -2097152 0.015526 seconds -4194304 0.030239 seconds -8388608 0.060280 seconds -16777216 0.189415 seconds diff --git a/mpi/all-gather/benchmarks/16_gpu.txt b/mpi/all-gather/benchmarks/16_gpu.txt deleted file mode 100644 index b69654b..0000000 --- a/mpi/all-gather/benchmarks/16_gpu.txt +++ /dev/null @@ -1,12 +0,0 @@ -Local data size: 128 -Global data size: 2048 -Number of GPUs: 16 -Message size range: 2097152 - 134217728 -Number of iterations: 10 -2097152 0.002391 seconds -4194304 0.003558 seconds -8388608 0.007162 seconds -16777216 0.014929 seconds -33554432 0.030427 seconds -67108864 0.062092 seconds -134217728 0.151508 seconds diff --git a/mpi/all-gather/benchmarks/32_gpu.txt b/mpi/all-gather/benchmarks/32_gpu.txt deleted file mode 100644 index 0e15475..0000000 --- a/mpi/all-gather/benchmarks/32_gpu.txt +++ /dev/null @@ -1,14 +0,0 @@ -Local data size: 64 -Global data size: 2048 -Number of GPUs: 32 -Message size range: 262144 - 67108864 -Number of iterations: 10 -262144 0.000730 seconds -524288 0.001367 seconds -1048576 0.002650 seconds -2097152 0.003740 seconds -4194304 0.007503 seconds -8388608 0.014208 seconds -16777216 0.029923 seconds -33554432 0.061970 seconds -67108864 0.168545 seconds diff --git a/mpi/all-gather/benchmarks/64_gpu.txt b/mpi/all-gather/benchmarks/64_gpu.txt deleted file mode 100644 index ed700b9..0000000 --- a/mpi/all-gather/benchmarks/64_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 32 -Global data size: 2048 -Number of GPUs: 64 -Message size range: 262144 - 33554432 -Number of iterations: 10 -262144 0.001561 seconds -524288 0.002915 seconds -1048576 0.004163 seconds -2097152 0.007885 seconds -4194304 0.014989 seconds -8388608 0.029413 seconds -16777216 0.063034 seconds -33554432 0.183096 seconds diff --git a/mpi/all-gather/benchmarks/8_gpu.txt b/mpi/all-gather/benchmarks/8_gpu.txt deleted file mode 100644 index de3a837..0000000 --- a/mpi/all-gather/benchmarks/8_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 256 -Global data size: 2048 -Number of GPUs: 8 -Message size range: 2097152 - 268435456 -Number of iterations: 10 -2097152 0.000838 seconds -4194304 0.001719 seconds -8388608 0.003172 seconds -16777216 0.006797 seconds -33554432 0.013860 seconds -67108864 0.027938 seconds -134217728 0.055353 seconds -268435456 0.104310 seconds diff --git a/mpi/all-gather/frontier/128_gcd_run.sh b/mpi/all-gather/frontier/128_gcd_run.sh new file mode 100644 index 0000000..4e8c955 --- /dev/null +++ b/mpi/all-gather/frontier/128_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 15:00 +#SBATCH -N 16 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/128_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 16)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/frontier/16_gcd_run.sh b/mpi/all-gather/frontier/16_gcd_run.sh new file mode 100644 index 0000000..bb2429f --- /dev/null +++ b/mpi/all-gather/frontier/16_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 10:00 +#SBATCH -N 2 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/16_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 128)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/frontier/32_gcd_run.sh b/mpi/all-gather/frontier/32_gcd_run.sh new file mode 100644 index 0000000..e630b97 --- /dev/null +++ b/mpi/all-gather/frontier/32_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 15:00 +#SBATCH -N 4 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/32_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 64)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/frontier/64_gcd_run.sh b/mpi/all-gather/frontier/64_gcd_run.sh new file mode 100644 index 0000000..e7c707f --- /dev/null +++ b/mpi/all-gather/frontier/64_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 15:00 +#SBATCH -N 8 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/64_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 32)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/frontier/8_gcd_run.sh b/mpi/all-gather/frontier/8_gcd_run.sh new file mode 100644 index 0000000..563f933 --- /dev/null +++ b/mpi/all-gather/frontier/8_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 10:00 +#SBATCH -N 1 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/8_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 256)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/frontier/benchmarks/16_gcd.txt b/mpi/all-gather/frontier/benchmarks/16_gcd.txt new file mode 100644 index 0000000..35a9e26 --- /dev/null +++ b/mpi/all-gather/frontier/benchmarks/16_gcd.txt @@ -0,0 +1,13 @@ +srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 16 2097152 134217728 10 + 0: Local data size: 128 + 0: Global data size: 2048 + 0: Number of GPUs: 16 + 0: Message size range: 2097152 - 134217728 + 0: Number of iterations: 10 + 0: 2097152 0.002249 seconds + 0: 4194304 0.003148 seconds + 0: 8388608 0.006062 seconds + 0: 16777216 0.011871 seconds + 0: 33554432 0.023485 seconds + 0: 67108864 0.046822 seconds + 0: 134217728 0.139763 seconds diff --git a/mpi/all-gather/frontier/benchmarks/32_gcd.txt b/mpi/all-gather/frontier/benchmarks/32_gcd.txt new file mode 100644 index 0000000..f758360 --- /dev/null +++ b/mpi/all-gather/frontier/benchmarks/32_gcd.txt @@ -0,0 +1,15 @@ +srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 32 262144 67108864 10 + 0: Local data size: 64 + 0: Global data size: 2048 + 0: Number of GPUs: 32 + 0: Message size range: 262144 - 67108864 + 0: Number of iterations: 10 + 0: 262144 0.000783 seconds + 0: 524288 0.001513 seconds + 0: 1048576 0.002953 seconds + 0: 2097152 0.003404 seconds + 0: 4194304 0.006485 seconds + 0: 8388608 0.012489 seconds + 0: 16777216 0.024484 seconds + 0: 33554432 0.048460 seconds + 0: 67108864 0.185884 seconds diff --git a/mpi/all-gather/frontier/benchmarks/8_gcd.txt b/mpi/all-gather/frontier/benchmarks/8_gcd.txt new file mode 100644 index 0000000..7856a16 --- /dev/null +++ b/mpi/all-gather/frontier/benchmarks/8_gcd.txt @@ -0,0 +1,14 @@ +srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 8 2097152 268435456 10 +0: Local data size: 256 +0: Global data size: 2048 +0: Number of GPUs: 8 +0: Message size range: 2097152 - 268435456 +0: Number of iterations: 10 +0: 2097152 0.000505 seconds +0: 4194304 0.000856 seconds +0: 8388608 0.001645 seconds +0: 16777216 0.003223 seconds +0: 33554432 0.006379 seconds +0: 67108864 0.012691 seconds +0: 134217728 0.025316 seconds +0: 268435456 0.053944 seconds diff --git a/mpi/all-reduce/128_gpu_run.sh b/mpi/all-reduce/128_gpu_run.sh deleted file mode 100644 index 6a5ccff..0000000 --- a/mpi/all-reduce/128_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 32 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/benchmarks/128_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-reduce/16_gpu_run.sh b/mpi/all-reduce/16_gpu_run.sh deleted file mode 100644 index 4158fe0..0000000 --- a/mpi/all-reduce/16_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 4 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/benchmarks/16_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-reduce/32_gpu_run.sh b/mpi/all-reduce/32_gpu_run.sh deleted file mode 100644 index 8990167..0000000 --- a/mpi/all-reduce/32_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 8 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/benchmarks/32_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-reduce/64_gpu_run.sh b/mpi/all-reduce/64_gpu_run.sh deleted file mode 100644 index 314f852..0000000 --- a/mpi/all-reduce/64_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 16 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/benchmarks/64_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-reduce/allreduce.x b/mpi/all-reduce/allreduce.x new file mode 100755 index 0000000000000000000000000000000000000000..283e31cfd4ec10983f8a8159c5c70bb949af53dd GIT binary patch literal 25832 zcmeHP3v^t?d7itg7qX?he&E=~#Fro&@z~W$wv23S(CT5Y(iJF{j3EK9ulFG>-WU5Y zHUVQJOtJ`rLz`DiTbHIOCnrtaoRm_UgXI{E$w`PIVBEZL5+J(>KfuPO33&Vcb7yw1 z*4jc(>oCHTIo<{%`*OpPB#8ojY?^Gxv0RR$3H=$zWmEGUASZO5;#Fj~N^KoJfGQ zvo=!`>-D%vNhQ^-kb)}uDsE;a1fIyIhb;b)*n_5AM! z(x>Y|QJ)9J{gYG^1|^l{HHAYPmMm@x`|aUSG`-orxowGk$zn$`=4j#e=r&NBR$ae_ zTd*P)MR2}cL^|TujuV*yP8)|9%%?!;Dh+J_Cc;;h!M(sIDmPRH-&6+QS_c1O8T`-7 z;NLET?<<2JEQ22^ga3OO{C8#W56a*bxM36dZF(7eP8ob*8GKn8{L^Ldo66wcGI+2I zzOf8`XBm9B4E}H#{Oe`#C(7VYm%-n|eVNAQvd47|DaPdZGX?U5kLnu11^lmo+t?h| zrfUQj!ryUxE^F7d0!E2{68J$~Bi0lwc)k@p(+b`J=6@EBQJ#@&q)K^`GX53w51=d! zB^YCe!d??R6`HuLN@XbLRbUj-b3+%{aLkxOwAor??4+jcOugf(9L;JmhXN9J{O zzw-8x+i#lpZ@+!yF6Q-O1oL^5DLs+$M)Xh=$Y3NE6^IvXA}|wh*_BkaN4Vf!!aKrt7LX( zEFDb=q&ty_CHhl|P;`(GL%)AZ)HjrfMXBUuDv^qX1qrj>z8-H+G?e0r&R8VkP3X~$ zMJ{RR_+#l53&vqK6=V^k3dt1OkYK?(5}{OpYpj&bBdY^j;sLKO7EOf))3J1t3kWO{ z5e425_xch69qPh#Fc?lJhggsr%d3W>5&?f2J|K3jE)+ZTL?RSO7+@+C39!IsWO)OA zJ*C6D81DKpt@8AAbb4DH zEsj>!6pN>tdXvD=c#xKVET>e+o^Jv!J_tfWZV)6tF5*d0;v79NbI zmlbMwej`7=6>?z0=L?e$n~4$4BHl^-orRo8JcOICZ{Yye zsNnn|6KM6)`ZV7;U`Tvz2{_j--KTzL87`wF{67f*I>lg=o%4@ zU(8cRWYaaCAbGhIB4DLoH&UEsjk>`Reu)9`*r{ur5^kQK(_CSq zpa?LZ-*%7rlG5^7$=}16`I66yd^X6A`5MnDh53-sKpa8Um%$bB(+F7nbI&``G&M;A$wKdnxvza^+H4+3~2OT*6^wx3@p-@pMXy>ojM< zD3VtYvGcGO&}>mZPfP!FW1(t+qT8ih3i-K`&L3 zSbejZ#cUy=r$Vu4^4eOqDjeIOhc$X_^EUC;)G|-ZHv#JI=@YFonneYh8&$MyG_BX= z@!Zhq?d^5-d3#+qc~>xj46N?y^$xgKclWz*@N@z1T| z`ao$MdkKR<7&H&M4Ri$tf!&~g209AbfQR{I;z6^Zui)WmL&s^E&ls&Oz6-h=RE`D8 zovWF$*`{1HwYp{(a2kW?Ia>$&c2L$)Z6n@^>keSo66UJ6-EZlvtJ(r3u#`{r`!e!Z zgBJ2Fy*KmCUn2QXfoR0e09rapAePc(x(^cQL zs(w@%u70p;XXQf`4_oiISZ7q$x4G(_TmWJawz=wSTo5zrMLzg4D%cLbP%rB>3RSEN zPz~^x;G;pLSy#R7VGE2rWbLlkc2>aXgO%=j=lxag`nKWf-um_gFU&=?5ruWs*Kl9< z!A>m(9QoXmIdj-GE(CWqAM44yxXzvVjeGd@lYIl8os|di8gcKeb8)ibQ49?E3%`RQ z=JXnO<_-0U%DHg8`oySv$7s6oK;;y0ihHE;QAhwv&2sN-oej*rbHydc^ZD%eVP;I( zb?bhiy}YQs{6M8$Xa|M13QPAchj!+<>?2fxk;?u;*3K0_IR=XlNQ*zB;2u6_QSaSi zA)S-%7qj4P$?E3kbC+^xpL_V%?+4XbWoj&+2d%rxpchsc^x|KWP^MQk)wRb?xz#H; zb68z_hL|(R{yj&|scZ9uoFmy6pnitb&vdCXpH*j`g;?%4(q~<2;~90}G)Pt*csr zHfrXIE2+IM_2<;qpPzu#IfCm>=XmSS{Rg;nd8oV(A#jL3BvWuzT{~2})qxx|H@yY9 zv15=s?jpoRh&l$Q@g#}90&_0)#&@Ws4@ythBA8HD0UbC+Rqs*<;KtY9 zqYQ{PW?_De+J}s10ls#Oa?s6pP#zo6)4ihe}VGw!7? zsykZG<@4^P2UDGNCvju;1=X!T$Q(X<`0N3CJEiM_J6=l7+Wtd&y^YnXPyNQW{Uv%W z2G!^fNOa#@wqR!F4etEDu_yPN`^WGywl&l|I(DTxe;f4|^{HRmw!bJ;sg!;Da!{SR z(w!f5`)W~#yO*i|GIiCyU!&&H=aF1#B=?{G0BI8X5+HY`tHJ3S{?MX6cmfYt?~d2ihu-9gy!y~D zDY5he^$RZ}F??5plc^6qkEDAiRcI$Fl#P%Lq~`5KRl2fK!L}Q03~blZa6`Mg(ZcXvR@Ju>6z z>mY~T_;?|IBhRl}OGdV-Fw#^o^0yGmey1SmBFR5`fcg_YK+*H3c{1m+4`VjIV^rPo z01XKM8wtEqfIIm(0ZDh}SDwu8|CJh)wTsLzQfB6s2Df|XEe*5x;sIe#*D1_{$1QsS zD(%j^mYqg)=Rm{Ez3mIpG_GF<%y?K^_8v!ryE4b&kZPcN_ki_e-b7JA-a|Qi_tTR) zl4@`QcpX4VRYeq(182{tOL? zpL5;fy4AJL<-K*kXJ@z}>_!wHt5fG=;?=9X4*U1)_GEtResPT3rYba!cp7GRVc41J z$z0Wt`I+lqVW%s*6=tc_>3q;Q;m#b-eqKN`0A<&q4j*M>nNWBE4)SEUjjILp6F}Ly zAH&8O(bflwHcknMx?uJ&rhgw*kM9bFl|>u>D4JElUz`N=#DDhqO3QvRcU3J~Q zy>@g$bjv+TR)=`*7awAx;5)eYC-k&ljJ&6QgW8WR8;`wC*uwGHj|iJS9(w^8^?z92 z3;(ce1S!mQXQ8%x{{hyGBEC+l>wup_r!wL6BiGnU;PlEh;eSE61NaN$arm5#%>w?! zc>F`)c){gv&*$?rFLw{W?abW$wmXwP>dve=@?1A5vgf+rH^^y&oFw8NzWWGM?|TW! z+`kY%(tX4|f&~Gs35MVAOofKu?^So~Le}uzr&;<6A@I~RAW#1iO{BT~{b(=lTHSNq zr^yY&-5)4pzTvx1vh=M|{u3mBLdf4}$hSY&eX=0GX!!2;S=uS(x&7*ChW&+x{0uaq zAYZj>tYR0IBJSbtBQ?W!e_*-CCUmiGQFknYFSx}kdA-%AQ9j%I0nls~-s6?C5vc72 zme2ka;I3WI@8_$KVl-}f1%{?)J(*W&hUVG1;5%3yuW|3R6YY0pKJ3Yy@mx;suW|4D z(3A zxkH?vPCT5!a-*EDB_94~xjme>5RY!oa=SVIdk1(t>nyj6^Y0Rm0g&akasDmh@q%Ev zIOl&ud=2sIIsY@_=@Ue*kMl1Pj~N}ywR8T5#M7tKoRjlEAf7&%<}}VfMLgD!ENA2V zcZshjo^k$L#H+-g#Q2b3@EGy*nILzR^M6h}^|st0&Obst?kC#M`3H!nPoKFxoWGZN z8uD_xIe#bd^rS&iM_*(G#e z97N$g>In#fF#7G$%xlyW5M;i+gkMp@uPWhhF5x33{N@s#Miz5>%#DR=khTQhCtjs-99@&wZ&dJjGC5!H z6PKEdCOt6efk_WcdSKE6lOCA#z@!HzJuvBkNe@hVVA2DV9+>pNqz5KFFzJCw4@`Ps z(gTwonDoG;2PQo*u?K3z?`-Yj_oz`p<$m>2ty#?)hZYLDM9`Ij-Y95L&`pBgFX&eU z{idK#2zo%!!-Bpk=ovw)T8wgM3A#|wC4#OL^hQC0f^HJ@enGz?=r;v@LeK+(9v1XX zLC*+Uwa6&H6I;qOzPV1@gv}Dz&7c{Z0yV9%RS&ebINO%`moCzsdQ0=-4Oa&i1%v+9 zpsp`zb1rr+Zf#o*ez&x+ zZp8AJOLHz`%X$8Ng?#yYCjCJG1^Ig>{Q<;8!KlwSk!9GI{q2#7gn=35twR$G`8P$s zl)vwuEa6MdxB!UuE0ibOFMsE!zZCN0`e)=S+bj$7D25=6=;mLk5uI$E!RVk=%nCAP}4iWXw%j}0vR(AK@xCDLoa`yeqxD&m>AYTK3__NI^60mLtJhJP zom6EC>Q^{R_4W|^GFK(`Yq)H5$CMkdL;O0fY1@U#1z;%9ZuaRRrmQ8$RfC!FmqKEu zvc_^Zo2d*~wj%zFTyUXs z*&KpvD|xX~%r#q%zlWH5A-fEu&cUvmW7PSwIaGT4?5}nJuXiXD;ABv(EY!z>CfkaJ ziJdkDE}33Ug42;xV>p9yW@PSFkcUlF58R7=f3v^T0smYGkWEvXAwG+0vV2ao-l)xl zp1KjJh3~LZlt0whG6jJIvDGcPCFu{ulVXQ7!=`a;@ptqaf68-#MUEWSrOIim?aJ%4 zYd(F+C5fcfc6$JS4yq`Z!i<&C9~zP6x`Ji!EQB5Q4j;0x!3&p#(J=Ut74Riep5_hRTdnfps))pa>TX%lqr@9VDwj$R(6vpVO&L;(#eE4fo0I=lY7CF>14pz z2;OvcQz+^Sr~LuO0^y*&SZ{3h_XmQGA?XeIkXrXM#X9|?vINB~pf9sV36Q?s^FE^dF5W-0S z)C4b13(&XllK{LuH{f6ZKQ<_$(SGI|(i4o!Fn-bj-nd$4bF=Xp)$EZ}IvQ9p7>EWE zAs<*h;Tu|E99g5;2l@XSS~NQhuD}+0=-GpyZA;-e9B&f<8!`54+QWh9U}|WE6Hq#O zM=0vIr}Pcjt`1&LisKn@N&@!fkIzIW#4!SxXe5X1*ddw<_-N-OH=Gy@*s(p+j;)xG zil)d`0!Klhaf7I8Y6!fcE)-)4)D-6vpyI(eZ7-!7P<^;foK=uYY+2E~TYnU2HFJGP*&*2ygG@16b zj(^y3xC4LG^u(ZV;#}&8)I+=})-)801e){>=}_3;l!!-+M8}#QPgm0*4j%B?TO5m; zo6rZDID-OaH_0);>y0M@C7ts8`f$gf$Z`GG@D^`TI+_d(Mgz2K*YMwDr~P=HcH&;i zdviCuJ^tPp4#z-R4D=_n6eJ=?WjQ*@(O8a5QchAsyQnYq#>D(X>b)m;sV6B-wUb3Dcn~nOskHGa7;%g#6JaG zVN+Rrnub&{@EJO$zH&`*t;iy(;N+opUE>JVgTnQ=2w$&jNEPq*%}8UuQ}{bnsvm{* zxGbQt^ZK7(&mTfyI__8LD(GhdmupbF zF!XDI%Xybv9K9p(JqtB1$@lLgbq1HlW}#ykIL}1%3Dk2MPH&V8Rq4-#z^6+g6Kw-- zo4`&taLptqlM&$b43vi9GWa89@NWU1s6Mn(o(SJp27kT`{?jt}8)fjbWpG+#(|z1? zsm7apA9xB)0;jEk6!;%+gL*j5p>a`v8T@l)a9UwbinI&U^bp8F~5;GLd~+ zc29IZ++UWF|3(@7pUU9Rmcd^sgVQIEiR$@Q8T_3x_?a?zC7$1j?AMpU=a#`6%HWp+ zpXgjYC&!EL1=(4a3;dw2Nr+Pyw_nHPewB9gR|>R~Y@+(FD}yr!W|VKQt@yKsTe<_%@G(9=i?3e zAt^cLNCh@SiqGmDd=2N|YQ-rtex-I?vAm?_xy=Iz%V*@svU3!l7gZgcZvMoHH2Zi=l3}cjanVZ@1Ue z(?8&)GYI`LV^wP`Ah7zv+Qp9p?!4aEhTE`+bTlK$oy=v$Vi`n8%5@kUcK}c{=8Z)3 zILeHhmt3d|7HU8p6T8~WYl;GCb>uBB&W-uC3FZ~zhbq#YlPCDC^4!qj^3bt7{oMoJ z0au5o8%-R)kQYX{MR_cv*+$73jYi8wtNqEC7f0{;u`>K$et4tzv(1aC>c;)-f{*%| zTj=b<4b;0+%}c@zBCS3gAQg)MY+m9`rXziu`n`Rjey3NQ`Y6c%m%|^uUeRs6!B_%) zBTUa*>4}gNEVRc#^dZ9Q9sitlU*Cmitz4hy8eKPE@9OR8G{)jTCZ^$>F1)6AkmqE} z^RwlCGm*_@Wtu2$Ag#tF^&hlo;ELxr8ytstB*M$|Ly_;WHrXik|8}7-Zg=5#yrW`r^Dh5Ysc1lc z+mOf?O7!J-o0m01g8H{Pl-7SIGO7KteEFT{ZOzc4=gS;Q_5T!^W_{MI8RC~1#3mhU zu;=-o*3ERQHK7vV5&muPyE{%+BJ@i-OgBk$uu=#u}WzC54xp!i9z5qahy>n*V#BVpDrtc%+X z30aPmFV+8PiN5^qU=#XMPf{w&9LB&9HY~tWeObQj#--)cd3omgYwI;`NozL@&ZbOL zDAlK*%?OkJYu7a{-R9zi&b=NNv~e7MTZ29o;Vi+W{$G{gBuhck`9j}7i}6vHNJ~M% zN`FX6M>+itLt>lgKT^KL99vdKe|@*1L1!?ULkY{=W%N(3G_>Ttcxj_lpS~LrR_ec> zxDAbgk^w{NOX?N+(!Qk5UL&(3gFW0XPcR{BMF0Q* literal 0 HcmV?d00001 diff --git a/mpi/all-reduce/frontier/8_gcd_run.sh b/mpi/all-reduce/frontier/8_gcd_run.sh new file mode 100644 index 0000000..81ffbc4 --- /dev/null +++ b/mpi/all-reduce/frontier/8_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 1 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/8_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/frontier/benchmarks/8_gcd.txt b/mpi/all-reduce/frontier/benchmarks/8_gcd.txt new file mode 100644 index 0000000..a9b69c1 --- /dev/null +++ b/mpi/all-reduce/frontier/benchmarks/8_gcd.txt @@ -0,0 +1,13 @@ +srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 8 16777216 1073741824 10 +0: Local data size: 1024 +0: Global data size: 1024 +0: Number of GPUs: 8 +0: Message size range: 16777216 - 1073741824 +0: Number of iterations: 10 +0: 16777216 0.049728 seconds +0: 33554432 0.099497 seconds +0: 67108864 0.202129 seconds +0: 134217728 0.500335 seconds +0: 268435456 1.560791 seconds +0: 536870912 3.265382 seconds +0: 1073741824 6.500534 seconds diff --git a/mpi/reduce-scatter/128_gpu_run.sh b/mpi/reduce-scatter/128_gpu_run.sh deleted file mode 100644 index e0a9db1..0000000 --- a/mpi/reduce-scatter/128_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 32 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/benchmarks/128_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/reduce-scatter/16_gpu_run.sh b/mpi/reduce-scatter/16_gpu_run.sh deleted file mode 100644 index be576de..0000000 --- a/mpi/reduce-scatter/16_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 4 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/benchmarks/16_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/reduce-scatter/32_gpu_run.sh b/mpi/reduce-scatter/32_gpu_run.sh deleted file mode 100644 index 04a7f0a..0000000 --- a/mpi/reduce-scatter/32_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 8 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/benchmarks/32_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/reduce-scatter/64_gpu_run.sh b/mpi/reduce-scatter/64_gpu_run.sh deleted file mode 100644 index 48c7645..0000000 --- a/mpi/reduce-scatter/64_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 16 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/benchmarks/64_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/reduce-scatter/8_gpu_run.sh b/mpi/reduce-scatter/8_gpu_run.sh deleted file mode 100644 index 5f8f10e..0000000 --- a/mpi/reduce-scatter/8_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 2 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/benchmarks/8_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/reduce-scatter/frontier/8_gcd_run.sh b/mpi/reduce-scatter/frontier/8_gcd_run.sh new file mode 100644 index 0000000..9d4191c --- /dev/null +++ b/mpi/reduce-scatter/frontier/8_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 1 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt new file mode 100644 index 0000000..493d5ee --- /dev/null +++ b/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt @@ -0,0 +1,14 @@ +srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 8 16777216 2147483648 10 +0: Local data size: 2048 +0: Global data size: 2048 +0: Number of GPUs: 8 +0: Message size range: 16777216 - 2147483648 +0: Number of iterations: 10 +0: 16777216 5.130130 seconds +0: 33554432 5.120491 seconds +0: 67108864 5.115654 seconds +0: 134217728 5.128319 seconds +0: 268435456 5.111989 seconds +0: 536870912 5.115996 seconds +0: 1073741824 5.127237 seconds +0: 2147483648 5.116940 seconds diff --git a/mpi/reduce-scatter/reduce_scatter.x b/mpi/reduce-scatter/reduce_scatter.x new file mode 100755 index 0000000000000000000000000000000000000000..d50ad5ac990357f4067a380d5a59a5e6b24a3805 GIT binary patch literal 25888 zcmeHP3v^V~x!z|c2{Ay<@DRaAIcmVDWbz=vfPk4~0uwut2tn{U4D(1v=EXb^yvoBE z>omkt+huL_TCUexdT-Y%z3Wyjz2zZ_=xQxmAGBJv*2l!~QV^&jbH9I|y(cF_rh4yP z>#nYQv*GN2|KI-qfA9UDefHV=O!i*a?3`y(6efd-&1J;Ry;$Q=1CJToe4|K!G_ZPB zf_N$`V@2Q#IHu<}FpVc9tupDjq{TYR%%G$f75Pz<#&aY+pL2{!Dr-oT^ag?^k~*y# zr?ZkWi_Vi?AufqmU99m8NuxqeQllQVar4DooNaE>^|JH=9WVcyPNg0-i}aondQS;G zNjH-Kf~3+f3*S$8seSp%dq?Ok8^zW6y`of`(2I`I@#q+hCnRmxVJ4~c;kD4Cdj3}f z>C?4Q)Mul(f0Am#pro?A%3xsmjM~bew;~t_Cs$XjuAfmcqt+UaSZlaFx(#%%=3ldz zTQDOQMQ}b}L^@-K$fWL|4S`<-jIP{J4`3*KejeNfe5i5*dGJ+v@U?mHd-LEw$b34}NtXd}$thMIJns2Vav1 zUz-QNKM(#;9{k67@So?wU(AEQmIwb7_iGG0mu+!tNHHeIrIC;)e5YF@IEz04+`=ZY z2De6VHvF37=bnK-FYvu?Ei0=hOnB}UJnIVH4aR>aj!~YTtEXhFYw39;IUr5rh{E1N zc~p`FzohC2KcbFXp55VNWQkwS1n#$bkFG}HiOM~wlp@mYOFQZI#wBpCMv1kl~G@%$K$K4o?)%BR`DcWd@gzs z2A0pNjNex2hx-$#S!F!x_N=Jzg;&KZm%HP>tci+v&=;?+tGL}4o=yfTs_VEduR9j+ zbcZWqxW{Nu#dK?}wW=a9!xyZnTOGTt!{1SN>x{aXH`W=8PhWX!v^H{UJRa4{&0oC= zPtYCis2r%INLD1nE5ec6!s6}S5l+s^*6_?meswG4z?jDqBp+52Bbr5g74f%cb3)Mo z?+YB@o+qRu5p;FN-C21rH^sA8M?=0)He+Sd?G3pTomn6h@h0I~ZjMUw2eL`JbyAks z%Ad_jW>cPcGO9Be64=st8WBGC6T=GUml<%w@WMI`IL&orh#GJ?ccQ%22At-zGRX0o zFdCa=kmEJsQzQslzf9xY2*k#68w|LyT!#TSmfL2)jq}K*2As-}VVMD!?-t7IG~gJ7 zvmt80OR^~ZV8BNj@bv~!1o*Qa}4-927ICcPaE(_2K zCk^;{5+v%@C}woe<=Tz-Vz)+w<6_Cgd5c@)l;q_Sihzl3jR=>Y%u{-#xiy|3c|FBh zR_@k0!Y|Yz9#^?FP6;>8>+22p#au#sr8d^3`oZh}xo71_V$<^9wYjd^Qmb&IJfwV7dX?3+b z7B(+(EO6R^H!ZYX4-tsBwYwI~n>PnHP&v7;-vwmH5H<}8{!P^fTT(CG z;z+&g=z8x+Tf4Kj@FZR@j^5ELIhpe?28PVphcNUUTkJ@^uRc-u0=j|v#7;-|&Sd$X z!e4_)30p0dAHC$FrYnqPvM`0_TxfZg{7L= z&`v#RoG0q;tgaQX0G^KBD^hzKc1u%hC1c zlYTW)nCQ=BKu1s0>DdK3z2MD{GMVa^)$%DG0Cu(ff;Ry8?P_F74M{YshhO$1Sk=kY zI~emj)U|Ilt4nqsb*h)`@vG7PLj)aCmmECmP%q=mYwD8Y#2iQVXB;`LF3Avbnq*&u z`f*Y}Zdb=YtByYbvA#2{?8)lb)oA*V--%M7d5|;@+0_Yqp#7;$9sdQ=Cv0l@akc#z zNLp<_;8$1Cg$5r!40;UoL^HDdHZ}ID1wyC&YV36jHFM78)LxtV2DSB#!;m^n@aSWG zy!EI53hs0UDkmWX4$&863a%R60M%x-y$_nJK7w5TLC77l5n>}m?FUnSghXG3Ih%Uj z$5hf65Pka)$sb1caSq!FCe)Ti+mBM!?P@#R`1Yri0nzd_%=c6Kknt?Qw+~WI8tr-C zg6IQ8D-Io1*G*gE=z6*sd9S=l(JK`FhN5R2Ghb4>>rQ7fj+uKCO>`%5V|Mw~wNIyB zJMr3yJ@j@;j`nxIoS3lT1$w>pm#Ev`wQP8qUWjbGr%?CO7Nx3Pb8 z&0=c<)jRtyS0}Hh{-SRCvt`3eLX}F{wW|-Rqvkm>I~|@9)Zxxq>XW19@A@-p9)2Fl zd3ti!QFSSox25*lPVBRt*vTvQGk^ERCV*37r7sNbLqRQ9|!=sXc%+ z3GD)8>-xf^Zv2#v z{wFF0sq=TB8us*~g00ore+Fxx87!++yWbFc_Vm{P_SnnL|Ner2*rvx81 z)X5mo|IdvVpTCdxojG1SLvS;V5c$Uo3taOR$BQ4q-2dKqf%dGM*8VpKj9LsB{Simc z=&xVF*p7whSKDbhv5j8i>2okSc#2Ye`w*k^O@&ZP&pF%IK@MH=bPry;JSpjX{CHassyq za1Vh!T+&5fVZ2`C%5&X z#{IJfWBr6p9gO{+u(5-&SAn5NkAmeLaHsB1Op`~Vo;wa^i}*gN-U9q6x{v{<-`o1< z0jHO%0e^^aEAZWeaWl&3p8)({2jld+7v7J3D>Io4P3aw72dYwc9&n_R?>JJ6_dnNs z3I^G8%_ntoOeaT(IJ)lK&(ynLMzZfo;(MC+J9@B|pyfo@$)-f0>tw6iy$M-ecOGNO z%Y?wTF+jH0z=5AeUC7_4%QrmNd?YJB zz3a}CELkPxx&51j{N=j*I5Z(EU$m*eU=vm`j;`kY#a(xvGOe=+T`YOj?&HPpVZF+t;tzQN$$H35Zt|j$4&FGxHQ?_Fzz1Y!P zL3EKV^+ikSxbsr-d9h>H7v}UL*mQIsOlZ|_$n}Xc^%;hAd+M+)GyY9S*KWlz^H0f- z99_>8-*~eez%XIWu4i(~Kf0T46zumDDp8x)n=kFpOy`1%J=KSr%ZgM9zz`KThIAhiN`yF^+h?qoOryDS>H0w-#|QluIOvy{6gX}3uJu_oWGiQ`UKlo z#ra0!>CES#?)zLa>z`OAo>Ppo}MFeYTCTtvKu_;)yeF7edG`u1^t zEb+LXXg}vmh#yD%R?eGiLqCZ6UleGQy{f%x-?uj2gE#M38~K8^F+h`)e%3+Mlhczjtv`#JwF z#7`#v2nIadf8sUb-vPhkVoEah6G88&)Oq<$dNlO}1nM9Z==V+vlCP(pVc?r{_{BNA zD~Dg1!*}QKn{xO^a`>O+@H=w&7jpPFa`^Xi_`^9o{hmvKZm=;>ho;arQsb05G=(iX z;osjVNjI7TbpQ${g}iVpn<&Z?glWq(E=L`Vf-Hv~C<}ZN0i@72LgSPk zAPT31;ZILz_-8Xst2ADt2Jt(P z+^?Qn%?CvOjd=+x=PS{1bslz#|3>t z&^>~_Cg=x(9v8G|x+q`Jse;ZBbe^Es3F;Sgm7w2{#GXtUg3+) z(zrRN(=Lp#`Ato;v~p~Qzf`NPvQD>FX*E?f)ljY1%DHSy*kip^h%H*Ys9A%hz^{@8C+{Lk0Ss(8QV%%U=oyOU>r_cW3kE?}uZBnEd^a{s3X9 zpx5VdWa;*0f4hGuVPJZBx1b5SykGPq`FmIw?66_yp$5JNxq zoA}8K#mWW(5T*(Xdb$6O(hHa$c(KxecohBZ3{@-FJ%2%_skqRLEc4BXO#AIX3Tin# zg8quQP%jxmqbT1}a4tyE3JB0Of)y991U>5>L?vIRh^Ht*)g^=<6|PF`gSaeo$CT@?Li`@C zF^VW_3K$BHKzeL|DND$4BfyNiKuC;N7Mt#5{`IMvxpl2+flKB9;J*?EJb6Kv5L9$ywWM9$t40w>R>mlRM^d&0e}rJF zMdqxLSB`8Mchw{l{(8V%#zu~uJJO8qUoe%mjN>ZIQna{ez@pIz%#PrOgkC93Ovu{I z8YvtwTcjv0;}jSkaUqNst0<3(IqPiYl1T)Y6!Kz68f!Kie}6FQY<3CAXe+yFl3wRa zCQ<1P6TjC8yws`;fs;YCFk2rJnrs;`OzfDEaLL#aBsdm1#kw;nXI$zo1$o%)^v%1l zPjBM48{wa`0n!bI_ynrS>`CELy*5L7>N=n%z7tAOJ})g{3IYjY6Igsr-0O?R#ZG62 z&EMFl zfF1N!53;a%3zvxx%U@anPb?7_C|{1W=3Pu_iOrg3!a)TJJIf?TT=Va7a6thu`cp+S zTZ;IMic}`!F>wM*hsPuLbjOo%pT42H@`}np*b_{8eT?~n{)&NmV>7?k=eKrBZ^(z# zyqhWJv3t$ogn6vSvKFM5Snb~*Z+FBh0NeF1y2=`4i+cJxydT-f!G!vknXzpG^dHu!t7!4Qr1GEb*F#<&dQ zCmG;Pt2I?u>#t9(B9usmeRDc|VP7oZ0qc%=I_Ky|)@T(S{QrU)tpWz;U@JWID*T}J zGvPQKZ{q_S()U+Z1byL-MCY6;K*{iJfv~qC;a-ky=iuFOar^>KLBRg{!I|iUI1T_4 zjCf}Sc68Fg2QYKn6PP;q~h zwu4d)s6N~#&KyX@*37A%F|E3m?886>wtR*oV8Rh-Qbn?s0JgO>ALIQKtL%(~e3kCy z$w1Is8H@_JF?BTOsBjQZFfZsVAvLBpL)&*kYNCW}%y9GAW<}c2rsXL&0RpnYP*+L}pA- zh20{PMqBM)!Pd3XRw8Q)504&4w zh9&_8?BncxmBO^@7z)R9WJvrX;0hbX_Kwkz8VG!bj;XI)Q_K}vL=~JY)Zo@QLiM0< z4KBi$xizHpnI+Ndabdob{ku=9ABAPOOt3HiPIw`K2)E)QJDc4aQk2%iYMvg;wlrxR zkam0!P)rk;{2n0jb&wj$58uv%KLlK1&l;#eM|~1 z%QU10LJ>94G%5Rcx-zE=c9?0vZxVk%;L;Bg{{zRzv)u0=*TcTaRGR&}St;-0xSIX@ zS&9Fe*Z<6V{tg0Taldj`L2nCOu0ip~;5>XFa5?Xii=*QLXP0PPj_T_Hgn}Cj? z<2)15`Kad@oZcuGs?wjA10O4e40H}~%Mf_wwLB13pxJ zXe-@N_?|p?zVpxa=aK&?4}K~SJ_f5jx{s_(<4wLBJcU|;(;hww{EwwUujM$021QHr z;LGyhfjszadGK!FL!IkJ=Qj@J&+q2J>C4AZ=fKg&fuZdDBo9vCU51i>ArJn?Ja{?} z{&^mp_9YHg?pb;8(meP@d2rhPG?e`-^5C{S_&nf4ovYW%@qzb+>_pcKe6L%R5U00r z`=goMH`0K9Od-y3J#CEtArH>1m}&ZAQDzNe5n{D9wqOC%!K~gj;rN;mvtrp}6(2wc zF|L@-c47geODBL}8t*1mk%|TrNJaP=Y*t^VI2W(e3rWc_Yr?l0Qhc^==fGjmSeIA-2?7pX?{ImM@R_R>>S%9m+J;WHLEeozC&I!n;^0 zf_Bo$d_1L7xuD2d8DS-}9Ony$`68*=;hMM5*4pfHwk&FQ(byoq1$VIN}P0+))%9HLk}{A*|wnT8H+xi&qz= z($dK_uyWTY-v*f1il3-Rw@)6^H{ZFS(dMM%brvKZzra5;7HMD>gygGikRr$I$R z0IO%X;>l3kszt80z@jRbIPy`D|L-S1x?G|=yZn(D`b>}>#@yo|hgh8tg7^m~*Wib+ z`5HSO#C*>J*RWrIjjgq%$v7U_ahB6sQcUr(7rC%PtR-5o669k@|8!+Py`3 z{z9GLgQp&(oTT)76YX7-yxhO`rqFK?da{kV^`{>ajvMr|`}0Tvp-Ly&@@4%6%zsIv zy2Xg+N`xD#9sR*=gq9EHZ%a{IJf;`f1Bq0R3zwxBd zH|}r5NBcAU?*>M7s=-A;t{W_>&eJb*#vs2bNGyN^=}R$r9=SZHyj=KQo{&$dR)L&n%aq;y+Uz3g%Msn-_AaH7;>>rz(G%hL6m(TT=D_7|NS=s!|_D>A4~QeW<8-TD>vcR-hXC-voeT%N1HS+rj~PTBfM zKkS7r`A_Q0^I4Zp1&C0tlWZ*OErH)6VbssAryFz$S&o#?)&G5tzWiQc5&BY3QYyvJmi*N3}s{ewPhNY(;7Ho8f6+nu0H+TMws+pgInX$4K_~bJnTV1 z8^qx^Ip|XnRXMoSeubL7!KGEdikJ4YdhW$i*g*MGlr==$wB1BTR>l)g*Jw63AYRrF#( zSkL2uDA#|XJo;OO{@fhnIV@WZAjieLup{(=6QN!_xKofm50~tJ^8K|Q`m*nlybR^y blQ5m1Zwxss`(_^fZS-P9csIv*uKs@lq3fIJ literal 0 HcmV?d00001 diff --git a/nccl/Makefile b/nccl/Makefile deleted file mode 100644 index 5652112..0000000 --- a/nccl/Makefile +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright 2024 Parallel Software and Systems Group, University of Maryland. -# See the top-level LICENSE file for details. -# -# SPDX-License-Identifier: MIT - -CC = cc - -# perlmutter flags -INC = -I/global/common/software/nersc9/nccl/2.19.4/include -CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL -LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl - -# frontier flags -# INC = -I${ROCM_PATH}/include -# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL -# LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl - -all: allgather.x allreduce.x reduce_scatter.x - -allgather.x: ../allgather.cu - ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu - -allreduce.x: ../allreduce.cu - ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu - -reduce_scatter.x: ../reduce_scatter.cu - ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu - -clean: - rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x diff --git a/nccl/all-gather/128_gpu_run.sh b/nccl/all-gather/128_gpu_run.sh deleted file mode 100644 index e9fc3ae..0000000 --- a/nccl/all-gather/128_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 32 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 32)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/128_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/all-gather/16_gpu_run.sh b/nccl/all-gather/16_gpu_run.sh deleted file mode 100644 index a94a523..0000000 --- a/nccl/all-gather/16_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 4 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 256)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/16_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/all-gather/32_gpu_run.sh b/nccl/all-gather/32_gpu_run.sh deleted file mode 100644 index f1ecd9f..0000000 --- a/nccl/all-gather/32_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 8 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 64)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/32_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/all-gather/64_gpu_run.sh b/nccl/all-gather/64_gpu_run.sh deleted file mode 100644 index 357da9e..0000000 --- a/nccl/all-gather/64_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 16 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 32)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/64_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/all-gather/8_gpu_run.sh b/nccl/all-gather/8_gpu_run.sh deleted file mode 100644 index 4bd249d..0000000 --- a/nccl/all-gather/8_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 2 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 256)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/8_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/all-gather/benchmarks/128_gpu.txt b/nccl/all-gather/benchmarks/128_gpu.txt deleted file mode 100644 index c84792c..0000000 --- a/nccl/all-gather/benchmarks/128_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 32 -Global data size: 4096 -Number of GPUs: 128 -Message size range: 262144 - 33554432 -Number of iterations: 10 -262144 0.002247 seconds -524288 0.002277 seconds -1048576 0.002775 seconds -2097152 0.004497 seconds -4194304 0.007477 seconds -8388608 0.015057 seconds -16777216 0.028550 seconds -33554432 0.056270 seconds diff --git a/nccl/all-gather/benchmarks/16_gpu.txt b/nccl/all-gather/benchmarks/16_gpu.txt deleted file mode 100644 index 73e83d9..0000000 --- a/nccl/all-gather/benchmarks/16_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 256 -Global data size: 4096 -Number of GPUs: 16 -Message size range: 2097152 - 268435456 -Number of iterations: 10 -2097152 0.000532 seconds -4194304 0.000982 seconds -8388608 0.001976 seconds -16777216 0.003447 seconds -33554432 0.006826 seconds -67108864 0.013190 seconds -134217728 0.026196 seconds -268435456 0.052567 seconds diff --git a/nccl/all-gather/benchmarks/32_gpu.txt b/nccl/all-gather/benchmarks/32_gpu.txt deleted file mode 100644 index 72f0d07..0000000 --- a/nccl/all-gather/benchmarks/32_gpu.txt +++ /dev/null @@ -1,14 +0,0 @@ -Local data size: 64 -Global data size: 2048 -Number of GPUs: 32 -Message size range: 262144 - 67108864 -Number of iterations: 10 -262144 0.000622 seconds -524288 0.000577 seconds -1048576 0.000780 seconds -2097152 0.001190 seconds -4194304 0.002041 seconds -8388608 0.003571 seconds -16777216 0.006995 seconds -33554432 0.013830 seconds -67108864 0.027698 seconds diff --git a/nccl/all-gather/benchmarks/64_gpu.txt b/nccl/all-gather/benchmarks/64_gpu.txt deleted file mode 100644 index db7919c..0000000 --- a/nccl/all-gather/benchmarks/64_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 32 -Global data size: 2048 -Number of GPUs: 64 -Message size range: 262144 - 33554432 -Number of iterations: 10 -262144 0.001077 seconds -524288 0.001154 seconds -1048576 0.001399 seconds -2097152 0.002078 seconds -4194304 0.003777 seconds -8388608 0.007711 seconds -16777216 0.014418 seconds -33554432 0.028471 seconds diff --git a/nccl/all-gather/benchmarks/8_gpu.txt b/nccl/all-gather/benchmarks/8_gpu.txt deleted file mode 100644 index 1c654f3..0000000 --- a/nccl/all-gather/benchmarks/8_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 256 -Global data size: 2048 -Number of GPUs: 8 -Message size range: 2097152 - 268435456 -Number of iterations: 10 -2097152 0.000286 seconds -4194304 0.000523 seconds -8388608 0.000954 seconds -16777216 0.001696 seconds -33554432 0.003150 seconds -67108864 0.006500 seconds -134217728 0.012278 seconds -268435456 0.024449 seconds diff --git a/nccl/all-reduce/128_gpu_run.sh b/nccl/all-reduce/128_gpu_run.sh deleted file mode 100644 index 0e1358b..0000000 --- a/nccl/all-reduce/128_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 32 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 4096)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/128_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/all-reduce/16_gpu_run.sh b/nccl/all-reduce/16_gpu_run.sh deleted file mode 100644 index 6553e02..0000000 --- a/nccl/all-reduce/16_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 4 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 4096)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/16_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/all-reduce/32_gpu_run.sh b/nccl/all-reduce/32_gpu_run.sh deleted file mode 100644 index b672e7c..0000000 --- a/nccl/all-reduce/32_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 8 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/32_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/all-reduce/64_gpu_run.sh b/nccl/all-reduce/64_gpu_run.sh deleted file mode 100644 index fc0416c..0000000 --- a/nccl/all-reduce/64_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 16 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/64_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/all-reduce/8_gpu_run.sh b/nccl/all-reduce/8_gpu_run.sh deleted file mode 100644 index d9c0ef6..0000000 --- a/nccl/all-reduce/8_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 2 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/8_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/reduce-scatter/128_gpu_run.sh b/nccl/reduce-scatter/128_gpu_run.sh deleted file mode 100644 index fa2199a..0000000 --- a/nccl/reduce-scatter/128_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 32 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 4096)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/128_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/reduce-scatter/16_gpu_run.sh b/nccl/reduce-scatter/16_gpu_run.sh deleted file mode 100644 index 2edffa6..0000000 --- a/nccl/reduce-scatter/16_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 4 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 4096)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/16_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/reduce-scatter/32_gpu_run.sh b/nccl/reduce-scatter/32_gpu_run.sh deleted file mode 100644 index 3d297ff..0000000 --- a/nccl/reduce-scatter/32_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 8 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/32_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/reduce-scatter/64_gpu_run.sh b/nccl/reduce-scatter/64_gpu_run.sh deleted file mode 100644 index 6bbf97a..0000000 --- a/nccl/reduce-scatter/64_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 16 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/64_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/reduce-scatter/8_gpu_run.sh b/nccl/reduce-scatter/8_gpu_run.sh deleted file mode 100644 index 21c0dc4..0000000 --- a/nccl/reduce-scatter/8_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 2 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/8_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/rccl/all-gather/allgather.x b/rccl/all-gather/allgather.x new file mode 100755 index 0000000000000000000000000000000000000000..fc85917cfaeee3d0d9962cb061dab34a2359729d GIT binary patch literal 25736 zcmeHP3v`s#oxk5q5@I0vCcFe6^#cP$C6fms2IP^+119bSBP2pq947NfMkh1FJP_RS zGGIFel-eHGcHPRlZp(4)S?addQVT>B(Czl%qsUfQsTMmSf)o_ku9E%z?|1Lyn<3M( z-E;Pw?a77j{_p>H|NsAe-|yaUa_{;k#{!e0FzHNe9wTngWQ{|0+-EHRE)f8!W3{XR z{!~`N^1$bCOpmW)8V^WXVbXC)^L3V)K}jzx;=?A5$4Gh&=NOYz=8!1q4LVH(b-Hu3 z&PvKGI!}7Jcy@{SU7~!F)(JUDje1nZ)01^Or|UHykhJbgI(}48sYk^kz5MI-`1}Tq z2PEAg$}Op^m&M0p9x7k9^o|KVOQCL8)=R6*gKNw$Kwz{^atftx;4Oy$WJ!%Fj)51mV+=3at zNP_d(Jkk+On_oR)^@%w!OHGF>fzq40%Sa3}Dg()DM-S7pK1Wx>Ce1^-SK{Mjsc zZx;OZEckD;;AgVnW;DQ1^-z!nACm>2lm)NJg3rr>JF?(QvfwMS;P+<1S7yOiXTjHI z!4p~VZ)U-FWx-#_f|mm?W>>MDE)6Y*`H1_BH3@v5OXG;d9|vw>6Iq48c_h3iIerzZ z%fSCh;D<7B8np`kk#A1>N2rX+4C(UZ%YV8A%?S$IhqN^0MV})S_V0oR#3Ix3REBh3 z2SzS_mH!IL!-s+=LgSj~Rd_J2Hi9W2tn}Kg*VLZba!q~HuRcDya@jRM`|#1PFsBnU zmD?GOxgs%Vx62;{(%Bsf3d9LE;_dWDW8R3<7jboao&KON1k}CS<@EW3u7LjmFIbOv zD6uYoxW(J;4zFPdE%C-0y{r6gFEI-uUQSzFfk4Pj$U+(25Q+z50%?jwLXjo0h(Fj# zh_1h6P0-yH2?Z(TpxYgQW_!@TGVX2mNZzPNBDO$ap)1zqjc|d6Pvgu3f{+} zu}CZwU@fi9PQrMA1e_6yC75$jN-Y}mgyJ#g3nN1@AM4gL57ZlpFyH+Vf6U7@7RZVr z^?J7}8e>Am=L^K6UCc+t=aH^R#E)!IcxlYv?PcE8$b{4Dam8Gy*N$jZZwf#`4`nv! z?hflIhhq$l=j!&LX7rcKOsXVjHC{5*s`R?4VGAA2^$pG{Yn62-D-VTZ1NMfnLFSLc9sujQUuH5!TW-t`-9>=+Zl|{PUY~@LSDWVs=x?3 z4%J~LF``(+R}lX|DyBQ^cT$~lfY&@Bow0zkE9y$gd$=hcy*k|O?M_9kjJrJD=$k2^ zJLHL@YPmT|$>&c6DH$otbLC4##Zw`7G#=I&#&$Mo82S1dO-vD--(|o}9MQcO3^=Vx zq}yk}b0i3Q$bi$jQ#v^h6J}gz$a$FX>!mPg?Fx-^Bj6j;tux@pbnOP*m~N{9H?H@V z8E{HNx)laoz8fg6%Yb8oO}VfEFG!(`tv28z4fqBFKFWZ9&47y~E|1-Az>5s>j~j5j z08(y;0jKwcbUO{WTq;xCE(2~{$Gl*`X)P(;J_9~hf}n>C_&5W8*np2W;71Mkl?MEn z0iR&NlLmaE0Y7cP>HQ|%1p|Jy1c|yd@)@kb7yFxXBxgq^512l;n)hU3wB+MI5G(AgdBbW+ZVb*(nCc;QD}F@Go+ zomapX20|UKfJSd_UQk}d0_F(0hd}Mkt)fVJSt(&-p$0AMMQgD+9Je(%TUu=pN9m)T!X)T%bI#m?^6*%GIMr%nVuGQE%}qtx zwj3DVnrknrcqq?aRNFJ6rKm2#6SPr|M1u82`MmDP{*9Pm zk3I2$z2~>5TiYDla+7#%*tZqBIGH;gWf&N@2d~;s+wFhrm`qN>&B_t`h^i}i9A#ijt~(XAZSMOgVeuJh@fYOmJr_;tgKJErbK3rT61*ikqVmp9eG@A=II zpBl=I^$!ey7EagcxE!5c`|gQ>fyy`3(kX5LjcVz&?*Q;Ms-Y=WB+;aP@`ex2s$NIE zf-#?6UH4v-x^&-JhkD}ypBnBzMbIgA>B+Np^+wJdQJ0=0<{YB`z>$mU(g8v)lI$;` zevZ`7HL7D@QOBN#Sl^{q4y1G%)o}8Z&w*5+d6G0wHLBweK>IVBI`#{M&)d|}b86eC zAW60DW1qT;9u)YKPe4BfJ>P^VpG}SY$^xN_J~i^Tg^D@%CMvH@eTT~W&L@z%NO0k& zeZ2G+{}bHB0jOMn5IA^WkSSDEVI5SP)V4lot~vp^{*#b9Wh2B!h}sXP^fZb73g&F; z9lxiPzJTcar%3)2M4#huBf*5)QfS**%6g;PhH8BOGm3y{X%go9seFid1>pN9DJF^X z9Jj#x2;P0C&Z_IDFSYl)l#jT#-X-rX@_s?y%l28XtD9$D92l_AIuvW5mPEt!`qXtV zC61gwa{d6lo#KVQ&2PlUZ+wkjZ~X=8?hh;*-=No`PYu3AqP-_9zQowu>;t|1FYGt= zk9ONq+a0!cTiX)*#*gh7jP}IY#ET^_qU+cbKPuS=+B1-=ZvH;Tm3_-!=U1NFP0PLH zx3IWtPP`1rBB6~OItXZjgn}G807#S2U4U#opPSUJN@Bn} z_#DvOd$J@>bHF#!=Ya6<(5KP2OLaPq>Gay8ABj1@34rE+w*jDkhNf6ag3keex#OS8 zI>h)t&P)Es$3Ijrcl^WL|K0dUdH&P{P>l(oKV;uh_+K}pga1y|*dAK;@21ytvJ)?U z`XI>qW>F8?GWQ6C>|4g{Dbc&&v{d{=9$)x88EGXWuU{Hz$*j{~xJK@>Ls+@5ZDD3wla*9kjRnh))VzzQu>MYS~m$ixkx2#*#2G zrGi;O%ml%-5|b~OIv;VAECO3gjv{bY4SlO5sRy1YxxfSK?=Gpp)ZSL2`P5=Z38wnu zoD$084n!yGMR{nO?{GiMXQouS%J;sB)@e+9gsJ7H5Zt=~tRry>wE^V&gzQAyCf)=0 zZE%$X$)`YCW_`A9nIkcf{6{c$QHjZA*bGtGf5;pE#S*IVWP<>I0x(%aCidAkzD#d` zyKHycmfP;JIhXJEZEsPq;5yYj>*V^KG;_V4w9%xmw~ieZNwyrAIjKmF#0Nj6Zv;)1 zuO{1s`6pm7c_o<3QMyhjb&}EtSQ1{2(h0~>>8eQS2bZbzW1;i~&-Q1@_+{#?6?(5g zFIfksc_U10U@y=VIdiRb-*Fb04r6ei8eYFt60`-Tm zws#K=dK2Pzzl|LCj|cwkbi5b1J<*GfMdjOzT4l+H5Q5}^Xye~e$GkyqN)R62+in>#e`aycsDT8O1R zwhgp%=(*4k^Y>h6Q8!Zode(l*;x`I`-Niumyo4H|9Y7dGz%HW@wyD;7nm$+h-92kh zv-ol;e~7S}Gf)@Y;!Ql?>K>%e_I?gD`39Z<9 zw*3M_%f;rz+q7bIY@703?3CN>+scS8u_eA}PMmYxKoy^F@BPA@Tmr-P%_n18b9p`^dJl3+TuY&V$5I>4|jq|S&k3A;qvvB?;;){r9oZn5nO8jZe z0|Qf@Bi=&%G0s0jJoU4_!<>JLc(fDt5oa19LCz;F!= z*T8TM4A;PL4Gh=7a19LCz;F!=*T8TM4A;PL4Gh=7a19LCz;F!=*T8TM4A;PL4Gh=7 za1H!$MbnpdfDXuP0P1+5WufuMH?>JxO8pbrW9n4sSi^m#!K z2zo@&j|4p@XkL{_U(l(7)(E;l&^rY63A#$qhXj30(C-QQyr2gJJtF8wf;QlwnZ}RY zX{&I40tXy4{ivX(mCkf|XI53z&hpHf;i_;|RaST0?49BBd1m@tuA174>Wb=_wKd+E zwKw2Tqx`}9yph=&H|KCP3L|V`L&I#X6o=n$&?+meGprR_RYg@LRBN?TF54V*TW=6z zOWKz-X|NPueS>zh6^9JbBGd8czai?sOv?KWZsDzUsYkzKnphKj`AcTH%p4xSDHSh& z-=seXASZv{q(6Wd%IW#}9-?&nvVT1~lrS(oy?amuUH*vZhw}H{V;Ov=85;mmeud&> z`Q`8a&m+=<=f{XuHkckkP?2|CNEt?zmBWuGZDBaobG4v+W{NUokTgho-kCFd%Ty!#+P&j^;EzY_F{hu{_b6?r^F9-_8# zpW4EtZf4gZt1jb2q z$_E=i6CORv&7x)30Y=NhuLInOj1BfErC71#6)U3^bJ5jBQ;No*-m>8e3nKF_)Xf{t#n#U+GJYo`z=c`DMlDT4>GHoKk1-U%gk;a_O!QV%W8po!A z6k6H*iF%%=O{CQ8Cj3o3@FJ@+1WpFk+*E!{D6(bHFtNoWQ6-~Ckl<*<$eOwXsEMDrQk4HHDqsp63F6dS zbWPOb4M)X!X@(=@IO=a*qW@{n6(%`zm}e=Unk_fY*KWCW(xgb#Y+30w;Wh&0dYCaY z`a>hKT$r*9p80WJ-s(maWB1@O@oD)_X22bZ`3KXNGp)IoDb11D(@nTaL19;z6 zlPRVgVBbLsX0{CeB^fD?McRsd+zB%%cyPgE8}Hjr%ckaE^KOtqbpFrX2He@(w;~DxGL9u<{)9Df6<+ z9~>;docYbu6g#hQlwHpL=9>g2ZfU^DZn|^9k9z^A2u|D`;9A4)1#mXshHC;mIID<4 zdzib+6=7V4@p}vKrqvoMEA`i>R@NPh2fcGUy+Lop?*{9NxVz@+SJY@_o&5i8RazMg z&c%Ux=#}|EYiFV2aGi}8Y`1=%vn=2ZcE-BqRsf0z@An5iWieL=4zGiEMa4A@p?DOh z^wT5J32~_a78ud4GMpfddEIn~k{gb6ddqN9vka#)Ar*{~tq87!K;b%()mRsJU0o=K zBFHIjD?rA5VLEr{ECFn7Zh)VG z52g?cM9XkO-y4`qP2hJ0%Dk(+ZnOwBjA#)kM`Cohk7iqmc&FGYu z=7(1dl1%HrdTsF%#e-3QXV6Orc=h@l?zETJX@~BWyf-(Yw$s~2uW(GHgO2`WmYhW7 ztSo0IIUCEFNyG}S&#=s1srcO~*!jN2OJMpf45RN$kfkb%<4H_H%q=)N$`ASZ*} zz-bI*x}Gffqgn80fDe@)+9?l(_h!Lg&4Ry~1wWnzKc5AsP4-avxh4x&J1g8z%Kv(Kebwo~)K?kw`X zS@2(G!H;Ia-_L@7lm+k4f}hWVD;U>9)mt9$q3+8o5-#Jfty(5_B~v6qs;balF?&x+c`4iB`8 z?#<&Noyv_s#*PL%kPh4*804FqCcAUNVp~g-)6u-7%}I9)dP4dxRo@U`H-){32iMm* zouQ6lC%LQ?LlSlZn_!H~1N+Z9HdVdEAGdBMgDsCDS7cJiDeCED#c z2RGaLdf5Q;R_B)`Qp?Hv`4&2EtG7Am#+@ZiZO%4Zy`u?5Oy8IbBiy3AZqaBX!M>p+Hyw&0;WNqu?F z@q##5xRZq7Wcg+Kvi_DLj`ZtE2u_}R+(C^3XFT_akL8!tzY7?(OBEh+a=pGC-}A`@ zWyW+CYmRj12W#wqOzuaEbvZ}6IM1sQx<5l-?q^$t{xULdaI*Xo+K2#&h~`TD?k4YB zA1%ruC@0v4f=9_dSNm^I4P%dgS^CdA%f~urZ4y5?0Q{8 z%4O5viwJ5vsW11z^8WfAqWoewrSc=&@ep*W{-nOVUv;PW39b}zMwgkNe?z2EKedmo z)3sze(nhBKn;H7@JAp;$OFc;`Eu-rPLs*XhGxcTqg(5yPeYy(9n15}B#wE2nUBN<; z2A8Q%KXVZ#>#xqGap?*hCv;zRn$yxa{1yg%N}?hIm-^q$z)6;zq}K_39Ub%!+eBCj z3Rc#Kl(c5kuh%6ui2ftxGt9BsS@c_*bPc*g(daT*MiW~``?Q#tX*gbuNA^GY{@MV2+4qd;9B$V+`fg)%87%vH7X1!-F~U8SVLVg+zX7OP BLl^)6 literal 0 HcmV?d00001 diff --git a/rccl/all-reduce/allreduce.x b/rccl/all-reduce/allreduce.x new file mode 100755 index 0000000000000000000000000000000000000000..a21c76bfb1f099fdcb0b0fde0a1a598eac47e83d GIT binary patch literal 25840 zcmeHP3v^V~x!z|c2{DkI2`@pk93&tJbY=Wt9Z`psZo#W*gaXd(_O3ah@>ko)A9VT=v3-awMg%v&^svf zB;6+JEvfX&;*Vk;s$aJJ4hg;L5qf^oFU>;FR;=T;VvR>6ZPj5Wsr2Cu(4%sG*+BYq zZ4%|#B%Yt7nlLD-%&#oqZ<{%zEZ`{(_=EA4r7Npumd>1EjfSih+#Wp!s?)+nOSlCy z?jj4$XS%-QMk2Z6MAhJpfdf{l3F{n3~=;`BvaonEIu=nDaLuXH(m z{-7)1zuODe;~h+_!yj()cDlo>7-B8nSeer0+E8a_ zL(m^v>NHEvfl#)OK`7l=nYxKNEN67hQ@JiavM@ANVsImf#ag>ZU3u9yq{ zZHq?rM*=A5q0$E3onbxaaE#$0x;j1ZkpA+iBxgBi;3Y&)mtIFR*}}$#np$UtwZdA- z%0l5-nLFZIRTlP^x!vBf@|jkf)yAWEg*oY!=5L!_7QL&?2S>+Hys~K6EEWO(soK6Nx%PYApk1G=Ga0N>vc$%nG>2&K1tF1IP(;KL$Tp78m z-Pc}u$IQxzC(;p#PQUYx@QlzM(P&uDH+%6)-2qpyy=`E)Rz83X1n|doVsb zRl@TtdEr*bfe|zr@?kkKqFTh;h`&3P(i!$U$)_CPo+qR|7I1b%T`74FH^q}zhC98T zsf0V@E>EW`){z1_L!LNX%gs?vK7T4o*+^MlDqkuoo{G7n@vzP?wsL`n5kQ~AiJ^k? zYYn(@UUI(yH*-n-w$*^kIS{2hY`|$=DuW!438OJchHjU}qlB06n0~8Trt!!y+>QBG z8*pR34g+q?x7mOj=YuyHaLPl5Wd>ZnJ1DKgfMcLdg|GoHNTKkv0UvI_*BWp!)#a)8 z8}LGd{8j^AWWXOb;CKV{q=M6Z`HD%~F;G-o7y3c@* zG2jOb_*ese(12fTzz-SlaRxkTz{eZ#V+Newb26MU;1eWB)TPm#(KDB8FyixF8WE0* zr4Q#VE{#)?mrEf6#t={L=#@d(N5G0w7SE}bKMk`D2^&82ZlxN&|?Gl{{1 zBEbIswQG!*l$L#^e+^^COZFAnH^`3hO6Qc+{7Ciz*>|X%dRU7b2Cn{oN{l;nf<7Xj7uIr+Bn0ka|S{TU1-yn1)Y@0!+&T=3u8GxB?RukLC}C z<1sDlqKOPiTdA-$fJ}5Ylc~fx8Vu_n@5?j#VJ~c(- zYKpX)qG>fnHhWRk!lJe2u3?+Z7~PcXD6-v`=P0V`8rD=) z9pRbTsWc+9nxcH}%VyYl9CmWZjv;JK3%;%8M;a2Z-|k4f@96sQSaWOR=G+dvJ{+41 z-^1I*v3c%eD0lyuMUxpjxx|t9NPRB104`Ub>vnADj!)Z@`#Bng;@Frw2NHl{V;!3- z^C1D%E06Z~CvSzdK4t4|yM^{O1KM-;NF`q+S{fByWY4?dn z^{PESHQaZUprh*2BPSf{Rh)TCU3!X`Q%L>`N6x8B`w2NmvcH7-DN;XGr;dJE9eo;N zz2{rmlhUbE!^xw*M&ts`Bcyq>P93`k+JCaEqt7CK+OAGJrM8{~Nvf@%_|z42p~9bj z3VIUsbUl)Mb~W;A3xv-3)X3Wws^;7YRA0OL4%PLYPa$=V;KC1kdF{{r8Qi&ksGNZi zINZ*XDY&Z82Gx4CwHKNzK89T15y&026JjSs?E^FI7>T|Nb9VK{&nTy}5PkP3$$yIE zQyi`%m{5BPZ9PFnuTxv$#&`cj2@su@g!w+I9}->$`0f!(NuoXt3PPU0;E^<9*AzH|RC#Q-d#(XwSzMUt;w2j{cs$=XV?1N7rn&+F#z?ceOfk zEwvYQ$9tA_uM1VmrKhJCsv{OS`nw(O0+ivN+3HV6EbMs?B@gaLbb%i2IicRf!&vf9&~n`+32h*}Zj%eX0FM`x1L=i(}m<4s`m4#OcHfCC?&jN8;HM zD6y{oTy?`Y(2pB7uE{SywS%Ug$tN(`Y)HHe$ReSC1LR2T1~g7WTe#F7K$?V>1G0CW zHL06am~?D-N8OT#NYjQ>>Xx(UbB(i3sNXn=NJC<8{f2&ZOOiU`W>KWgdXbX#loyyy zD9bKfb;(+RJd;A~KacM=*Z$t=7fJ|Ayq&ZG?RtTuEI!?_ zaqiw(C|KfTL(eCpcRpj$SYzV-^6uoHA%?`Kk+}0aDf9!NjfvCAKG255OAUNn`H(8a z#})AUxU!kbkw}dzBmRhfg8uT-VNS<9rqjz}pnP2U91$8+n-0gF8PGW^(WBY_$=@9 z8^6uFvT9jhQiUAU;<}PBG1COIjF@qPX(omyPb60DBaX5~WK#*vK~JcmZSndC`%Dk(og%IiMI+mJm^tA;wY=}cYXto|0+jBJt*<91vvUSC2@ z3-6%SUmKXeeHm{p6WM+jW|EV^^#3BhOV4Kt?u^ zy=PYqngbfhb|v{bEcAYHw!gnI@yBEibY)qR^m3;9{Sg@vNNpe)5#SMG;?OPccC34e zUh%isZ?)fMzuoSpo}Z3CO865F_r6`PrAqN*!oCIB5$UCP4tQ_8zrPtSoAsn8)056p(tIS*BEr%2i7j!@Cyqq?kR!3=;NE&tWP9tWPlKG)$uT01 zu6qtLb^RNN_WqRkjr9i|8?kz!1x(kO+L*uVOq05SM!v3lPO|t_LSRQRke$c*liQnI zg(_gNQwZDSwXXUvls$Vn@bKPbgDzjaxBgg4etOqEXIR`O<+=S! zh5UG3eiW*ZlF!@Pm$Mb?CP!EO!Thd!zA&w^2wkj&)D6?&3vTghUT$?K@@Kog0GfOU zuH%KX5UA|})}P!1aO>9nyI*}ZZTSa=rjHGYw`m61xOvKtuySAG*j!4q#hy6ZkT}(N zC3!sG(R0?EY=K3`h9fbp{2jR-YE1kEFNeCsr}qBQzjJi$Rvfc_AOG0V^-}&Vx5;LT z336)vl$rmD-Sn7Xe`D@v=*_g!GxZ-J{oOjwV^lpo^jAi#=!jEdxT_PZ5hK2X5_=C( zU8meZJUO!W0O#9?NBvoEH|K9A9xi9S+d03Ocyt8TyN&Z-B_6$#^={>S4e=PlS?^lT z&n6y2KkE&1zJmCC;+Jv$D&lFh?rrA$<-}uB$$G0fe+luJ+Ol37=SLDhf_RPd1;o>* z-ChglO~e-w&p7`Vd-s6Xd_OZ+I} zw{!jt;%Ol5-NyM>h^J4py<0i|BJuP|ws$S(cMwkvzBkPI|0aGM@yj^>H1YK5wYQn` zj}uRGq26lFKSKNj;%%INka+rJ)2ng*o5bS_3F^=Jdx`%F@r?6#6F-soV;D=Xo~$9t zm|xHzmFYY=h{9=V2?&C)eHr}a)DjHoWf}bZ44yi-QT~<;K9s@Vox$Iq!GAA<|IZBm z*$n>W4F0tY{x=ys{VkY+@mo9b)PO0B8?JFm4Vc1HI^o~}Yi!%?se(>SFLL*cM6{HJsM`e!+juJKgH4%F(g zNrIR4paw`m{!WP+90i$94U)n>p-&Bvf=s6lNnw4t#%UqH<6EHMkM-;23FqnQXqvoy z-GWEYo)qJTT;eArB0BV8{bQ9vJe#kOzi5Fyw(D4-9!= z$OA(j81lf72ZlT_kKl+ zDE%GK#OiUEe+e|EV*dh9U!O{se;*tv#N^)x=?4Xa1-(2^B1x1-w6_Nb69%T|cRQ+} z%ReOAm;8I_whTVgj2(cezd~uU{_^kU&mqx+>nBK6)|wtbRFP+2NEt@X%Hi9Q@|8b9 zV#K3D>t?b(2{+|p{F;_sWbQ7~K(Q4sCl?!CSxyBp^v8P>KY1ZvSw{fERAE8S_gUng z!+gN=m1^8a&@W-ASh71x_k4gvQ+}=)N#@&dGaal0k~4$D!#)6#tLF@%5tMGpp&!cS zEr$SI!&rX)ouC)phg-qd=*DAoL((RGr_4-B9yO!k^Y9>*Y0D`3I1DLQ@#q1HUc;jn zBq>&k<{iL|F3RLB&=JEQM>Q1N%ZLx(P0@0S@;@aj&ei1_n1LM{%40=IlBqo>nnXKK z2J+Z5z}5MbXDt<(g8C3=soagkzR6XIeHfR8o|tmueB3|8Ra`FengWKx_aQyf&y=O) zxM5&MH3^B)$`aE(Y_!s9T8;Y+rZo^4Bh{%KY-}ZNbSc-0C%YOjo-FRwfa_4Offl6{ zE0(-sWu#&*nou;QXcXEle$Zkrnm>X-vQn%79!|*?{K&vG1BtVTUpKsA)co-#{5rr~ z!iEo@H{6WopEH#;jN&THlD8ypz@pIz%nsv*gdTo5V;-BbnKF_)U^Y)t8b&EFJZut- z=c~w%vbkuCa>aOp3vzk3!;K}IgWng77{jgrDYUZrE(A0E&kRP{iib*ndHb} zo~4{LTdtn3UH8>VlOj>GONVjj8A9E_PqYWyBE z2j`eaes$rk%#>sPh;DlyG))_CF0k?v^C9zs>mMAbza06^S11ndaMWFn{^qL%CQfI- zPHsAD!H;tQs0vOT7~opP&jD~YT#sV`JlKnfN_&{Q!xdp%hVgR>@W$0@(cSrLR4eU_ z#e?3t?cSg_;&+2}Mcf^8^&@Mv(sus;`U=|1>33b*DMWq zgYB`7xi&!Y;9dTpr!?kj!)A5xuBbSkAry~dSAKdT8X*o5z(gb3QHmX+F|V68Npi!H zc5f+mVwPgpC8UBevK7Hm5U5-`iW=(xud55iPy{8#=>#aaFHGA@sRUFWZWAXJ#3HNa zme0JVd^RzicQsd}-930JHAHG5&KN7}2z7ePTy1fGz*80pcj}RvB@K;r zW$ie2z+GBlonBssHps*Y6fnC&jsZ?*IO5G{l;@X+I|f;%wO`#^yhib0)ZZTT(gt1K ze?yJ-!W!-1t&+FqdU!ki+2{_(KspfUC#V!8B1dI8I?2&kj!aTcQeAsMU+O(7<{wh; zKLjuJB(;c0wV(=HBBS9HbeoLE1(d^1C`*1Im=rnRR%^9Lj0mc*+hx>ftMvkG-5_nn zQnt`}^kB4g!+EysLMo6_q_=ijkRlF&nwaeeGVT+Y!dlXF{whP0fE@N&>bXi`T6GMD zV>&V@emQW3jbQf|Ylsa5K10XUSFS1Mi6o*5P9CauX&j+)P*{YE@MSIyF@0u9^y|1V z-%0(hlgdY787>p-%ijSnCJ^BcTx4gPOGAv}dRWQhBU8WsOxb~eVj9cj_W+5nf&5^8 zpcTVl_(RKwu=E zSLQ0{w*r@IQ2f}Nhxa6&u{ion;N4R-F30!jBX$ax#>S#y=r~Wr?P8R(7^gSNg{t%? zZ7mupg$%R`xMdJK^}scQoQ!q?r#_Gwy0YL8X2G8ZK3I8Zr92qklLgUHd})zA7=CFMyetboHw(Uq?KKTpsI)VyXH_t|s*_o+nEQa?Bd@u7nhy)m!-*&dSvWrp%1P`U-+s(`IZuXIG~mllb-! ztmrv~I>XKo&W{VZXlZ1nb&lT~rP-cb1Ucm*2x~+=k@WS9agZFdx^b=@l3>x>7H>!1 zbn*^xo#iL|$)vWnhFRWzpB3K4iV(De&e!8Hoyvtn#tH~4mNuL- z803qgdWUntVtZ4)v$3J2)k$X%dP4fDR$oA1^@X*I2glnvouRfnu!zLjYn$xHI4PCV){Z}7~vM>v5ZC=8D})|jN~T*Pc-Dj!FwL84FBN| zZ*<;VK7%T+w}?7;xw*W8&OU6T{t+vm8DJ2pbmIuBP$$6hna*guvw1~}v)SKbbJ97A z|NjnubUH=Db^1aPw2lDvIMgKVM94vw+G!Adif}s9ds_XXI`p<|t)6SteSML=siD?5 z_Wmt#8rJE;Ym5hZUbZ|(Tkhi$$y`>(iPFZ*VYsCJCX)tk;2dY2;}E~e`JqH;-7_4Q z)R+6&whN#e?-dG^2bEL4zwxD(hl#jj4IGmCa{pL3Pfu^w2@_^g`AB*Ca7ug4B;TE< zagA=FU(LCJI%bxi{xERLpwEVBx&{>(!KRZ`{<8c6=6@NZvdR4Aem$Gem;3dkym$>u zmNg+pWs>@G|KT`tD#BAbIWK?dzokeg{Td-B_ak--eW^%g#7Fq^{O#QF*ivMez2r3#cX1oE6*KoO&8~7H9~h~=*xBT{l&Tz-??m1$n^g@p)VeH z>hF2-T=xe>J*0`u@_!>kUw*$?KS|f1_H7KA?PoI*ss1v5`Mu{cp-=tG7&7(0g+!zN z_HvDjZx{L&=~!VTv;4GvrYg$*(Os)?$?nOVq@K6V4LpD8$0v}W;*$Du-MjAs^|wQp z$|UvW{?h{&sJ{!kdX8KMS5ob`A{~NUt6Ye zNv&E}7)ObQkf~39J|j%}uiB+?=_)%XbpCZ((9$^kjSc#gg)IY@`rpsMNtS}76NSEx z4%`pfMO+FBR{BFqTC?fb=n`v1`;qb)=Gg2k`pfEd4LWnt7&2Jq$fAF2fv)v#hC&9* z==%|2nf`mtq3gG13>Z>hQm4>2mXYE%da)p^<@bSGrvK>sjI=c%g?;HlpU%@ah76Xi z1d!okE`E$YSR!bmotm9 literal 0 HcmV?d00001 diff --git a/rccl/reduce-scatter/reduce_scatter.x b/rccl/reduce-scatter/reduce_scatter.x new file mode 100755 index 0000000000000000000000000000000000000000..d2657f4967ef5b24773d2d3a26a0addf76226bc0 GIT binary patch literal 25848 zcmeHP3wT_`b)LJc7qX?hvW;W&h%ZY1_G?F_8~23AMC@} zw0%$=Ip@sGxifQTu6AZ!i)XP#QJ4%Cb~z)ie5S^s27YI3)#pV7q=D75 zQrs_Km8=MS0mqE=2Bz_dqz;RLOIl*E%nC|+F_9j#Xgo#Ib2!JCq_TuWNpB=*A}Z)) z!Afc~c+xARsKjS##4#r5ULhwbGwD$s_s-NRFw9|l@q|%4iK#$7#X#?rg zwON#Bvv_`zYQmtTGQaBHP*>gD>Rx|UZzz&lRkf`>S{ zf~QR57Z#}u`TQCfh16N0GiXFPBzPh-aS=TW7y8=;sqNIbcYhjK-?RQ>*0Vm6bVLw`c~=Q zU?`&ZhVBS}^#{fh>j}l$0%2clHAAc;kZcaD4EX}YERF{_ZPR;uqdr2G$mFJIDv}gP zOFSNpcO>JXNH-yde#h#FuO}XjP|gvbuNRujBB9$-fmXld&3Yu#6b*-4BcbF{J#s6T zSsL)Ce1Q(1o5qyfyY3 zdo8Pu#*)>(xW2kN7O3|50@cnsyTk6_QM|^y^jZsb&96?}ULAz9lPFnrBBuLptqMd| zCaSyiL?COTD$yHAIBTo!2t?+PfhuP$x8>L4i5@*t6~_}reX8cz=h_`r$+|#qP3@}q z?cKrd+FR;sh?qsjGC!uHM{oE8!UKI-m z!r6q|Qo28^CwsC$IOoMUNaI+z1!b`I#))yvxf(c)5!o`%A zr`~PC%S`fHO?bHpzu$!8g^&#ons71Y=V_0ca51Ik_%;)6p6@(q!sQZ!6b4NA6mG$| z?J?nJn()0Qe5wiGXTm>Y!uOl-X(l{l!l#??!zP^GcQTwX;b%*bsIJkS(KDB8G2%;f zjR?o((ua7Pt`Xt*XL-!HP17|VAv-g8%(!W~#v_CqG0w8ty1@~Co&oW@L)SPZe5`po z%`C(L_F({F2Udk5hq27RhzJ@oXhSzK~uFXo!pXJvsMCyg5?5d_F&nSFW0^q1TU=DU#BP%g+ z@oS+-ES1z^I!$aa519%D9x-?bUHU zWAP@8DU02gGSzG(JEMMWE|2HxCU0AttKHk?y3V_Z38ZsrYn!*zy|ksneYK|x0d#Cs6#fulA1)ObN(2j>ouJIrmV6lXHP6H*$SM-DC&pT!hRJfYb>@A@4yuWb~$0L za@$>&rm~{dPy$QoRK6`pUkaK{x3qBqvKhnm9MU(64nf#PTn#8l6qJGo*ZsI20CtEB zBcI0dX^rJtWBKgHa)+zDeo6UyYhUr^qAi8@6nx2gm&H0%SYGcccW?oSLD=RhFL6Q4 zC>Qx)0S1r;)Gb>s>jH%e)(ofw_$}~JH{z_h-1a34jND^wDc80X!06^ece&%PB6oRx zUvXP`L!4*kqSA=W8p}(#FWX_~A=r`qnaokxIw}OWIuEp_U%uI${;j+3?ZfSzo-Ktv zc#XKXl)Z(wjC;$X2T<k6?ZMq&xDXP6 zl2hGVYEJ-z>O}{Khch?A+K{sKrkz6j96T_gy>M4y8934|651-_T??U|-kqtTa`YD- z!Kg=uRakue04!cCExts--FM2O-nkl2(Vaf*emMi)mMC_g-t^NN84PpZjA zE$Z@tBOdkQT|qTAbdaEf>hc3e-0H=gc}-n@jF@9c{u4(|smq56IYqKRh59j4Kh~^H zd0w4z9Abl~TiKP>X;x#IgFz2+f#w0yJlL#G-39FrUFwvRh#z;UvyZ8rM?o@b=lel* zC0(fSp+lfYL65f}DdqcJis>(P)Y{%dB=v^ySUwY z@QAwZ(&g^HXG@Uw>aXbbD&1b9+jH)DFRPnsPYn;d=j};0(UZi3*%4INKAV2+_-n^^ z(c3Cj7Tokoa_WW`>2)_$sy_8w+lE)@H5pVR&ywhl_iVxRl&jstJBFUzX>K2V^V!;v zb71Hab;f#XFX~fo**3f^R4JDoI|iXTaj|=N!0jtV8P?2KpP0C0$6F|QH05910gPUGRKDprqUa-e@Ze8YD?z+~s%+=Z9-tfK~oxU}FJpFX# z50JGx{ewyE85)dQTA|ZJUm%_nbtZ^UOP9z-3mV4*eUs_BW0{T}fE_^^5~( z-_smrsX6ZcMNiB_!O}-tcf3F4na6Az^Q3?49LW3;Vo3ZZ5})~M7X2MiPx^Rf2(&f* zTq_?}-lhuiaRt0Fu56)lq_gA7#NVTzpuar#E~jTcV9;}upnP2U2oV}r5|lvW%8L}C zaYdcc{C{vf`SClb-s$7XeFV4A2$Fw1*$KR&YJVmeh%->=1@Fc|s8>#j5FRFu3YyW4S>4Y3wWOp|Ge4bu5m5gjv zVWc{1qzgisa8~jJ?9m9;{1=R1uJjZS4Pj4E|J;h4Jn3VZQcT@94XB&mrtS@#W_WqV;6S=a*$nIlM+%ronOJXNMJkMjlBlZZEG9u~Pi z4( zGe+cr$`d@Y?#4<7UKO2{T2QUwlzu*2KgagT zc!bUAgLsO6fWr0xJT!OuH}DFOHwihw1x|t83)VTD`413`l50QbN&i0cD==>1&CEKi zy2!=*c`GnnpSeYVGyrD8WMaU*;W--WKkvHHb(8C6m-nWf9^ZkXiRz37=2qT-?R5`J zn{GIhKf(UZyjt$v1(m1*YV_WZX#U=Ua@{R**aMrH1zw z4ty{*%nLq%7H!JbiXsi(U?lzR=%fNlx&leGig5S6??|tC-okM(Y?<{>ShX*6Z>b{M;Yy!u zO&{}IL=G=;?>K4Abikl{(}ASsd_%5>Jn28-<c#XaxWsrTG{&y{@s zCOJG|f}CAH<>r56Cp{(D?=SoVx*o0cEW>+Af2V=-7}d@Q!?j&@be>s$+-bZUo<)2Q zB@XVVs?NHBc=F@mUd}Hi9@S@q1DwB-czB!*ZsU9-@#qI^@IlVcCm!9M4Q}Op4e=Ou z+2DH4UraoG-WrT?{(Ry~h+o0^vxujmb+DcDlZeNpk_|R+zLa=OZP}oM^A_UiQ}3Y0 z`9I+^M-LX?=--_GfcSFa8Ry?4o<8Xg9!CEkp7mSeZN%^A{4a^8_BFVd^REz(=ZX4r z{zc*^6Tgk~&k|3g-r$3re~NheL_4^Z^Z!CTeVQFy&-s5Kp4$6hjPnl>KaKbmoc|{A z^a*ybo%8n+PjjKc2F`zx__K+3aQ;r>>66W%#`!ylKbLqL=i|hGmUzbbTZo@Q{Nas!;oEX}Zw{Zx;n(Kyn{)VY z=kWiS!#|nBzmUWKJcs{v4o`m@reOZ&PCPYW3fcsXQ)<8zXzD|O8ZZTVuoUQlQ`jTq zsYCG~@)LwM#MgQ6hwp$W4IsgS}pE?``N3q5!br=fo3BwC_-8>=XLb04d0H>W~!fb84KH@jGLW3I6b~QJz?l zk&dRx%Qql+^i0lY{Ur3q$+!o`JuvQpaSx1pVB7=a9vJt)xCh2PFz$hI4~%shIeC&U!)Re)Zg3UlZx?3VKY?q8g2h zP8IY5LF)uvEa);Q|t`RC2i9eRy( zZr5djIl-X6HmK`$^^UoYxwZ9mf!g|u@Ux>(urh|hkJ5bIdh(W1dpYSl&BWp->NG(D_K5jXzkCjOE))9}v% zlZ-h^fA_Pn7To1u{>-V2E#&DNv+45hfs=%o{Cgn%fMBd(l;;s73Hzem+&h*qFeAU4 zQ3XT(5g{-CKDsT3&o$!$AnLF1G+BQ^A4j4e*AJ1Ztham#QAM78A^(=n$XNw^8&Zk# zAtWY#OK9Cd*3ZLD`3!zN%g(UwE!RM?1+SnGn_O8z4Keh`dJ8{!p+wn00Kx^rf|2hJ zkb40O0xwY-aGyv&e4%2=?kL^!J3y9_LMxK2H{oX4*9@d!E{BVM2c*!*8A1~&-BxfG zNYSkjpsSdbl-ve-#$C9TevWQDMmHpF=6A}>q7+dxns7NDq%wO2MIVA8kM2|Fq3q$vN>vEseQseu{Uk)b?Lo*|j0a*@!@ zlYu<&7;yCp%Cm`zOhNq`XQ|vCVqfK|#NLm~Mo&z+_6pqJ##P}GCTD@6@Eu4`3Nd9l zIgXAln0%9vn4&DRtYK4>PRm-{ud%Fyz?o8=%E6}A;zpNpnRv2m0prQy-T=561siEm zN`+!8s!%2=*7CEt(X9pOe!Y9Nk}O%oIxp*(|0OJ!{(?j-idvF)4tpY|C|YsRVYq~ zPo!C0%a!5SQ-6Lhb)(6ErVwv?2Whkki^){xGao@ z!T+)VzIZY;lD`~jtvi^~8lQiu1xFw#>ajidRilqP;{VdVSt`j+oDoAxI5f>-0 zbo+dA7kDC-2pC(yt1qh#MSQ&}e}J(-Z?I~l+}Nh?4+QN!(i^gqT6Z$VI%$tJlC)0J z_&sTjEVNF#a>@iqalVPk)0 zRc|2Do$OiU0F;W{9*X#@l6n_5s)N@P;&_H=DuEsOqZ82xaex3O8i}4N>0BciU)A`1$vU{Ig@pBW9_91 z(#w}eMsqlT0#&BXp`$xHj&0S9iw#qEJKrk0``aObCtrh z>KF^hbYx8YeBcV3$nLMu5E}`6hK{MPTvJ>wl87oed8k3xI6~#1a1}1XSLhmI#>|rF z^|&zK$^MR$%12=ZE(`3--~BEn5aAYFWamL$LyY1^SjFR$vcLC~b^;JkEK`~M9w6~` zkRQtrU(SPni`$=^{XMF*^X)wHkLJO@YNhgw_*1Sc|C#J#QfOPDAvO|1O!g9ZVTo3yeOL_Kpuu_T653^X*?C)VE{ykp))64lY z2u#BB%3TG$DR8+4#gDsrcvs?1jq`GGbWGrTFEDVvFCW-PxHL8u4a2~}@^B8yS%K3V zza&0&W|_4xO*2ndD?N4xIWxZn!%S{`EZg_koX99@<+s7QQPF{?k18 zzC8GQdGL?(;1yWyja8oW^5Apx;4OLZuem;~3|2Pj$d$Y%~ zQ(dj!|j(bx!Y!4PHhIA4Lrt3PX zidd{SiCC1M&Snqvi1YD!{E(C!vnKnAF>n;=st2LnZvHk7k#YJ|sMDmWT(2Ys93FeQUpR7oa zPafd6#B+6{%R?vbbhLDOJ6(;Q7Swa}YF-%O7UkKDW*a#NHS&z)cLIMR>cz=>ek>3F zU!36Ry}>z`9AGq!W_Y^YSwjaPc2P%3I_r8FL~4CFdny_R=&bW5QsMTM9p3g(hr>&U zDgN&}0n+Ogjn^BD#?eB0sq3L8qmG3fW6eDZqHhvj@8~Yqa#b_BUA~sjHJY!#%GK7| zWFCb7f;bKDbm29}gFH7|o~JGMZ;50sE8|3IKU*;_slVBxfg3r`+2AdSqH8aWk#)?ss7y7u zD9H7|UfNlUkdOs4Sj>(Sed=H4 zkgNX{B%1XHoEjG&5c)RhSYaf${PzQ=D$4dD_jSs1<8%Ebc}X9Ju4>Yk>)(Ad0U}5_ zgB#%`z6}XfCYk^KCXI{k|AhM6Ax+Ot>dSSuJTL!2QGd~$vgMI}*aKbipVXJn$Z;{x>@a<}M9@UXrN6kJjZ3ya`TklDecAR%UIv@^)O&AEhB++zVjlg6 O=*5VzH^+Fc{(k~WGn2>w literal 0 HcmV?d00001 diff --git a/reduce_scatter.cu b/reduce_scatter.cu index 1853aed..99fc950 100644 --- a/reduce_scatter.cu +++ b/reduce_scatter.cu @@ -14,6 +14,7 @@ #include #define bfloat16 nv_bfloat16 #elif USE_ROCM + #define __HIP_PLATFORM_AMD__ #include #include #include From 79e7570c80271c3938c313c6aca1da20cb96b99f Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Sun, 14 Apr 2024 13:38:57 -0700 Subject: [PATCH 35/52] fix Makefiles --- mpi/Makefile | 13 +------------ nccl/Makefile | 5 ----- rccl/Makefile | 5 ----- 3 files changed, 1 insertion(+), 22 deletions(-) diff --git a/mpi/Makefile b/mpi/Makefile index 3efbe3f..12ed3bf 100644 --- a/mpi/Makefile +++ b/mpi/Makefile @@ -6,25 +6,14 @@ CC = cc # perlmutter flags -<<<<<<< HEAD INC = -I/global/common/software/nersc9/nccl/2.19.4/include CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl # frontier flags # INC = -I${ROCM_PATH}/include -# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL +# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_MPI # LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl -======= -# INC = -I/global/common/software/nersc9/nccl/2.19.4/include -# CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI -# LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl - -# frontier flags -INC = -I${ROCM_PATH}/include -CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_MPI -LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl ->>>>>>> origin/frontier all: allgather.x allreduce.x reduce_scatter.x diff --git a/nccl/Makefile b/nccl/Makefile index 5652112..d4423b4 100644 --- a/nccl/Makefile +++ b/nccl/Makefile @@ -10,11 +10,6 @@ INC = -I/global/common/software/nersc9/nccl/2.19.4/include CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl -# frontier flags -# INC = -I${ROCM_PATH}/include -# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL -# LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl - all: allgather.x allreduce.x reduce_scatter.x allgather.x: ../allgather.cu diff --git a/rccl/Makefile b/rccl/Makefile index 590dee7..aa0a7b9 100644 --- a/rccl/Makefile +++ b/rccl/Makefile @@ -5,11 +5,6 @@ CC = cc -# perlmutter flags -# INC = -I/global/common/software/nersc9/nccl/2.19.4/include -# CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL -# LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl - # frontier flags INC = -I${ROCM_PATH}/include CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL From 0cd86f0b6518f895bfb0d2aca42905d900a038c0 Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Sun, 14 Apr 2024 17:28:06 -0400 Subject: [PATCH 36/52] add benchmark code for all-reduce and reduce-scatter --- mpi/Makefile | 12 ++++++------ mpi/all-reduce/frontier/128_gcd_run.sh | 21 +++++++++++++++++++++ mpi/all-reduce/frontier/16_gcd_run.sh | 21 +++++++++++++++++++++ mpi/all-reduce/frontier/32_gcd_run.sh | 21 +++++++++++++++++++++ mpi/all-reduce/frontier/64_gcd_run.sh | 21 +++++++++++++++++++++ mpi/reduce-scatter/frontier/128_gcd_run.sh | 21 +++++++++++++++++++++ mpi/reduce-scatter/frontier/16_gcd_run.sh | 21 +++++++++++++++++++++ mpi/reduce-scatter/frontier/32_gcd_run.sh | 21 +++++++++++++++++++++ mpi/reduce-scatter/frontier/64_gcd_run.sh | 21 +++++++++++++++++++++ 9 files changed, 174 insertions(+), 6 deletions(-) create mode 100644 mpi/all-reduce/frontier/128_gcd_run.sh create mode 100644 mpi/all-reduce/frontier/16_gcd_run.sh create mode 100644 mpi/all-reduce/frontier/32_gcd_run.sh create mode 100644 mpi/all-reduce/frontier/64_gcd_run.sh create mode 100644 mpi/reduce-scatter/frontier/128_gcd_run.sh create mode 100644 mpi/reduce-scatter/frontier/16_gcd_run.sh create mode 100644 mpi/reduce-scatter/frontier/32_gcd_run.sh create mode 100644 mpi/reduce-scatter/frontier/64_gcd_run.sh diff --git a/mpi/Makefile b/mpi/Makefile index 12ed3bf..28861d4 100644 --- a/mpi/Makefile +++ b/mpi/Makefile @@ -6,14 +6,14 @@ CC = cc # perlmutter flags -INC = -I/global/common/software/nersc9/nccl/2.19.4/include -CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI -LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl +# INC = -I/global/common/software/nersc9/nccl/2.19.4/include +# CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI +# LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl # frontier flags -# INC = -I${ROCM_PATH}/include -# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_MPI -# LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl +INC = -I${ROCM_PATH}/include +CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_MPI +LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl all: allgather.x allreduce.x reduce_scatter.x diff --git a/mpi/all-reduce/frontier/128_gcd_run.sh b/mpi/all-reduce/frontier/128_gcd_run.sh new file mode 100644 index 0000000..5c6baf5 --- /dev/null +++ b/mpi/all-reduce/frontier/128_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 16 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/128_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/frontier/16_gcd_run.sh b/mpi/all-reduce/frontier/16_gcd_run.sh new file mode 100644 index 0000000..e1ad604 --- /dev/null +++ b/mpi/all-reduce/frontier/16_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 2 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/16_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/frontier/32_gcd_run.sh b/mpi/all-reduce/frontier/32_gcd_run.sh new file mode 100644 index 0000000..be7bdd9 --- /dev/null +++ b/mpi/all-reduce/frontier/32_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 4 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/32_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/frontier/64_gcd_run.sh b/mpi/all-reduce/frontier/64_gcd_run.sh new file mode 100644 index 0000000..a8e13d2 --- /dev/null +++ b/mpi/all-reduce/frontier/64_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 8 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/64_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/frontier/128_gcd_run.sh b/mpi/reduce-scatter/frontier/128_gcd_run.sh new file mode 100644 index 0000000..b6505f8 --- /dev/null +++ b/mpi/reduce-scatter/frontier/128_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 16 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/frontier/16_gcd_run.sh b/mpi/reduce-scatter/frontier/16_gcd_run.sh new file mode 100644 index 0000000..eb6b2ba --- /dev/null +++ b/mpi/reduce-scatter/frontier/16_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 2 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/frontier/32_gcd_run.sh b/mpi/reduce-scatter/frontier/32_gcd_run.sh new file mode 100644 index 0000000..4ed3437 --- /dev/null +++ b/mpi/reduce-scatter/frontier/32_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 4 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/frontier/64_gcd_run.sh b/mpi/reduce-scatter/frontier/64_gcd_run.sh new file mode 100644 index 0000000..a5a9957 --- /dev/null +++ b/mpi/reduce-scatter/frontier/64_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 8 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x From 032011775eaf727fa38d07fcfebf944484e76c8b Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Mon, 15 Apr 2024 03:23:10 -0400 Subject: [PATCH 37/52] add results of MPI on Frontier so far --- mpi/all-reduce/frontier/benchmarks/16_gcd.txt | 12 ++++++++++++ mpi/all-reduce/frontier/benchmarks/32_gcd.txt | 14 ++++++++++++++ mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt | 13 +++++++++++++ mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt | 15 +++++++++++++++ 4 files changed, 54 insertions(+) create mode 100644 mpi/all-reduce/frontier/benchmarks/16_gcd.txt create mode 100644 mpi/all-reduce/frontier/benchmarks/32_gcd.txt create mode 100644 mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt create mode 100644 mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt diff --git a/mpi/all-reduce/frontier/benchmarks/16_gcd.txt b/mpi/all-reduce/frontier/benchmarks/16_gcd.txt new file mode 100644 index 0000000..609afbd --- /dev/null +++ b/mpi/all-reduce/frontier/benchmarks/16_gcd.txt @@ -0,0 +1,12 @@ +srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 16 33554432 1073741824 10 + 0: Local data size: 1024 + 0: Global data size: 1024 + 0: Number of GPUs: 16 + 0: Message size range: 33554432 - 1073741824 + 0: Number of iterations: 10 + 0: 33554432 0.133082 seconds + 0: 67108864 0.267616 seconds + 0: 134217728 0.634895 seconds + 0: 268435456 1.928400 seconds + 0: 536870912 3.973167 seconds + 0: 1073741824 7.913018 seconds diff --git a/mpi/all-reduce/frontier/benchmarks/32_gcd.txt b/mpi/all-reduce/frontier/benchmarks/32_gcd.txt new file mode 100644 index 0000000..b92c437 --- /dev/null +++ b/mpi/all-reduce/frontier/benchmarks/32_gcd.txt @@ -0,0 +1,14 @@ +srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 32 8388608 1073741824 10 + 0: Local data size: 1024 + 0: Global data size: 1024 + 0: Number of GPUs: 32 + 0: Message size range: 8388608 - 1073741824 + 0: Number of iterations: 10 + 0: 8388608 0.043066 seconds + 0: 16777216 0.084259 seconds + 0: 33554432 0.167705 seconds + 0: 67108864 0.336696 seconds + 0: 134217728 0.773389 seconds + 0: 268435456 2.284815 seconds + 0: 536870912 4.693147 seconds + 0: 1073741824 9.356859 seconds diff --git a/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt new file mode 100644 index 0000000..fa9c67a --- /dev/null +++ b/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt @@ -0,0 +1,13 @@ +srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 16 33554432 2147483648 10 + 0: Local data size: 2048 + 0: Global data size: 2048 + 0: Number of GPUs: 16 + 0: Message size range: 33554432 - 2147483648 + 0: Number of iterations: 10 + 0: 33554432 5.091016 seconds + 0: 67108864 5.092117 seconds + 0: 134217728 5.082377 seconds + 0: 268435456 5.103443 seconds + 0: 536870912 5.102289 seconds + 0: 1073741824 5.116191 seconds + 0: 2147483648 5.115768 seconds diff --git a/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt new file mode 100644 index 0000000..23a0ace --- /dev/null +++ b/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt @@ -0,0 +1,15 @@ +srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 32 8388608 2147483648 10 + 0: Local data size: 2048 + 0: Global data size: 2048 + 0: Number of GPUs: 32 + 0: Message size range: 8388608 - 2147483648 + 0: Number of iterations: 10 + 0: 8388608 5.006776 seconds + 0: 16777216 4.981770 seconds + 0: 33554432 5.014587 seconds + 0: 67108864 4.994224 seconds + 0: 134217728 4.977063 seconds + 0: 268435456 4.980235 seconds + 0: 536870912 5.007770 seconds + 0: 1073741824 5.013561 seconds + 0: 2147483648 5.015718 seconds From 7752cedddba582d6f4ddbbd68f764d7e4d035995 Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Mon, 15 Apr 2024 12:39:44 -0400 Subject: [PATCH 38/52] add 64 gcd data for MPI --- mpi/all-gather/frontier/benchmarks/64_gcd.txt | 14 ++++++++++++++ mpi/all-reduce/frontier/benchmarks/64_gcd.txt | 13 +++++++++++++ .../frontier/benchmarks/64_gcd.txt | 17 +++++++++++++++++ 3 files changed, 44 insertions(+) create mode 100644 mpi/all-gather/frontier/benchmarks/64_gcd.txt create mode 100644 mpi/all-reduce/frontier/benchmarks/64_gcd.txt create mode 100644 mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt diff --git a/mpi/all-gather/frontier/benchmarks/64_gcd.txt b/mpi/all-gather/frontier/benchmarks/64_gcd.txt new file mode 100644 index 0000000..3eed822 --- /dev/null +++ b/mpi/all-gather/frontier/benchmarks/64_gcd.txt @@ -0,0 +1,14 @@ +srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 64 262144 33554432 10 + 0: Local data size: 32 + 0: Global data size: 2048 + 0: Number of GPUs: 64 + 0: Message size range: 262144 - 33554432 + 0: Number of iterations: 10 + 0: 262144 0.001685 seconds + 0: 524288 0.003350 seconds + 0: 1048576 0.003938 seconds + 0: 2097152 0.006864 seconds + 0: 4194304 0.013037 seconds + 0: 8388608 0.025167 seconds + 0: 16777216 0.049414 seconds + 0: 33554432 0.211224 seconds diff --git a/mpi/all-reduce/frontier/benchmarks/64_gcd.txt b/mpi/all-reduce/frontier/benchmarks/64_gcd.txt new file mode 100644 index 0000000..122c83e --- /dev/null +++ b/mpi/all-reduce/frontier/benchmarks/64_gcd.txt @@ -0,0 +1,13 @@ +srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 64 16777216 1073741824 10 + 0: Local data size: 1024 + 0: Global data size: 1024 + 0: Number of GPUs: 64 + 0: Message size range: 16777216 - 1073741824 + 0: Number of iterations: 10 + 0: 16777216 0.101777 seconds + 0: 33554432 0.203258 seconds + 0: 67108864 0.406569 seconds + 0: 134217728 0.913391 seconds + 0: 268435456 2.633732 seconds + 0: 536870912 5.375804 seconds + 0: 1073741824 10.708706 seconds diff --git a/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt new file mode 100644 index 0000000..560c383 --- /dev/null +++ b/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt @@ -0,0 +1,17 @@ +srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 64 16777216 2147483648 10 + 0: Local data size: 2048 + 0: Global data size: 2048 + 0: Number of GPUs: 64 + 0: Message size range: 16777216 - 2147483648 + 0: Number of iterations: 10 + 0: 16777216 5.006610 seconds + 0: 33554432 4.998351 seconds + 0: 67108864 5.003749 seconds + 0: 134217728 5.066133 seconds + 0: 268435456 4.980950 seconds + 0: 536870912 4.982830 seconds + 0: 1073741824 5.023178 seconds + 0: 2147483648 4.988750 seconds + 0: + 0: MPICH Slingshot Network Summary: 4 network timeouts + 0: From 4d5a82721db14ffe7d5bac7be4b5750a3e1779b4 Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Tue, 16 Apr 2024 02:39:12 -0400 Subject: [PATCH 39/52] add 128 gcd numbers for MPI on Frontier --- mpi/all-gather/frontier/benchmarks/128_gcd.txt | 13 +++++++++++++ mpi/all-reduce/frontier/benchmarks/128_gcd.txt | 12 ++++++++++++ mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt | 13 +++++++++++++ 3 files changed, 38 insertions(+) create mode 100644 mpi/all-gather/frontier/benchmarks/128_gcd.txt create mode 100644 mpi/all-reduce/frontier/benchmarks/128_gcd.txt create mode 100644 mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt diff --git a/mpi/all-gather/frontier/benchmarks/128_gcd.txt b/mpi/all-gather/frontier/benchmarks/128_gcd.txt new file mode 100644 index 0000000..824b380 --- /dev/null +++ b/mpi/all-gather/frontier/benchmarks/128_gcd.txt @@ -0,0 +1,13 @@ +srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 128 262144 16777216 10 + 0: Local data size: 16 + 0: Global data size: 2048 + 0: Number of GPUs: 128 + 0: Message size range: 262144 - 16777216 + 0: Number of iterations: 10 + 0: 262144 0.003748 seconds + 0: 524288 0.005048 seconds + 0: 1048576 0.008068 seconds + 0: 2097152 0.014084 seconds + 0: 4194304 0.026981 seconds + 0: 8388608 0.051879 seconds + 0: 16777216 0.255600 seconds diff --git a/mpi/all-reduce/frontier/benchmarks/128_gcd.txt b/mpi/all-reduce/frontier/benchmarks/128_gcd.txt new file mode 100644 index 0000000..56c18aa --- /dev/null +++ b/mpi/all-reduce/frontier/benchmarks/128_gcd.txt @@ -0,0 +1,12 @@ +srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 128 33554432 1073741824 10 + 0: Local data size: 1024 + 0: Global data size: 1024 + 0: Number of GPUs: 128 + 0: Message size range: 33554432 - 1073741824 + 0: Number of iterations: 10 + 0: 33554432 0.240206 seconds + 0: 67108864 0.476990 seconds + 0: 134217728 1.041500 seconds + 0: 268435456 2.951969 seconds + 0: 536870912 5.990606 seconds + 0: 1073741824 12.004613 seconds diff --git a/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt new file mode 100644 index 0000000..af5e98a --- /dev/null +++ b/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt @@ -0,0 +1,13 @@ +srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 128 33554432 2147483648 10 + 0: Local data size: 2048 + 0: Global data size: 2048 + 0: Number of GPUs: 128 + 0: Message size range: 33554432 - 2147483648 + 0: Number of iterations: 10 + 0: 33554432 5.046207 seconds + 0: 67108864 5.031027 seconds + 0: 134217728 5.063647 seconds + 0: 268435456 5.054240 seconds + 0: 536870912 5.047598 seconds + 0: 1073741824 5.051536 seconds + 0: 2147483648 5.057082 seconds From dffbac0f00eb291a78a6e5086e87ea8eb233049c Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Thu, 11 Jul 2024 17:13:17 -0700 Subject: [PATCH 40/52] use latest nccl --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a1fdcdb..526fb95 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ Before compiling do these: ### Perlmutter ```sh -module load PrgEnv-cray cudatoolkit craype-accel-nvidia80 nccl/2.19.4 +module load PrgEnv-cray cudatoolkit craype-accel-nvidia80 nccl export CRAY_ACCEL_TARGET=nvidia80 export MPICH_GPU_SUPPORT_ENABLED=1 ``` From 7ca3d66301fb92049dd797e3406ae27c0b8169cc Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Thu, 11 Jul 2024 17:17:30 -0700 Subject: [PATCH 41/52] update .gitignore to ignore .x and .out files --- LICENSE | 20 -- README.md | 15 - allgather.cu | 248 ---------------- allreduce.cu | 262 ----------------- mpi/Makefile | 30 -- mpi/all-gather/allgather.x | Bin 25696 -> 0 bytes mpi/all-gather/frontier/128_gcd_run.sh | 21 -- mpi/all-gather/frontier/16_gcd_run.sh | 21 -- mpi/all-gather/frontier/32_gcd_run.sh | 21 -- mpi/all-gather/frontier/64_gcd_run.sh | 21 -- mpi/all-gather/frontier/8_gcd_run.sh | 21 -- .../frontier/benchmarks/128_gcd.txt | 13 - mpi/all-gather/frontier/benchmarks/16_gcd.txt | 13 - mpi/all-gather/frontier/benchmarks/32_gcd.txt | 15 - mpi/all-gather/frontier/benchmarks/64_gcd.txt | 14 - mpi/all-gather/frontier/benchmarks/8_gcd.txt | 14 - mpi/all-gather/perlmutter/128_gpu_run.sh | 37 --- mpi/all-gather/perlmutter/16_gpu_run.sh | 37 --- mpi/all-gather/perlmutter/32_gpu_run.sh | 37 --- mpi/all-gather/perlmutter/64_gpu_run.sh | 37 --- mpi/all-gather/perlmutter/8_gpu_run.sh | 37 --- .../perlmutter/benchmarks/128_gpu.txt | 12 - .../perlmutter/benchmarks/16_gpu.txt | 12 - .../perlmutter/benchmarks/32_gpu.txt | 14 - .../perlmutter/benchmarks/64_gpu.txt | 13 - .../perlmutter/benchmarks/8_gpu.txt | 13 - mpi/all-reduce/allreduce.x | Bin 25832 -> 0 bytes mpi/all-reduce/frontier/128_gcd_run.sh | 21 -- mpi/all-reduce/frontier/16_gcd_run.sh | 21 -- mpi/all-reduce/frontier/32_gcd_run.sh | 21 -- mpi/all-reduce/frontier/64_gcd_run.sh | 21 -- mpi/all-reduce/frontier/8_gcd_run.sh | 21 -- .../frontier/benchmarks/128_gcd.txt | 12 - mpi/all-reduce/frontier/benchmarks/16_gcd.txt | 12 - mpi/all-reduce/frontier/benchmarks/32_gcd.txt | 14 - mpi/all-reduce/frontier/benchmarks/64_gcd.txt | 13 - mpi/all-reduce/frontier/benchmarks/8_gcd.txt | 13 - mpi/all-reduce/perlmutter/128_gpu_run.sh | 37 --- mpi/all-reduce/perlmutter/16_gpu_run.sh | 37 --- mpi/all-reduce/perlmutter/32_gpu_run.sh | 37 --- mpi/all-reduce/perlmutter/64_gpu_run.sh | 37 --- mpi/all-reduce/perlmutter/8_gpu_run.sh | 37 --- .../perlmutter/benchmarks/128_gpu.txt | 11 - .../perlmutter/benchmarks/16_gpu.txt | 11 - .../perlmutter/benchmarks/32_gpu.txt | 13 - .../perlmutter/benchmarks/64_gpu.txt | 12 - .../perlmutter/benchmarks/8_gpu.txt | 12 - mpi/reduce-scatter/frontier/128_gcd_run.sh | 21 -- mpi/reduce-scatter/frontier/16_gcd_run.sh | 21 -- mpi/reduce-scatter/frontier/32_gcd_run.sh | 21 -- mpi/reduce-scatter/frontier/64_gcd_run.sh | 21 -- mpi/reduce-scatter/frontier/8_gcd_run.sh | 21 -- .../frontier/benchmarks/128_gcd.txt | 13 - .../frontier/benchmarks/16_gcd.txt | 13 - .../frontier/benchmarks/32_gcd.txt | 15 - .../frontier/benchmarks/64_gcd.txt | 17 -- .../frontier/benchmarks/8_gcd.txt | 14 - mpi/reduce-scatter/perlmutter/128_gpu_run.sh | 37 --- mpi/reduce-scatter/perlmutter/16_gpu_run.sh | 37 --- mpi/reduce-scatter/perlmutter/32_gpu_run.sh | 37 --- mpi/reduce-scatter/perlmutter/64_gpu_run.sh | 37 --- mpi/reduce-scatter/perlmutter/8_gpu_run.sh | 37 --- .../perlmutter/benchmarks/128_gpu.txt | 12 - .../perlmutter/benchmarks/16_gpu.txt | 12 - .../perlmutter/benchmarks/32_gpu.txt | 14 - .../perlmutter/benchmarks/64_gpu.txt | 13 - .../perlmutter/benchmarks/8_gpu.txt | 13 - mpi/reduce-scatter/reduce_scatter.x | Bin 25888 -> 0 bytes nccl/Makefile | 25 -- nccl/all-gather/128_gpu_run.sh | 37 --- nccl/all-gather/16_gpu_run.sh | 37 --- nccl/all-gather/32_gpu_run.sh | 37 --- nccl/all-gather/64_gpu_run.sh | 37 --- nccl/all-gather/8_gpu_run.sh | 37 --- nccl/all-gather/benchmarks/128_gpu.txt | 13 - nccl/all-gather/benchmarks/16_gpu.txt | 13 - nccl/all-gather/benchmarks/32_gpu.txt | 14 - nccl/all-gather/benchmarks/64_gpu.txt | 13 - nccl/all-gather/benchmarks/8_gpu.txt | 13 - nccl/all-reduce/128_gpu_run.sh | 37 --- nccl/all-reduce/16_gpu_run.sh | 37 --- nccl/all-reduce/32_gpu_run.sh | 37 --- nccl/all-reduce/64_gpu_run.sh | 37 --- nccl/all-reduce/8_gpu_run.sh | 37 --- nccl/all-reduce/benchmarks/128_gpu.txt | 12 - nccl/all-reduce/benchmarks/16_gpu.txt | 12 - nccl/all-reduce/benchmarks/32_gpu.txt | 14 - nccl/all-reduce/benchmarks/64_gpu.txt | 13 - nccl/all-reduce/benchmarks/8_gpu.txt | 13 - nccl/reduce-scatter/128_gpu_run.sh | 37 --- nccl/reduce-scatter/16_gpu_run.sh | 37 --- nccl/reduce-scatter/32_gpu_run.sh | 37 --- nccl/reduce-scatter/64_gpu_run.sh | 37 --- nccl/reduce-scatter/8_gpu_run.sh | 37 --- nccl/reduce-scatter/benchmarks/128_gpu.txt | 12 - nccl/reduce-scatter/benchmarks/16_gpu.txt | 12 - nccl/reduce-scatter/benchmarks/32_gpu.txt | 14 - nccl/reduce-scatter/benchmarks/64_gpu.txt | 13 - nccl/reduce-scatter/benchmarks/8_gpu.txt | 13 - rccl/Makefile | 25 -- rccl/all-gather/allgather.x | Bin 25736 -> 0 bytes rccl/all-reduce/allreduce.x | Bin 25840 -> 0 bytes rccl/reduce-scatter/reduce_scatter.x | Bin 25848 -> 0 bytes reduce_scatter.cu | 269 ------------------ 104 files changed, 2905 deletions(-) delete mode 100644 LICENSE delete mode 100644 README.md delete mode 100644 allgather.cu delete mode 100644 allreduce.cu delete mode 100644 mpi/Makefile delete mode 100755 mpi/all-gather/allgather.x delete mode 100644 mpi/all-gather/frontier/128_gcd_run.sh delete mode 100644 mpi/all-gather/frontier/16_gcd_run.sh delete mode 100644 mpi/all-gather/frontier/32_gcd_run.sh delete mode 100644 mpi/all-gather/frontier/64_gcd_run.sh delete mode 100644 mpi/all-gather/frontier/8_gcd_run.sh delete mode 100644 mpi/all-gather/frontier/benchmarks/128_gcd.txt delete mode 100644 mpi/all-gather/frontier/benchmarks/16_gcd.txt delete mode 100644 mpi/all-gather/frontier/benchmarks/32_gcd.txt delete mode 100644 mpi/all-gather/frontier/benchmarks/64_gcd.txt delete mode 100644 mpi/all-gather/frontier/benchmarks/8_gcd.txt delete mode 100644 mpi/all-gather/perlmutter/128_gpu_run.sh delete mode 100644 mpi/all-gather/perlmutter/16_gpu_run.sh delete mode 100644 mpi/all-gather/perlmutter/32_gpu_run.sh delete mode 100644 mpi/all-gather/perlmutter/64_gpu_run.sh delete mode 100644 mpi/all-gather/perlmutter/8_gpu_run.sh delete mode 100644 mpi/all-gather/perlmutter/benchmarks/128_gpu.txt delete mode 100644 mpi/all-gather/perlmutter/benchmarks/16_gpu.txt delete mode 100644 mpi/all-gather/perlmutter/benchmarks/32_gpu.txt delete mode 100644 mpi/all-gather/perlmutter/benchmarks/64_gpu.txt delete mode 100644 mpi/all-gather/perlmutter/benchmarks/8_gpu.txt delete mode 100755 mpi/all-reduce/allreduce.x delete mode 100644 mpi/all-reduce/frontier/128_gcd_run.sh delete mode 100644 mpi/all-reduce/frontier/16_gcd_run.sh delete mode 100644 mpi/all-reduce/frontier/32_gcd_run.sh delete mode 100644 mpi/all-reduce/frontier/64_gcd_run.sh delete mode 100644 mpi/all-reduce/frontier/8_gcd_run.sh delete mode 100644 mpi/all-reduce/frontier/benchmarks/128_gcd.txt delete mode 100644 mpi/all-reduce/frontier/benchmarks/16_gcd.txt delete mode 100644 mpi/all-reduce/frontier/benchmarks/32_gcd.txt delete mode 100644 mpi/all-reduce/frontier/benchmarks/64_gcd.txt delete mode 100644 mpi/all-reduce/frontier/benchmarks/8_gcd.txt delete mode 100644 mpi/all-reduce/perlmutter/128_gpu_run.sh delete mode 100644 mpi/all-reduce/perlmutter/16_gpu_run.sh delete mode 100644 mpi/all-reduce/perlmutter/32_gpu_run.sh delete mode 100644 mpi/all-reduce/perlmutter/64_gpu_run.sh delete mode 100644 mpi/all-reduce/perlmutter/8_gpu_run.sh delete mode 100644 mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt delete mode 100644 mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt delete mode 100644 mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt delete mode 100644 mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt delete mode 100644 mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt delete mode 100644 mpi/reduce-scatter/frontier/128_gcd_run.sh delete mode 100644 mpi/reduce-scatter/frontier/16_gcd_run.sh delete mode 100644 mpi/reduce-scatter/frontier/32_gcd_run.sh delete mode 100644 mpi/reduce-scatter/frontier/64_gcd_run.sh delete mode 100644 mpi/reduce-scatter/frontier/8_gcd_run.sh delete mode 100644 mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt delete mode 100644 mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt delete mode 100644 mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt delete mode 100644 mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt delete mode 100644 mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt delete mode 100644 mpi/reduce-scatter/perlmutter/128_gpu_run.sh delete mode 100644 mpi/reduce-scatter/perlmutter/16_gpu_run.sh delete mode 100644 mpi/reduce-scatter/perlmutter/32_gpu_run.sh delete mode 100644 mpi/reduce-scatter/perlmutter/64_gpu_run.sh delete mode 100644 mpi/reduce-scatter/perlmutter/8_gpu_run.sh delete mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt delete mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt delete mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt delete mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt delete mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt delete mode 100755 mpi/reduce-scatter/reduce_scatter.x delete mode 100644 nccl/Makefile delete mode 100644 nccl/all-gather/128_gpu_run.sh delete mode 100644 nccl/all-gather/16_gpu_run.sh delete mode 100644 nccl/all-gather/32_gpu_run.sh delete mode 100644 nccl/all-gather/64_gpu_run.sh delete mode 100644 nccl/all-gather/8_gpu_run.sh delete mode 100644 nccl/all-gather/benchmarks/128_gpu.txt delete mode 100644 nccl/all-gather/benchmarks/16_gpu.txt delete mode 100644 nccl/all-gather/benchmarks/32_gpu.txt delete mode 100644 nccl/all-gather/benchmarks/64_gpu.txt delete mode 100644 nccl/all-gather/benchmarks/8_gpu.txt delete mode 100644 nccl/all-reduce/128_gpu_run.sh delete mode 100644 nccl/all-reduce/16_gpu_run.sh delete mode 100644 nccl/all-reduce/32_gpu_run.sh delete mode 100644 nccl/all-reduce/64_gpu_run.sh delete mode 100644 nccl/all-reduce/8_gpu_run.sh delete mode 100644 nccl/all-reduce/benchmarks/128_gpu.txt delete mode 100644 nccl/all-reduce/benchmarks/16_gpu.txt delete mode 100644 nccl/all-reduce/benchmarks/32_gpu.txt delete mode 100644 nccl/all-reduce/benchmarks/64_gpu.txt delete mode 100644 nccl/all-reduce/benchmarks/8_gpu.txt delete mode 100644 nccl/reduce-scatter/128_gpu_run.sh delete mode 100644 nccl/reduce-scatter/16_gpu_run.sh delete mode 100644 nccl/reduce-scatter/32_gpu_run.sh delete mode 100644 nccl/reduce-scatter/64_gpu_run.sh delete mode 100644 nccl/reduce-scatter/8_gpu_run.sh delete mode 100644 nccl/reduce-scatter/benchmarks/128_gpu.txt delete mode 100644 nccl/reduce-scatter/benchmarks/16_gpu.txt delete mode 100644 nccl/reduce-scatter/benchmarks/32_gpu.txt delete mode 100644 nccl/reduce-scatter/benchmarks/64_gpu.txt delete mode 100644 nccl/reduce-scatter/benchmarks/8_gpu.txt delete mode 100644 rccl/Makefile delete mode 100755 rccl/all-gather/allgather.x delete mode 100755 rccl/all-reduce/allreduce.x delete mode 100755 rccl/reduce-scatter/reduce_scatter.x delete mode 100644 reduce_scatter.cu diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 9943369..0000000 --- a/LICENSE +++ /dev/null @@ -1,20 +0,0 @@ -Copyright (c) 2024, Parallel Software and Systems Group, University of -Maryland. - -Permission is hereby granted, free of charge, to any person obtaining a -copy of this software and associated documentation files (the "Software"), -to deal in the Software without restriction, including without limitation -the rights to use, copy, modify, merge, publish, distribute, sublicense, -and/or sell copies of the Software, and to permit persons to whom the -Software is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md deleted file mode 100644 index 526fb95..0000000 --- a/README.md +++ /dev/null @@ -1,15 +0,0 @@ -Before compiling do these: - -### Perlmutter -```sh -module load PrgEnv-cray cudatoolkit craype-accel-nvidia80 nccl -export CRAY_ACCEL_TARGET=nvidia80 -export MPICH_GPU_SUPPORT_ENABLED=1 -``` -### Frontier -```sh -module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 -export MPICH_GPU_SUPPORT_ENABLED=1 -export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}" -``` - diff --git a/allgather.cu b/allgather.cu deleted file mode 100644 index 8c357bb..0000000 --- a/allgather.cu +++ /dev/null @@ -1,248 +0,0 @@ -/* \file allgather.cu - * Copyright 2024 Parallel Software and Systems Group, University of Maryland. - * See the top-level LICENSE file for details. - * - * SPDX-License-Identifier: MIT - */ - -#include -#include -#include -#include - -#ifdef USE_CUDA - #include - #define bfloat16 nv_bfloat16 -#elif USE_ROCM - #define __HIP_PLATFORM_AMD__ - #include - #include - #include - #define bfloat16 hip_bfloat16 -#endif - -#ifdef USE_NCCL - #include "nccl.h" -#elif USE_RCCL - #include -#endif - -#define NUM_WARMUP_ITERATIONS 5 - -#define MPI_CHECK(cmd) do { \ - int64_t e = cmd; \ - if( e != MPI_SUCCESS ) { \ - printf("Failed: MPI error %s:%d '%ld'\n", \ - __FILE__,__LINE__, e); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - -#define CUDA_CHECK(cmd) do { \ - cudaError_t e = cmd; \ - if(e != cudaSuccess) { \ - printf("CUDA error %s:%d: %s\n", \ - __FILE__, __LINE__, cudaGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - -#define HIP_CHECK(cmd) do { \ - hipError_t e = cmd; \ - if(e != hipSuccess) { \ - printf("HIP error %s:%d: %s\n", \ - __FILE__, __LINE__, hipGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - -// NCCL_CHECK is used to validate RCCL functions as well -#define NCCL_CHECK(cmd) do { \ - ncclResult_t e = cmd; \ - if (e != ncclSuccess) { \ - printf("NCCL error %s:%d %s\n", \ - __FILE__, __LINE__, ncclGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - -void initializeData(bfloat16 *data, int64_t size) { - for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) { - #ifdef USE_CUDA - data[i] = __float2bfloat16((float)i); - #elif USE_ROCM - // ROCm doesn't have a float2bfloat16 method - data[i] = (bfloat16) ((float) i); - #endif - } -} - -int main(int argc, char *argv[]) { - if (argc != 5) { - fprintf(stderr, "Usage: %s \n", argv[0]); - return EXIT_FAILURE; - } - - int num_gpus = atoi(argv[1]); - int64_t min_msg_size = atoi(argv[2]); - int64_t max_msg_size = atoi(argv[3]); - int iterations = atoi(argv[4]); - - if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { - fprintf(stderr, "Invalid input parameters.\n"); - return EXIT_FAILURE; - } - - int my_rank, num_pes; - int num_gpus_per_node; - int msg_count; - - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); - MPI_Comm_size(MPI_COMM_WORLD, &num_pes); - - if (num_pes != num_gpus) { - fprintf(stderr, "Number of processes must match number of GPUs.\n"); - MPI_Finalize(); - return EXIT_FAILURE; - } - - // Initialize GPU context - #if USE_CUDA - cudaGetDeviceCount(&num_gpus_per_node); - cudaSetDevice((my_rank % num_gpus_per_node)); - #elif USE_ROCM - hipGetDeviceCount(&num_gpus_per_node); - hipSetDevice((my_rank % num_gpus_per_node)); - #endif - - int64_t local_data_size = max_msg_size; // Size of local data - int64_t global_data_size = local_data_size * num_gpus; // Size of global data - - if (my_rank == 0) { - fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024); - fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024); - } - - bfloat16 *local_data = (bfloat16*)malloc(local_data_size); - bfloat16 *global_data = (bfloat16*)malloc(global_data_size); - - // Initialize local data - initializeData(local_data, local_data_size); - - // Allocate memory on GPU - bfloat16 *d_local_data, *d_global_data; - #ifdef USE_CUDA - CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); - CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); - // Copy local data to GPU - CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); - - #elif USE_ROCM - HIP_CHECK(hipMalloc(&d_local_data, local_data_size)); - HIP_CHECK(hipMalloc(&d_global_data, global_data_size)); - HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice)); - #endif - - #ifdef USE_MPI - // create 2-byte datatype (send raw, un-interpreted bytes) - MPI_Datatype mpi_type_bfloat16; - MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16); - MPI_Type_commit(&mpi_type_bfloat16); - - #elif defined(USE_NCCL) || defined(USE_RCCL) - ncclUniqueId nccl_comm_id; - ncclComm_t nccl_comm; - - if (my_rank == 0) { - /* Generates an Id to be used in ncclCommInitRank. */ - ncclGetUniqueId(&nccl_comm_id); - } - - /* distribute nccl_comm_id to all ranks */ - MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, - 0, MPI_COMM_WORLD)); - - /* Create a new NCCL/RCCL communicator */ - NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); - #endif - - // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather - double total_time, start_time; - MPI_Request request; - MPI_Status status; - - // Print benchmark results - if (my_rank == 0) { - printf("Number of GPUs: %d\n", num_gpus); - printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size); - printf("Number of iterations: %d\n", iterations); - } - fflush(NULL); - - for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { - msg_count = msg_size / sizeof(bfloat16); - // warmup iterations - for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { - #ifdef USE_MPI - MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16, - d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request)); - - MPI_CHECK(MPI_Wait(&request, &status)); - #elif defined(USE_NCCL) || defined(USE_RCCL) - NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL)); - #endif - - #ifdef USE_CUDA - cudaDeviceSynchronize(); - #elif USE_ROCM - hipDeviceSynchronize(); - #endif - } - - if(msg_size >= 8388608) - iterations = 20; - - MPI_Barrier(MPI_COMM_WORLD); - start_time = MPI_Wtime(); - for (int i = 0; i < iterations; ++i) { - #ifdef USE_MPI - MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16, - d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request)); - - MPI_CHECK(MPI_Wait(&request, &status)); - #elif defined(USE_NCCL) || defined(USE_RCCL) - NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL)); - #endif - - #ifdef USE_CUDA - cudaDeviceSynchronize(); - #elif USE_ROCM - hipDeviceSynchronize(); - #endif - } - MPI_Barrier(MPI_COMM_WORLD); - total_time = MPI_Wtime() - start_time; - if (my_rank == 0) - printf("%ld %.6f seconds\n", msg_size, (total_time / iterations)); - } - - // Cleanup - free(local_data); - free(global_data); - #ifdef USE_CUDA - CUDA_CHECK(cudaFree(d_local_data)); - CUDA_CHECK(cudaFree(d_global_data)); - #elif USE_ROCM - HIP_CHECK(hipFree(d_local_data)); - HIP_CHECK(hipFree(d_global_data)); - #endif - - #ifdef defined(USE_NCCL) || defined(USE_RCCL) - ncclCommDestroy(nccl_comm); - #endif - - MPI_Finalize(); - return EXIT_SUCCESS; -} - diff --git a/allreduce.cu b/allreduce.cu deleted file mode 100644 index 111b254..0000000 --- a/allreduce.cu +++ /dev/null @@ -1,262 +0,0 @@ -/* \file allreduce.cu - * Copyright 2024 Parallel Software and Systems Group, University of Maryland. - * See the top-level LICENSE file for details. - * - * SPDX-License-Identifier: MIT - */ - -#include -#include -#include -#include - -#ifdef USE_CUDA - #include - #define bfloat16 nv_bfloat16 -#elif USE_ROCM - #define __HIP_PLATFORM_AMD__ - #include - #include - #include - #define bfloat16 hip_bfloat16 -#endif - -#ifdef USE_NCCL - #include "nccl.h" -#elif USE_RCCL - #include -#endif - -#define NUM_WARMUP_ITERATIONS 5 - -#define MPI_CHECK(cmd) do { \ - int64_t e = cmd; \ - if( e != MPI_SUCCESS ) { \ - printf("Failed: MPI error %s:%d '%ld'\n", \ - __FILE__,__LINE__, e); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - -#define CUDA_CHECK(cmd) do { \ - cudaError_t e = cmd; \ - if(e != cudaSuccess) { \ - printf("CUDA error %s:%d: %s\n", \ - __FILE__, __LINE__, cudaGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - -#define HIP_CHECK(cmd) do { \ - hipError_t e = cmd; \ - if(e != hipSuccess) { \ - printf("HIP error %s:%d: %s\n", \ - __FILE__, __LINE__, hipGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - -// NCCL_CHECK is used to validate RCCL functions as well -#define NCCL_CHECK(cmd) do { \ - ncclResult_t e = cmd; \ - if (e != ncclSuccess) { \ - printf("NCCL error %s:%d %s\n", \ - __FILE__, __LINE__, ncclGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - -void initializeData(bfloat16 *data, int64_t size) { - for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) { - #ifdef USE_CUDA - data[i] = __float2bfloat16((float)i); - #elif USE_ROCM - // ROCm doesn't have a float2bfloat16 method - data[i] = (bfloat16) ((float) i); - #endif - } -} - -void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { - bfloat16* in = (bfloat16*) invec; - bfloat16* inout = (bfloat16*) inoutvec; - for (int i = 0; i < *len; i++) { - #ifdef USE_CUDA - inout[i] = __hadd(in[i], inout[i]); - #elif USE_ROCM - inout[i] = in[i] + inout[i]; - #endif - } -} - -int main(int argc, char *argv[]) { - if (argc != 5) { - fprintf(stderr, "Usage: %s \n", argv[0]); - return EXIT_FAILURE; - } - - int num_gpus = atoi(argv[1]); - int64_t min_msg_size = strtoll(argv[2], NULL, 10); - int64_t max_msg_size = strtoll(argv[3], NULL, 10); - int iterations = atoi(argv[4]); - - if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { - fprintf(stderr, "Invalid input parameters.\n"); - return EXIT_FAILURE; - } - - int my_rank, num_pes; - int num_gpus_per_node; - int msg_count; - - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); - MPI_Comm_size(MPI_COMM_WORLD, &num_pes); - - if (num_pes != num_gpus) { - fprintf(stderr, "Number of processes must match number of GPUs.\n"); - MPI_Finalize(); - return EXIT_FAILURE; - } - - // Initialize GPU context - #if USE_CUDA - cudaGetDeviceCount(&num_gpus_per_node); - cudaSetDevice((my_rank % num_gpus_per_node)); - #elif USE_ROCM - hipGetDeviceCount(&num_gpus_per_node); - hipSetDevice((my_rank % num_gpus_per_node)); - #endif - - int64_t local_data_size = max_msg_size; // Size of local data - int64_t global_data_size = local_data_size; // Size of global data - - if (my_rank == 0) { - fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024); - fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024); - } - - bfloat16 *local_data = (bfloat16*)malloc(local_data_size); - bfloat16 *global_data = (bfloat16*)malloc(global_data_size); - - // Initialize local data - initializeData(local_data, local_data_size); - - bfloat16 *d_local_data, *d_global_data; - #ifdef USE_CUDA - CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); - CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); - // Copy local data to GPU - CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); - - #elif USE_ROCM - HIP_CHECK(hipMalloc(&d_local_data, local_data_size)); - HIP_CHECK(hipMalloc(&d_global_data, global_data_size)); - HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice)); - #endif - - #ifdef USE_MPI - // create 2-byte datatype (send raw, un-interpreted bytes) - MPI_Datatype mpi_type_bfloat16; - MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16); - MPI_Type_commit(&mpi_type_bfloat16); - - // define custom reduce operation for nv_bfloat16 types - MPI_Op CUSTOM_SUM; - MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM); - - #elif defined(USE_NCCL) || defined(USE_RCCL) - ncclUniqueId nccl_comm_id; - ncclComm_t nccl_comm; - - if (my_rank == 0) { - /* Generates an Id to be used in ncclCommInitRank. */ - ncclGetUniqueId(&nccl_comm_id); - } - - /* distribute nccl_comm_id to all ranks */ - MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, - 0, MPI_COMM_WORLD)); - - /* Create a new NCCL/RCCL communicator */ - NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); - #endif - - // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather - double total_time, start_time; - MPI_Request request; - MPI_Status status; - - // Print benchmark results - if (my_rank == 0) { - printf("Number of GPUs: %d\n", num_gpus); - printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size); - printf("Number of iterations: %d\n", iterations); - } - fflush(NULL); - - for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { - msg_count = msg_size / sizeof(bfloat16); - // warmup iterations - for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { - #ifdef USE_MPI - MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16, - CUSTOM_SUM, MPI_COMM_WORLD, &request)); - - MPI_CHECK(MPI_Wait(&request, &status)); - #elif defined(USE_NCCL) || defined(USE_RCCL) - NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL)); - #endif - - #ifdef USE_CUDA - cudaDeviceSynchronize(); - #elif USE_ROCM - hipDeviceSynchronize(); - #endif - } - - if(msg_size >= 8388608) - iterations = 20; - - MPI_Barrier(MPI_COMM_WORLD); - start_time = MPI_Wtime(); - for (int i = 0; i < iterations; ++i) { - #ifdef USE_MPI - MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16, - CUSTOM_SUM, MPI_COMM_WORLD, &request)); - - MPI_CHECK(MPI_Wait(&request, &status)); - #elif defined(USE_NCCL) || defined(USE_RCCL) - NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL)); - #endif - - #ifdef USE_CUDA - cudaDeviceSynchronize(); - #elif USE_ROCM - hipDeviceSynchronize(); - #endif - } - MPI_Barrier(MPI_COMM_WORLD); - total_time = MPI_Wtime() - start_time; - if (my_rank == 0) - printf("%ld %.6f seconds\n", msg_size, (total_time / iterations)); - } - - // Cleanup - free(local_data); - free(global_data); - #ifdef USE_CUDA - CUDA_CHECK(cudaFree(d_local_data)); - CUDA_CHECK(cudaFree(d_global_data)); - #elif USE_ROCM - HIP_CHECK(hipFree(d_local_data)); - HIP_CHECK(hipFree(d_global_data)); - #endif - - #ifdef defined(USE_NCCL) || defined(USE_RCCL) - ncclCommDestroy(nccl_comm); - #endif - - MPI_Finalize(); - return EXIT_SUCCESS; -} diff --git a/mpi/Makefile b/mpi/Makefile deleted file mode 100644 index 28861d4..0000000 --- a/mpi/Makefile +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright 2024 Parallel Software and Systems Group, University of Maryland. -# See the top-level LICENSE file for details. -# -# SPDX-License-Identifier: MIT - -CC = cc - -# perlmutter flags -# INC = -I/global/common/software/nersc9/nccl/2.19.4/include -# CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI -# LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl - -# frontier flags -INC = -I${ROCM_PATH}/include -CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_MPI -LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl - -all: allgather.x allreduce.x reduce_scatter.x - -allgather.x: ../allgather.cu - ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu - -allreduce.x: ../allreduce.cu - ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu - -reduce_scatter.x: ../reduce_scatter.cu - ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu - -clean: - rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x diff --git a/mpi/all-gather/allgather.x b/mpi/all-gather/allgather.x deleted file mode 100755 index 03793882f6c87b1c4fbedcb5062d4db7921d7a3f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 25696 zcmeHP3v^UPny%ZOmjQB{r{H6}V1THk^MJ&FfaxR+9jBua67Yd`I{io*(+~TB!Bs#b z&a?q#*gcNx%+9dR?yP6lnPGQl9md@OA_|N<55UK8bY^76S2xH*MPPi8{ri_?G+^V`&w{O*YyK{+2QJ8clwwMw3gBco!>bcL@ky}Inq@LBW zV)(OIDJuY<$1y#=o@qQFX{AZWB`wlfW(Fm_h=`AvG#(@AC7feSQkg@dq&MU=5!C5p z6LeNmX3=@l%g41%#BUSjlXSPL%Smd~qcV2P(BnHAG#-$2|Ce;U`!bzMJt`LI^$NXS zp(p7!QEo|Py)1hx?xphOO7Do&8>goy>!nrbJtfNflttqLk~(#9CaJ8$8=*(}{Bs@Y z)3r(DXOp;pl4`=Bq%ysVoN9t1YjcXN`re)!ZIspUSlK`gU%? z3|}O{`CK09h+8{MWFk0i7@{*XK(#C>~E0TSJI3Ii8pyPxx-PMsNoIDsT&%#>(6p!5Q}m$1h^_ zZY_gR;2Q$p@79PlI1~D-fbp>K^U)1G~x`RGfAmk4N_4K-3{y@mx z6S&(4*6SNdtUC~C@dZ7RwG5%n~p$@^nYTAxb$GkH*72tfjTt)f@`Md7v>I47#H3&>cfuOqcbB6LIE` zz+BwVf_l=iIPxB4{=1@qxQ}Zrk;NjbZEGVwmnR&G2f7mBM2rgvEEq(B!kFLRlZbUQ zKNX8-2FAPG@ory~vn$;qZw>BfG$6rvAn0ShUPQTkUU%G$dhCqF^r{CG@>20ao?t}J zYb4Hav)n;1YC?Z#)KI0k=HcN%caa_=wQQ-gxuMZjZLPM}u!?XbUg3$l*H%P)6&{bT zqN>(fX|3czJbGO8C<=5gsEFNF;YWqWk+X_e#O=AG+!tCCtLSvcd>Iqvu^wNns;2yI zUuZ5FD6gvFw!H3WtlJ$bkK(SOJmqt(^Q@KS@mgO`bxm*dt}cI9&Fb2ks5jajjm^Dt zb!1+6bu1Rq)6Jc|a!-#t)KxK*Nf9hhgzgB1?+S_MXICh(Ad|z-7V?W*p$d$8JUvv0 zRm6y55noCC-I^1`~ zrn($|(twXQ*x7Etjq{Bi2At-I((N|jlOzbb-+)gx;0Fx&6a#+PfM009j~MW&20UfJ zry1~52ArN}(w#Hl7fX<+TO*&*J(p`I;)~oG5sqKVL%L^iYdk>mVkyJ zHXwzeYj6}%kGkW5a45F8m@VxIce;BtdKmLE@zNACXV^0W>S%5i zrP7N<2^$MFWLYm-i_Pi0q0!aSVrzA^*luzyVghMf*4*N1b1btjciiA?0^YdHb~8jE z-rDB6Vabw3)O?hS&DS_U>E4>qnZ2MDpuM0$(8oZZ1Kk1IhEDo(&>d*(-DofqF2h_9 zbSvlqP}#SYyO%Mg*P>iJwyHF0 zkP*E+8tTG-$>8M0c+{S5cO>6ZpU%Gpw@iI{w`1e(MA_c_FN0GY{rPhs z0VqDjvAO0ZU{FmT9~?~ez}hio%WZpv_B}(|3-{&+h4z{0y0!{S&n`qbxi8f~`RLDo zBNMfG(bvi10%`H*NQ;vfv@RnF%)z8wGb7;WP_xp2xHJsmcY;X{C z+$Tplo&1tcFMSHM>X2GCb1np$)Ur$enIf9h@JthF*wvGV{BTzFGHP9n`5o%I-`Ulb zyFYNMSMK$zkz*$aI-#yS{((cik~6QUE6)&f2GJjJwA@ws&>ZIq@ zNoOIJ{;ZX~8J#9Ik~-mcA{A&JC(RR0>Xg0E{?MjQ`V`@_Hnr@G+IAWwrMA86SJ%*m z0-rnydK&bs9Z`Op8hzOUq4Rz<`fCdnbJ53X6w{`@MrD2NB&5z0JnlJy?dthIf;&G5 zm2(gR2k%odg{m57K~>q+wlw6{9EIGmt$cZeRNel_$gi4GjK_>+@va10I{+p)*kKH6<7Z7Xc; zwzlPtzIS^a$q$nEl+JS`AgZb*l_2^EH{=X`!I`a(8{!+)`Hz)T2vPkGv4m}TO zs)Y7&XfGhmnfzFy{{qyJY%10IJ~pYFXx3}rcwBw>EE>Lj;~Dkg6BL;Lq574#5a_$7 zRAcJHuTrnrOj+8DETz_w353S(LS~v$>jhh>vm3z9UsYPEZcIU~C3(7e;7zT?bDj*$ zNtqDa-&8vGZ_k2D{yOz6rn$-AHmg58iL}PD#5_m;qDL1n=1hL%7&tm<=WL6{s&=QY zp)~pz-GdlM@+gn7=rPj~<4g{wuHf|dj)C-a^?@Tu8KC3f$>xD~OB2+yAEGiO|0sL* z>UYpm(PC%lbn-`UbNh34v9P0m+`>A>c5PKPwv%S}&){juQYlPUcTgz(SMbsJ zE`*Syf8x$xK@Lsw$xQsMJbv8St6{`KMk+E!z5}7u4>FQfBsu?A|H}cy!3PlP#C6oE zEY9Tl)Z>_9Z``eJe2j()fKwgG6RC{?{01LCAnE9jm)4M^kyoxifh(H5OFwpH;iop@hlbp#lm4?y3jpOrC*FARofuuI1>L zl#T}AO#oGcspB9m^Utqqa`fL*T9-NjmO_@)OIT1)uCEjsoSJ%8AeBH;2~ywf=zFgB zR@*AuZMNHOuG{uFH+xH0Igr3BQ)IF4gzUM5Ib_E|R-l6)U^a!G#$x^+@$>*4XT5xM#MAv5oYs(6CT9L3b^b8_=f1~az|nb{`HP%BM+=kv{U7|cxM zWS|KdauLG%@VabUnl%x5+^LY3%C*ATLG@)-&TTa=NXZ5E+HFQ#o;Q7^8no8?Q zL&I}s-HpTMN1s!&Ur?@j1tnIAyu1Jib4hpVAR3Bo@K=fd)YE$wc#7nCbEQl<)Q!LW zba3!}brsToU%k>S285k!(M_&;AJn*gpF?ot)Om zDI$)(dk-`9fkOzUe@uM8{jj4Sb9kD@_nm8u2l~#ns2eFqefOSbi7SP`Gvk5m4ALNg z{CH4a+@^8+?5C+3`s|-5$2@)ao??mHq&&Ts*|}cG&(-DY_t{Tn;<_Y>24jC(xuSZb&n=b|pS#jAL})tyM6?fL|0 z>Uq>2&zyxoZ5Ob?)OLVdw)|`lUpNSF*m4LAP2ZZ6zor?SbMwrHvBGS3Y%V9d+?M>b z8RO#>RQE-Wfltk;<*@D8cs#CEy(Y&m44)W8o02DOgOh&a=-Z<>=KnTv6ivS9*4yOB zhY4e5J&~RM2Ycv-!G3@KL3BY{Qke!1kp3PW=OHSn?gnfA70;-_nLmQBURI9~@iaEF z^bsoc%TR49|@fa6b zdIRV0CLRwEmX2^fN<6K=(;b{&O+38P%~# zzm9mU`B>V*`3B-kh-aK%K)g!)DNO1IXI2w$A^r&GuOyzDTKWLzXAqD3iSl#)BI4hWaY_x8+><(C@*O?hIC6AD$@PxWIHe9l?tlo0WSQ2Xahw_qIhhW1 z5OUj18mID|zO+vrin~IM6Y3D;eklxpcz!TU{DA_E2-AJM_iE5X(+Px=<&f{V<##ml z9kz_821l-2=u-nDC*$RB3r&?8$Em}STP65YgL;1IMSjo}dHyyD9zBoqvYvG9QGc`s zMr&ZS21aXOv<60NV6+BCYhbhnMr&ZS21aXOv<60NV6+BCYhbhnMr&ZS21aXOv<60N zV6+BCYhbhn{xWKyNc=9iy+Y#!suaJY$^GitrF>AtKPu>V1$|o3y@I|X=-Yyx5wxI6 zPiKmtvjnXbbcvuV1oaELM$iWZeN@oz3i`C5dj)+((6w2 zHew^1#<%2YYp@vuyCpPzE1{;9)wq2%)s=Pgz4Pa~E8W#q^E$8g&Gq}eHGa3dwytts z<-D4@T3=1w75Hu`5W2$`U7&GuPG^%a!j?8RF3`%b>HP|=s?s{wTB%i6R#!o_PAlWG z%^{EV3L&<xk;k!M{<8Ag?r$G7nnDIY>&>^FqgEo6NMe=L1NMA^!& zo;MI_D#|w_%6uC<)8Qr{dGk12NZ)AX>nTHMEX7;$E&?gI0|ImvvZA6pK`(d^Uh$X7 z<00}8wTb(5FHA}SHKQ@~MVeC9LBS_rNV$>+=_>+d4i8!orC2FgZ~z`%l*lWfW5>`3 zgG%M|2#?uA!72*!pXL?UVsYnWOHsaALSHT_jXTAV(*;>_$~T_^u3k%NHd2BW4J7I$CMSYEjB~#ZoX{nV^_UE-sl_G7;_8 zWM$@(YsV5uR>muU$56BdA8nZCA#%Z(YsNHBymp!iUt5?<*_bhl$C%Ol^JcN;iCl$Q z3fc>XEERRoZ>!weS#>|#G zeI}e&pj-|!W=0=6k>zn2%ivi6JMpa^L@{C2JUJiwIZ=z=*W z9Fw513rupvHNVIdQy#F#VcX1Z5-AMJNJSzR6(_EAc|39lc`On0>D$LEuC54#JUt1o zk1=15zkDd)I26F^^IN-RZOBe)-oq60g#G4F+&n?!e%u^dXr6H0(pAiqXMT&keOpas z)6B(Io?|{_e&OwZ^I{{V}SQ2jhv5Z&8;o z8m+vG|KCupmBZj7Y`lkFxgWG{J}M5!+4#T)_5GmbJ-$#^yn9h4phW1dK*(DjcXwiw zJ9u|Y9N&P`6tJ6rcqAGj4j;foBi3Dx9jI}ihjwOi!_h8ZIktb6W2+~mLUFPc#ZeI` zTosds%AmHvP_x1WbxFd99#2tZhG)`yxXtw2AwRK(< z<}Sko=*S10ybz(z7!P>Lqdpvtfu3Zz&t%=$SbL~|^zh|@;T(>;K#^(R?C{QxBO!QK zb4R;8Bgaxhq!!|evx@F;&{yH^Oaywo712mg4>YtlJDV!Ha9n|>yxKaqsse40iIXc} zc8weZT&_sem(?ggn;%{=NOD;F)oY8FC=rSUx7>wLSda z=oOBEbjZ;syW}JyM`bxW$fI{lA5zaJc&R5TO}EI^bDy3q?J^k2 zK)1HX+rM zQKYB#FegPE0yVM9Z_BV>UaF2Xz98bU+&`(}hO-(m7QS;`-|4qPVKm)}({ zBM@#iF0!-Dtsz8V-SzVDgv|G}89NYAOjDR#|4V#5aJdF87lwW>a5?Xi zi=+1h&Sq&`lAj%b&>37Bn}UX+<2(}Hg~;c4oYp88s{B z2I;&r>I2!XF9*Ig2mU?aBjtxy$|K<-ai~In1 za5N`-d;#sLe);r(t+ zLY$TWr}iWF!PKLjk-MDZdf4b+l>=v1%mjVW2(yN;NU+)(nz6v>Vpi|kP;70GS+Oj! ziWil`7*|whyRpE~rQ<*_&32Qj2t|702!;8HY*t^lIQOpG3rWc_YuwihDLxyw^0k+h zs|`(c8He?mKeG;-Y4McZ!2l-TU136|_* zY8NXx&~7?;kB4+BC;V9}7pycoaeiQkFJkNt*OFzn7Q4&Yyu8gtXA646`f604_+!iN|zOjTo55C*H3KN!9by~D;RV~kY>cVR6<^`zyfL=xvE_} zr$~ubHLjsWwLUL4!2GWB!xHJv$@BV_I&WyOIqA5b<@PpLo2|iVM-hiFz=aWRQ69Hw zw2^i8B2P#z?Y*(E3kU6av7-Bbekh~sma2J_b^U%ep^jRss_CS{PHNrps@k5v^m7|s zF3|>E{%{mGx`#SX_F;}AtagV%^ftid8s2H_*EgZ##$oh-9H!x! zE)4VB$@8fr0+9RCL^PL`VWPB8uMn5i-(=Fj4V{~;a~$HHoS#XA)*WMTNqxCLkDjOG zw&3|dj?$of%6T{5lyY|&e5^skQop`H<9K}mCzYQE&GEB4u7v)Ln z%l(uG#0JJENeC`GeOZ4i5l8wBBm^h-SN76PfumnW$dNu?f`6v|-N5L+RO2Ei=i#k* zPfzY+p^lx>9z1NnX-_fS78~m+Q{%FQ~sAx|AoGzT8h6 z`GWeppiB2o>dW=5JePi(Xn&$RW!kT-hyBo{`jh(deATJqr?oP~8J#S*#D0N*Q9rXj zt=A=FI#NDc|HUkQx&F5ZeW@oYrDb%-z!26az-)b)zHG+X=@&uSn18K9SoTm3{p;w#2)8xMc((q30~vNPh5!Hn diff --git a/mpi/all-gather/frontier/128_gcd_run.sh b/mpi/all-gather/frontier/128_gcd_run.sh deleted file mode 100644 index 4e8c955..0000000 --- a/mpi/all-gather/frontier/128_gcd_run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -#SBATCH -p batch -#SBATCH -A CSC569 -#SBATCH -t 15:00 -#SBATCH -N 16 -#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/128_gcd.txt -#SBATCH -C nvme - -## calculating the number of nodes and GPUs -export NNODES=$SLURM_JOB_NUM_NODES -export GPUS_PER_NODE=8 ## change as per your machine -export GPUS=$(( NNODES * GPUS_PER_NODE )) - -MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 16)) -SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-gather/frontier/16_gcd_run.sh b/mpi/all-gather/frontier/16_gcd_run.sh deleted file mode 100644 index bb2429f..0000000 --- a/mpi/all-gather/frontier/16_gcd_run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -#SBATCH -p batch -#SBATCH -A CSC569 -#SBATCH -t 10:00 -#SBATCH -N 2 -#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/16_gcd.txt -#SBATCH -C nvme - -## calculating the number of nodes and GPUs -export NNODES=$SLURM_JOB_NUM_NODES -export GPUS_PER_NODE=8 ## change as per your machine -export GPUS=$(( NNODES * GPUS_PER_NODE )) - -MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 128)) -SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-gather/frontier/32_gcd_run.sh b/mpi/all-gather/frontier/32_gcd_run.sh deleted file mode 100644 index e630b97..0000000 --- a/mpi/all-gather/frontier/32_gcd_run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -#SBATCH -p batch -#SBATCH -A CSC569 -#SBATCH -t 15:00 -#SBATCH -N 4 -#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/32_gcd.txt -#SBATCH -C nvme - -## calculating the number of nodes and GPUs -export NNODES=$SLURM_JOB_NUM_NODES -export GPUS_PER_NODE=8 ## change as per your machine -export GPUS=$(( NNODES * GPUS_PER_NODE )) - -MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 64)) -SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-gather/frontier/64_gcd_run.sh b/mpi/all-gather/frontier/64_gcd_run.sh deleted file mode 100644 index e7c707f..0000000 --- a/mpi/all-gather/frontier/64_gcd_run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -#SBATCH -p batch -#SBATCH -A CSC569 -#SBATCH -t 15:00 -#SBATCH -N 8 -#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/64_gcd.txt -#SBATCH -C nvme - -## calculating the number of nodes and GPUs -export NNODES=$SLURM_JOB_NUM_NODES -export GPUS_PER_NODE=8 ## change as per your machine -export GPUS=$(( NNODES * GPUS_PER_NODE )) - -MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 32)) -SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-gather/frontier/8_gcd_run.sh b/mpi/all-gather/frontier/8_gcd_run.sh deleted file mode 100644 index 563f933..0000000 --- a/mpi/all-gather/frontier/8_gcd_run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -#SBATCH -p batch -#SBATCH -A CSC569 -#SBATCH -t 10:00 -#SBATCH -N 1 -#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/8_gcd.txt -#SBATCH -C nvme - -## calculating the number of nodes and GPUs -export NNODES=$SLURM_JOB_NUM_NODES -export GPUS_PER_NODE=8 ## change as per your machine -export GPUS=$(( NNODES * GPUS_PER_NODE )) - -MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 256)) -SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-gather/frontier/benchmarks/128_gcd.txt b/mpi/all-gather/frontier/benchmarks/128_gcd.txt deleted file mode 100644 index 824b380..0000000 --- a/mpi/all-gather/frontier/benchmarks/128_gcd.txt +++ /dev/null @@ -1,13 +0,0 @@ -srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 128 262144 16777216 10 - 0: Local data size: 16 - 0: Global data size: 2048 - 0: Number of GPUs: 128 - 0: Message size range: 262144 - 16777216 - 0: Number of iterations: 10 - 0: 262144 0.003748 seconds - 0: 524288 0.005048 seconds - 0: 1048576 0.008068 seconds - 0: 2097152 0.014084 seconds - 0: 4194304 0.026981 seconds - 0: 8388608 0.051879 seconds - 0: 16777216 0.255600 seconds diff --git a/mpi/all-gather/frontier/benchmarks/16_gcd.txt b/mpi/all-gather/frontier/benchmarks/16_gcd.txt deleted file mode 100644 index 35a9e26..0000000 --- a/mpi/all-gather/frontier/benchmarks/16_gcd.txt +++ /dev/null @@ -1,13 +0,0 @@ -srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 16 2097152 134217728 10 - 0: Local data size: 128 - 0: Global data size: 2048 - 0: Number of GPUs: 16 - 0: Message size range: 2097152 - 134217728 - 0: Number of iterations: 10 - 0: 2097152 0.002249 seconds - 0: 4194304 0.003148 seconds - 0: 8388608 0.006062 seconds - 0: 16777216 0.011871 seconds - 0: 33554432 0.023485 seconds - 0: 67108864 0.046822 seconds - 0: 134217728 0.139763 seconds diff --git a/mpi/all-gather/frontier/benchmarks/32_gcd.txt b/mpi/all-gather/frontier/benchmarks/32_gcd.txt deleted file mode 100644 index f758360..0000000 --- a/mpi/all-gather/frontier/benchmarks/32_gcd.txt +++ /dev/null @@ -1,15 +0,0 @@ -srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 32 262144 67108864 10 - 0: Local data size: 64 - 0: Global data size: 2048 - 0: Number of GPUs: 32 - 0: Message size range: 262144 - 67108864 - 0: Number of iterations: 10 - 0: 262144 0.000783 seconds - 0: 524288 0.001513 seconds - 0: 1048576 0.002953 seconds - 0: 2097152 0.003404 seconds - 0: 4194304 0.006485 seconds - 0: 8388608 0.012489 seconds - 0: 16777216 0.024484 seconds - 0: 33554432 0.048460 seconds - 0: 67108864 0.185884 seconds diff --git a/mpi/all-gather/frontier/benchmarks/64_gcd.txt b/mpi/all-gather/frontier/benchmarks/64_gcd.txt deleted file mode 100644 index 3eed822..0000000 --- a/mpi/all-gather/frontier/benchmarks/64_gcd.txt +++ /dev/null @@ -1,14 +0,0 @@ -srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 64 262144 33554432 10 - 0: Local data size: 32 - 0: Global data size: 2048 - 0: Number of GPUs: 64 - 0: Message size range: 262144 - 33554432 - 0: Number of iterations: 10 - 0: 262144 0.001685 seconds - 0: 524288 0.003350 seconds - 0: 1048576 0.003938 seconds - 0: 2097152 0.006864 seconds - 0: 4194304 0.013037 seconds - 0: 8388608 0.025167 seconds - 0: 16777216 0.049414 seconds - 0: 33554432 0.211224 seconds diff --git a/mpi/all-gather/frontier/benchmarks/8_gcd.txt b/mpi/all-gather/frontier/benchmarks/8_gcd.txt deleted file mode 100644 index 7856a16..0000000 --- a/mpi/all-gather/frontier/benchmarks/8_gcd.txt +++ /dev/null @@ -1,14 +0,0 @@ -srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 8 2097152 268435456 10 -0: Local data size: 256 -0: Global data size: 2048 -0: Number of GPUs: 8 -0: Message size range: 2097152 - 268435456 -0: Number of iterations: 10 -0: 2097152 0.000505 seconds -0: 4194304 0.000856 seconds -0: 8388608 0.001645 seconds -0: 16777216 0.003223 seconds -0: 33554432 0.006379 seconds -0: 67108864 0.012691 seconds -0: 134217728 0.025316 seconds -0: 268435456 0.053944 seconds diff --git a/mpi/all-gather/perlmutter/128_gpu_run.sh b/mpi/all-gather/perlmutter/128_gpu_run.sh deleted file mode 100644 index 710a399..0000000 --- a/mpi/all-gather/perlmutter/128_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 32 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 16)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-gather/perlmutter/16_gpu_run.sh b/mpi/all-gather/perlmutter/16_gpu_run.sh deleted file mode 100644 index d4d984e..0000000 --- a/mpi/all-gather/perlmutter/16_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 4 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 128)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-gather/perlmutter/32_gpu_run.sh b/mpi/all-gather/perlmutter/32_gpu_run.sh deleted file mode 100644 index d2f1b0d..0000000 --- a/mpi/all-gather/perlmutter/32_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 8 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 64)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-gather/perlmutter/64_gpu_run.sh b/mpi/all-gather/perlmutter/64_gpu_run.sh deleted file mode 100644 index 515d667..0000000 --- a/mpi/all-gather/perlmutter/64_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 16 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 32)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-gather/perlmutter/8_gpu_run.sh b/mpi/all-gather/perlmutter/8_gpu_run.sh deleted file mode 100644 index 210ea3d..0000000 --- a/mpi/all-gather/perlmutter/8_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 2 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 256)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt deleted file mode 100644 index 3787302..0000000 --- a/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt +++ /dev/null @@ -1,12 +0,0 @@ -Local data size: 16 -Global data size: 2048 -Number of GPUs: 128 -Message size range: 262144 - 16777216 -Number of iterations: 10 -262144 0.003218 seconds -524288 0.005101 seconds -1048576 0.008701 seconds -2097152 0.015526 seconds -4194304 0.030239 seconds -8388608 0.060280 seconds -16777216 0.189415 seconds diff --git a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt deleted file mode 100644 index b69654b..0000000 --- a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt +++ /dev/null @@ -1,12 +0,0 @@ -Local data size: 128 -Global data size: 2048 -Number of GPUs: 16 -Message size range: 2097152 - 134217728 -Number of iterations: 10 -2097152 0.002391 seconds -4194304 0.003558 seconds -8388608 0.007162 seconds -16777216 0.014929 seconds -33554432 0.030427 seconds -67108864 0.062092 seconds -134217728 0.151508 seconds diff --git a/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt deleted file mode 100644 index 0e15475..0000000 --- a/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt +++ /dev/null @@ -1,14 +0,0 @@ -Local data size: 64 -Global data size: 2048 -Number of GPUs: 32 -Message size range: 262144 - 67108864 -Number of iterations: 10 -262144 0.000730 seconds -524288 0.001367 seconds -1048576 0.002650 seconds -2097152 0.003740 seconds -4194304 0.007503 seconds -8388608 0.014208 seconds -16777216 0.029923 seconds -33554432 0.061970 seconds -67108864 0.168545 seconds diff --git a/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt deleted file mode 100644 index ed700b9..0000000 --- a/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 32 -Global data size: 2048 -Number of GPUs: 64 -Message size range: 262144 - 33554432 -Number of iterations: 10 -262144 0.001561 seconds -524288 0.002915 seconds -1048576 0.004163 seconds -2097152 0.007885 seconds -4194304 0.014989 seconds -8388608 0.029413 seconds -16777216 0.063034 seconds -33554432 0.183096 seconds diff --git a/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt deleted file mode 100644 index de3a837..0000000 --- a/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 256 -Global data size: 2048 -Number of GPUs: 8 -Message size range: 2097152 - 268435456 -Number of iterations: 10 -2097152 0.000838 seconds -4194304 0.001719 seconds -8388608 0.003172 seconds -16777216 0.006797 seconds -33554432 0.013860 seconds -67108864 0.027938 seconds -134217728 0.055353 seconds -268435456 0.104310 seconds diff --git a/mpi/all-reduce/allreduce.x b/mpi/all-reduce/allreduce.x deleted file mode 100755 index 283e31cfd4ec10983f8a8159c5c70bb949af53dd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 25832 zcmeHP3v^t?d7itg7qX?he&E=~#Fro&@z~W$wv23S(CT5Y(iJF{j3EK9ulFG>-WU5Y zHUVQJOtJ`rLz`DiTbHIOCnrtaoRm_UgXI{E$w`PIVBEZL5+J(>KfuPO33&Vcb7yw1 z*4jc(>oCHTIo<{%`*OpPB#8ojY?^Gxv0RR$3H=$zWmEGUASZO5;#Fj~N^KoJfGQ zvo=!`>-D%vNhQ^-kb)}uDsE;a1fIyIhb;b)*n_5AM! z(x>Y|QJ)9J{gYG^1|^l{HHAYPmMm@x`|aUSG`-orxowGk$zn$`=4j#e=r&NBR$ae_ zTd*P)MR2}cL^|TujuV*yP8)|9%%?!;Dh+J_Cc;;h!M(sIDmPRH-&6+QS_c1O8T`-7 z;NLET?<<2JEQ22^ga3OO{C8#W56a*bxM36dZF(7eP8ob*8GKn8{L^Ldo66wcGI+2I zzOf8`XBm9B4E}H#{Oe`#C(7VYm%-n|eVNAQvd47|DaPdZGX?U5kLnu11^lmo+t?h| zrfUQj!ryUxE^F7d0!E2{68J$~Bi0lwc)k@p(+b`J=6@EBQJ#@&q)K^`GX53w51=d! zB^YCe!d??R6`HuLN@XbLRbUj-b3+%{aLkxOwAor??4+jcOugf(9L;JmhXN9J{O zzw-8x+i#lpZ@+!yF6Q-O1oL^5DLs+$M)Xh=$Y3NE6^IvXA}|wh*_BkaN4Vf!!aKrt7LX( zEFDb=q&ty_CHhl|P;`(GL%)AZ)HjrfMXBUuDv^qX1qrj>z8-H+G?e0r&R8VkP3X~$ zMJ{RR_+#l53&vqK6=V^k3dt1OkYK?(5}{OpYpj&bBdY^j;sLKO7EOf))3J1t3kWO{ z5e425_xch69qPh#Fc?lJhggsr%d3W>5&?f2J|K3jE)+ZTL?RSO7+@+C39!IsWO)OA zJ*C6D81DKpt@8AAbb4DH zEsj>!6pN>tdXvD=c#xKVET>e+o^Jv!J_tfWZV)6tF5*d0;v79NbI zmlbMwej`7=6>?z0=L?e$n~4$4BHl^-orRo8JcOICZ{Yye zsNnn|6KM6)`ZV7;U`Tvz2{_j--KTzL87`wF{67f*I>lg=o%4@ zU(8cRWYaaCAbGhIB4DLoH&UEsjk>`Reu)9`*r{ur5^kQK(_CSq zpa?LZ-*%7rlG5^7$=}16`I66yd^X6A`5MnDh53-sKpa8Um%$bB(+F7nbI&``G&M;A$wKdnxvza^+H4+3~2OT*6^wx3@p-@pMXy>ojM< zD3VtYvGcGO&}>mZPfP!FW1(t+qT8ih3i-K`&L3 zSbejZ#cUy=r$Vu4^4eOqDjeIOhc$X_^EUC;)G|-ZHv#JI=@YFonneYh8&$MyG_BX= z@!Zhq?d^5-d3#+qc~>xj46N?y^$xgKclWz*@N@z1T| z`ao$MdkKR<7&H&M4Ri$tf!&~g209AbfQR{I;z6^Zui)WmL&s^E&ls&Oz6-h=RE`D8 zovWF$*`{1HwYp{(a2kW?Ia>$&c2L$)Z6n@^>keSo66UJ6-EZlvtJ(r3u#`{r`!e!Z zgBJ2Fy*KmCUn2QXfoR0e09rapAePc(x(^cQL zs(w@%u70p;XXQf`4_oiISZ7q$x4G(_TmWJawz=wSTo5zrMLzg4D%cLbP%rB>3RSEN zPz~^x;G;pLSy#R7VGE2rWbLlkc2>aXgO%=j=lxag`nKWf-um_gFU&=?5ruWs*Kl9< z!A>m(9QoXmIdj-GE(CWqAM44yxXzvVjeGd@lYIl8os|di8gcKeb8)ibQ49?E3%`RQ z=JXnO<_-0U%DHg8`oySv$7s6oK;;y0ihHE;QAhwv&2sN-oej*rbHydc^ZD%eVP;I( zb?bhiy}YQs{6M8$Xa|M13QPAchj!+<>?2fxk;?u;*3K0_IR=XlNQ*zB;2u6_QSaSi zA)S-%7qj4P$?E3kbC+^xpL_V%?+4XbWoj&+2d%rxpchsc^x|KWP^MQk)wRb?xz#H; zb68z_hL|(R{yj&|scZ9uoFmy6pnitb&vdCXpH*j`g;?%4(q~<2;~90}G)Pt*csr zHfrXIE2+IM_2<;qpPzu#IfCm>=XmSS{Rg;nd8oV(A#jL3BvWuzT{~2})qxx|H@yY9 zv15=s?jpoRh&l$Q@g#}90&_0)#&@Ws4@ythBA8HD0UbC+Rqs*<;KtY9 zqYQ{PW?_De+J}s10ls#Oa?s6pP#zo6)4ihe}VGw!7? zsykZG<@4^P2UDGNCvju;1=X!T$Q(X<`0N3CJEiM_J6=l7+Wtd&y^YnXPyNQW{Uv%W z2G!^fNOa#@wqR!F4etEDu_yPN`^WGywl&l|I(DTxe;f4|^{HRmw!bJ;sg!;Da!{SR z(w!f5`)W~#yO*i|GIiCyU!&&H=aF1#B=?{G0BI8X5+HY`tHJ3S{?MX6cmfYt?~d2ihu-9gy!y~D zDY5he^$RZ}F??5plc^6qkEDAiRcI$Fl#P%Lq~`5KRl2fK!L}Q03~blZa6`Mg(ZcXvR@Ju>6z z>mY~T_;?|IBhRl}OGdV-Fw#^o^0yGmey1SmBFR5`fcg_YK+*H3c{1m+4`VjIV^rPo z01XKM8wtEqfIIm(0ZDh}SDwu8|CJh)wTsLzQfB6s2Df|XEe*5x;sIe#*D1_{$1QsS zD(%j^mYqg)=Rm{Ez3mIpG_GF<%y?K^_8v!ryE4b&kZPcN_ki_e-b7JA-a|Qi_tTR) zl4@`QcpX4VRYeq(182{tOL? zpL5;fy4AJL<-K*kXJ@z}>_!wHt5fG=;?=9X4*U1)_GEtResPT3rYba!cp7GRVc41J z$z0Wt`I+lqVW%s*6=tc_>3q;Q;m#b-eqKN`0A<&q4j*M>nNWBE4)SEUjjILp6F}Ly zAH&8O(bflwHcknMx?uJ&rhgw*kM9bFl|>u>D4JElUz`N=#DDhqO3QvRcU3J~Q zy>@g$bjv+TR)=`*7awAx;5)eYC-k&ljJ&6QgW8WR8;`wC*uwGHj|iJS9(w^8^?z92 z3;(ce1S!mQXQ8%x{{hyGBEC+l>wup_r!wL6BiGnU;PlEh;eSE61NaN$arm5#%>w?! zc>F`)c){gv&*$?rFLw{W?abW$wmXwP>dve=@?1A5vgf+rH^^y&oFw8NzWWGM?|TW! z+`kY%(tX4|f&~Gs35MVAOofKu?^So~Le}uzr&;<6A@I~RAW#1iO{BT~{b(=lTHSNq zr^yY&-5)4pzTvx1vh=M|{u3mBLdf4}$hSY&eX=0GX!!2;S=uS(x&7*ChW&+x{0uaq zAYZj>tYR0IBJSbtBQ?W!e_*-CCUmiGQFknYFSx}kdA-%AQ9j%I0nls~-s6?C5vc72 zme2ka;I3WI@8_$KVl-}f1%{?)J(*W&hUVG1;5%3yuW|3R6YY0pKJ3Yy@mx;suW|4D z(3A zxkH?vPCT5!a-*EDB_94~xjme>5RY!oa=SVIdk1(t>nyj6^Y0Rm0g&akasDmh@q%Ev zIOl&ud=2sIIsY@_=@Ue*kMl1Pj~N}ywR8T5#M7tKoRjlEAf7&%<}}VfMLgD!ENA2V zcZshjo^k$L#H+-g#Q2b3@EGy*nILzR^M6h}^|st0&Obst?kC#M`3H!nPoKFxoWGZN z8uD_xIe#bd^rS&iM_*(G#e z97N$g>In#fF#7G$%xlyW5M;i+gkMp@uPWhhF5x33{N@s#Miz5>%#DR=khTQhCtjs-99@&wZ&dJjGC5!H z6PKEdCOt6efk_WcdSKE6lOCA#z@!HzJuvBkNe@hVVA2DV9+>pNqz5KFFzJCw4@`Ps z(gTwonDoG;2PQo*u?K3z?`-Yj_oz`p<$m>2ty#?)hZYLDM9`Ij-Y95L&`pBgFX&eU z{idK#2zo%!!-Bpk=ovw)T8wgM3A#|wC4#OL^hQC0f^HJ@enGz?=r;v@LeK+(9v1XX zLC*+Uwa6&H6I;qOzPV1@gv}Dz&7c{Z0yV9%RS&ebINO%`moCzsdQ0=-4Oa&i1%v+9 zpsp`zb1rr+Zf#o*ez&x+ zZp8AJOLHz`%X$8Ng?#yYCjCJG1^Ig>{Q<;8!KlwSk!9GI{q2#7gn=35twR$G`8P$s zl)vwuEa6MdxB!UuE0ibOFMsE!zZCN0`e)=S+bj$7D25=6=;mLk5uI$E!RVk=%nCAP}4iWXw%j}0vR(AK@xCDLoa`yeqxD&m>AYTK3__NI^60mLtJhJP zom6EC>Q^{R_4W|^GFK(`Yq)H5$CMkdL;O0fY1@U#1z;%9ZuaRRrmQ8$RfC!FmqKEu zvc_^Zo2d*~wj%zFTyUXs z*&KpvD|xX~%r#q%zlWH5A-fEu&cUvmW7PSwIaGT4?5}nJuXiXD;ABv(EY!z>CfkaJ ziJdkDE}33Ug42;xV>p9yW@PSFkcUlF58R7=f3v^T0smYGkWEvXAwG+0vV2ao-l)xl zp1KjJh3~LZlt0whG6jJIvDGcPCFu{ulVXQ7!=`a;@ptqaf68-#MUEWSrOIim?aJ%4 zYd(F+C5fcfc6$JS4yq`Z!i<&C9~zP6x`Ji!EQB5Q4j;0x!3&p#(J=Ut74Riep5_hRTdnfps))pa>TX%lqr@9VDwj$R(6vpVO&L;(#eE4fo0I=lY7CF>14pz z2;OvcQz+^Sr~LuO0^y*&SZ{3h_XmQGA?XeIkXrXM#X9|?vINB~pf9sV36Q?s^FE^dF5W-0S z)C4b13(&XllK{LuH{f6ZKQ<_$(SGI|(i4o!Fn-bj-nd$4bF=Xp)$EZ}IvQ9p7>EWE zAs<*h;Tu|E99g5;2l@XSS~NQhuD}+0=-GpyZA;-e9B&f<8!`54+QWh9U}|WE6Hq#O zM=0vIr}Pcjt`1&LisKn@N&@!fkIzIW#4!SxXe5X1*ddw<_-N-OH=Gy@*s(p+j;)xG zil)d`0!Klhaf7I8Y6!fcE)-)4)D-6vpyI(eZ7-!7P<^;foK=uYY+2E~TYnU2HFJGP*&*2ygG@16b zj(^y3xC4LG^u(ZV;#}&8)I+=})-)801e){>=}_3;l!!-+M8}#QPgm0*4j%B?TO5m; zo6rZDID-OaH_0);>y0M@C7ts8`f$gf$Z`GG@D^`TI+_d(Mgz2K*YMwDr~P=HcH&;i zdviCuJ^tPp4#z-R4D=_n6eJ=?WjQ*@(O8a5QchAsyQnYq#>D(X>b)m;sV6B-wUb3Dcn~nOskHGa7;%g#6JaG zVN+Rrnub&{@EJO$zH&`*t;iy(;N+opUE>JVgTnQ=2w$&jNEPq*%}8UuQ}{bnsvm{* zxGbQt^ZK7(&mTfyI__8LD(GhdmupbF zF!XDI%Xybv9K9p(JqtB1$@lLgbq1HlW}#ykIL}1%3Dk2MPH&V8Rq4-#z^6+g6Kw-- zo4`&taLptqlM&$b43vi9GWa89@NWU1s6Mn(o(SJp27kT`{?jt}8)fjbWpG+#(|z1? zsm7apA9xB)0;jEk6!;%+gL*j5p>a`v8T@l)a9UwbinI&U^bp8F~5;GLd~+ zc29IZ++UWF|3(@7pUU9Rmcd^sgVQIEiR$@Q8T_3x_?a?zC7$1j?AMpU=a#`6%HWp+ zpXgjYC&!EL1=(4a3;dw2Nr+Pyw_nHPewB9gR|>R~Y@+(FD}yr!W|VKQt@yKsTe<_%@G(9=i?3e zAt^cLNCh@SiqGmDd=2N|YQ-rtex-I?vAm?_xy=Iz%V*@svU3!l7gZgcZvMoHH2Zi=l3}cjanVZ@1Ue z(?8&)GYI`LV^wP`Ah7zv+Qp9p?!4aEhTE`+bTlK$oy=v$Vi`n8%5@kUcK}c{=8Z)3 zILeHhmt3d|7HU8p6T8~WYl;GCb>uBB&W-uC3FZ~zhbq#YlPCDC^4!qj^3bt7{oMoJ z0au5o8%-R)kQYX{MR_cv*+$73jYi8wtNqEC7f0{;u`>K$et4tzv(1aC>c;)-f{*%| zTj=b<4b;0+%}c@zBCS3gAQg)MY+m9`rXziu`n`Rjey3NQ`Y6c%m%|^uUeRs6!B_%) zBTUa*>4}gNEVRc#^dZ9Q9sitlU*Cmitz4hy8eKPE@9OR8G{)jTCZ^$>F1)6AkmqE} z^RwlCGm*_@Wtu2$Ag#tF^&hlo;ELxr8ytstB*M$|Ly_;WHrXik|8}7-Zg=5#yrW`r^Dh5Ysc1lc z+mOf?O7!J-o0m01g8H{Pl-7SIGO7KteEFT{ZOzc4=gS;Q_5T!^W_{MI8RC~1#3mhU zu;=-o*3ERQHK7vV5&muPyE{%+BJ@i-OgBk$uu=#u}WzC54xp!i9z5qahy>n*V#BVpDrtc%+X z30aPmFV+8PiN5^qU=#XMPf{w&9LB&9HY~tWeObQj#--)cd3omgYwI;`NozL@&ZbOL zDAlK*%?OkJYu7a{-R9zi&b=NNv~e7MTZ29o;Vi+W{$G{gBuhck`9j}7i}6vHNJ~M% zN`FX6M>+itLt>lgKT^KL99vdKe|@*1L1!?ULkY{=W%N(3G_>Ttcxj_lpS~LrR_ec> zxDAbgk^w{NOX?N+(!Qk5UL&(3gFW0XPcR{BMF0Q* diff --git a/mpi/all-reduce/frontier/128_gcd_run.sh b/mpi/all-reduce/frontier/128_gcd_run.sh deleted file mode 100644 index 5c6baf5..0000000 --- a/mpi/all-reduce/frontier/128_gcd_run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -#SBATCH -p batch -#SBATCH -A CSC569 -#SBATCH -t 20:00 -#SBATCH -N 16 -#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/128_gcd.txt -#SBATCH -C nvme - -## calculating the number of nodes and GPUs -export NNODES=$SLURM_JOB_NUM_NODES -export GPUS_PER_NODE=8 ## change as per your machine -export GPUS=$(( NNODES * GPUS_PER_NODE )) - -MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 1024)) -SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-reduce/frontier/16_gcd_run.sh b/mpi/all-reduce/frontier/16_gcd_run.sh deleted file mode 100644 index e1ad604..0000000 --- a/mpi/all-reduce/frontier/16_gcd_run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -#SBATCH -p batch -#SBATCH -A CSC569 -#SBATCH -t 20:00 -#SBATCH -N 2 -#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/16_gcd.txt -#SBATCH -C nvme - -## calculating the number of nodes and GPUs -export NNODES=$SLURM_JOB_NUM_NODES -export GPUS_PER_NODE=8 ## change as per your machine -export GPUS=$(( NNODES * GPUS_PER_NODE )) - -MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 1024)) -SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-reduce/frontier/32_gcd_run.sh b/mpi/all-reduce/frontier/32_gcd_run.sh deleted file mode 100644 index be7bdd9..0000000 --- a/mpi/all-reduce/frontier/32_gcd_run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -#SBATCH -p batch -#SBATCH -A CSC569 -#SBATCH -t 20:00 -#SBATCH -N 4 -#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/32_gcd.txt -#SBATCH -C nvme - -## calculating the number of nodes and GPUs -export NNODES=$SLURM_JOB_NUM_NODES -export GPUS_PER_NODE=8 ## change as per your machine -export GPUS=$(( NNODES * GPUS_PER_NODE )) - -MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 1024)) -SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-reduce/frontier/64_gcd_run.sh b/mpi/all-reduce/frontier/64_gcd_run.sh deleted file mode 100644 index a8e13d2..0000000 --- a/mpi/all-reduce/frontier/64_gcd_run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -#SBATCH -p batch -#SBATCH -A CSC569 -#SBATCH -t 20:00 -#SBATCH -N 8 -#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/64_gcd.txt -#SBATCH -C nvme - -## calculating the number of nodes and GPUs -export NNODES=$SLURM_JOB_NUM_NODES -export GPUS_PER_NODE=8 ## change as per your machine -export GPUS=$(( NNODES * GPUS_PER_NODE )) - -MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 1024)) -SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-reduce/frontier/8_gcd_run.sh b/mpi/all-reduce/frontier/8_gcd_run.sh deleted file mode 100644 index 81ffbc4..0000000 --- a/mpi/all-reduce/frontier/8_gcd_run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -#SBATCH -p batch -#SBATCH -A CSC569 -#SBATCH -t 20:00 -#SBATCH -N 1 -#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/8_gcd.txt -#SBATCH -C nvme - -## calculating the number of nodes and GPUs -export NNODES=$SLURM_JOB_NUM_NODES -export GPUS_PER_NODE=8 ## change as per your machine -export GPUS=$(( NNODES * GPUS_PER_NODE )) - -MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 1024)) -SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-reduce/frontier/benchmarks/128_gcd.txt b/mpi/all-reduce/frontier/benchmarks/128_gcd.txt deleted file mode 100644 index 56c18aa..0000000 --- a/mpi/all-reduce/frontier/benchmarks/128_gcd.txt +++ /dev/null @@ -1,12 +0,0 @@ -srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 128 33554432 1073741824 10 - 0: Local data size: 1024 - 0: Global data size: 1024 - 0: Number of GPUs: 128 - 0: Message size range: 33554432 - 1073741824 - 0: Number of iterations: 10 - 0: 33554432 0.240206 seconds - 0: 67108864 0.476990 seconds - 0: 134217728 1.041500 seconds - 0: 268435456 2.951969 seconds - 0: 536870912 5.990606 seconds - 0: 1073741824 12.004613 seconds diff --git a/mpi/all-reduce/frontier/benchmarks/16_gcd.txt b/mpi/all-reduce/frontier/benchmarks/16_gcd.txt deleted file mode 100644 index 609afbd..0000000 --- a/mpi/all-reduce/frontier/benchmarks/16_gcd.txt +++ /dev/null @@ -1,12 +0,0 @@ -srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 16 33554432 1073741824 10 - 0: Local data size: 1024 - 0: Global data size: 1024 - 0: Number of GPUs: 16 - 0: Message size range: 33554432 - 1073741824 - 0: Number of iterations: 10 - 0: 33554432 0.133082 seconds - 0: 67108864 0.267616 seconds - 0: 134217728 0.634895 seconds - 0: 268435456 1.928400 seconds - 0: 536870912 3.973167 seconds - 0: 1073741824 7.913018 seconds diff --git a/mpi/all-reduce/frontier/benchmarks/32_gcd.txt b/mpi/all-reduce/frontier/benchmarks/32_gcd.txt deleted file mode 100644 index b92c437..0000000 --- a/mpi/all-reduce/frontier/benchmarks/32_gcd.txt +++ /dev/null @@ -1,14 +0,0 @@ -srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 32 8388608 1073741824 10 - 0: Local data size: 1024 - 0: Global data size: 1024 - 0: Number of GPUs: 32 - 0: Message size range: 8388608 - 1073741824 - 0: Number of iterations: 10 - 0: 8388608 0.043066 seconds - 0: 16777216 0.084259 seconds - 0: 33554432 0.167705 seconds - 0: 67108864 0.336696 seconds - 0: 134217728 0.773389 seconds - 0: 268435456 2.284815 seconds - 0: 536870912 4.693147 seconds - 0: 1073741824 9.356859 seconds diff --git a/mpi/all-reduce/frontier/benchmarks/64_gcd.txt b/mpi/all-reduce/frontier/benchmarks/64_gcd.txt deleted file mode 100644 index 122c83e..0000000 --- a/mpi/all-reduce/frontier/benchmarks/64_gcd.txt +++ /dev/null @@ -1,13 +0,0 @@ -srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 64 16777216 1073741824 10 - 0: Local data size: 1024 - 0: Global data size: 1024 - 0: Number of GPUs: 64 - 0: Message size range: 16777216 - 1073741824 - 0: Number of iterations: 10 - 0: 16777216 0.101777 seconds - 0: 33554432 0.203258 seconds - 0: 67108864 0.406569 seconds - 0: 134217728 0.913391 seconds - 0: 268435456 2.633732 seconds - 0: 536870912 5.375804 seconds - 0: 1073741824 10.708706 seconds diff --git a/mpi/all-reduce/frontier/benchmarks/8_gcd.txt b/mpi/all-reduce/frontier/benchmarks/8_gcd.txt deleted file mode 100644 index a9b69c1..0000000 --- a/mpi/all-reduce/frontier/benchmarks/8_gcd.txt +++ /dev/null @@ -1,13 +0,0 @@ -srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 8 16777216 1073741824 10 -0: Local data size: 1024 -0: Global data size: 1024 -0: Number of GPUs: 8 -0: Message size range: 16777216 - 1073741824 -0: Number of iterations: 10 -0: 16777216 0.049728 seconds -0: 33554432 0.099497 seconds -0: 67108864 0.202129 seconds -0: 134217728 0.500335 seconds -0: 268435456 1.560791 seconds -0: 536870912 3.265382 seconds -0: 1073741824 6.500534 seconds diff --git a/mpi/all-reduce/perlmutter/128_gpu_run.sh b/mpi/all-reduce/perlmutter/128_gpu_run.sh deleted file mode 100644 index 33729eb..0000000 --- a/mpi/all-reduce/perlmutter/128_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 20:00 -#SBATCH -N 32 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 1024)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-reduce/perlmutter/16_gpu_run.sh b/mpi/all-reduce/perlmutter/16_gpu_run.sh deleted file mode 100644 index dc30279..0000000 --- a/mpi/all-reduce/perlmutter/16_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 15:00 -#SBATCH -N 4 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 1024)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-reduce/perlmutter/32_gpu_run.sh b/mpi/all-reduce/perlmutter/32_gpu_run.sh deleted file mode 100644 index be73564..0000000 --- a/mpi/all-reduce/perlmutter/32_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 20:00 -#SBATCH -N 8 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 1024)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-reduce/perlmutter/64_gpu_run.sh b/mpi/all-reduce/perlmutter/64_gpu_run.sh deleted file mode 100644 index cf714da..0000000 --- a/mpi/all-reduce/perlmutter/64_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 20:00 -#SBATCH -N 16 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 1024)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-reduce/perlmutter/8_gpu_run.sh b/mpi/all-reduce/perlmutter/8_gpu_run.sh deleted file mode 100644 index 49ff135..0000000 --- a/mpi/all-reduce/perlmutter/8_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 15:00 -#SBATCH -N 2 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 1024)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt deleted file mode 100644 index 4e3e17d..0000000 --- a/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt +++ /dev/null @@ -1,11 +0,0 @@ -Local data size: 1024 -Global data size: 1024 -Number of GPUs: 128 -Message size range: 33554432 - 1073741824 -Number of iterations: 10 -33554432 0.264543 seconds -67108864 0.527909 seconds -134217728 1.092095 seconds -268435456 3.194094 seconds -536870912 6.415718 seconds -1073741824 12.819154 seconds diff --git a/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt deleted file mode 100644 index b377ec2..0000000 --- a/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt +++ /dev/null @@ -1,11 +0,0 @@ -Local data size: 1024 -Global data size: 1024 -Number of GPUs: 16 -Message size range: 33554432 - 1073741824 -Number of iterations: 10 -33554432 0.142677 seconds -67108864 0.324897 seconds -134217728 0.673650 seconds -268435456 2.140369 seconds -536870912 4.318430 seconds -1073741824 8.632880 seconds diff --git a/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt deleted file mode 100644 index cda53bf..0000000 --- a/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 1024 -Global data size: 1024 -Number of GPUs: 32 -Message size range: 8388608 - 1073741824 -Number of iterations: 10 -8388608 0.049975 seconds -16777216 0.092395 seconds -33554432 0.181888 seconds -67108864 0.368241 seconds -134217728 0.774021 seconds -268435456 2.362729 seconds -536870912 4.760279 seconds -1073741824 9.524390 seconds diff --git a/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt deleted file mode 100644 index 341fc93..0000000 --- a/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt +++ /dev/null @@ -1,12 +0,0 @@ -Local data size: 1024 -Global data size: 1024 -Number of GPUs: 64 -Message size range: 16777216 - 1073741824 -Number of iterations: 10 -16777216 0.111867 seconds -33554432 0.230462 seconds -67108864 0.465838 seconds -134217728 0.970915 seconds -268435456 2.875694 seconds -536870912 5.771569 seconds -1073741824 11.522959 seconds diff --git a/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt deleted file mode 100644 index 05fd1e8..0000000 --- a/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt +++ /dev/null @@ -1,12 +0,0 @@ -Local data size: 1024 -Global data size: 1024 -Number of GPUs: 8 -Message size range: 16777216 - 1073741824 -Number of iterations: 10 -16777216 0.058292 seconds -33554432 0.107128 seconds -67108864 0.211506 seconds -134217728 0.491929 seconds -268435456 1.508757 seconds -536870912 3.052047 seconds -1073741824 6.103450 seconds diff --git a/mpi/reduce-scatter/frontier/128_gcd_run.sh b/mpi/reduce-scatter/frontier/128_gcd_run.sh deleted file mode 100644 index b6505f8..0000000 --- a/mpi/reduce-scatter/frontier/128_gcd_run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -#SBATCH -p batch -#SBATCH -A CSC569 -#SBATCH -t 20:00 -#SBATCH -N 16 -#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt -#SBATCH -C nvme - -## calculating the number of nodes and GPUs -export NNODES=$SLURM_JOB_NUM_NODES -export GPUS_PER_NODE=8 ## change as per your machine -export GPUS=$(( NNODES * GPUS_PER_NODE )) - -MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) -SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/reduce-scatter/frontier/16_gcd_run.sh b/mpi/reduce-scatter/frontier/16_gcd_run.sh deleted file mode 100644 index eb6b2ba..0000000 --- a/mpi/reduce-scatter/frontier/16_gcd_run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -#SBATCH -p batch -#SBATCH -A CSC569 -#SBATCH -t 20:00 -#SBATCH -N 2 -#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt -#SBATCH -C nvme - -## calculating the number of nodes and GPUs -export NNODES=$SLURM_JOB_NUM_NODES -export GPUS_PER_NODE=8 ## change as per your machine -export GPUS=$(( NNODES * GPUS_PER_NODE )) - -MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) -SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/reduce-scatter/frontier/32_gcd_run.sh b/mpi/reduce-scatter/frontier/32_gcd_run.sh deleted file mode 100644 index 4ed3437..0000000 --- a/mpi/reduce-scatter/frontier/32_gcd_run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -#SBATCH -p batch -#SBATCH -A CSC569 -#SBATCH -t 20:00 -#SBATCH -N 4 -#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt -#SBATCH -C nvme - -## calculating the number of nodes and GPUs -export NNODES=$SLURM_JOB_NUM_NODES -export GPUS_PER_NODE=8 ## change as per your machine -export GPUS=$(( NNODES * GPUS_PER_NODE )) - -MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) -SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/reduce-scatter/frontier/64_gcd_run.sh b/mpi/reduce-scatter/frontier/64_gcd_run.sh deleted file mode 100644 index a5a9957..0000000 --- a/mpi/reduce-scatter/frontier/64_gcd_run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -#SBATCH -p batch -#SBATCH -A CSC569 -#SBATCH -t 20:00 -#SBATCH -N 8 -#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt -#SBATCH -C nvme - -## calculating the number of nodes and GPUs -export NNODES=$SLURM_JOB_NUM_NODES -export GPUS_PER_NODE=8 ## change as per your machine -export GPUS=$(( NNODES * GPUS_PER_NODE )) - -MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) -SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/reduce-scatter/frontier/8_gcd_run.sh b/mpi/reduce-scatter/frontier/8_gcd_run.sh deleted file mode 100644 index 9d4191c..0000000 --- a/mpi/reduce-scatter/frontier/8_gcd_run.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -#SBATCH -p batch -#SBATCH -A CSC569 -#SBATCH -t 20:00 -#SBATCH -N 1 -#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt -#SBATCH -C nvme - -## calculating the number of nodes and GPUs -export NNODES=$SLURM_JOB_NUM_NODES -export GPUS_PER_NODE=8 ## change as per your machine -export GPUS=$(( NNODES * GPUS_PER_NODE )) - -MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) -SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt deleted file mode 100644 index af5e98a..0000000 --- a/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt +++ /dev/null @@ -1,13 +0,0 @@ -srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 128 33554432 2147483648 10 - 0: Local data size: 2048 - 0: Global data size: 2048 - 0: Number of GPUs: 128 - 0: Message size range: 33554432 - 2147483648 - 0: Number of iterations: 10 - 0: 33554432 5.046207 seconds - 0: 67108864 5.031027 seconds - 0: 134217728 5.063647 seconds - 0: 268435456 5.054240 seconds - 0: 536870912 5.047598 seconds - 0: 1073741824 5.051536 seconds - 0: 2147483648 5.057082 seconds diff --git a/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt deleted file mode 100644 index fa9c67a..0000000 --- a/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt +++ /dev/null @@ -1,13 +0,0 @@ -srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 16 33554432 2147483648 10 - 0: Local data size: 2048 - 0: Global data size: 2048 - 0: Number of GPUs: 16 - 0: Message size range: 33554432 - 2147483648 - 0: Number of iterations: 10 - 0: 33554432 5.091016 seconds - 0: 67108864 5.092117 seconds - 0: 134217728 5.082377 seconds - 0: 268435456 5.103443 seconds - 0: 536870912 5.102289 seconds - 0: 1073741824 5.116191 seconds - 0: 2147483648 5.115768 seconds diff --git a/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt deleted file mode 100644 index 23a0ace..0000000 --- a/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt +++ /dev/null @@ -1,15 +0,0 @@ -srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 32 8388608 2147483648 10 - 0: Local data size: 2048 - 0: Global data size: 2048 - 0: Number of GPUs: 32 - 0: Message size range: 8388608 - 2147483648 - 0: Number of iterations: 10 - 0: 8388608 5.006776 seconds - 0: 16777216 4.981770 seconds - 0: 33554432 5.014587 seconds - 0: 67108864 4.994224 seconds - 0: 134217728 4.977063 seconds - 0: 268435456 4.980235 seconds - 0: 536870912 5.007770 seconds - 0: 1073741824 5.013561 seconds - 0: 2147483648 5.015718 seconds diff --git a/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt deleted file mode 100644 index 560c383..0000000 --- a/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt +++ /dev/null @@ -1,17 +0,0 @@ -srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 64 16777216 2147483648 10 - 0: Local data size: 2048 - 0: Global data size: 2048 - 0: Number of GPUs: 64 - 0: Message size range: 16777216 - 2147483648 - 0: Number of iterations: 10 - 0: 16777216 5.006610 seconds - 0: 33554432 4.998351 seconds - 0: 67108864 5.003749 seconds - 0: 134217728 5.066133 seconds - 0: 268435456 4.980950 seconds - 0: 536870912 4.982830 seconds - 0: 1073741824 5.023178 seconds - 0: 2147483648 4.988750 seconds - 0: - 0: MPICH Slingshot Network Summary: 4 network timeouts - 0: diff --git a/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt deleted file mode 100644 index 493d5ee..0000000 --- a/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt +++ /dev/null @@ -1,14 +0,0 @@ -srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 8 16777216 2147483648 10 -0: Local data size: 2048 -0: Global data size: 2048 -0: Number of GPUs: 8 -0: Message size range: 16777216 - 2147483648 -0: Number of iterations: 10 -0: 16777216 5.130130 seconds -0: 33554432 5.120491 seconds -0: 67108864 5.115654 seconds -0: 134217728 5.128319 seconds -0: 268435456 5.111989 seconds -0: 536870912 5.115996 seconds -0: 1073741824 5.127237 seconds -0: 2147483648 5.116940 seconds diff --git a/mpi/reduce-scatter/perlmutter/128_gpu_run.sh b/mpi/reduce-scatter/perlmutter/128_gpu_run.sh deleted file mode 100644 index 469aeaf..0000000 --- a/mpi/reduce-scatter/perlmutter/128_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 32 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/reduce-scatter/perlmutter/16_gpu_run.sh b/mpi/reduce-scatter/perlmutter/16_gpu_run.sh deleted file mode 100644 index e66b9f4..0000000 --- a/mpi/reduce-scatter/perlmutter/16_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 4 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/reduce-scatter/perlmutter/32_gpu_run.sh b/mpi/reduce-scatter/perlmutter/32_gpu_run.sh deleted file mode 100644 index 07d6020..0000000 --- a/mpi/reduce-scatter/perlmutter/32_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 30:00 -#SBATCH -N 8 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/reduce-scatter/perlmutter/64_gpu_run.sh b/mpi/reduce-scatter/perlmutter/64_gpu_run.sh deleted file mode 100644 index e51945a..0000000 --- a/mpi/reduce-scatter/perlmutter/64_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 16 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/reduce-scatter/perlmutter/8_gpu_run.sh b/mpi/reduce-scatter/perlmutter/8_gpu_run.sh deleted file mode 100644 index 1b51537..0000000 --- a/mpi/reduce-scatter/perlmutter/8_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 30:00 -#SBATCH -N 2 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt deleted file mode 100644 index d696072..0000000 --- a/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt +++ /dev/null @@ -1,12 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 128 -Message size range: 33554432 - 2147483648 -Number of iterations: 10 -33554432 3.352414 seconds -67108864 3.323000 seconds -134217728 3.331817 seconds -268435456 3.327162 seconds -536870912 3.345694 seconds -1073741824 3.326455 seconds -2147483648 3.321790 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt deleted file mode 100644 index b71477d..0000000 --- a/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt +++ /dev/null @@ -1,12 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 16 -Message size range: 33554432 - 2147483648 -Number of iterations: 10 -33554432 3.368300 seconds -67108864 3.361940 seconds -134217728 3.367816 seconds -268435456 3.360722 seconds -536870912 3.363088 seconds -1073741824 3.392373 seconds -2147483648 3.375325 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt deleted file mode 100644 index 38e09b1..0000000 --- a/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt +++ /dev/null @@ -1,14 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 32 -Message size range: 8388608 - 2147483648 -Number of iterations: 10 -8388608 3.368554 seconds -16777216 3.367485 seconds -33554432 3.376475 seconds -67108864 3.381592 seconds -134217728 3.384111 seconds -268435456 3.375780 seconds -536870912 3.371542 seconds -1073741824 3.379895 seconds -2147483648 3.381470 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt deleted file mode 100644 index d982100..0000000 --- a/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 64 -Message size range: 16777216 - 2147483648 -Number of iterations: 10 -16777216 2.220629 seconds -33554432 2.201147 seconds -67108864 2.196879 seconds -134217728 2.199449 seconds -268435456 2.194973 seconds -536870912 2.196809 seconds -1073741824 2.196212 seconds -2147483648 2.201029 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt deleted file mode 100644 index d2bdd9a..0000000 --- a/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 8 -Message size range: 16777216 - 2147483648 -Number of iterations: 10 -16777216 3.558431 seconds -33554432 3.553477 seconds -67108864 3.562137 seconds -134217728 3.556267 seconds -268435456 3.551567 seconds -536870912 3.599067 seconds -1073741824 3.608635 seconds -2147483648 3.624090 seconds diff --git a/mpi/reduce-scatter/reduce_scatter.x b/mpi/reduce-scatter/reduce_scatter.x deleted file mode 100755 index d50ad5ac990357f4067a380d5a59a5e6b24a3805..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 25888 zcmeHP3v^V~x!z|c2{Ay<@DRaAIcmVDWbz=vfPk4~0uwut2tn{U4D(1v=EXb^yvoBE z>omkt+huL_TCUexdT-Y%z3Wyjz2zZ_=xQxmAGBJv*2l!~QV^&jbH9I|y(cF_rh4yP z>#nYQv*GN2|KI-qfA9UDefHV=O!i*a?3`y(6efd-&1J;Ry;$Q=1CJToe4|K!G_ZPB zf_N$`V@2Q#IHu<}FpVc9tupDjq{TYR%%G$f75Pz<#&aY+pL2{!Dr-oT^ag?^k~*y# zr?ZkWi_Vi?AufqmU99m8NuxqeQllQVar4DooNaE>^|JH=9WVcyPNg0-i}aondQS;G zNjH-Kf~3+f3*S$8seSp%dq?Ok8^zW6y`of`(2I`I@#q+hCnRmxVJ4~c;kD4Cdj3}f z>C?4Q)Mul(f0Am#pro?A%3xsmjM~bew;~t_Cs$XjuAfmcqt+UaSZlaFx(#%%=3ldz zTQDOQMQ}b}L^@-K$fWL|4S`<-jIP{J4`3*KejeNfe5i5*dGJ+v@U?mHd-LEw$b34}NtXd}$thMIJns2Vav1 zUz-QNKM(#;9{k67@So?wU(AEQmIwb7_iGG0mu+!tNHHeIrIC;)e5YF@IEz04+`=ZY z2De6VHvF37=bnK-FYvu?Ei0=hOnB}UJnIVH4aR>aj!~YTtEXhFYw39;IUr5rh{E1N zc~p`FzohC2KcbFXp55VNWQkwS1n#$bkFG}HiOM~wlp@mYOFQZI#wBpCMv1kl~G@%$K$K4o?)%BR`DcWd@gzs z2A0pNjNex2hx-$#S!F!x_N=Jzg;&KZm%HP>tci+v&=;?+tGL}4o=yfTs_VEduR9j+ zbcZWqxW{Nu#dK?}wW=a9!xyZnTOGTt!{1SN>x{aXH`W=8PhWX!v^H{UJRa4{&0oC= zPtYCis2r%INLD1nE5ec6!s6}S5l+s^*6_?meswG4z?jDqBp+52Bbr5g74f%cb3)Mo z?+YB@o+qRu5p;FN-C21rH^sA8M?=0)He+Sd?G3pTomn6h@h0I~ZjMUw2eL`JbyAks z%Ad_jW>cPcGO9Be64=st8WBGC6T=GUml<%w@WMI`IL&orh#GJ?ccQ%22At-zGRX0o zFdCa=kmEJsQzQslzf9xY2*k#68w|LyT!#TSmfL2)jq}K*2As-}VVMD!?-t7IG~gJ7 zvmt80OR^~ZV8BNj@bv~!1o*Qa}4-927ICcPaE(_2K zCk^;{5+v%@C}woe<=Tz-Vz)+w<6_Cgd5c@)l;q_Sihzl3jR=>Y%u{-#xiy|3c|FBh zR_@k0!Y|Yz9#^?FP6;>8>+22p#au#sr8d^3`oZh}xo71_V$<^9wYjd^Qmb&IJfwV7dX?3+b z7B(+(EO6R^H!ZYX4-tsBwYwI~n>PnHP&v7;-vwmH5H<}8{!P^fTT(CG z;z+&g=z8x+Tf4Kj@FZR@j^5ELIhpe?28PVphcNUUTkJ@^uRc-u0=j|v#7;-|&Sd$X z!e4_)30p0dAHC$FrYnqPvM`0_TxfZg{7L= z&`v#RoG0q;tgaQX0G^KBD^hzKc1u%hC1c zlYTW)nCQ=BKu1s0>DdK3z2MD{GMVa^)$%DG0Cu(ff;Ry8?P_F74M{YshhO$1Sk=kY zI~emj)U|Ilt4nqsb*h)`@vG7PLj)aCmmECmP%q=mYwD8Y#2iQVXB;`LF3Avbnq*&u z`f*Y}Zdb=YtByYbvA#2{?8)lb)oA*V--%M7d5|;@+0_Yqp#7;$9sdQ=Cv0l@akc#z zNLp<_;8$1Cg$5r!40;UoL^HDdHZ}ID1wyC&YV36jHFM78)LxtV2DSB#!;m^n@aSWG zy!EI53hs0UDkmWX4$&863a%R60M%x-y$_nJK7w5TLC77l5n>}m?FUnSghXG3Ih%Uj z$5hf65Pka)$sb1caSq!FCe)Ti+mBM!?P@#R`1Yri0nzd_%=c6Kknt?Qw+~WI8tr-C zg6IQ8D-Io1*G*gE=z6*sd9S=l(JK`FhN5R2Ghb4>>rQ7fj+uKCO>`%5V|Mw~wNIyB zJMr3yJ@j@;j`nxIoS3lT1$w>pm#Ev`wQP8qUWjbGr%?CO7Nx3Pb8 z&0=c<)jRtyS0}Hh{-SRCvt`3eLX}F{wW|-Rqvkm>I~|@9)Zxxq>XW19@A@-p9)2Fl zd3ti!QFSSox25*lPVBRt*vTvQGk^ERCV*37r7sNbLqRQ9|!=sXc%+ z3GD)8>-xf^Zv2#v z{wFF0sq=TB8us*~g00ore+Fxx87!++yWbFc_Vm{P_SnnL|Ner2*rvx81 z)X5mo|IdvVpTCdxojG1SLvS;V5c$Uo3taOR$BQ4q-2dKqf%dGM*8VpKj9LsB{Simc z=&xVF*p7whSKDbhv5j8i>2okSc#2Ye`w*k^O@&ZP&pF%IK@MH=bPry;JSpjX{CHassyq za1Vh!T+&5fVZ2`C%5&X z#{IJfWBr6p9gO{+u(5-&SAn5NkAmeLaHsB1Op`~Vo;wa^i}*gN-U9q6x{v{<-`o1< z0jHO%0e^^aEAZWeaWl&3p8)({2jld+7v7J3D>Io4P3aw72dYwc9&n_R?>JJ6_dnNs z3I^G8%_ntoOeaT(IJ)lK&(ynLMzZfo;(MC+J9@B|pyfo@$)-f0>tw6iy$M-ecOGNO z%Y?wTF+jH0z=5AeUC7_4%QrmNd?YJB zz3a}CELkPxx&51j{N=j*I5Z(EU$m*eU=vm`j;`kY#a(xvGOe=+T`YOj?&HPpVZF+t;tzQN$$H35Zt|j$4&FGxHQ?_Fzz1Y!P zL3EKV^+ikSxbsr-d9h>H7v}UL*mQIsOlZ|_$n}Xc^%;hAd+M+)GyY9S*KWlz^H0f- z99_>8-*~eez%XIWu4i(~Kf0T46zumDDp8x)n=kFpOy`1%J=KSr%ZgM9zz`KThIAhiN`yF^+h?qoOryDS>H0w-#|QluIOvy{6gX}3uJu_oWGiQ`UKlo z#ra0!>CES#?)zLa>z`OAo>Ppo}MFeYTCTtvKu_;)yeF7edG`u1^t zEb+LXXg}vmh#yD%R?eGiLqCZ6UleGQy{f%x-?uj2gE#M38~K8^F+h`)e%3+Mlhczjtv`#JwF z#7`#v2nIadf8sUb-vPhkVoEah6G88&)Oq<$dNlO}1nM9Z==V+vlCP(pVc?r{_{BNA zD~Dg1!*}QKn{xO^a`>O+@H=w&7jpPFa`^Xi_`^9o{hmvKZm=;>ho;arQsb05G=(iX z;osjVNjI7TbpQ${g}iVpn<&Z?glWq(E=L`Vf-Hv~C<}ZN0i@72LgSPk zAPT31;ZILz_-8Xst2ADt2Jt(P z+^?Qn%?CvOjd=+x=PS{1bslz#|3>t z&^>~_Cg=x(9v8G|x+q`Jse;ZBbe^Es3F;Sgm7w2{#GXtUg3+) z(zrRN(=Lp#`Ato;v~p~Qzf`NPvQD>FX*E?f)ljY1%DHSy*kip^h%H*Ys9A%hz^{@8C+{Lk0Ss(8QV%%U=oyOU>r_cW3kE?}uZBnEd^a{s3X9 zpx5VdWa;*0f4hGuVPJZBx1b5SykGPq`FmIw?66_yp$5JNxq zoA}8K#mWW(5T*(Xdb$6O(hHa$c(KxecohBZ3{@-FJ%2%_skqRLEc4BXO#AIX3Tin# zg8quQP%jxmqbT1}a4tyE3JB0Of)y991U>5>L?vIRh^Ht*)g^=<6|PF`gSaeo$CT@?Li`@C zF^VW_3K$BHKzeL|DND$4BfyNiKuC;N7Mt#5{`IMvxpl2+flKB9;J*?EJb6Kv5L9$ywWM9$t40w>R>mlRM^d&0e}rJF zMdqxLSB`8Mchw{l{(8V%#zu~uJJO8qUoe%mjN>ZIQna{ez@pIz%#PrOgkC93Ovu{I z8YvtwTcjv0;}jSkaUqNst0<3(IqPiYl1T)Y6!Kz68f!Kie}6FQY<3CAXe+yFl3wRa zCQ<1P6TjC8yws`;fs;YCFk2rJnrs;`OzfDEaLL#aBsdm1#kw;nXI$zo1$o%)^v%1l zPjBM48{wa`0n!bI_ynrS>`CELy*5L7>N=n%z7tAOJ})g{3IYjY6Igsr-0O?R#ZG62 z&EMFl zfF1N!53;a%3zvxx%U@anPb?7_C|{1W=3Pu_iOrg3!a)TJJIf?TT=Va7a6thu`cp+S zTZ;IMic}`!F>wM*hsPuLbjOo%pT42H@`}np*b_{8eT?~n{)&NmV>7?k=eKrBZ^(z# zyqhWJv3t$ogn6vSvKFM5Snb~*Z+FBh0NeF1y2=`4i+cJxydT-f!G!vknXzpG^dHu!t7!4Qr1GEb*F#<&dQ zCmG;Pt2I?u>#t9(B9usmeRDc|VP7oZ0qc%=I_Ky|)@T(S{QrU)tpWz;U@JWID*T}J zGvPQKZ{q_S()U+Z1byL-MCY6;K*{iJfv~qC;a-ky=iuFOar^>KLBRg{!I|iUI1T_4 zjCf}Sc68Fg2QYKn6PP;q~h zwu4d)s6N~#&KyX@*37A%F|E3m?886>wtR*oV8Rh-Qbn?s0JgO>ALIQKtL%(~e3kCy z$w1Is8H@_JF?BTOsBjQZFfZsVAvLBpL)&*kYNCW}%y9GAW<}c2rsXL&0RpnYP*+L}pA- zh20{PMqBM)!Pd3XRw8Q)504&4w zh9&_8?BncxmBO^@7z)R9WJvrX;0hbX_Kwkz8VG!bj;XI)Q_K}vL=~JY)Zo@QLiM0< z4KBi$xizHpnI+Ndabdob{ku=9ABAPOOt3HiPIw`K2)E)QJDc4aQk2%iYMvg;wlrxR zkam0!P)rk;{2n0jb&wj$58uv%KLlK1&l;#eM|~1 z%QU10LJ>94G%5Rcx-zE=c9?0vZxVk%;L;Bg{{zRzv)u0=*TcTaRGR&}St;-0xSIX@ zS&9Fe*Z<6V{tg0Taldj`L2nCOu0ip~;5>XFa5?Xii=*QLXP0PPj_T_Hgn}Cj? z<2)15`Kad@oZcuGs?wjA10O4e40H}~%Mf_wwLB13pxJ zXe-@N_?|p?zVpxa=aK&?4}K~SJ_f5jx{s_(<4wLBJcU|;(;hww{EwwUujM$021QHr z;LGyhfjszadGK!FL!IkJ=Qj@J&+q2J>C4AZ=fKg&fuZdDBo9vCU51i>ArJn?Ja{?} z{&^mp_9YHg?pb;8(meP@d2rhPG?e`-^5C{S_&nf4ovYW%@qzb+>_pcKe6L%R5U00r z`=goMH`0K9Od-y3J#CEtArH>1m}&ZAQDzNe5n{D9wqOC%!K~gj;rN;mvtrp}6(2wc zF|L@-c47geODBL}8t*1mk%|TrNJaP=Y*t^VI2W(e3rWc_Yr?l0Qhc^==fGjmSeIA-2?7pX?{ImM@R_R>>S%9m+J;WHLEeozC&I!n;^0 zf_Bo$d_1L7xuD2d8DS-}9Ony$`68*=;hMM5*4pfHwk&FQ(byoq1$VIN}P0+))%9HLk}{A*|wnT8H+xi&qz= z($dK_uyWTY-v*f1il3-Rw@)6^H{ZFS(dMM%brvKZzra5;7HMD>gygGikRr$I$R z0IO%X;>l3kszt80z@jRbIPy`D|L-S1x?G|=yZn(D`b>}>#@yo|hgh8tg7^m~*Wib+ z`5HSO#C*>J*RWrIjjgq%$v7U_ahB6sQcUr(7rC%PtR-5o669k@|8!+Py`3 z{z9GLgQp&(oTT)76YX7-yxhO`rqFK?da{kV^`{>ajvMr|`}0Tvp-Ly&@@4%6%zsIv zy2Xg+N`xD#9sR*=gq9EHZ%a{IJf;`f1Bq0R3zwxBd zH|}r5NBcAU?*>M7s=-A;t{W_>&eJb*#vs2bNGyN^=}R$r9=SZHyj=KQo{&$dR)L&n%aq;y+Uz3g%Msn-_AaH7;>>rz(G%hL6m(TT=D_7|NS=s!|_D>A4~QeW<8-TD>vcR-hXC-voeT%N1HS+rj~PTBfM zKkS7r`A_Q0^I4Zp1&C0tlWZ*OErH)6VbssAryFz$S&o#?)&G5tzWiQc5&BY3QYyvJmi*N3}s{ewPhNY(;7Ho8f6+nu0H+TMws+pgInX$4K_~bJnTV1 z8^qx^Ip|XnRXMoSeubL7!KGEdikJ4YdhW$i*g*MGlr==$wB1BTR>l)g*Jw63AYRrF#( zSkL2uDA#|XJo;OO{@fhnIV@WZAjieLup{(=6QN!_xKofm50~tJ^8K|Q`m*nlybR^y blQ5m1Zwxss`(_^fZS-P9csIv*uKs@lq3fIJ diff --git a/nccl/Makefile b/nccl/Makefile deleted file mode 100644 index d4423b4..0000000 --- a/nccl/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright 2024 Parallel Software and Systems Group, University of Maryland. -# See the top-level LICENSE file for details. -# -# SPDX-License-Identifier: MIT - -CC = cc - -# perlmutter flags -INC = -I/global/common/software/nersc9/nccl/2.19.4/include -CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL -LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl - -all: allgather.x allreduce.x reduce_scatter.x - -allgather.x: ../allgather.cu - ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu - -allreduce.x: ../allreduce.cu - ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu - -reduce_scatter.x: ../reduce_scatter.cu - ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu - -clean: - rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x diff --git a/nccl/all-gather/128_gpu_run.sh b/nccl/all-gather/128_gpu_run.sh deleted file mode 100644 index e9fc3ae..0000000 --- a/nccl/all-gather/128_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 32 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 32)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/128_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/all-gather/16_gpu_run.sh b/nccl/all-gather/16_gpu_run.sh deleted file mode 100644 index a94a523..0000000 --- a/nccl/all-gather/16_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 4 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 256)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/16_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/all-gather/32_gpu_run.sh b/nccl/all-gather/32_gpu_run.sh deleted file mode 100644 index f1ecd9f..0000000 --- a/nccl/all-gather/32_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 8 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 64)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/32_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/all-gather/64_gpu_run.sh b/nccl/all-gather/64_gpu_run.sh deleted file mode 100644 index 357da9e..0000000 --- a/nccl/all-gather/64_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 16 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 32)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/64_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/all-gather/8_gpu_run.sh b/nccl/all-gather/8_gpu_run.sh deleted file mode 100644 index 4bd249d..0000000 --- a/nccl/all-gather/8_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 2 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 256)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-gather/benchmarks/8_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/all-gather/benchmarks/128_gpu.txt b/nccl/all-gather/benchmarks/128_gpu.txt deleted file mode 100644 index c84792c..0000000 --- a/nccl/all-gather/benchmarks/128_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 32 -Global data size: 4096 -Number of GPUs: 128 -Message size range: 262144 - 33554432 -Number of iterations: 10 -262144 0.002247 seconds -524288 0.002277 seconds -1048576 0.002775 seconds -2097152 0.004497 seconds -4194304 0.007477 seconds -8388608 0.015057 seconds -16777216 0.028550 seconds -33554432 0.056270 seconds diff --git a/nccl/all-gather/benchmarks/16_gpu.txt b/nccl/all-gather/benchmarks/16_gpu.txt deleted file mode 100644 index 73e83d9..0000000 --- a/nccl/all-gather/benchmarks/16_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 256 -Global data size: 4096 -Number of GPUs: 16 -Message size range: 2097152 - 268435456 -Number of iterations: 10 -2097152 0.000532 seconds -4194304 0.000982 seconds -8388608 0.001976 seconds -16777216 0.003447 seconds -33554432 0.006826 seconds -67108864 0.013190 seconds -134217728 0.026196 seconds -268435456 0.052567 seconds diff --git a/nccl/all-gather/benchmarks/32_gpu.txt b/nccl/all-gather/benchmarks/32_gpu.txt deleted file mode 100644 index 72f0d07..0000000 --- a/nccl/all-gather/benchmarks/32_gpu.txt +++ /dev/null @@ -1,14 +0,0 @@ -Local data size: 64 -Global data size: 2048 -Number of GPUs: 32 -Message size range: 262144 - 67108864 -Number of iterations: 10 -262144 0.000622 seconds -524288 0.000577 seconds -1048576 0.000780 seconds -2097152 0.001190 seconds -4194304 0.002041 seconds -8388608 0.003571 seconds -16777216 0.006995 seconds -33554432 0.013830 seconds -67108864 0.027698 seconds diff --git a/nccl/all-gather/benchmarks/64_gpu.txt b/nccl/all-gather/benchmarks/64_gpu.txt deleted file mode 100644 index db7919c..0000000 --- a/nccl/all-gather/benchmarks/64_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 32 -Global data size: 2048 -Number of GPUs: 64 -Message size range: 262144 - 33554432 -Number of iterations: 10 -262144 0.001077 seconds -524288 0.001154 seconds -1048576 0.001399 seconds -2097152 0.002078 seconds -4194304 0.003777 seconds -8388608 0.007711 seconds -16777216 0.014418 seconds -33554432 0.028471 seconds diff --git a/nccl/all-gather/benchmarks/8_gpu.txt b/nccl/all-gather/benchmarks/8_gpu.txt deleted file mode 100644 index 1c654f3..0000000 --- a/nccl/all-gather/benchmarks/8_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 256 -Global data size: 2048 -Number of GPUs: 8 -Message size range: 2097152 - 268435456 -Number of iterations: 10 -2097152 0.000286 seconds -4194304 0.000523 seconds -8388608 0.000954 seconds -16777216 0.001696 seconds -33554432 0.003150 seconds -67108864 0.006500 seconds -134217728 0.012278 seconds -268435456 0.024449 seconds diff --git a/nccl/all-reduce/128_gpu_run.sh b/nccl/all-reduce/128_gpu_run.sh deleted file mode 100644 index 623f0c2..0000000 --- a/nccl/all-reduce/128_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 32 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/128_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/all-reduce/16_gpu_run.sh b/nccl/all-reduce/16_gpu_run.sh deleted file mode 100644 index af689e9..0000000 --- a/nccl/all-reduce/16_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 4 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/16_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/all-reduce/32_gpu_run.sh b/nccl/all-reduce/32_gpu_run.sh deleted file mode 100644 index b672e7c..0000000 --- a/nccl/all-reduce/32_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 8 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/32_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/all-reduce/64_gpu_run.sh b/nccl/all-reduce/64_gpu_run.sh deleted file mode 100644 index fc0416c..0000000 --- a/nccl/all-reduce/64_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 16 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/64_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/all-reduce/8_gpu_run.sh b/nccl/all-reduce/8_gpu_run.sh deleted file mode 100644 index d9c0ef6..0000000 --- a/nccl/all-reduce/8_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 2 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/all-reduce/benchmarks/8_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/all-reduce/benchmarks/128_gpu.txt b/nccl/all-reduce/benchmarks/128_gpu.txt deleted file mode 100644 index c8bc5f3..0000000 --- a/nccl/all-reduce/benchmarks/128_gpu.txt +++ /dev/null @@ -1,12 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 128 -Message size range: 33554432 - 2147483648 -Number of iterations: 10 -33554432 0.002252 seconds -67108864 0.003958 seconds -134217728 0.005696 seconds -268435456 0.008861 seconds -536870912 0.016701 seconds -1073741824 0.035052 seconds -2147483648 0.069582 seconds diff --git a/nccl/all-reduce/benchmarks/16_gpu.txt b/nccl/all-reduce/benchmarks/16_gpu.txt deleted file mode 100644 index 8199a8f..0000000 --- a/nccl/all-reduce/benchmarks/16_gpu.txt +++ /dev/null @@ -1,12 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 16 -Message size range: 33554432 - 2147483648 -Number of iterations: 10 -33554432 0.000971 seconds -67108864 0.001813 seconds -134217728 0.003415 seconds -268435456 0.007049 seconds -536870912 0.013323 seconds -1073741824 0.026322 seconds -2147483648 0.052252 seconds diff --git a/nccl/all-reduce/benchmarks/32_gpu.txt b/nccl/all-reduce/benchmarks/32_gpu.txt deleted file mode 100644 index fa6e736..0000000 --- a/nccl/all-reduce/benchmarks/32_gpu.txt +++ /dev/null @@ -1,14 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 32 -Message size range: 8388608 - 2147483648 -Number of iterations: 10 -8388608 0.000589 seconds -16777216 0.001015 seconds -33554432 0.001352 seconds -67108864 0.002146 seconds -134217728 0.003621 seconds -268435456 0.006997 seconds -536870912 0.013742 seconds -1073741824 0.027021 seconds -2147483648 0.054364 seconds diff --git a/nccl/all-reduce/benchmarks/64_gpu.txt b/nccl/all-reduce/benchmarks/64_gpu.txt deleted file mode 100644 index a773bf1..0000000 --- a/nccl/all-reduce/benchmarks/64_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 64 -Message size range: 16777216 - 2147483648 -Number of iterations: 10 -16777216 0.001196 seconds -33554432 0.001740 seconds -67108864 0.002970 seconds -134217728 0.004544 seconds -268435456 0.008213 seconds -536870912 0.017505 seconds -1073741824 0.035188 seconds -2147483648 0.069951 seconds diff --git a/nccl/all-reduce/benchmarks/8_gpu.txt b/nccl/all-reduce/benchmarks/8_gpu.txt deleted file mode 100644 index 4d60f0f..0000000 --- a/nccl/all-reduce/benchmarks/8_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 8 -Message size range: 16777216 - 2147483648 -Number of iterations: 10 -16777216 0.000511 seconds -33554432 0.000916 seconds -67108864 0.001663 seconds -134217728 0.003137 seconds -268435456 0.006408 seconds -536870912 0.012493 seconds -1073741824 0.024300 seconds -2147483648 0.048155 seconds diff --git a/nccl/reduce-scatter/128_gpu_run.sh b/nccl/reduce-scatter/128_gpu_run.sh deleted file mode 100644 index 8590821..0000000 --- a/nccl/reduce-scatter/128_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 32 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/128_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/reduce-scatter/16_gpu_run.sh b/nccl/reduce-scatter/16_gpu_run.sh deleted file mode 100644 index 7a20fa6..0000000 --- a/nccl/reduce-scatter/16_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 4 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/16_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/reduce-scatter/32_gpu_run.sh b/nccl/reduce-scatter/32_gpu_run.sh deleted file mode 100644 index 3d297ff..0000000 --- a/nccl/reduce-scatter/32_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 8 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/32_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/reduce-scatter/64_gpu_run.sh b/nccl/reduce-scatter/64_gpu_run.sh deleted file mode 100644 index 6bbf97a..0000000 --- a/nccl/reduce-scatter/64_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 16 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/64_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/reduce-scatter/8_gpu_run.sh b/nccl/reduce-scatter/8_gpu_run.sh deleted file mode 100644 index 21c0dc4..0000000 --- a/nccl/reduce-scatter/8_gpu_run.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -#SBATCH -A m4641_g -#SBATCH -C gpu -#SBATCH -q regular -#SBATCH -t 10:00 -#SBATCH -N 2 -#SBATCH --ntasks-per-node=4 -#SBATCH -c 32 -#SBATCH --gpus-per-task=1 -#SBATCH --gpu-bind=none - -export CUDA_DEVICE_MAX_CONNECTIONS=1 -NNODES=$SLURM_JOB_NUM_NODES -GPUS=$(( NNODES * 4 )) -export WORLD_SIZE=$GPUS -export MASTER_ADDR=$(hostname) -export MASTER_PORT=29500 -export CUDA_VISIBLE_DEVICES=3,2,1,0 -export NCCL_NET_GDR_LEVEL=PHB -export NCCL_CROSS_NIC=1 -export NCCL_SOCKET_IFNAME=hsn -export NCCL_NET="AWS Libfabric" -export FI_CXI_RDZV_THRESHOLD=0 -export FI_CXI_RDZV_GET_MIN=0 -export FI_CXI_OFLOW_BUF_SIZE=1073741824 -export FI_CXI_OFLOW_BUF_COUNT=1 - -MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 -MAX_MSG_SIZE=$((1048576 * 2048)) - -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/8_gpu.txt" - -echo $run_cmd -eval $run_cmd -set +x diff --git a/nccl/reduce-scatter/benchmarks/128_gpu.txt b/nccl/reduce-scatter/benchmarks/128_gpu.txt deleted file mode 100644 index 7c1c8f9..0000000 --- a/nccl/reduce-scatter/benchmarks/128_gpu.txt +++ /dev/null @@ -1,12 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 128 -Message size range: 33554432 - 2147483648 -Number of iterations: 10 -33554432 0.028300 seconds -67108864 0.028351 seconds -134217728 0.028351 seconds -268435456 0.028502 seconds -536870912 0.028579 seconds -1073741824 0.028650 seconds -2147483648 0.028506 seconds diff --git a/nccl/reduce-scatter/benchmarks/16_gpu.txt b/nccl/reduce-scatter/benchmarks/16_gpu.txt deleted file mode 100644 index 14acf87..0000000 --- a/nccl/reduce-scatter/benchmarks/16_gpu.txt +++ /dev/null @@ -1,12 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 16 -Message size range: 33554432 - 2147483648 -Number of iterations: 10 -33554432 0.033170 seconds -67108864 0.033280 seconds -134217728 0.033220 seconds -268435456 0.033291 seconds -536870912 0.033217 seconds -1073741824 0.033158 seconds -2147483648 0.033275 seconds diff --git a/nccl/reduce-scatter/benchmarks/32_gpu.txt b/nccl/reduce-scatter/benchmarks/32_gpu.txt deleted file mode 100644 index 7eecc67..0000000 --- a/nccl/reduce-scatter/benchmarks/32_gpu.txt +++ /dev/null @@ -1,14 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 32 -Message size range: 8388608 - 2147483648 -Number of iterations: 10 -8388608 0.027121 seconds -16777216 0.027661 seconds -33554432 0.027766 seconds -67108864 0.027992 seconds -134217728 0.027914 seconds -268435456 0.027912 seconds -536870912 0.027777 seconds -1073741824 0.027861 seconds -2147483648 0.027551 seconds diff --git a/nccl/reduce-scatter/benchmarks/64_gpu.txt b/nccl/reduce-scatter/benchmarks/64_gpu.txt deleted file mode 100644 index 8f8ddd0..0000000 --- a/nccl/reduce-scatter/benchmarks/64_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 64 -Message size range: 16777216 - 2147483648 -Number of iterations: 10 -16777216 0.028306 seconds -33554432 0.028511 seconds -67108864 0.028175 seconds -134217728 0.027998 seconds -268435456 0.027883 seconds -536870912 0.027802 seconds -1073741824 0.027954 seconds -2147483648 0.028085 seconds diff --git a/nccl/reduce-scatter/benchmarks/8_gpu.txt b/nccl/reduce-scatter/benchmarks/8_gpu.txt deleted file mode 100644 index 26c22b6..0000000 --- a/nccl/reduce-scatter/benchmarks/8_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 8 -Message size range: 16777216 - 2147483648 -Number of iterations: 10 -16777216 0.024231 seconds -33554432 0.024389 seconds -67108864 0.024167 seconds -134217728 0.024047 seconds -268435456 0.024293 seconds -536870912 0.024031 seconds -1073741824 0.024048 seconds -2147483648 0.024241 seconds diff --git a/rccl/Makefile b/rccl/Makefile deleted file mode 100644 index aa0a7b9..0000000 --- a/rccl/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright 2024 Parallel Software and Systems Group, University of Maryland. -# See the top-level LICENSE file for details. -# -# SPDX-License-Identifier: MIT - -CC = cc - -# frontier flags -INC = -I${ROCM_PATH}/include -CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL -LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl - -all: allgather.x allreduce.x reduce_scatter.x - -allgather.x: ../allgather.cu - ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu - -allreduce.x: ../allreduce.cu - ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu - -reduce_scatter.x: ../reduce_scatter.cu - ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu - -clean: - rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x diff --git a/rccl/all-gather/allgather.x b/rccl/all-gather/allgather.x deleted file mode 100755 index fc85917cfaeee3d0d9962cb061dab34a2359729d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 25736 zcmeHP3v`s#oxk5q5@I0vCcFe6^#cP$C6fms2IP^+119bSBP2pq947NfMkh1FJP_RS zGGIFel-eHGcHPRlZp(4)S?addQVT>B(Czl%qsUfQsTMmSf)o_ku9E%z?|1Lyn<3M( z-E;Pw?a77j{_p>H|NsAe-|yaUa_{;k#{!e0FzHNe9wTngWQ{|0+-EHRE)f8!W3{XR z{!~`N^1$bCOpmW)8V^WXVbXC)^L3V)K}jzx;=?A5$4Gh&=NOYz=8!1q4LVH(b-Hu3 z&PvKGI!}7Jcy@{SU7~!F)(JUDje1nZ)01^Or|UHykhJbgI(}48sYk^kz5MI-`1}Tq z2PEAg$}Op^m&M0p9x7k9^o|KVOQCL8)=R6*gKNw$Kwz{^atftx;4Oy$WJ!%Fj)51mV+=3at zNP_d(Jkk+On_oR)^@%w!OHGF>fzq40%Sa3}Dg()DM-S7pK1Wx>Ce1^-SK{Mjsc zZx;OZEckD;;AgVnW;DQ1^-z!nACm>2lm)NJg3rr>JF?(QvfwMS;P+<1S7yOiXTjHI z!4p~VZ)U-FWx-#_f|mm?W>>MDE)6Y*`H1_BH3@v5OXG;d9|vw>6Iq48c_h3iIerzZ z%fSCh;D<7B8np`kk#A1>N2rX+4C(UZ%YV8A%?S$IhqN^0MV})S_V0oR#3Ix3REBh3 z2SzS_mH!IL!-s+=LgSj~Rd_J2Hi9W2tn}Kg*VLZba!q~HuRcDya@jRM`|#1PFsBnU zmD?GOxgs%Vx62;{(%Bsf3d9LE;_dWDW8R3<7jboao&KON1k}CS<@EW3u7LjmFIbOv zD6uYoxW(J;4zFPdE%C-0y{r6gFEI-uUQSzFfk4Pj$U+(25Q+z50%?jwLXjo0h(Fj# zh_1h6P0-yH2?Z(TpxYgQW_!@TGVX2mNZzPNBDO$ap)1zqjc|d6Pvgu3f{+} zu}CZwU@fi9PQrMA1e_6yC75$jN-Y}mgyJ#g3nN1@AM4gL57ZlpFyH+Vf6U7@7RZVr z^?J7}8e>Am=L^K6UCc+t=aH^R#E)!IcxlYv?PcE8$b{4Dam8Gy*N$jZZwf#`4`nv! z?hflIhhq$l=j!&LX7rcKOsXVjHC{5*s`R?4VGAA2^$pG{Yn62-D-VTZ1NMfnLFSLc9sujQUuH5!TW-t`-9>=+Zl|{PUY~@LSDWVs=x?3 z4%J~LF``(+R}lX|DyBQ^cT$~lfY&@Bow0zkE9y$gd$=hcy*k|O?M_9kjJrJD=$k2^ zJLHL@YPmT|$>&c6DH$otbLC4##Zw`7G#=I&#&$Mo82S1dO-vD--(|o}9MQcO3^=Vx zq}yk}b0i3Q$bi$jQ#v^h6J}gz$a$FX>!mPg?Fx-^Bj6j;tux@pbnOP*m~N{9H?H@V z8E{HNx)laoz8fg6%Yb8oO}VfEFG!(`tv28z4fqBFKFWZ9&47y~E|1-Az>5s>j~j5j z08(y;0jKwcbUO{WTq;xCE(2~{$Gl*`X)P(;J_9~hf}n>C_&5W8*np2W;71Mkl?MEn z0iR&NlLmaE0Y7cP>HQ|%1p|Jy1c|yd@)@kb7yFxXBxgq^512l;n)hU3wB+MI5G(AgdBbW+ZVb*(nCc;QD}F@Go+ zomapX20|UKfJSd_UQk}d0_F(0hd}Mkt)fVJSt(&-p$0AMMQgD+9Je(%TUu=pN9m)T!X)T%bI#m?^6*%GIMr%nVuGQE%}qtx zwj3DVnrknrcqq?aRNFJ6rKm2#6SPr|M1u82`MmDP{*9Pm zk3I2$z2~>5TiYDla+7#%*tZqBIGH;gWf&N@2d~;s+wFhrm`qN>&B_t`h^i}i9A#ijt~(XAZSMOgVeuJh@fYOmJr_;tgKJErbK3rT61*ikqVmp9eG@A=II zpBl=I^$!ey7EagcxE!5c`|gQ>fyy`3(kX5LjcVz&?*Q;Ms-Y=WB+;aP@`ex2s$NIE zf-#?6UH4v-x^&-JhkD}ypBnBzMbIgA>B+Np^+wJdQJ0=0<{YB`z>$mU(g8v)lI$;` zevZ`7HL7D@QOBN#Sl^{q4y1G%)o}8Z&w*5+d6G0wHLBweK>IVBI`#{M&)d|}b86eC zAW60DW1qT;9u)YKPe4BfJ>P^VpG}SY$^xN_J~i^Tg^D@%CMvH@eTT~W&L@z%NO0k& zeZ2G+{}bHB0jOMn5IA^WkSSDEVI5SP)V4lot~vp^{*#b9Wh2B!h}sXP^fZb73g&F; z9lxiPzJTcar%3)2M4#huBf*5)QfS**%6g;PhH8BOGm3y{X%go9seFid1>pN9DJF^X z9Jj#x2;P0C&Z_IDFSYl)l#jT#-X-rX@_s?y%l28XtD9$D92l_AIuvW5mPEt!`qXtV zC61gwa{d6lo#KVQ&2PlUZ+wkjZ~X=8?hh;*-=No`PYu3AqP-_9zQowu>;t|1FYGt= zk9ONq+a0!cTiX)*#*gh7jP}IY#ET^_qU+cbKPuS=+B1-=ZvH;Tm3_-!=U1NFP0PLH zx3IWtPP`1rBB6~OItXZjgn}G807#S2U4U#opPSUJN@Bn} z_#DvOd$J@>bHF#!=Ya6<(5KP2OLaPq>Gay8ABj1@34rE+w*jDkhNf6ag3keex#OS8 zI>h)t&P)Es$3Ijrcl^WL|K0dUdH&P{P>l(oKV;uh_+K}pga1y|*dAK;@21ytvJ)?U z`XI>qW>F8?GWQ6C>|4g{Dbc&&v{d{=9$)x88EGXWuU{Hz$*j{~xJK@>Ls+@5ZDD3wla*9kjRnh))VzzQu>MYS~m$ixkx2#*#2G zrGi;O%ml%-5|b~OIv;VAECO3gjv{bY4SlO5sRy1YxxfSK?=Gpp)ZSL2`P5=Z38wnu zoD$084n!yGMR{nO?{GiMXQouS%J;sB)@e+9gsJ7H5Zt=~tRry>wE^V&gzQAyCf)=0 zZE%$X$)`YCW_`A9nIkcf{6{c$QHjZA*bGtGf5;pE#S*IVWP<>I0x(%aCidAkzD#d` zyKHycmfP;JIhXJEZEsPq;5yYj>*V^KG;_V4w9%xmw~ieZNwyrAIjKmF#0Nj6Zv;)1 zuO{1s`6pm7c_o<3QMyhjb&}EtSQ1{2(h0~>>8eQS2bZbzW1;i~&-Q1@_+{#?6?(5g zFIfksc_U10U@y=VIdiRb-*Fb04r6ei8eYFt60`-Tm zws#K=dK2Pzzl|LCj|cwkbi5b1J<*GfMdjOzT4l+H5Q5}^Xye~e$GkyqN)R62+in>#e`aycsDT8O1R zwhgp%=(*4k^Y>h6Q8!Zode(l*;x`I`-Niumyo4H|9Y7dGz%HW@wyD;7nm$+h-92kh zv-ol;e~7S}Gf)@Y;!Ql?>K>%e_I?gD`39Z<9 zw*3M_%f;rz+q7bIY@703?3CN>+scS8u_eA}PMmYxKoy^F@BPA@Tmr-P%_n18b9p`^dJl3+TuY&V$5I>4|jq|S&k3A;qvvB?;;){r9oZn5nO8jZe z0|Qf@Bi=&%G0s0jJoU4_!<>JLc(fDt5oa19LCz;F!= z*T8TM4A;PL4Gh=7a19LCz;F!=*T8TM4A;PL4Gh=7a19LCz;F!=*T8TM4A;PL4Gh=7 za1H!$MbnpdfDXuP0P1+5WufuMH?>JxO8pbrW9n4sSi^m#!K z2zo@&j|4p@XkL{_U(l(7)(E;l&^rY63A#$qhXj30(C-QQyr2gJJtF8wf;QlwnZ}RY zX{&I40tXy4{ivX(mCkf|XI53z&hpHf;i_;|RaST0?49BBd1m@tuA174>Wb=_wKd+E zwKw2Tqx`}9yph=&H|KCP3L|V`L&I#X6o=n$&?+meGprR_RYg@LRBN?TF54V*TW=6z zOWKz-X|NPueS>zh6^9JbBGd8czai?sOv?KWZsDzUsYkzKnphKj`AcTH%p4xSDHSh& z-=seXASZv{q(6Wd%IW#}9-?&nvVT1~lrS(oy?amuUH*vZhw}H{V;Ov=85;mmeud&> z`Q`8a&m+=<=f{XuHkckkP?2|CNEt?zmBWuGZDBaobG4v+W{NUokTgho-kCFd%Ty!#+P&j^;EzY_F{hu{_b6?r^F9-_8# zpW4EtZf4gZt1jb2q z$_E=i6CORv&7x)30Y=NhuLInOj1BfErC71#6)U3^bJ5jBQ;No*-m>8e3nKF_)Xf{t#n#U+GJYo`z=c`DMlDT4>GHoKk1-U%gk;a_O!QV%W8po!A z6k6H*iF%%=O{CQ8Cj3o3@FJ@+1WpFk+*E!{D6(bHFtNoWQ6-~Ckl<*<$eOwXsEMDrQk4HHDqsp63F6dS zbWPOb4M)X!X@(=@IO=a*qW@{n6(%`zm}e=Unk_fY*KWCW(xgb#Y+30w;Wh&0dYCaY z`a>hKT$r*9p80WJ-s(maWB1@O@oD)_X22bZ`3KXNGp)IoDb11D(@nTaL19;z6 zlPRVgVBbLsX0{CeB^fD?McRsd+zB%%cyPgE8}Hjr%ckaE^KOtqbpFrX2He@(w;~DxGL9u<{)9Df6<+ z9~>;docYbu6g#hQlwHpL=9>g2ZfU^DZn|^9k9z^A2u|D`;9A4)1#mXshHC;mIID<4 zdzib+6=7V4@p}vKrqvoMEA`i>R@NPh2fcGUy+Lop?*{9NxVz@+SJY@_o&5i8RazMg z&c%Ux=#}|EYiFV2aGi}8Y`1=%vn=2ZcE-BqRsf0z@An5iWieL=4zGiEMa4A@p?DOh z^wT5J32~_a78ud4GMpfddEIn~k{gb6ddqN9vka#)Ar*{~tq87!K;b%()mRsJU0o=K zBFHIjD?rA5VLEr{ECFn7Zh)VG z52g?cM9XkO-y4`qP2hJ0%Dk(+ZnOwBjA#)kM`Cohk7iqmc&FGYu z=7(1dl1%HrdTsF%#e-3QXV6Orc=h@l?zETJX@~BWyf-(Yw$s~2uW(GHgO2`WmYhW7 ztSo0IIUCEFNyG}S&#=s1srcO~*!jN2OJMpf45RN$kfkb%<4H_H%q=)N$`ASZ*} zz-bI*x}Gffqgn80fDe@)+9?l(_h!Lg&4Ry~1wWnzKc5AsP4-avxh4x&J1g8z%Kv(Kebwo~)K?kw`X zS@2(G!H;Ia-_L@7lm+k4f}hWVD;U>9)mt9$q3+8o5-#Jfty(5_B~v6qs;balF?&x+c`4iB`8 z?#<&Noyv_s#*PL%kPh4*804FqCcAUNVp~g-)6u-7%}I9)dP4dxRo@U`H-){32iMm* zouQ6lC%LQ?LlSlZn_!H~1N+Z9HdVdEAGdBMgDsCDS7cJiDeCED#c z2RGaLdf5Q;R_B)`Qp?Hv`4&2EtG7Am#+@ZiZO%4Zy`u?5Oy8IbBiy3AZqaBX!M>p+Hyw&0;WNqu?F z@q##5xRZq7Wcg+Kvi_DLj`ZtE2u_}R+(C^3XFT_akL8!tzY7?(OBEh+a=pGC-}A`@ zWyW+CYmRj12W#wqOzuaEbvZ}6IM1sQx<5l-?q^$t{xULdaI*Xo+K2#&h~`TD?k4YB zA1%ruC@0v4f=9_dSNm^I4P%dgS^CdA%f~urZ4y5?0Q{8 z%4O5viwJ5vsW11z^8WfAqWoewrSc=&@ep*W{-nOVUv;PW39b}zMwgkNe?z2EKedmo z)3sze(nhBKn;H7@JAp;$OFc;`Eu-rPLs*XhGxcTqg(5yPeYy(9n15}B#wE2nUBN<; z2A8Q%KXVZ#>#xqGap?*hCv;zRn$yxa{1yg%N}?hIm-^q$z)6;zq}K_39Ub%!+eBCj z3Rc#Kl(c5kuh%6ui2ftxGt9BsS@c_*bPc*g(daT*MiW~``?Q#tX*gbuNA^GY{@MV2+4qd;9B$V+`fg)%87%vH7X1!-F~U8SVLVg+zX7OP BLl^)6 diff --git a/rccl/all-reduce/allreduce.x b/rccl/all-reduce/allreduce.x deleted file mode 100755 index a21c76bfb1f099fdcb0b0fde0a1a598eac47e83d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 25840 zcmeHP3v^V~x!z|c2{DkI2`@pk93&tJbY=Wt9Z`psZo#W*gaXd(_O3ah@>ko)A9VT=v3-awMg%v&^svf zB;6+JEvfX&;*Vk;s$aJJ4hg;L5qf^oFU>;FR;=T;VvR>6ZPj5Wsr2Cu(4%sG*+BYq zZ4%|#B%Yt7nlLD-%&#oqZ<{%zEZ`{(_=EA4r7Npumd>1EjfSih+#Wp!s?)+nOSlCy z?jj4$XS%-QMk2Z6MAhJpfdf{l3F{n3~=;`BvaonEIu=nDaLuXH(m z{-7)1zuODe;~h+_!yj()cDlo>7-B8nSeer0+E8a_ zL(m^v>NHEvfl#)OK`7l=nYxKNEN67hQ@JiavM@ANVsImf#ag>ZU3u9yq{ zZHq?rM*=A5q0$E3onbxaaE#$0x;j1ZkpA+iBxgBi;3Y&)mtIFR*}}$#np$UtwZdA- z%0l5-nLFZIRTlP^x!vBf@|jkf)yAWEg*oY!=5L!_7QL&?2S>+Hys~K6EEWO(soK6Nx%PYApk1G=Ga0N>vc$%nG>2&K1tF1IP(;KL$Tp78m z-Pc}u$IQxzC(;p#PQUYx@QlzM(P&uDH+%6)-2qpyy=`E)Rz83X1n|doVsb zRl@TtdEr*bfe|zr@?kkKqFTh;h`&3P(i!$U$)_CPo+qR|7I1b%T`74FH^q}zhC98T zsf0V@E>EW`){z1_L!LNX%gs?vK7T4o*+^MlDqkuoo{G7n@vzP?wsL`n5kQ~AiJ^k? zYYn(@UUI(yH*-n-w$*^kIS{2hY`|$=DuW!438OJchHjU}qlB06n0~8Trt!!y+>QBG z8*pR34g+q?x7mOj=YuyHaLPl5Wd>ZnJ1DKgfMcLdg|GoHNTKkv0UvI_*BWp!)#a)8 z8}LGd{8j^AWWXOb;CKV{q=M6Z`HD%~F;G-o7y3c@* zG2jOb_*ese(12fTzz-SlaRxkTz{eZ#V+Newb26MU;1eWB)TPm#(KDB8FyixF8WE0* zr4Q#VE{#)?mrEf6#t={L=#@d(N5G0w7SE}bKMk`D2^&82ZlxN&|?Gl{{1 zBEbIswQG!*l$L#^e+^^COZFAnH^`3hO6Qc+{7Ciz*>|X%dRU7b2Cn{oN{l;nf<7Xj7uIr+Bn0ka|S{TU1-yn1)Y@0!+&T=3u8GxB?RukLC}C z<1sDlqKOPiTdA-$fJ}5Ylc~fx8Vu_n@5?j#VJ~c(- zYKpX)qG>fnHhWRk!lJe2u3?+Z7~PcXD6-v`=P0V`8rD=) z9pRbTsWc+9nxcH}%VyYl9CmWZjv;JK3%;%8M;a2Z-|k4f@96sQSaWOR=G+dvJ{+41 z-^1I*v3c%eD0lyuMUxpjxx|t9NPRB104`Ub>vnADj!)Z@`#Bng;@Frw2NHl{V;!3- z^C1D%E06Z~CvSzdK4t4|yM^{O1KM-;NF`q+S{fByWY4?dn z^{PESHQaZUprh*2BPSf{Rh)TCU3!X`Q%L>`N6x8B`w2NmvcH7-DN;XGr;dJE9eo;N zz2{rmlhUbE!^xw*M&ts`Bcyq>P93`k+JCaEqt7CK+OAGJrM8{~Nvf@%_|z42p~9bj z3VIUsbUl)Mb~W;A3xv-3)X3Wws^;7YRA0OL4%PLYPa$=V;KC1kdF{{r8Qi&ksGNZi zINZ*XDY&Z82Gx4CwHKNzK89T15y&026JjSs?E^FI7>T|Nb9VK{&nTy}5PkP3$$yIE zQyi`%m{5BPZ9PFnuTxv$#&`cj2@su@g!w+I9}->$`0f!(NuoXt3PPU0;E^<9*AzH|RC#Q-d#(XwSzMUt;w2j{cs$=XV?1N7rn&+F#z?ceOfk zEwvYQ$9tA_uM1VmrKhJCsv{OS`nw(O0+ivN+3HV6EbMs?B@gaLbb%i2IicRf!&vf9&~n`+32h*}Zj%eX0FM`x1L=i(}m<4s`m4#OcHfCC?&jN8;HM zD6y{oTy?`Y(2pB7uE{SywS%Ug$tN(`Y)HHe$ReSC1LR2T1~g7WTe#F7K$?V>1G0CW zHL06am~?D-N8OT#NYjQ>>Xx(UbB(i3sNXn=NJC<8{f2&ZOOiU`W>KWgdXbX#loyyy zD9bKfb;(+RJd;A~KacM=*Z$t=7fJ|Ayq&ZG?RtTuEI!?_ zaqiw(C|KfTL(eCpcRpj$SYzV-^6uoHA%?`Kk+}0aDf9!NjfvCAKG255OAUNn`H(8a z#})AUxU!kbkw}dzBmRhfg8uT-VNS<9rqjz}pnP2U91$8+n-0gF8PGW^(WBY_$=@9 z8^6uFvT9jhQiUAU;<}PBG1COIjF@qPX(omyPb60DBaX5~WK#*vK~JcmZSndC`%Dk(og%IiMI+mJm^tA;wY=}cYXto|0+jBJt*<91vvUSC2@ z3-6%SUmKXeeHm{p6WM+jW|EV^^#3BhOV4Kt?u^ zy=PYqngbfhb|v{bEcAYHw!gnI@yBEibY)qR^m3;9{Sg@vNNpe)5#SMG;?OPccC34e zUh%isZ?)fMzuoSpo}Z3CO865F_r6`PrAqN*!oCIB5$UCP4tQ_8zrPtSoAsn8)056p(tIS*BEr%2i7j!@Cyqq?kR!3=;NE&tWP9tWPlKG)$uT01 zu6qtLb^RNN_WqRkjr9i|8?kz!1x(kO+L*uVOq05SM!v3lPO|t_LSRQRke$c*liQnI zg(_gNQwZDSwXXUvls$Vn@bKPbgDzjaxBgg4etOqEXIR`O<+=S! zh5UG3eiW*ZlF!@Pm$Mb?CP!EO!Thd!zA&w^2wkj&)D6?&3vTghUT$?K@@Kog0GfOU zuH%KX5UA|})}P!1aO>9nyI*}ZZTSa=rjHGYw`m61xOvKtuySAG*j!4q#hy6ZkT}(N zC3!sG(R0?EY=K3`h9fbp{2jR-YE1kEFNeCsr}qBQzjJi$Rvfc_AOG0V^-}&Vx5;LT z336)vl$rmD-Sn7Xe`D@v=*_g!GxZ-J{oOjwV^lpo^jAi#=!jEdxT_PZ5hK2X5_=C( zU8meZJUO!W0O#9?NBvoEH|K9A9xi9S+d03Ocyt8TyN&Z-B_6$#^={>S4e=PlS?^lT z&n6y2KkE&1zJmCC;+Jv$D&lFh?rrA$<-}uB$$G0fe+luJ+Ol37=SLDhf_RPd1;o>* z-ChglO~e-w&p7`Vd-s6Xd_OZ+I} zw{!jt;%Ol5-NyM>h^J4py<0i|BJuP|ws$S(cMwkvzBkPI|0aGM@yj^>H1YK5wYQn` zj}uRGq26lFKSKNj;%%INka+rJ)2ng*o5bS_3F^=Jdx`%F@r?6#6F-soV;D=Xo~$9t zm|xHzmFYY=h{9=V2?&C)eHr}a)DjHoWf}bZ44yi-QT~<;K9s@Vox$Iq!GAA<|IZBm z*$n>W4F0tY{x=ys{VkY+@mo9b)PO0B8?JFm4Vc1HI^o~}Yi!%?se(>SFLL*cM6{HJsM`e!+juJKgH4%F(g zNrIR4paw`m{!WP+90i$94U)n>p-&Bvf=s6lNnw4t#%UqH<6EHMkM-;23FqnQXqvoy z-GWEYo)qJTT;eArB0BV8{bQ9vJe#kOzi5Fyw(D4-9!= z$OA(j81lf72ZlT_kKl+ zDE%GK#OiUEe+e|EV*dh9U!O{se;*tv#N^)x=?4Xa1-(2^B1x1-w6_Nb69%T|cRQ+} z%ReOAm;8I_whTVgj2(cezd~uU{_^kU&mqx+>nBK6)|wtbRFP+2NEt@X%Hi9Q@|8b9 zV#K3D>t?b(2{+|p{F;_sWbQ7~K(Q4sCl?!CSxyBp^v8P>KY1ZvSw{fERAE8S_gUng z!+gN=m1^8a&@W-ASh71x_k4gvQ+}=)N#@&dGaal0k~4$D!#)6#tLF@%5tMGpp&!cS zEr$SI!&rX)ouC)phg-qd=*DAoL((RGr_4-B9yO!k^Y9>*Y0D`3I1DLQ@#q1HUc;jn zBq>&k<{iL|F3RLB&=JEQM>Q1N%ZLx(P0@0S@;@aj&ei1_n1LM{%40=IlBqo>nnXKK z2J+Z5z}5MbXDt<(g8C3=soagkzR6XIeHfR8o|tmueB3|8Ra`FengWKx_aQyf&y=O) zxM5&MH3^B)$`aE(Y_!s9T8;Y+rZo^4Bh{%KY-}ZNbSc-0C%YOjo-FRwfa_4Offl6{ zE0(-sWu#&*nou;QXcXEle$Zkrnm>X-vQn%79!|*?{K&vG1BtVTUpKsA)co-#{5rr~ z!iEo@H{6WopEH#;jN&THlD8ypz@pIz%nsv*gdTo5V;-BbnKF_)U^Y)t8b&EFJZut- z=c~w%vbkuCa>aOp3vzk3!;K}IgWng77{jgrDYUZrE(A0E&kRP{iib*ndHb} zo~4{LTdtn3UH8>VlOj>GONVjj8A9E_PqYWyBE z2j`eaes$rk%#>sPh;DlyG))_CF0k?v^C9zs>mMAbza06^S11ndaMWFn{^qL%CQfI- zPHsAD!H;tQs0vOT7~opP&jD~YT#sV`JlKnfN_&{Q!xdp%hVgR>@W$0@(cSrLR4eU_ z#e?3t?cSg_;&+2}Mcf^8^&@Mv(sus;`U=|1>33b*DMWq zgYB`7xi&!Y;9dTpr!?kj!)A5xuBbSkAry~dSAKdT8X*o5z(gb3QHmX+F|V68Npi!H zc5f+mVwPgpC8UBevK7Hm5U5-`iW=(xud55iPy{8#=>#aaFHGA@sRUFWZWAXJ#3HNa zme0JVd^RzicQsd}-930JHAHG5&KN7}2z7ePTy1fGz*80pcj}RvB@K;r zW$ie2z+GBlonBssHps*Y6fnC&jsZ?*IO5G{l;@X+I|f;%wO`#^yhib0)ZZTT(gt1K ze?yJ-!W!-1t&+FqdU!ki+2{_(KspfUC#V!8B1dI8I?2&kj!aTcQeAsMU+O(7<{wh; zKLjuJB(;c0wV(=HBBS9HbeoLE1(d^1C`*1Im=rnRR%^9Lj0mc*+hx>ftMvkG-5_nn zQnt`}^kB4g!+EysLMo6_q_=ijkRlF&nwaeeGVT+Y!dlXF{whP0fE@N&>bXi`T6GMD zV>&V@emQW3jbQf|Ylsa5K10XUSFS1Mi6o*5P9CauX&j+)P*{YE@MSIyF@0u9^y|1V z-%0(hlgdY787>p-%ijSnCJ^BcTx4gPOGAv}dRWQhBU8WsOxb~eVj9cj_W+5nf&5^8 zpcTVl_(RKwu=E zSLQ0{w*r@IQ2f}Nhxa6&u{ion;N4R-F30!jBX$ax#>S#y=r~Wr?P8R(7^gSNg{t%? zZ7mupg$%R`xMdJK^}scQoQ!q?r#_Gwy0YL8X2G8ZK3I8Zr92qklLgUHd})zA7=CFMyetboHw(Uq?KKTpsI)VyXH_t|s*_o+nEQa?Bd@u7nhy)m!-*&dSvWrp%1P`U-+s(`IZuXIG~mllb-! ztmrv~I>XKo&W{VZXlZ1nb&lT~rP-cb1Ucm*2x~+=k@WS9agZFdx^b=@l3>x>7H>!1 zbn*^xo#iL|$)vWnhFRWzpB3K4iV(De&e!8Hoyvtn#tH~4mNuL- z803qgdWUntVtZ4)v$3J2)k$X%dP4fDR$oA1^@X*I2glnvouRfnu!zLjYn$xHI4PCV){Z}7~vM>v5ZC=8D})|jN~T*Pc-Dj!FwL84FBN| zZ*<;VK7%T+w}?7;xw*W8&OU6T{t+vm8DJ2pbmIuBP$$6hna*guvw1~}v)SKbbJ97A z|NjnubUH=Db^1aPw2lDvIMgKVM94vw+G!Adif}s9ds_XXI`p<|t)6SteSML=siD?5 z_Wmt#8rJE;Ym5hZUbZ|(Tkhi$$y`>(iPFZ*VYsCJCX)tk;2dY2;}E~e`JqH;-7_4Q z)R+6&whN#e?-dG^2bEL4zwxD(hl#jj4IGmCa{pL3Pfu^w2@_^g`AB*Ca7ug4B;TE< zagA=FU(LCJI%bxi{xERLpwEVBx&{>(!KRZ`{<8c6=6@NZvdR4Aem$Gem;3dkym$>u zmNg+pWs>@G|KT`tD#BAbIWK?dzokeg{Td-B_ak--eW^%g#7Fq^{O#QF*ivMez2r3#cX1oE6*KoO&8~7H9~h~=*xBT{l&Tz-??m1$n^g@p)VeH z>hF2-T=xe>J*0`u@_!>kUw*$?KS|f1_H7KA?PoI*ss1v5`Mu{cp-=tG7&7(0g+!zN z_HvDjZx{L&=~!VTv;4GvrYg$*(Os)?$?nOVq@K6V4LpD8$0v}W;*$Du-MjAs^|wQp z$|UvW{?h{&sJ{!kdX8KMS5ob`A{~NUt6Ye zNv&E}7)ObQkf~39J|j%}uiB+?=_)%XbpCZ((9$^kjSc#gg)IY@`rpsMNtS}76NSEx z4%`pfMO+FBR{BFqTC?fb=n`v1`;qb)=Gg2k`pfEd4LWnt7&2Jq$fAF2fv)v#hC&9* z==%|2nf`mtq3gG13>Z>hQm4>2mXYE%da)p^<@bSGrvK>sjI=c%g?;HlpU%@ah76Xi z1d!okE`E$YSR!bmotm9 diff --git a/rccl/reduce-scatter/reduce_scatter.x b/rccl/reduce-scatter/reduce_scatter.x deleted file mode 100755 index d2657f4967ef5b24773d2d3a26a0addf76226bc0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 25848 zcmeHP3wT_`b)LJc7qX?hvW;W&h%ZY1_G?F_8~23AMC@} zw0%$=Ip@sGxifQTu6AZ!i)XP#QJ4%Cb~z)ie5S^s27YI3)#pV7q=D75 zQrs_Km8=MS0mqE=2Bz_dqz;RLOIl*E%nC|+F_9j#Xgo#Ib2!JCq_TuWNpB=*A}Z)) z!Afc~c+xARsKjS##4#r5ULhwbGwD$s_s-NRFw9|l@q|%4iK#$7#X#?rg zwON#Bvv_`zYQmtTGQaBHP*>gD>Rx|UZzz&lRkf`>S{ zf~QR57Z#}u`TQCfh16N0GiXFPBzPh-aS=TW7y8=;sqNIbcYhjK-?RQ>*0Vm6bVLw`c~=Q zU?`&ZhVBS}^#{fh>j}l$0%2clHAAc;kZcaD4EX}YERF{_ZPR;uqdr2G$mFJIDv}gP zOFSNpcO>JXNH-yde#h#FuO}XjP|gvbuNRujBB9$-fmXld&3Yu#6b*-4BcbF{J#s6T zSsL)Ce1Q(1o5qyfyY3 zdo8Pu#*)>(xW2kN7O3|50@cnsyTk6_QM|^y^jZsb&96?}ULAz9lPFnrBBuLptqMd| zCaSyiL?COTD$yHAIBTo!2t?+PfhuP$x8>L4i5@*t6~_}reX8cz=h_`r$+|#qP3@}q z?cKrd+FR;sh?qsjGC!uHM{oE8!UKI-m z!r6q|Qo28^CwsC$IOoMUNaI+z1!b`I#))yvxf(c)5!o`%A zr`~PC%S`fHO?bHpzu$!8g^&#ons71Y=V_0ca51Ik_%;)6p6@(q!sQZ!6b4NA6mG$| z?J?nJn()0Qe5wiGXTm>Y!uOl-X(l{l!l#??!zP^GcQTwX;b%*bsIJkS(KDB8G2%;f zjR?o((ua7Pt`Xt*XL-!HP17|VAv-g8%(!W~#v_CqG0w8ty1@~Co&oW@L)SPZe5`po z%`C(L_F({F2Udk5hq27RhzJ@oXhSzK~uFXo!pXJvsMCyg5?5d_F&nSFW0^q1TU=DU#BP%g+ z@oS+-ES1z^I!$aa519%D9x-?bUHU zWAP@8DU02gGSzG(JEMMWE|2HxCU0AttKHk?y3V_Z38ZsrYn!*zy|ksneYK|x0d#Cs6#fulA1)ObN(2j>ouJIrmV6lXHP6H*$SM-DC&pT!hRJfYb>@A@4yuWb~$0L za@$>&rm~{dPy$QoRK6`pUkaK{x3qBqvKhnm9MU(64nf#PTn#8l6qJGo*ZsI20CtEB zBcI0dX^rJtWBKgHa)+zDeo6UyYhUr^qAi8@6nx2gm&H0%SYGcccW?oSLD=RhFL6Q4 zC>Qx)0S1r;)Gb>s>jH%e)(ofw_$}~JH{z_h-1a34jND^wDc80X!06^ece&%PB6oRx zUvXP`L!4*kqSA=W8p}(#FWX_~A=r`qnaokxIw}OWIuEp_U%uI${;j+3?ZfSzo-Ktv zc#XKXl)Z(wjC;$X2T<k6?ZMq&xDXP6 zl2hGVYEJ-z>O}{Khch?A+K{sKrkz6j96T_gy>M4y8934|651-_T??U|-kqtTa`YD- z!Kg=uRakue04!cCExts--FM2O-nkl2(Vaf*emMi)mMC_g-t^NN84PpZjA zE$Z@tBOdkQT|qTAbdaEf>hc3e-0H=gc}-n@jF@9c{u4(|smq56IYqKRh59j4Kh~^H zd0w4z9Abl~TiKP>X;x#IgFz2+f#w0yJlL#G-39FrUFwvRh#z;UvyZ8rM?o@b=lel* zC0(fSp+lfYL65f}DdqcJis>(P)Y{%dB=v^ySUwY z@QAwZ(&g^HXG@Uw>aXbbD&1b9+jH)DFRPnsPYn;d=j};0(UZi3*%4INKAV2+_-n^^ z(c3Cj7Tokoa_WW`>2)_$sy_8w+lE)@H5pVR&ywhl_iVxRl&jstJBFUzX>K2V^V!;v zb71Hab;f#XFX~fo**3f^R4JDoI|iXTaj|=N!0jtV8P?2KpP0C0$6F|QH05910gPUGRKDprqUa-e@Ze8YD?z+~s%+=Z9-tfK~oxU}FJpFX# z50JGx{ewyE85)dQTA|ZJUm%_nbtZ^UOP9z-3mV4*eUs_BW0{T}fE_^^5~( z-_smrsX6ZcMNiB_!O}-tcf3F4na6Az^Q3?49LW3;Vo3ZZ5})~M7X2MiPx^Rf2(&f* zTq_?}-lhuiaRt0Fu56)lq_gA7#NVTzpuar#E~jTcV9;}upnP2U2oV}r5|lvW%8L}C zaYdcc{C{vf`SClb-s$7XeFV4A2$Fw1*$KR&YJVmeh%->=1@Fc|s8>#j5FRFu3YyW4S>4Y3wWOp|Ge4bu5m5gjv zVWc{1qzgisa8~jJ?9m9;{1=R1uJjZS4Pj4E|J;h4Jn3VZQcT@94XB&mrtS@#W_WqV;6S=a*$nIlM+%ronOJXNMJkMjlBlZZEG9u~Pi z4( zGe+cr$`d@Y?#4<7UKO2{T2QUwlzu*2KgagT zc!bUAgLsO6fWr0xJT!OuH}DFOHwihw1x|t83)VTD`413`l50QbN&i0cD==>1&CEKi zy2!=*c`GnnpSeYVGyrD8WMaU*;W--WKkvHHb(8C6m-nWf9^ZkXiRz37=2qT-?R5`J zn{GIhKf(UZyjt$v1(m1*YV_WZX#U=Ua@{R**aMrH1zw z4ty{*%nLq%7H!JbiXsi(U?lzR=%fNlx&leGig5S6??|tC-okM(Y?<{>ShX*6Z>b{M;Yy!u zO&{}IL=G=;?>K4Abikl{(}ASsd_%5>Jn28-<c#XaxWsrTG{&y{@s zCOJG|f}CAH<>r56Cp{(D?=SoVx*o0cEW>+Af2V=-7}d@Q!?j&@be>s$+-bZUo<)2Q zB@XVVs?NHBc=F@mUd}Hi9@S@q1DwB-czB!*ZsU9-@#qI^@IlVcCm!9M4Q}Op4e=Ou z+2DH4UraoG-WrT?{(Ry~h+o0^vxujmb+DcDlZeNpk_|R+zLa=OZP}oM^A_UiQ}3Y0 z`9I+^M-LX?=--_GfcSFa8Ry?4o<8Xg9!CEkp7mSeZN%^A{4a^8_BFVd^REz(=ZX4r z{zc*^6Tgk~&k|3g-r$3re~NheL_4^Z^Z!CTeVQFy&-s5Kp4$6hjPnl>KaKbmoc|{A z^a*ybo%8n+PjjKc2F`zx__K+3aQ;r>>66W%#`!ylKbLqL=i|hGmUzbbTZo@Q{Nas!;oEX}Zw{Zx;n(Kyn{)VY z=kWiS!#|nBzmUWKJcs{v4o`m@reOZ&PCPYW3fcsXQ)<8zXzD|O8ZZTVuoUQlQ`jTq zsYCG~@)LwM#MgQ6hwp$W4IsgS}pE?``N3q5!br=fo3BwC_-8>=XLb04d0H>W~!fb84KH@jGLW3I6b~QJz?l zk&dRx%Qql+^i0lY{Ur3q$+!o`JuvQpaSx1pVB7=a9vJt)xCh2PFz$hI4~%shIeC&U!)Re)Zg3UlZx?3VKY?q8g2h zP8IY5LF)uvEa);Q|t`RC2i9eRy( zZr5djIl-X6HmK`$^^UoYxwZ9mf!g|u@Ux>(urh|hkJ5bIdh(W1dpYSl&BWp->NG(D_K5jXzkCjOE))9}v% zlZ-h^fA_Pn7To1u{>-V2E#&DNv+45hfs=%o{Cgn%fMBd(l;;s73Hzem+&h*qFeAU4 zQ3XT(5g{-CKDsT3&o$!$AnLF1G+BQ^A4j4e*AJ1Ztham#QAM78A^(=n$XNw^8&Zk# zAtWY#OK9Cd*3ZLD`3!zN%g(UwE!RM?1+SnGn_O8z4Keh`dJ8{!p+wn00Kx^rf|2hJ zkb40O0xwY-aGyv&e4%2=?kL^!J3y9_LMxK2H{oX4*9@d!E{BVM2c*!*8A1~&-BxfG zNYSkjpsSdbl-ve-#$C9TevWQDMmHpF=6A}>q7+dxns7NDq%wO2MIVA8kM2|Fq3q$vN>vEseQseu{Uk)b?Lo*|j0a*@!@ zlYu<&7;yCp%Cm`zOhNq`XQ|vCVqfK|#NLm~Mo&z+_6pqJ##P}GCTD@6@Eu4`3Nd9l zIgXAln0%9vn4&DRtYK4>PRm-{ud%Fyz?o8=%E6}A;zpNpnRv2m0prQy-T=561siEm zN`+!8s!%2=*7CEt(X9pOe!Y9Nk}O%oIxp*(|0OJ!{(?j-idvF)4tpY|C|YsRVYq~ zPo!C0%a!5SQ-6Lhb)(6ErVwv?2Whkki^){xGao@ z!T+)VzIZY;lD`~jtvi^~8lQiu1xFw#>ajidRilqP;{VdVSt`j+oDoAxI5f>-0 zbo+dA7kDC-2pC(yt1qh#MSQ&}e}J(-Z?I~l+}Nh?4+QN!(i^gqT6Z$VI%$tJlC)0J z_&sTjEVNF#a>@iqalVPk)0 zRc|2Do$OiU0F;W{9*X#@l6n_5s)N@P;&_H=DuEsOqZ82xaex3O8i}4N>0BciU)A`1$vU{Ig@pBW9_91 z(#w}eMsqlT0#&BXp`$xHj&0S9iw#qEJKrk0``aObCtrh z>KF^hbYx8YeBcV3$nLMu5E}`6hK{MPTvJ>wl87oed8k3xI6~#1a1}1XSLhmI#>|rF z^|&zK$^MR$%12=ZE(`3--~BEn5aAYFWamL$LyY1^SjFR$vcLC~b^;JkEK`~M9w6~` zkRQtrU(SPni`$=^{XMF*^X)wHkLJO@YNhgw_*1Sc|C#J#QfOPDAvO|1O!g9ZVTo3yeOL_Kpuu_T653^X*?C)VE{ykp))64lY z2u#BB%3TG$DR8+4#gDsrcvs?1jq`GGbWGrTFEDVvFCW-PxHL8u4a2~}@^B8yS%K3V zza&0&W|_4xO*2ndD?N4xIWxZn!%S{`EZg_koX99@<+s7QQPF{?k18 zzC8GQdGL?(;1yWyja8oW^5Apx;4OLZuem;~3|2Pj$d$Y%~ zQ(dj!|j(bx!Y!4PHhIA4Lrt3PX zidd{SiCC1M&Snqvi1YD!{E(C!vnKnAF>n;=st2LnZvHk7k#YJ|sMDmWT(2Ys93FeQUpR7oa zPafd6#B+6{%R?vbbhLDOJ6(;Q7Swa}YF-%O7UkKDW*a#NHS&z)cLIMR>cz=>ek>3F zU!36Ry}>z`9AGq!W_Y^YSwjaPc2P%3I_r8FL~4CFdny_R=&bW5QsMTM9p3g(hr>&U zDgN&}0n+Ogjn^BD#?eB0sq3L8qmG3fW6eDZqHhvj@8~Yqa#b_BUA~sjHJY!#%GK7| zWFCb7f;bKDbm29}gFH7|o~JGMZ;50sE8|3IKU*;_slVBxfg3r`+2AdSqH8aWk#)?ss7y7u zD9H7|UfNlUkdOs4Sj>(Sed=H4 zkgNX{B%1XHoEjG&5c)RhSYaf${PzQ=D$4dD_jSs1<8%Ebc}X9Ju4>Yk>)(Ad0U}5_ zgB#%`z6}XfCYk^KCXI{k|AhM6Ax+Ot>dSSuJTL!2QGd~$vgMI}*aKbipVXJn$Z;{x>@a<}M9@UXrN6kJjZ3ya`TklDecAR%UIv@^)O&AEhB++zVjlg6 O=*5VzH^+Fc{(k~WGn2>w diff --git a/reduce_scatter.cu b/reduce_scatter.cu deleted file mode 100644 index 99fc950..0000000 --- a/reduce_scatter.cu +++ /dev/null @@ -1,269 +0,0 @@ -/* \file reduce_scatter.cu - * Copyright 2024 Parallel Software and Systems Group, University of Maryland. - * See the top-level LICENSE file for details. - * - * SPDX-License-Identifier: MIT - */ - -#include -#include -#include -#include - -#ifdef USE_CUDA - #include - #define bfloat16 nv_bfloat16 -#elif USE_ROCM - #define __HIP_PLATFORM_AMD__ - #include - #include - #include - #define bfloat16 hip_bfloat16 -#endif - -#ifdef USE_NCCL - #include "nccl.h" -#elif USE_RCCL - #include -#endif - -#define NUM_WARMUP_ITERATIONS 5 - -#define MPI_CHECK(cmd) do { \ - int64_t e = cmd; \ - if( e != MPI_SUCCESS ) { \ - printf("Failed: MPI error %s:%d '%ld'\n", \ - __FILE__,__LINE__, e); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - -#define CUDA_CHECK(cmd) do { \ - cudaError_t e = cmd; \ - if(e != cudaSuccess) { \ - printf("CUDA error %s:%d: %s\n", \ - __FILE__, __LINE__, cudaGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - -#define HIP_CHECK(cmd) do { \ - hipError_t e = cmd; \ - if(e != hipSuccess) { \ - printf("HIP error %s:%d: %s\n", \ - __FILE__, __LINE__, hipGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - -// NCCL_CHECK is used to validate RCCL functions as well -#define NCCL_CHECK(cmd) do { \ - ncclResult_t e = cmd; \ - if (e != ncclSuccess) { \ - printf("NCCL error %s:%d %s\n", \ - __FILE__, __LINE__, ncclGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - -void initializeData(bfloat16 *data, int64_t size) { - for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) { - #ifdef USE_CUDA - data[i] = __float2bfloat16((float)i); - #elif USE_ROCM - // ROCm doesn't have a float2bfloat16 method - data[i] = (bfloat16) ((float) i); - #endif - } -} - -void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { - bfloat16* in = (bfloat16*) invec; - bfloat16* inout = (bfloat16*) inoutvec; - for (int i = 0; i < *len; i++) { - #ifdef USE_CUDA - inout[i] = __hadd(in[i], inout[i]); - #elif USE_ROCM - inout[i] = in[i] + inout[i]; - #endif - } -} - -int main(int argc, char *argv[]) { - if (argc != 5) { - fprintf(stderr, "Usage: %s \n", argv[0]); - return EXIT_FAILURE; - } - - int num_gpus = atoi(argv[1]); - int64_t min_msg_size = strtoll(argv[2], NULL, 10); - int64_t max_msg_size = strtoll(argv[3], NULL, 10); - int iterations = atoi(argv[4]); - - if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { - fprintf(stderr, "Invalid input parameters.\n"); - return EXIT_FAILURE; - } - - int my_rank, num_pes; - int num_gpus_per_node; - int msg_count; - - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); - MPI_Comm_size(MPI_COMM_WORLD, &num_pes); - - if (num_pes != num_gpus) { - fprintf(stderr, "Number of processes must match number of GPUs.\n"); - MPI_Finalize(); - return EXIT_FAILURE; - } - - // Initialize GPU context - #if USE_CUDA - cudaGetDeviceCount(&num_gpus_per_node); - cudaSetDevice((my_rank % num_gpus_per_node)); - #elif USE_ROCM - hipGetDeviceCount(&num_gpus_per_node); - hipSetDevice((my_rank % num_gpus_per_node)); - #endif - - int64_t local_data_size = max_msg_size; // Size of local data - int64_t global_data_size = local_data_size; // Size of global data - - if (my_rank == 0) { - fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024); - fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024); - } - - bfloat16 *local_data = (bfloat16*)malloc(local_data_size); - bfloat16 *global_data = (bfloat16*)malloc(global_data_size); - - // Initialize local data - initializeData(local_data, local_data_size); - - bfloat16 *d_local_data, *d_global_data; - #ifdef USE_CUDA - CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); - CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); - // Copy local data to GPU - CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); - - #elif USE_ROCM - HIP_CHECK(hipMalloc(&d_local_data, local_data_size)); - HIP_CHECK(hipMalloc(&d_global_data, global_data_size)); - HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice)); - #endif - - #ifdef USE_MPI - // create 2-byte datatype (send raw, un-interpreted bytes) - MPI_Datatype mpi_type_bfloat16; - MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16); - MPI_Type_commit(&mpi_type_bfloat16); - - // define custom reduce operation for nv_bfloat16 types - MPI_Op CUSTOM_SUM; - MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM); - - #elif defined(USE_NCCL) || defined(USE_RCCL) - ncclUniqueId nccl_comm_id; - ncclComm_t nccl_comm; - - if (my_rank == 0) { - /* Generates an Id to be used in ncclCommInitRank. */ - ncclGetUniqueId(&nccl_comm_id); - } - - /* distribute nccl_comm_id to all ranks */ - MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, - 0, MPI_COMM_WORLD)); - - /* Create a new NCCL/RCCL communicator */ - NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); - #endif - - // init recvcounts to send an equal portion of data from the reduce operation - int num_elements = local_data_size / sizeof(bfloat16); - int portion = num_elements / num_pes; - int *recvcounts = (int*) malloc(sizeof(int) * num_pes); - for (int i = 0; i < num_pes; i++) - recvcounts[i] = portion; - - // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather - double total_time, start_time; - MPI_Request request; - MPI_Status status; - - // Print benchmark results - if (my_rank == 0) { - printf("Number of GPUs: %d\n", num_gpus); - printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size); - printf("Number of iterations: %d\n", iterations); - } - fflush(NULL); - - for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { - msg_count = msg_size / sizeof(bfloat16); - // warmup iterations - for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { - #ifdef USE_MPI - MPI_CHECK(MPI_Ireduce_scatter(d_local_data, d_global_data, recvcounts, mpi_type_bfloat16, - CUSTOM_SUM, MPI_COMM_WORLD, &request)); - - MPI_CHECK(MPI_Wait(&request, &status)); - #elif defined(USE_NCCL) || defined(USE_RCCL) - NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL)); - #endif - - #ifdef USE_CUDA - cudaDeviceSynchronize(); - #elif USE_ROCM - hipDeviceSynchronize(); - #endif - } - - if(msg_size >= 8388608) - iterations = 20; - - MPI_Barrier(MPI_COMM_WORLD); - start_time = MPI_Wtime(); - for (int i = 0; i < iterations; ++i) { - #ifdef USE_MPI - MPI_CHECK(MPI_Ireduce_scatter(d_local_data, d_global_data, recvcounts, mpi_type_bfloat16, - CUSTOM_SUM, MPI_COMM_WORLD, &request)); - - MPI_CHECK(MPI_Wait(&request, &status)); - #elif defined(USE_NCCL) || defined(USE_RCCL) - NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL)); - #endif - - #ifdef USE_CUDA - cudaDeviceSynchronize(); - #elif USE_ROCM - hipDeviceSynchronize(); - #endif - } - MPI_Barrier(MPI_COMM_WORLD); - total_time = MPI_Wtime() - start_time; - if (my_rank == 0) - printf("%ld %.6f seconds\n", msg_size, (total_time / iterations)); - } - - // Cleanup - free(local_data); - free(global_data); - #ifdef USE_CUDA - CUDA_CHECK(cudaFree(d_local_data)); - CUDA_CHECK(cudaFree(d_global_data)); - #elif USE_ROCM - HIP_CHECK(hipFree(d_local_data)); - HIP_CHECK(hipFree(d_global_data)); - #endif - - #ifdef defined(USE_NCCL) || defined(USE_RCCL) - ncclCommDestroy(nccl_comm); - #endif - - MPI_Finalize(); - return EXIT_SUCCESS; -} From f84dd26084d452099721e2aad433e2b275542dcd Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Thu, 11 Jul 2024 17:19:45 -0700 Subject: [PATCH 42/52] update .gitignore to ignore .x and .out files --- .gitignore | 2 + LICENSE | 20 ++ README.md | 15 + allgather.cu | 248 ++++++++++++++++ allreduce.cu | 262 +++++++++++++++++ mpi/Makefile | 30 ++ mpi/all-gather/frontier/128_gcd_run.sh | 21 ++ mpi/all-gather/frontier/16_gcd_run.sh | 21 ++ mpi/all-gather/frontier/32_gcd_run.sh | 21 ++ mpi/all-gather/frontier/64_gcd_run.sh | 21 ++ mpi/all-gather/frontier/8_gcd_run.sh | 21 ++ .../frontier/benchmarks/128_gcd.txt | 13 + mpi/all-gather/frontier/benchmarks/16_gcd.txt | 13 + mpi/all-gather/frontier/benchmarks/32_gcd.txt | 15 + mpi/all-gather/frontier/benchmarks/64_gcd.txt | 14 + mpi/all-gather/frontier/benchmarks/8_gcd.txt | 14 + mpi/all-gather/perlmutter/128_gpu_run.sh | 37 +++ mpi/all-gather/perlmutter/16_gpu_run.sh | 37 +++ mpi/all-gather/perlmutter/32_gpu_run.sh | 37 +++ mpi/all-gather/perlmutter/64_gpu_run.sh | 37 +++ mpi/all-gather/perlmutter/8_gpu_run.sh | 37 +++ .../perlmutter/benchmarks/128_gpu.txt | 12 + .../perlmutter/benchmarks/16_gpu.txt | 21 ++ .../perlmutter/benchmarks/32_gpu.txt | 14 + .../perlmutter/benchmarks/64_gpu.txt | 13 + .../perlmutter/benchmarks/8_gpu.txt | 13 + mpi/all-reduce/frontier/128_gcd_run.sh | 21 ++ mpi/all-reduce/frontier/16_gcd_run.sh | 21 ++ mpi/all-reduce/frontier/32_gcd_run.sh | 21 ++ mpi/all-reduce/frontier/64_gcd_run.sh | 21 ++ mpi/all-reduce/frontier/8_gcd_run.sh | 21 ++ .../frontier/benchmarks/128_gcd.txt | 12 + mpi/all-reduce/frontier/benchmarks/16_gcd.txt | 12 + mpi/all-reduce/frontier/benchmarks/32_gcd.txt | 14 + mpi/all-reduce/frontier/benchmarks/64_gcd.txt | 13 + mpi/all-reduce/frontier/benchmarks/8_gcd.txt | 13 + mpi/all-reduce/perlmutter/128_gpu_run.sh | 37 +++ mpi/all-reduce/perlmutter/16_gpu_run.sh | 37 +++ mpi/all-reduce/perlmutter/32_gpu_run.sh | 37 +++ mpi/all-reduce/perlmutter/64_gpu_run.sh | 37 +++ mpi/all-reduce/perlmutter/8_gpu_run.sh | 37 +++ .../perlmutter/benchmarks/16_gpu.txt | 11 + .../perlmutter/benchmarks/32_gpu.txt | 13 + .../perlmutter/benchmarks/8_gpu.txt | 12 + mpi/reduce-scatter/frontier/128_gcd_run.sh | 21 ++ mpi/reduce-scatter/frontier/16_gcd_run.sh | 21 ++ mpi/reduce-scatter/frontier/32_gcd_run.sh | 21 ++ mpi/reduce-scatter/frontier/64_gcd_run.sh | 21 ++ mpi/reduce-scatter/frontier/8_gcd_run.sh | 21 ++ .../frontier/benchmarks/128_gcd.txt | 13 + .../frontier/benchmarks/16_gcd.txt | 13 + .../frontier/benchmarks/32_gcd.txt | 15 + .../frontier/benchmarks/64_gcd.txt | 17 ++ .../frontier/benchmarks/8_gcd.txt | 14 + mpi/reduce-scatter/perlmutter/128_gpu_run.sh | 37 +++ mpi/reduce-scatter/perlmutter/16_gpu_run.sh | 37 +++ mpi/reduce-scatter/perlmutter/32_gpu_run.sh | 37 +++ mpi/reduce-scatter/perlmutter/64_gpu_run.sh | 37 +++ mpi/reduce-scatter/perlmutter/8_gpu_run.sh | 37 +++ .../perlmutter/benchmarks/128_gpu.txt | 12 + .../perlmutter/benchmarks/16_gpu.txt | 12 + .../perlmutter/benchmarks/32_gpu.txt | 14 + .../perlmutter/benchmarks/64_gpu.txt | 13 + .../perlmutter/benchmarks/8_gpu.txt | 13 + nccl/Makefile | 25 ++ nccl/all-gather/128_gpu_run.sh | 37 +++ nccl/all-gather/16_gpu_run.sh | 37 +++ nccl/all-gather/32_gpu_run.sh | 37 +++ nccl/all-gather/64_gpu_run.sh | 37 +++ nccl/all-gather/8_gpu_run.sh | 37 +++ nccl/all-gather/benchmarks/16_gpu.txt | 13 + nccl/all-gather/benchmarks/32_gpu.txt | 14 + nccl/all-gather/benchmarks/64_gpu.txt | 13 + nccl/all-gather/benchmarks/8_gpu.txt | 13 + nccl/all-reduce/128_gpu_run.sh | 37 +++ nccl/all-reduce/16_gpu_run.sh | 37 +++ nccl/all-reduce/32_gpu_run.sh | 37 +++ nccl/all-reduce/64_gpu_run.sh | 37 +++ nccl/all-reduce/8_gpu_run.sh | 37 +++ nccl/all-reduce/benchmarks/16_gpu.txt | 12 + nccl/all-reduce/benchmarks/32_gpu.txt | 14 + nccl/reduce-scatter/128_gpu_run.sh | 37 +++ nccl/reduce-scatter/16_gpu_run.sh | 37 +++ nccl/reduce-scatter/32_gpu_run.sh | 37 +++ nccl/reduce-scatter/64_gpu_run.sh | 37 +++ nccl/reduce-scatter/8_gpu_run.sh | 37 +++ nccl/reduce-scatter/benchmarks/128_gpu.txt | 12 + nccl/reduce-scatter/benchmarks/16_gpu.txt | 12 + nccl/reduce-scatter/benchmarks/32_gpu.txt | 14 + nccl/reduce-scatter/benchmarks/64_gpu.txt | 13 + nccl/reduce-scatter/benchmarks/8_gpu.txt | 13 + rccl/Makefile | 25 ++ reduce_scatter.cu | 269 ++++++++++++++++++ 93 files changed, 2842 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 allgather.cu create mode 100644 allreduce.cu create mode 100644 mpi/Makefile create mode 100644 mpi/all-gather/frontier/128_gcd_run.sh create mode 100644 mpi/all-gather/frontier/16_gcd_run.sh create mode 100644 mpi/all-gather/frontier/32_gcd_run.sh create mode 100644 mpi/all-gather/frontier/64_gcd_run.sh create mode 100644 mpi/all-gather/frontier/8_gcd_run.sh create mode 100644 mpi/all-gather/frontier/benchmarks/128_gcd.txt create mode 100644 mpi/all-gather/frontier/benchmarks/16_gcd.txt create mode 100644 mpi/all-gather/frontier/benchmarks/32_gcd.txt create mode 100644 mpi/all-gather/frontier/benchmarks/64_gcd.txt create mode 100644 mpi/all-gather/frontier/benchmarks/8_gcd.txt create mode 100644 mpi/all-gather/perlmutter/128_gpu_run.sh create mode 100644 mpi/all-gather/perlmutter/16_gpu_run.sh create mode 100644 mpi/all-gather/perlmutter/32_gpu_run.sh create mode 100644 mpi/all-gather/perlmutter/64_gpu_run.sh create mode 100644 mpi/all-gather/perlmutter/8_gpu_run.sh create mode 100644 mpi/all-gather/perlmutter/benchmarks/128_gpu.txt create mode 100644 mpi/all-gather/perlmutter/benchmarks/16_gpu.txt create mode 100644 mpi/all-gather/perlmutter/benchmarks/32_gpu.txt create mode 100644 mpi/all-gather/perlmutter/benchmarks/64_gpu.txt create mode 100644 mpi/all-gather/perlmutter/benchmarks/8_gpu.txt create mode 100644 mpi/all-reduce/frontier/128_gcd_run.sh create mode 100644 mpi/all-reduce/frontier/16_gcd_run.sh create mode 100644 mpi/all-reduce/frontier/32_gcd_run.sh create mode 100644 mpi/all-reduce/frontier/64_gcd_run.sh create mode 100644 mpi/all-reduce/frontier/8_gcd_run.sh create mode 100644 mpi/all-reduce/frontier/benchmarks/128_gcd.txt create mode 100644 mpi/all-reduce/frontier/benchmarks/16_gcd.txt create mode 100644 mpi/all-reduce/frontier/benchmarks/32_gcd.txt create mode 100644 mpi/all-reduce/frontier/benchmarks/64_gcd.txt create mode 100644 mpi/all-reduce/frontier/benchmarks/8_gcd.txt create mode 100644 mpi/all-reduce/perlmutter/128_gpu_run.sh create mode 100644 mpi/all-reduce/perlmutter/16_gpu_run.sh create mode 100644 mpi/all-reduce/perlmutter/32_gpu_run.sh create mode 100644 mpi/all-reduce/perlmutter/64_gpu_run.sh create mode 100644 mpi/all-reduce/perlmutter/8_gpu_run.sh create mode 100644 mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt create mode 100644 mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt create mode 100644 mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt create mode 100644 mpi/reduce-scatter/frontier/128_gcd_run.sh create mode 100644 mpi/reduce-scatter/frontier/16_gcd_run.sh create mode 100644 mpi/reduce-scatter/frontier/32_gcd_run.sh create mode 100644 mpi/reduce-scatter/frontier/64_gcd_run.sh create mode 100644 mpi/reduce-scatter/frontier/8_gcd_run.sh create mode 100644 mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt create mode 100644 mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt create mode 100644 mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt create mode 100644 mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt create mode 100644 mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt create mode 100644 mpi/reduce-scatter/perlmutter/128_gpu_run.sh create mode 100644 mpi/reduce-scatter/perlmutter/16_gpu_run.sh create mode 100644 mpi/reduce-scatter/perlmutter/32_gpu_run.sh create mode 100644 mpi/reduce-scatter/perlmutter/64_gpu_run.sh create mode 100644 mpi/reduce-scatter/perlmutter/8_gpu_run.sh create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt create mode 100644 nccl/Makefile create mode 100644 nccl/all-gather/128_gpu_run.sh create mode 100644 nccl/all-gather/16_gpu_run.sh create mode 100644 nccl/all-gather/32_gpu_run.sh create mode 100644 nccl/all-gather/64_gpu_run.sh create mode 100644 nccl/all-gather/8_gpu_run.sh create mode 100644 nccl/all-gather/benchmarks/16_gpu.txt create mode 100644 nccl/all-gather/benchmarks/32_gpu.txt create mode 100644 nccl/all-gather/benchmarks/64_gpu.txt create mode 100644 nccl/all-gather/benchmarks/8_gpu.txt create mode 100644 nccl/all-reduce/128_gpu_run.sh create mode 100644 nccl/all-reduce/16_gpu_run.sh create mode 100644 nccl/all-reduce/32_gpu_run.sh create mode 100644 nccl/all-reduce/64_gpu_run.sh create mode 100644 nccl/all-reduce/8_gpu_run.sh create mode 100644 nccl/all-reduce/benchmarks/16_gpu.txt create mode 100644 nccl/all-reduce/benchmarks/32_gpu.txt create mode 100644 nccl/reduce-scatter/128_gpu_run.sh create mode 100644 nccl/reduce-scatter/16_gpu_run.sh create mode 100644 nccl/reduce-scatter/32_gpu_run.sh create mode 100644 nccl/reduce-scatter/64_gpu_run.sh create mode 100644 nccl/reduce-scatter/8_gpu_run.sh create mode 100644 nccl/reduce-scatter/benchmarks/128_gpu.txt create mode 100644 nccl/reduce-scatter/benchmarks/16_gpu.txt create mode 100644 nccl/reduce-scatter/benchmarks/32_gpu.txt create mode 100644 nccl/reduce-scatter/benchmarks/64_gpu.txt create mode 100644 nccl/reduce-scatter/benchmarks/8_gpu.txt create mode 100644 rccl/Makefile create mode 100644 reduce_scatter.cu diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7882514 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.x +*.out diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..9943369 --- /dev/null +++ b/LICENSE @@ -0,0 +1,20 @@ +Copyright (c) 2024, Parallel Software and Systems Group, University of +Maryland. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..526fb95 --- /dev/null +++ b/README.md @@ -0,0 +1,15 @@ +Before compiling do these: + +### Perlmutter +```sh +module load PrgEnv-cray cudatoolkit craype-accel-nvidia80 nccl +export CRAY_ACCEL_TARGET=nvidia80 +export MPICH_GPU_SUPPORT_ENABLED=1 +``` +### Frontier +```sh +module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 +export MPICH_GPU_SUPPORT_ENABLED=1 +export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}" +``` + diff --git a/allgather.cu b/allgather.cu new file mode 100644 index 0000000..8c357bb --- /dev/null +++ b/allgather.cu @@ -0,0 +1,248 @@ +/* \file allgather.cu + * Copyright 2024 Parallel Software and Systems Group, University of Maryland. + * See the top-level LICENSE file for details. + * + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include +#include + +#ifdef USE_CUDA + #include + #define bfloat16 nv_bfloat16 +#elif USE_ROCM + #define __HIP_PLATFORM_AMD__ + #include + #include + #include + #define bfloat16 hip_bfloat16 +#endif + +#ifdef USE_NCCL + #include "nccl.h" +#elif USE_RCCL + #include +#endif + +#define NUM_WARMUP_ITERATIONS 5 + +#define MPI_CHECK(cmd) do { \ + int64_t e = cmd; \ + if( e != MPI_SUCCESS ) { \ + printf("Failed: MPI error %s:%d '%ld'\n", \ + __FILE__,__LINE__, e); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define CUDA_CHECK(cmd) do { \ + cudaError_t e = cmd; \ + if(e != cudaSuccess) { \ + printf("CUDA error %s:%d: %s\n", \ + __FILE__, __LINE__, cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define HIP_CHECK(cmd) do { \ + hipError_t e = cmd; \ + if(e != hipSuccess) { \ + printf("HIP error %s:%d: %s\n", \ + __FILE__, __LINE__, hipGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +// NCCL_CHECK is used to validate RCCL functions as well +#define NCCL_CHECK(cmd) do { \ + ncclResult_t e = cmd; \ + if (e != ncclSuccess) { \ + printf("NCCL error %s:%d %s\n", \ + __FILE__, __LINE__, ncclGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +void initializeData(bfloat16 *data, int64_t size) { + for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) { + #ifdef USE_CUDA + data[i] = __float2bfloat16((float)i); + #elif USE_ROCM + // ROCm doesn't have a float2bfloat16 method + data[i] = (bfloat16) ((float) i); + #endif + } +} + +int main(int argc, char *argv[]) { + if (argc != 5) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return EXIT_FAILURE; + } + + int num_gpus = atoi(argv[1]); + int64_t min_msg_size = atoi(argv[2]); + int64_t max_msg_size = atoi(argv[3]); + int iterations = atoi(argv[4]); + + if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { + fprintf(stderr, "Invalid input parameters.\n"); + return EXIT_FAILURE; + } + + int my_rank, num_pes; + int num_gpus_per_node; + int msg_count; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_pes); + + if (num_pes != num_gpus) { + fprintf(stderr, "Number of processes must match number of GPUs.\n"); + MPI_Finalize(); + return EXIT_FAILURE; + } + + // Initialize GPU context + #if USE_CUDA + cudaGetDeviceCount(&num_gpus_per_node); + cudaSetDevice((my_rank % num_gpus_per_node)); + #elif USE_ROCM + hipGetDeviceCount(&num_gpus_per_node); + hipSetDevice((my_rank % num_gpus_per_node)); + #endif + + int64_t local_data_size = max_msg_size; // Size of local data + int64_t global_data_size = local_data_size * num_gpus; // Size of global data + + if (my_rank == 0) { + fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024); + fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024); + } + + bfloat16 *local_data = (bfloat16*)malloc(local_data_size); + bfloat16 *global_data = (bfloat16*)malloc(global_data_size); + + // Initialize local data + initializeData(local_data, local_data_size); + + // Allocate memory on GPU + bfloat16 *d_local_data, *d_global_data; + #ifdef USE_CUDA + CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); + CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); + // Copy local data to GPU + CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); + + #elif USE_ROCM + HIP_CHECK(hipMalloc(&d_local_data, local_data_size)); + HIP_CHECK(hipMalloc(&d_global_data, global_data_size)); + HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice)); + #endif + + #ifdef USE_MPI + // create 2-byte datatype (send raw, un-interpreted bytes) + MPI_Datatype mpi_type_bfloat16; + MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16); + MPI_Type_commit(&mpi_type_bfloat16); + + #elif defined(USE_NCCL) || defined(USE_RCCL) + ncclUniqueId nccl_comm_id; + ncclComm_t nccl_comm; + + if (my_rank == 0) { + /* Generates an Id to be used in ncclCommInitRank. */ + ncclGetUniqueId(&nccl_comm_id); + } + + /* distribute nccl_comm_id to all ranks */ + MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, + 0, MPI_COMM_WORLD)); + + /* Create a new NCCL/RCCL communicator */ + NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); + #endif + + // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather + double total_time, start_time; + MPI_Request request; + MPI_Status status; + + // Print benchmark results + if (my_rank == 0) { + printf("Number of GPUs: %d\n", num_gpus); + printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size); + printf("Number of iterations: %d\n", iterations); + } + fflush(NULL); + + for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { + msg_count = msg_size / sizeof(bfloat16); + // warmup iterations + for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { + #ifdef USE_MPI + MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16, + d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request)); + + MPI_CHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) || defined(USE_RCCL) + NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL)); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); + #endif + } + + if(msg_size >= 8388608) + iterations = 20; + + MPI_Barrier(MPI_COMM_WORLD); + start_time = MPI_Wtime(); + for (int i = 0; i < iterations; ++i) { + #ifdef USE_MPI + MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16, + d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request)); + + MPI_CHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) || defined(USE_RCCL) + NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL)); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); + #endif + } + MPI_Barrier(MPI_COMM_WORLD); + total_time = MPI_Wtime() - start_time; + if (my_rank == 0) + printf("%ld %.6f seconds\n", msg_size, (total_time / iterations)); + } + + // Cleanup + free(local_data); + free(global_data); + #ifdef USE_CUDA + CUDA_CHECK(cudaFree(d_local_data)); + CUDA_CHECK(cudaFree(d_global_data)); + #elif USE_ROCM + HIP_CHECK(hipFree(d_local_data)); + HIP_CHECK(hipFree(d_global_data)); + #endif + + #ifdef defined(USE_NCCL) || defined(USE_RCCL) + ncclCommDestroy(nccl_comm); + #endif + + MPI_Finalize(); + return EXIT_SUCCESS; +} + diff --git a/allreduce.cu b/allreduce.cu new file mode 100644 index 0000000..111b254 --- /dev/null +++ b/allreduce.cu @@ -0,0 +1,262 @@ +/* \file allreduce.cu + * Copyright 2024 Parallel Software and Systems Group, University of Maryland. + * See the top-level LICENSE file for details. + * + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include +#include + +#ifdef USE_CUDA + #include + #define bfloat16 nv_bfloat16 +#elif USE_ROCM + #define __HIP_PLATFORM_AMD__ + #include + #include + #include + #define bfloat16 hip_bfloat16 +#endif + +#ifdef USE_NCCL + #include "nccl.h" +#elif USE_RCCL + #include +#endif + +#define NUM_WARMUP_ITERATIONS 5 + +#define MPI_CHECK(cmd) do { \ + int64_t e = cmd; \ + if( e != MPI_SUCCESS ) { \ + printf("Failed: MPI error %s:%d '%ld'\n", \ + __FILE__,__LINE__, e); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define CUDA_CHECK(cmd) do { \ + cudaError_t e = cmd; \ + if(e != cudaSuccess) { \ + printf("CUDA error %s:%d: %s\n", \ + __FILE__, __LINE__, cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define HIP_CHECK(cmd) do { \ + hipError_t e = cmd; \ + if(e != hipSuccess) { \ + printf("HIP error %s:%d: %s\n", \ + __FILE__, __LINE__, hipGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +// NCCL_CHECK is used to validate RCCL functions as well +#define NCCL_CHECK(cmd) do { \ + ncclResult_t e = cmd; \ + if (e != ncclSuccess) { \ + printf("NCCL error %s:%d %s\n", \ + __FILE__, __LINE__, ncclGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +void initializeData(bfloat16 *data, int64_t size) { + for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) { + #ifdef USE_CUDA + data[i] = __float2bfloat16((float)i); + #elif USE_ROCM + // ROCm doesn't have a float2bfloat16 method + data[i] = (bfloat16) ((float) i); + #endif + } +} + +void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { + bfloat16* in = (bfloat16*) invec; + bfloat16* inout = (bfloat16*) inoutvec; + for (int i = 0; i < *len; i++) { + #ifdef USE_CUDA + inout[i] = __hadd(in[i], inout[i]); + #elif USE_ROCM + inout[i] = in[i] + inout[i]; + #endif + } +} + +int main(int argc, char *argv[]) { + if (argc != 5) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return EXIT_FAILURE; + } + + int num_gpus = atoi(argv[1]); + int64_t min_msg_size = strtoll(argv[2], NULL, 10); + int64_t max_msg_size = strtoll(argv[3], NULL, 10); + int iterations = atoi(argv[4]); + + if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { + fprintf(stderr, "Invalid input parameters.\n"); + return EXIT_FAILURE; + } + + int my_rank, num_pes; + int num_gpus_per_node; + int msg_count; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_pes); + + if (num_pes != num_gpus) { + fprintf(stderr, "Number of processes must match number of GPUs.\n"); + MPI_Finalize(); + return EXIT_FAILURE; + } + + // Initialize GPU context + #if USE_CUDA + cudaGetDeviceCount(&num_gpus_per_node); + cudaSetDevice((my_rank % num_gpus_per_node)); + #elif USE_ROCM + hipGetDeviceCount(&num_gpus_per_node); + hipSetDevice((my_rank % num_gpus_per_node)); + #endif + + int64_t local_data_size = max_msg_size; // Size of local data + int64_t global_data_size = local_data_size; // Size of global data + + if (my_rank == 0) { + fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024); + fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024); + } + + bfloat16 *local_data = (bfloat16*)malloc(local_data_size); + bfloat16 *global_data = (bfloat16*)malloc(global_data_size); + + // Initialize local data + initializeData(local_data, local_data_size); + + bfloat16 *d_local_data, *d_global_data; + #ifdef USE_CUDA + CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); + CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); + // Copy local data to GPU + CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); + + #elif USE_ROCM + HIP_CHECK(hipMalloc(&d_local_data, local_data_size)); + HIP_CHECK(hipMalloc(&d_global_data, global_data_size)); + HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice)); + #endif + + #ifdef USE_MPI + // create 2-byte datatype (send raw, un-interpreted bytes) + MPI_Datatype mpi_type_bfloat16; + MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16); + MPI_Type_commit(&mpi_type_bfloat16); + + // define custom reduce operation for nv_bfloat16 types + MPI_Op CUSTOM_SUM; + MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM); + + #elif defined(USE_NCCL) || defined(USE_RCCL) + ncclUniqueId nccl_comm_id; + ncclComm_t nccl_comm; + + if (my_rank == 0) { + /* Generates an Id to be used in ncclCommInitRank. */ + ncclGetUniqueId(&nccl_comm_id); + } + + /* distribute nccl_comm_id to all ranks */ + MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, + 0, MPI_COMM_WORLD)); + + /* Create a new NCCL/RCCL communicator */ + NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); + #endif + + // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather + double total_time, start_time; + MPI_Request request; + MPI_Status status; + + // Print benchmark results + if (my_rank == 0) { + printf("Number of GPUs: %d\n", num_gpus); + printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size); + printf("Number of iterations: %d\n", iterations); + } + fflush(NULL); + + for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { + msg_count = msg_size / sizeof(bfloat16); + // warmup iterations + for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { + #ifdef USE_MPI + MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16, + CUSTOM_SUM, MPI_COMM_WORLD, &request)); + + MPI_CHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) || defined(USE_RCCL) + NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL)); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); + #endif + } + + if(msg_size >= 8388608) + iterations = 20; + + MPI_Barrier(MPI_COMM_WORLD); + start_time = MPI_Wtime(); + for (int i = 0; i < iterations; ++i) { + #ifdef USE_MPI + MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16, + CUSTOM_SUM, MPI_COMM_WORLD, &request)); + + MPI_CHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) || defined(USE_RCCL) + NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL)); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); + #endif + } + MPI_Barrier(MPI_COMM_WORLD); + total_time = MPI_Wtime() - start_time; + if (my_rank == 0) + printf("%ld %.6f seconds\n", msg_size, (total_time / iterations)); + } + + // Cleanup + free(local_data); + free(global_data); + #ifdef USE_CUDA + CUDA_CHECK(cudaFree(d_local_data)); + CUDA_CHECK(cudaFree(d_global_data)); + #elif USE_ROCM + HIP_CHECK(hipFree(d_local_data)); + HIP_CHECK(hipFree(d_global_data)); + #endif + + #ifdef defined(USE_NCCL) || defined(USE_RCCL) + ncclCommDestroy(nccl_comm); + #endif + + MPI_Finalize(); + return EXIT_SUCCESS; +} diff --git a/mpi/Makefile b/mpi/Makefile new file mode 100644 index 0000000..12ed3bf --- /dev/null +++ b/mpi/Makefile @@ -0,0 +1,30 @@ +# Copyright 2024 Parallel Software and Systems Group, University of Maryland. +# See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +CC = cc + +# perlmutter flags +INC = -I/global/common/software/nersc9/nccl/2.19.4/include +CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI +LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl + +# frontier flags +# INC = -I${ROCM_PATH}/include +# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_MPI +# LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl + +all: allgather.x allreduce.x reduce_scatter.x + +allgather.x: ../allgather.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu + +allreduce.x: ../allreduce.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu + +reduce_scatter.x: ../reduce_scatter.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu + +clean: + rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x diff --git a/mpi/all-gather/frontier/128_gcd_run.sh b/mpi/all-gather/frontier/128_gcd_run.sh new file mode 100644 index 0000000..4e8c955 --- /dev/null +++ b/mpi/all-gather/frontier/128_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 15:00 +#SBATCH -N 16 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/128_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 16)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/frontier/16_gcd_run.sh b/mpi/all-gather/frontier/16_gcd_run.sh new file mode 100644 index 0000000..bb2429f --- /dev/null +++ b/mpi/all-gather/frontier/16_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 10:00 +#SBATCH -N 2 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/16_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 128)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/frontier/32_gcd_run.sh b/mpi/all-gather/frontier/32_gcd_run.sh new file mode 100644 index 0000000..e630b97 --- /dev/null +++ b/mpi/all-gather/frontier/32_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 15:00 +#SBATCH -N 4 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/32_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 64)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/frontier/64_gcd_run.sh b/mpi/all-gather/frontier/64_gcd_run.sh new file mode 100644 index 0000000..e7c707f --- /dev/null +++ b/mpi/all-gather/frontier/64_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 15:00 +#SBATCH -N 8 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/64_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 32)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/frontier/8_gcd_run.sh b/mpi/all-gather/frontier/8_gcd_run.sh new file mode 100644 index 0000000..563f933 --- /dev/null +++ b/mpi/all-gather/frontier/8_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 10:00 +#SBATCH -N 1 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/8_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 256)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/frontier/benchmarks/128_gcd.txt b/mpi/all-gather/frontier/benchmarks/128_gcd.txt new file mode 100644 index 0000000..824b380 --- /dev/null +++ b/mpi/all-gather/frontier/benchmarks/128_gcd.txt @@ -0,0 +1,13 @@ +srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 128 262144 16777216 10 + 0: Local data size: 16 + 0: Global data size: 2048 + 0: Number of GPUs: 128 + 0: Message size range: 262144 - 16777216 + 0: Number of iterations: 10 + 0: 262144 0.003748 seconds + 0: 524288 0.005048 seconds + 0: 1048576 0.008068 seconds + 0: 2097152 0.014084 seconds + 0: 4194304 0.026981 seconds + 0: 8388608 0.051879 seconds + 0: 16777216 0.255600 seconds diff --git a/mpi/all-gather/frontier/benchmarks/16_gcd.txt b/mpi/all-gather/frontier/benchmarks/16_gcd.txt new file mode 100644 index 0000000..35a9e26 --- /dev/null +++ b/mpi/all-gather/frontier/benchmarks/16_gcd.txt @@ -0,0 +1,13 @@ +srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 16 2097152 134217728 10 + 0: Local data size: 128 + 0: Global data size: 2048 + 0: Number of GPUs: 16 + 0: Message size range: 2097152 - 134217728 + 0: Number of iterations: 10 + 0: 2097152 0.002249 seconds + 0: 4194304 0.003148 seconds + 0: 8388608 0.006062 seconds + 0: 16777216 0.011871 seconds + 0: 33554432 0.023485 seconds + 0: 67108864 0.046822 seconds + 0: 134217728 0.139763 seconds diff --git a/mpi/all-gather/frontier/benchmarks/32_gcd.txt b/mpi/all-gather/frontier/benchmarks/32_gcd.txt new file mode 100644 index 0000000..f758360 --- /dev/null +++ b/mpi/all-gather/frontier/benchmarks/32_gcd.txt @@ -0,0 +1,15 @@ +srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 32 262144 67108864 10 + 0: Local data size: 64 + 0: Global data size: 2048 + 0: Number of GPUs: 32 + 0: Message size range: 262144 - 67108864 + 0: Number of iterations: 10 + 0: 262144 0.000783 seconds + 0: 524288 0.001513 seconds + 0: 1048576 0.002953 seconds + 0: 2097152 0.003404 seconds + 0: 4194304 0.006485 seconds + 0: 8388608 0.012489 seconds + 0: 16777216 0.024484 seconds + 0: 33554432 0.048460 seconds + 0: 67108864 0.185884 seconds diff --git a/mpi/all-gather/frontier/benchmarks/64_gcd.txt b/mpi/all-gather/frontier/benchmarks/64_gcd.txt new file mode 100644 index 0000000..3eed822 --- /dev/null +++ b/mpi/all-gather/frontier/benchmarks/64_gcd.txt @@ -0,0 +1,14 @@ +srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 64 262144 33554432 10 + 0: Local data size: 32 + 0: Global data size: 2048 + 0: Number of GPUs: 64 + 0: Message size range: 262144 - 33554432 + 0: Number of iterations: 10 + 0: 262144 0.001685 seconds + 0: 524288 0.003350 seconds + 0: 1048576 0.003938 seconds + 0: 2097152 0.006864 seconds + 0: 4194304 0.013037 seconds + 0: 8388608 0.025167 seconds + 0: 16777216 0.049414 seconds + 0: 33554432 0.211224 seconds diff --git a/mpi/all-gather/frontier/benchmarks/8_gcd.txt b/mpi/all-gather/frontier/benchmarks/8_gcd.txt new file mode 100644 index 0000000..7856a16 --- /dev/null +++ b/mpi/all-gather/frontier/benchmarks/8_gcd.txt @@ -0,0 +1,14 @@ +srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 8 2097152 268435456 10 +0: Local data size: 256 +0: Global data size: 2048 +0: Number of GPUs: 8 +0: Message size range: 2097152 - 268435456 +0: Number of iterations: 10 +0: 2097152 0.000505 seconds +0: 4194304 0.000856 seconds +0: 8388608 0.001645 seconds +0: 16777216 0.003223 seconds +0: 33554432 0.006379 seconds +0: 67108864 0.012691 seconds +0: 134217728 0.025316 seconds +0: 268435456 0.053944 seconds diff --git a/mpi/all-gather/perlmutter/128_gpu_run.sh b/mpi/all-gather/perlmutter/128_gpu_run.sh new file mode 100644 index 0000000..30fd2fc --- /dev/null +++ b/mpi/all-gather/perlmutter/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 16)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/perlmutter/16_gpu_run.sh b/mpi/all-gather/perlmutter/16_gpu_run.sh new file mode 100644 index 0000000..813b192 --- /dev/null +++ b/mpi/all-gather/perlmutter/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 128)) + +SCRIPT="$SCRATCH/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/perlmutter/32_gpu_run.sh b/mpi/all-gather/perlmutter/32_gpu_run.sh new file mode 100644 index 0000000..aad7f68 --- /dev/null +++ b/mpi/all-gather/perlmutter/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 64)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/perlmutter/64_gpu_run.sh b/mpi/all-gather/perlmutter/64_gpu_run.sh new file mode 100644 index 0000000..4897de4 --- /dev/null +++ b/mpi/all-gather/perlmutter/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 32)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/perlmutter/8_gpu_run.sh b/mpi/all-gather/perlmutter/8_gpu_run.sh new file mode 100644 index 0000000..3a454cf --- /dev/null +++ b/mpi/all-gather/perlmutter/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 256)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt new file mode 100644 index 0000000..3c16468 --- /dev/null +++ b/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 16 +Global data size: 2048 +Number of GPUs: 128 +Message size range: 262144 - 16777216 +Number of iterations: 10 +262144 0.003218 seconds +524288 0.005240 seconds +1048576 0.008649 seconds +2097152 0.015703 seconds +4194304 0.030562 seconds +8388608 0.060407 seconds +16777216 0.190813 seconds diff --git a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt new file mode 100644 index 0000000..9dc96cf --- /dev/null +++ b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt @@ -0,0 +1,21 @@ +slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory +slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory +slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory +slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory +slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory +slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory +slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory +slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory +slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory +slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory +slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory +slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory +slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory +slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory +slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory +slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory +srun: error: nid002072: tasks 8-11: Exited with exit code 2 +srun: Terminating StepId=27970493.0 +srun: error: nid002073: tasks 12-15: Exited with exit code 2 +srun: error: nid001572: tasks 4-7: Exited with exit code 2 +srun: error: nid001569: tasks 0-3: Exited with exit code 2 diff --git a/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt new file mode 100644 index 0000000..754e581 --- /dev/null +++ b/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt @@ -0,0 +1,14 @@ +Local data size: 64 +Global data size: 2048 +Number of GPUs: 32 +Message size range: 262144 - 67108864 +Number of iterations: 10 +262144 0.000744 seconds +524288 0.001397 seconds +1048576 0.002723 seconds +2097152 0.003728 seconds +4194304 0.007619 seconds +8388608 0.014516 seconds +16777216 0.030634 seconds +33554432 0.063410 seconds +67108864 0.172556 seconds diff --git a/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt new file mode 100644 index 0000000..cd13b86 --- /dev/null +++ b/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 32 +Global data size: 2048 +Number of GPUs: 64 +Message size range: 262144 - 33554432 +Number of iterations: 10 +262144 0.001523 seconds +524288 0.003143 seconds +1048576 0.004237 seconds +2097152 0.008015 seconds +4194304 0.015194 seconds +8388608 0.029697 seconds +16777216 0.063139 seconds +33554432 0.184281 seconds diff --git a/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt new file mode 100644 index 0000000..e010f99 --- /dev/null +++ b/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 256 +Global data size: 2048 +Number of GPUs: 8 +Message size range: 2097152 - 268435456 +Number of iterations: 10 +2097152 0.000888 seconds +4194304 0.001690 seconds +8388608 0.003195 seconds +16777216 0.006815 seconds +33554432 0.013828 seconds +67108864 0.028031 seconds +134217728 0.055406 seconds +268435456 0.104231 seconds diff --git a/mpi/all-reduce/frontier/128_gcd_run.sh b/mpi/all-reduce/frontier/128_gcd_run.sh new file mode 100644 index 0000000..5c6baf5 --- /dev/null +++ b/mpi/all-reduce/frontier/128_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 16 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/128_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/frontier/16_gcd_run.sh b/mpi/all-reduce/frontier/16_gcd_run.sh new file mode 100644 index 0000000..e1ad604 --- /dev/null +++ b/mpi/all-reduce/frontier/16_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 2 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/16_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/frontier/32_gcd_run.sh b/mpi/all-reduce/frontier/32_gcd_run.sh new file mode 100644 index 0000000..be7bdd9 --- /dev/null +++ b/mpi/all-reduce/frontier/32_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 4 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/32_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/frontier/64_gcd_run.sh b/mpi/all-reduce/frontier/64_gcd_run.sh new file mode 100644 index 0000000..a8e13d2 --- /dev/null +++ b/mpi/all-reduce/frontier/64_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 8 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/64_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/frontier/8_gcd_run.sh b/mpi/all-reduce/frontier/8_gcd_run.sh new file mode 100644 index 0000000..81ffbc4 --- /dev/null +++ b/mpi/all-reduce/frontier/8_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 1 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/8_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/frontier/benchmarks/128_gcd.txt b/mpi/all-reduce/frontier/benchmarks/128_gcd.txt new file mode 100644 index 0000000..56c18aa --- /dev/null +++ b/mpi/all-reduce/frontier/benchmarks/128_gcd.txt @@ -0,0 +1,12 @@ +srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 128 33554432 1073741824 10 + 0: Local data size: 1024 + 0: Global data size: 1024 + 0: Number of GPUs: 128 + 0: Message size range: 33554432 - 1073741824 + 0: Number of iterations: 10 + 0: 33554432 0.240206 seconds + 0: 67108864 0.476990 seconds + 0: 134217728 1.041500 seconds + 0: 268435456 2.951969 seconds + 0: 536870912 5.990606 seconds + 0: 1073741824 12.004613 seconds diff --git a/mpi/all-reduce/frontier/benchmarks/16_gcd.txt b/mpi/all-reduce/frontier/benchmarks/16_gcd.txt new file mode 100644 index 0000000..609afbd --- /dev/null +++ b/mpi/all-reduce/frontier/benchmarks/16_gcd.txt @@ -0,0 +1,12 @@ +srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 16 33554432 1073741824 10 + 0: Local data size: 1024 + 0: Global data size: 1024 + 0: Number of GPUs: 16 + 0: Message size range: 33554432 - 1073741824 + 0: Number of iterations: 10 + 0: 33554432 0.133082 seconds + 0: 67108864 0.267616 seconds + 0: 134217728 0.634895 seconds + 0: 268435456 1.928400 seconds + 0: 536870912 3.973167 seconds + 0: 1073741824 7.913018 seconds diff --git a/mpi/all-reduce/frontier/benchmarks/32_gcd.txt b/mpi/all-reduce/frontier/benchmarks/32_gcd.txt new file mode 100644 index 0000000..b92c437 --- /dev/null +++ b/mpi/all-reduce/frontier/benchmarks/32_gcd.txt @@ -0,0 +1,14 @@ +srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 32 8388608 1073741824 10 + 0: Local data size: 1024 + 0: Global data size: 1024 + 0: Number of GPUs: 32 + 0: Message size range: 8388608 - 1073741824 + 0: Number of iterations: 10 + 0: 8388608 0.043066 seconds + 0: 16777216 0.084259 seconds + 0: 33554432 0.167705 seconds + 0: 67108864 0.336696 seconds + 0: 134217728 0.773389 seconds + 0: 268435456 2.284815 seconds + 0: 536870912 4.693147 seconds + 0: 1073741824 9.356859 seconds diff --git a/mpi/all-reduce/frontier/benchmarks/64_gcd.txt b/mpi/all-reduce/frontier/benchmarks/64_gcd.txt new file mode 100644 index 0000000..122c83e --- /dev/null +++ b/mpi/all-reduce/frontier/benchmarks/64_gcd.txt @@ -0,0 +1,13 @@ +srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 64 16777216 1073741824 10 + 0: Local data size: 1024 + 0: Global data size: 1024 + 0: Number of GPUs: 64 + 0: Message size range: 16777216 - 1073741824 + 0: Number of iterations: 10 + 0: 16777216 0.101777 seconds + 0: 33554432 0.203258 seconds + 0: 67108864 0.406569 seconds + 0: 134217728 0.913391 seconds + 0: 268435456 2.633732 seconds + 0: 536870912 5.375804 seconds + 0: 1073741824 10.708706 seconds diff --git a/mpi/all-reduce/frontier/benchmarks/8_gcd.txt b/mpi/all-reduce/frontier/benchmarks/8_gcd.txt new file mode 100644 index 0000000..a9b69c1 --- /dev/null +++ b/mpi/all-reduce/frontier/benchmarks/8_gcd.txt @@ -0,0 +1,13 @@ +srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 8 16777216 1073741824 10 +0: Local data size: 1024 +0: Global data size: 1024 +0: Number of GPUs: 8 +0: Message size range: 16777216 - 1073741824 +0: Number of iterations: 10 +0: 16777216 0.049728 seconds +0: 33554432 0.099497 seconds +0: 67108864 0.202129 seconds +0: 134217728 0.500335 seconds +0: 268435456 1.560791 seconds +0: 536870912 3.265382 seconds +0: 1073741824 6.500534 seconds diff --git a/mpi/all-reduce/perlmutter/128_gpu_run.sh b/mpi/all-reduce/perlmutter/128_gpu_run.sh new file mode 100644 index 0000000..3438061 --- /dev/null +++ b/mpi/all-reduce/perlmutter/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 20:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/perlmutter/16_gpu_run.sh b/mpi/all-reduce/perlmutter/16_gpu_run.sh new file mode 100644 index 0000000..33962b7 --- /dev/null +++ b/mpi/all-reduce/perlmutter/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 15:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/perlmutter/32_gpu_run.sh b/mpi/all-reduce/perlmutter/32_gpu_run.sh new file mode 100644 index 0000000..fcad983 --- /dev/null +++ b/mpi/all-reduce/perlmutter/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 20:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/perlmutter/64_gpu_run.sh b/mpi/all-reduce/perlmutter/64_gpu_run.sh new file mode 100644 index 0000000..cd5b8fa --- /dev/null +++ b/mpi/all-reduce/perlmutter/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 20:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/perlmutter/8_gpu_run.sh b/mpi/all-reduce/perlmutter/8_gpu_run.sh new file mode 100644 index 0000000..ddf1050 --- /dev/null +++ b/mpi/all-reduce/perlmutter/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 15:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt new file mode 100644 index 0000000..76b174e --- /dev/null +++ b/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt @@ -0,0 +1,11 @@ +Local data size: 1024 +Global data size: 1024 +Number of GPUs: 16 +Message size range: 33554432 - 1073741824 +Number of iterations: 10 +33554432 0.145773 seconds +67108864 0.327744 seconds +134217728 0.680940 seconds +268435456 2.172019 seconds +536870912 4.377939 seconds +1073741824 8.740797 seconds diff --git a/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt new file mode 100644 index 0000000..c7d90db --- /dev/null +++ b/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 1024 +Global data size: 1024 +Number of GPUs: 32 +Message size range: 8388608 - 1073741824 +Number of iterations: 10 +8388608 0.050947 seconds +16777216 0.093279 seconds +33554432 0.183651 seconds +67108864 0.368861 seconds +134217728 0.804120 seconds +268435456 2.351269 seconds +536870912 4.727807 seconds +1073741824 9.445482 seconds diff --git a/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt new file mode 100644 index 0000000..43c1c73 --- /dev/null +++ b/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 1024 +Global data size: 1024 +Number of GPUs: 8 +Message size range: 16777216 - 1073741824 +Number of iterations: 10 +16777216 0.056679 seconds +33554432 0.108849 seconds +67108864 0.216523 seconds +134217728 0.510124 seconds +268435456 1.547371 seconds +536870912 3.104556 seconds +1073741824 6.214916 seconds diff --git a/mpi/reduce-scatter/frontier/128_gcd_run.sh b/mpi/reduce-scatter/frontier/128_gcd_run.sh new file mode 100644 index 0000000..b6505f8 --- /dev/null +++ b/mpi/reduce-scatter/frontier/128_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 16 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/frontier/16_gcd_run.sh b/mpi/reduce-scatter/frontier/16_gcd_run.sh new file mode 100644 index 0000000..eb6b2ba --- /dev/null +++ b/mpi/reduce-scatter/frontier/16_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 2 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/frontier/32_gcd_run.sh b/mpi/reduce-scatter/frontier/32_gcd_run.sh new file mode 100644 index 0000000..4ed3437 --- /dev/null +++ b/mpi/reduce-scatter/frontier/32_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 4 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/frontier/64_gcd_run.sh b/mpi/reduce-scatter/frontier/64_gcd_run.sh new file mode 100644 index 0000000..a5a9957 --- /dev/null +++ b/mpi/reduce-scatter/frontier/64_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 8 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/frontier/8_gcd_run.sh b/mpi/reduce-scatter/frontier/8_gcd_run.sh new file mode 100644 index 0000000..9d4191c --- /dev/null +++ b/mpi/reduce-scatter/frontier/8_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 1 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt new file mode 100644 index 0000000..af5e98a --- /dev/null +++ b/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt @@ -0,0 +1,13 @@ +srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 128 33554432 2147483648 10 + 0: Local data size: 2048 + 0: Global data size: 2048 + 0: Number of GPUs: 128 + 0: Message size range: 33554432 - 2147483648 + 0: Number of iterations: 10 + 0: 33554432 5.046207 seconds + 0: 67108864 5.031027 seconds + 0: 134217728 5.063647 seconds + 0: 268435456 5.054240 seconds + 0: 536870912 5.047598 seconds + 0: 1073741824 5.051536 seconds + 0: 2147483648 5.057082 seconds diff --git a/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt new file mode 100644 index 0000000..fa9c67a --- /dev/null +++ b/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt @@ -0,0 +1,13 @@ +srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 16 33554432 2147483648 10 + 0: Local data size: 2048 + 0: Global data size: 2048 + 0: Number of GPUs: 16 + 0: Message size range: 33554432 - 2147483648 + 0: Number of iterations: 10 + 0: 33554432 5.091016 seconds + 0: 67108864 5.092117 seconds + 0: 134217728 5.082377 seconds + 0: 268435456 5.103443 seconds + 0: 536870912 5.102289 seconds + 0: 1073741824 5.116191 seconds + 0: 2147483648 5.115768 seconds diff --git a/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt new file mode 100644 index 0000000..23a0ace --- /dev/null +++ b/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt @@ -0,0 +1,15 @@ +srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 32 8388608 2147483648 10 + 0: Local data size: 2048 + 0: Global data size: 2048 + 0: Number of GPUs: 32 + 0: Message size range: 8388608 - 2147483648 + 0: Number of iterations: 10 + 0: 8388608 5.006776 seconds + 0: 16777216 4.981770 seconds + 0: 33554432 5.014587 seconds + 0: 67108864 4.994224 seconds + 0: 134217728 4.977063 seconds + 0: 268435456 4.980235 seconds + 0: 536870912 5.007770 seconds + 0: 1073741824 5.013561 seconds + 0: 2147483648 5.015718 seconds diff --git a/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt new file mode 100644 index 0000000..560c383 --- /dev/null +++ b/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt @@ -0,0 +1,17 @@ +srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 64 16777216 2147483648 10 + 0: Local data size: 2048 + 0: Global data size: 2048 + 0: Number of GPUs: 64 + 0: Message size range: 16777216 - 2147483648 + 0: Number of iterations: 10 + 0: 16777216 5.006610 seconds + 0: 33554432 4.998351 seconds + 0: 67108864 5.003749 seconds + 0: 134217728 5.066133 seconds + 0: 268435456 4.980950 seconds + 0: 536870912 4.982830 seconds + 0: 1073741824 5.023178 seconds + 0: 2147483648 4.988750 seconds + 0: + 0: MPICH Slingshot Network Summary: 4 network timeouts + 0: diff --git a/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt new file mode 100644 index 0000000..493d5ee --- /dev/null +++ b/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt @@ -0,0 +1,14 @@ +srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 8 16777216 2147483648 10 +0: Local data size: 2048 +0: Global data size: 2048 +0: Number of GPUs: 8 +0: Message size range: 16777216 - 2147483648 +0: Number of iterations: 10 +0: 16777216 5.130130 seconds +0: 33554432 5.120491 seconds +0: 67108864 5.115654 seconds +0: 134217728 5.128319 seconds +0: 268435456 5.111989 seconds +0: 536870912 5.115996 seconds +0: 1073741824 5.127237 seconds +0: 2147483648 5.116940 seconds diff --git a/mpi/reduce-scatter/perlmutter/128_gpu_run.sh b/mpi/reduce-scatter/perlmutter/128_gpu_run.sh new file mode 100644 index 0000000..469aeaf --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/perlmutter/16_gpu_run.sh b/mpi/reduce-scatter/perlmutter/16_gpu_run.sh new file mode 100644 index 0000000..e66b9f4 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/perlmutter/32_gpu_run.sh b/mpi/reduce-scatter/perlmutter/32_gpu_run.sh new file mode 100644 index 0000000..07d6020 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 30:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/perlmutter/64_gpu_run.sh b/mpi/reduce-scatter/perlmutter/64_gpu_run.sh new file mode 100644 index 0000000..e51945a --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/perlmutter/8_gpu_run.sh b/mpi/reduce-scatter/perlmutter/8_gpu_run.sh new file mode 100644 index 0000000..1b51537 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 30:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt new file mode 100644 index 0000000..d696072 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 128 +Message size range: 33554432 - 2147483648 +Number of iterations: 10 +33554432 3.352414 seconds +67108864 3.323000 seconds +134217728 3.331817 seconds +268435456 3.327162 seconds +536870912 3.345694 seconds +1073741824 3.326455 seconds +2147483648 3.321790 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt new file mode 100644 index 0000000..b71477d --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 16 +Message size range: 33554432 - 2147483648 +Number of iterations: 10 +33554432 3.368300 seconds +67108864 3.361940 seconds +134217728 3.367816 seconds +268435456 3.360722 seconds +536870912 3.363088 seconds +1073741824 3.392373 seconds +2147483648 3.375325 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt new file mode 100644 index 0000000..38e09b1 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt @@ -0,0 +1,14 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 32 +Message size range: 8388608 - 2147483648 +Number of iterations: 10 +8388608 3.368554 seconds +16777216 3.367485 seconds +33554432 3.376475 seconds +67108864 3.381592 seconds +134217728 3.384111 seconds +268435456 3.375780 seconds +536870912 3.371542 seconds +1073741824 3.379895 seconds +2147483648 3.381470 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt new file mode 100644 index 0000000..d982100 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 64 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 2.220629 seconds +33554432 2.201147 seconds +67108864 2.196879 seconds +134217728 2.199449 seconds +268435456 2.194973 seconds +536870912 2.196809 seconds +1073741824 2.196212 seconds +2147483648 2.201029 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt new file mode 100644 index 0000000..d2bdd9a --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 8 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 3.558431 seconds +33554432 3.553477 seconds +67108864 3.562137 seconds +134217728 3.556267 seconds +268435456 3.551567 seconds +536870912 3.599067 seconds +1073741824 3.608635 seconds +2147483648 3.624090 seconds diff --git a/nccl/Makefile b/nccl/Makefile new file mode 100644 index 0000000..d4423b4 --- /dev/null +++ b/nccl/Makefile @@ -0,0 +1,25 @@ +# Copyright 2024 Parallel Software and Systems Group, University of Maryland. +# See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +CC = cc + +# perlmutter flags +INC = -I/global/common/software/nersc9/nccl/2.19.4/include +CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL +LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl + +all: allgather.x allreduce.x reduce_scatter.x + +allgather.x: ../allgather.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu + +allreduce.x: ../allreduce.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu + +reduce_scatter.x: ../reduce_scatter.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu + +clean: + rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x diff --git a/nccl/all-gather/128_gpu_run.sh b/nccl/all-gather/128_gpu_run.sh new file mode 100644 index 0000000..82998f7 --- /dev/null +++ b/nccl/all-gather/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 32)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-gather/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-gather/16_gpu_run.sh b/nccl/all-gather/16_gpu_run.sh new file mode 100644 index 0000000..47b5f7c --- /dev/null +++ b/nccl/all-gather/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 256)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-gather/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-gather/32_gpu_run.sh b/nccl/all-gather/32_gpu_run.sh new file mode 100644 index 0000000..5459a34 --- /dev/null +++ b/nccl/all-gather/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 64)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-gather/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-gather/64_gpu_run.sh b/nccl/all-gather/64_gpu_run.sh new file mode 100644 index 0000000..2ad7e3a --- /dev/null +++ b/nccl/all-gather/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 32)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-gather/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-gather/8_gpu_run.sh b/nccl/all-gather/8_gpu_run.sh new file mode 100644 index 0000000..55e05f8 --- /dev/null +++ b/nccl/all-gather/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 256)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-gather/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-gather/benchmarks/16_gpu.txt b/nccl/all-gather/benchmarks/16_gpu.txt new file mode 100644 index 0000000..22b1d19 --- /dev/null +++ b/nccl/all-gather/benchmarks/16_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 256 +Global data size: 4096 +Number of GPUs: 16 +Message size range: 2097152 - 268435456 +Number of iterations: 10 +2097152 0.000546 seconds +4194304 0.000963 seconds +8388608 0.001810 seconds +16777216 0.003587 seconds +33554432 0.006843 seconds +67108864 0.013602 seconds +134217728 0.026932 seconds +268435456 0.052715 seconds diff --git a/nccl/all-gather/benchmarks/32_gpu.txt b/nccl/all-gather/benchmarks/32_gpu.txt new file mode 100644 index 0000000..da3b81b --- /dev/null +++ b/nccl/all-gather/benchmarks/32_gpu.txt @@ -0,0 +1,14 @@ +Local data size: 64 +Global data size: 2048 +Number of GPUs: 32 +Message size range: 262144 - 67108864 +Number of iterations: 10 +262144 0.000531 seconds +524288 0.000602 seconds +1048576 0.000700 seconds +2097152 0.001056 seconds +4194304 0.001907 seconds +8388608 0.003960 seconds +16777216 0.006958 seconds +33554432 0.014047 seconds +67108864 0.027585 seconds diff --git a/nccl/all-gather/benchmarks/64_gpu.txt b/nccl/all-gather/benchmarks/64_gpu.txt new file mode 100644 index 0000000..f05957a --- /dev/null +++ b/nccl/all-gather/benchmarks/64_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 32 +Global data size: 2048 +Number of GPUs: 64 +Message size range: 262144 - 33554432 +Number of iterations: 10 +262144 0.001041 seconds +524288 0.001212 seconds +1048576 0.001357 seconds +2097152 0.002122 seconds +4194304 0.003750 seconds +8388608 0.007686 seconds +16777216 0.014414 seconds +33554432 0.028307 seconds diff --git a/nccl/all-gather/benchmarks/8_gpu.txt b/nccl/all-gather/benchmarks/8_gpu.txt new file mode 100644 index 0000000..9d9c99f --- /dev/null +++ b/nccl/all-gather/benchmarks/8_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 256 +Global data size: 2048 +Number of GPUs: 8 +Message size range: 2097152 - 268435456 +Number of iterations: 10 +2097152 0.000298 seconds +4194304 0.000477 seconds +8388608 0.000903 seconds +16777216 0.001661 seconds +33554432 0.003230 seconds +67108864 0.006674 seconds +134217728 0.012419 seconds +268435456 0.024550 seconds diff --git a/nccl/all-reduce/128_gpu_run.sh b/nccl/all-reduce/128_gpu_run.sh new file mode 100644 index 0000000..591cdf3 --- /dev/null +++ b/nccl/all-reduce/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-reduce/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-reduce/16_gpu_run.sh b/nccl/all-reduce/16_gpu_run.sh new file mode 100644 index 0000000..9232407 --- /dev/null +++ b/nccl/all-reduce/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-reduce/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-reduce/32_gpu_run.sh b/nccl/all-reduce/32_gpu_run.sh new file mode 100644 index 0000000..7130fa8 --- /dev/null +++ b/nccl/all-reduce/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-reduce/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-reduce/64_gpu_run.sh b/nccl/all-reduce/64_gpu_run.sh new file mode 100644 index 0000000..057637f --- /dev/null +++ b/nccl/all-reduce/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-reduce/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-reduce/8_gpu_run.sh b/nccl/all-reduce/8_gpu_run.sh new file mode 100644 index 0000000..be7f5f1 --- /dev/null +++ b/nccl/all-reduce/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-reduce/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-reduce/benchmarks/16_gpu.txt b/nccl/all-reduce/benchmarks/16_gpu.txt new file mode 100644 index 0000000..a866d54 --- /dev/null +++ b/nccl/all-reduce/benchmarks/16_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 16 +Message size range: 33554432 - 2147483648 +Number of iterations: 10 +33554432 0.001007 seconds +67108864 0.001788 seconds +134217728 0.003634 seconds +268435456 0.006935 seconds +536870912 0.013610 seconds +1073741824 0.027019 seconds +2147483648 0.052864 seconds diff --git a/nccl/all-reduce/benchmarks/32_gpu.txt b/nccl/all-reduce/benchmarks/32_gpu.txt new file mode 100644 index 0000000..a20b1cd --- /dev/null +++ b/nccl/all-reduce/benchmarks/32_gpu.txt @@ -0,0 +1,14 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 32 +Message size range: 8388608 - 2147483648 +Number of iterations: 10 +8388608 0.001052 seconds +16777216 0.001220 seconds +33554432 0.001356 seconds +67108864 0.002028 seconds +134217728 0.003714 seconds +268435456 0.007242 seconds +536870912 0.013809 seconds +1073741824 0.027274 seconds +2147483648 0.054261 seconds diff --git a/nccl/reduce-scatter/128_gpu_run.sh b/nccl/reduce-scatter/128_gpu_run.sh new file mode 100644 index 0000000..8590821 --- /dev/null +++ b/nccl/reduce-scatter/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/reduce-scatter/16_gpu_run.sh b/nccl/reduce-scatter/16_gpu_run.sh new file mode 100644 index 0000000..7a20fa6 --- /dev/null +++ b/nccl/reduce-scatter/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/reduce-scatter/32_gpu_run.sh b/nccl/reduce-scatter/32_gpu_run.sh new file mode 100644 index 0000000..3d297ff --- /dev/null +++ b/nccl/reduce-scatter/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/reduce-scatter/64_gpu_run.sh b/nccl/reduce-scatter/64_gpu_run.sh new file mode 100644 index 0000000..6bbf97a --- /dev/null +++ b/nccl/reduce-scatter/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/reduce-scatter/8_gpu_run.sh b/nccl/reduce-scatter/8_gpu_run.sh new file mode 100644 index 0000000..21c0dc4 --- /dev/null +++ b/nccl/reduce-scatter/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/reduce-scatter/benchmarks/128_gpu.txt b/nccl/reduce-scatter/benchmarks/128_gpu.txt new file mode 100644 index 0000000..7c1c8f9 --- /dev/null +++ b/nccl/reduce-scatter/benchmarks/128_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 128 +Message size range: 33554432 - 2147483648 +Number of iterations: 10 +33554432 0.028300 seconds +67108864 0.028351 seconds +134217728 0.028351 seconds +268435456 0.028502 seconds +536870912 0.028579 seconds +1073741824 0.028650 seconds +2147483648 0.028506 seconds diff --git a/nccl/reduce-scatter/benchmarks/16_gpu.txt b/nccl/reduce-scatter/benchmarks/16_gpu.txt new file mode 100644 index 0000000..14acf87 --- /dev/null +++ b/nccl/reduce-scatter/benchmarks/16_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 16 +Message size range: 33554432 - 2147483648 +Number of iterations: 10 +33554432 0.033170 seconds +67108864 0.033280 seconds +134217728 0.033220 seconds +268435456 0.033291 seconds +536870912 0.033217 seconds +1073741824 0.033158 seconds +2147483648 0.033275 seconds diff --git a/nccl/reduce-scatter/benchmarks/32_gpu.txt b/nccl/reduce-scatter/benchmarks/32_gpu.txt new file mode 100644 index 0000000..7eecc67 --- /dev/null +++ b/nccl/reduce-scatter/benchmarks/32_gpu.txt @@ -0,0 +1,14 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 32 +Message size range: 8388608 - 2147483648 +Number of iterations: 10 +8388608 0.027121 seconds +16777216 0.027661 seconds +33554432 0.027766 seconds +67108864 0.027992 seconds +134217728 0.027914 seconds +268435456 0.027912 seconds +536870912 0.027777 seconds +1073741824 0.027861 seconds +2147483648 0.027551 seconds diff --git a/nccl/reduce-scatter/benchmarks/64_gpu.txt b/nccl/reduce-scatter/benchmarks/64_gpu.txt new file mode 100644 index 0000000..8f8ddd0 --- /dev/null +++ b/nccl/reduce-scatter/benchmarks/64_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 64 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 0.028306 seconds +33554432 0.028511 seconds +67108864 0.028175 seconds +134217728 0.027998 seconds +268435456 0.027883 seconds +536870912 0.027802 seconds +1073741824 0.027954 seconds +2147483648 0.028085 seconds diff --git a/nccl/reduce-scatter/benchmarks/8_gpu.txt b/nccl/reduce-scatter/benchmarks/8_gpu.txt new file mode 100644 index 0000000..26c22b6 --- /dev/null +++ b/nccl/reduce-scatter/benchmarks/8_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 8 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 0.024231 seconds +33554432 0.024389 seconds +67108864 0.024167 seconds +134217728 0.024047 seconds +268435456 0.024293 seconds +536870912 0.024031 seconds +1073741824 0.024048 seconds +2147483648 0.024241 seconds diff --git a/rccl/Makefile b/rccl/Makefile new file mode 100644 index 0000000..aa0a7b9 --- /dev/null +++ b/rccl/Makefile @@ -0,0 +1,25 @@ +# Copyright 2024 Parallel Software and Systems Group, University of Maryland. +# See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +CC = cc + +# frontier flags +INC = -I${ROCM_PATH}/include +CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL +LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl + +all: allgather.x allreduce.x reduce_scatter.x + +allgather.x: ../allgather.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu + +allreduce.x: ../allreduce.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu + +reduce_scatter.x: ../reduce_scatter.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu + +clean: + rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x diff --git a/reduce_scatter.cu b/reduce_scatter.cu new file mode 100644 index 0000000..99fc950 --- /dev/null +++ b/reduce_scatter.cu @@ -0,0 +1,269 @@ +/* \file reduce_scatter.cu + * Copyright 2024 Parallel Software and Systems Group, University of Maryland. + * See the top-level LICENSE file for details. + * + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include +#include + +#ifdef USE_CUDA + #include + #define bfloat16 nv_bfloat16 +#elif USE_ROCM + #define __HIP_PLATFORM_AMD__ + #include + #include + #include + #define bfloat16 hip_bfloat16 +#endif + +#ifdef USE_NCCL + #include "nccl.h" +#elif USE_RCCL + #include +#endif + +#define NUM_WARMUP_ITERATIONS 5 + +#define MPI_CHECK(cmd) do { \ + int64_t e = cmd; \ + if( e != MPI_SUCCESS ) { \ + printf("Failed: MPI error %s:%d '%ld'\n", \ + __FILE__,__LINE__, e); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define CUDA_CHECK(cmd) do { \ + cudaError_t e = cmd; \ + if(e != cudaSuccess) { \ + printf("CUDA error %s:%d: %s\n", \ + __FILE__, __LINE__, cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define HIP_CHECK(cmd) do { \ + hipError_t e = cmd; \ + if(e != hipSuccess) { \ + printf("HIP error %s:%d: %s\n", \ + __FILE__, __LINE__, hipGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +// NCCL_CHECK is used to validate RCCL functions as well +#define NCCL_CHECK(cmd) do { \ + ncclResult_t e = cmd; \ + if (e != ncclSuccess) { \ + printf("NCCL error %s:%d %s\n", \ + __FILE__, __LINE__, ncclGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +void initializeData(bfloat16 *data, int64_t size) { + for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) { + #ifdef USE_CUDA + data[i] = __float2bfloat16((float)i); + #elif USE_ROCM + // ROCm doesn't have a float2bfloat16 method + data[i] = (bfloat16) ((float) i); + #endif + } +} + +void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { + bfloat16* in = (bfloat16*) invec; + bfloat16* inout = (bfloat16*) inoutvec; + for (int i = 0; i < *len; i++) { + #ifdef USE_CUDA + inout[i] = __hadd(in[i], inout[i]); + #elif USE_ROCM + inout[i] = in[i] + inout[i]; + #endif + } +} + +int main(int argc, char *argv[]) { + if (argc != 5) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return EXIT_FAILURE; + } + + int num_gpus = atoi(argv[1]); + int64_t min_msg_size = strtoll(argv[2], NULL, 10); + int64_t max_msg_size = strtoll(argv[3], NULL, 10); + int iterations = atoi(argv[4]); + + if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { + fprintf(stderr, "Invalid input parameters.\n"); + return EXIT_FAILURE; + } + + int my_rank, num_pes; + int num_gpus_per_node; + int msg_count; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_pes); + + if (num_pes != num_gpus) { + fprintf(stderr, "Number of processes must match number of GPUs.\n"); + MPI_Finalize(); + return EXIT_FAILURE; + } + + // Initialize GPU context + #if USE_CUDA + cudaGetDeviceCount(&num_gpus_per_node); + cudaSetDevice((my_rank % num_gpus_per_node)); + #elif USE_ROCM + hipGetDeviceCount(&num_gpus_per_node); + hipSetDevice((my_rank % num_gpus_per_node)); + #endif + + int64_t local_data_size = max_msg_size; // Size of local data + int64_t global_data_size = local_data_size; // Size of global data + + if (my_rank == 0) { + fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024); + fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024); + } + + bfloat16 *local_data = (bfloat16*)malloc(local_data_size); + bfloat16 *global_data = (bfloat16*)malloc(global_data_size); + + // Initialize local data + initializeData(local_data, local_data_size); + + bfloat16 *d_local_data, *d_global_data; + #ifdef USE_CUDA + CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); + CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); + // Copy local data to GPU + CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); + + #elif USE_ROCM + HIP_CHECK(hipMalloc(&d_local_data, local_data_size)); + HIP_CHECK(hipMalloc(&d_global_data, global_data_size)); + HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice)); + #endif + + #ifdef USE_MPI + // create 2-byte datatype (send raw, un-interpreted bytes) + MPI_Datatype mpi_type_bfloat16; + MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16); + MPI_Type_commit(&mpi_type_bfloat16); + + // define custom reduce operation for nv_bfloat16 types + MPI_Op CUSTOM_SUM; + MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM); + + #elif defined(USE_NCCL) || defined(USE_RCCL) + ncclUniqueId nccl_comm_id; + ncclComm_t nccl_comm; + + if (my_rank == 0) { + /* Generates an Id to be used in ncclCommInitRank. */ + ncclGetUniqueId(&nccl_comm_id); + } + + /* distribute nccl_comm_id to all ranks */ + MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, + 0, MPI_COMM_WORLD)); + + /* Create a new NCCL/RCCL communicator */ + NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); + #endif + + // init recvcounts to send an equal portion of data from the reduce operation + int num_elements = local_data_size / sizeof(bfloat16); + int portion = num_elements / num_pes; + int *recvcounts = (int*) malloc(sizeof(int) * num_pes); + for (int i = 0; i < num_pes; i++) + recvcounts[i] = portion; + + // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather + double total_time, start_time; + MPI_Request request; + MPI_Status status; + + // Print benchmark results + if (my_rank == 0) { + printf("Number of GPUs: %d\n", num_gpus); + printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size); + printf("Number of iterations: %d\n", iterations); + } + fflush(NULL); + + for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { + msg_count = msg_size / sizeof(bfloat16); + // warmup iterations + for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { + #ifdef USE_MPI + MPI_CHECK(MPI_Ireduce_scatter(d_local_data, d_global_data, recvcounts, mpi_type_bfloat16, + CUSTOM_SUM, MPI_COMM_WORLD, &request)); + + MPI_CHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) || defined(USE_RCCL) + NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL)); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); + #endif + } + + if(msg_size >= 8388608) + iterations = 20; + + MPI_Barrier(MPI_COMM_WORLD); + start_time = MPI_Wtime(); + for (int i = 0; i < iterations; ++i) { + #ifdef USE_MPI + MPI_CHECK(MPI_Ireduce_scatter(d_local_data, d_global_data, recvcounts, mpi_type_bfloat16, + CUSTOM_SUM, MPI_COMM_WORLD, &request)); + + MPI_CHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) || defined(USE_RCCL) + NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL)); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); + #endif + } + MPI_Barrier(MPI_COMM_WORLD); + total_time = MPI_Wtime() - start_time; + if (my_rank == 0) + printf("%ld %.6f seconds\n", msg_size, (total_time / iterations)); + } + + // Cleanup + free(local_data); + free(global_data); + #ifdef USE_CUDA + CUDA_CHECK(cudaFree(d_local_data)); + CUDA_CHECK(cudaFree(d_global_data)); + #elif USE_ROCM + HIP_CHECK(hipFree(d_local_data)); + HIP_CHECK(hipFree(d_global_data)); + #endif + + #ifdef defined(USE_NCCL) || defined(USE_RCCL) + ncclCommDestroy(nccl_comm); + #endif + + MPI_Finalize(); + return EXIT_SUCCESS; +} From 559f4bb99318595ac37e53bac77871d588eefcf3 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Thu, 11 Jul 2024 17:44:21 -0700 Subject: [PATCH 43/52] fix reduce_scatter bug --- reduce_scatter.cu | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/reduce_scatter.cu b/reduce_scatter.cu index 99fc950..b2dc99e 100644 --- a/reduce_scatter.cu +++ b/reduce_scatter.cu @@ -183,11 +183,8 @@ int main(int argc, char *argv[]) { #endif // init recvcounts to send an equal portion of data from the reduce operation - int num_elements = local_data_size / sizeof(bfloat16); - int portion = num_elements / num_pes; int *recvcounts = (int*) malloc(sizeof(int) * num_pes); - for (int i = 0; i < num_pes; i++) - recvcounts[i] = portion; + int portion; // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather double total_time, start_time; @@ -204,6 +201,11 @@ int main(int argc, char *argv[]) { for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { msg_count = msg_size / sizeof(bfloat16); + + portion = msg_count / num_pes; + for (int i = 0; i < num_pes; i++) + recvcounts[i] = portion; + // warmup iterations for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { #ifdef USE_MPI From d70e475861f02a871b9f56a758f845a53b1543e8 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Thu, 11 Jul 2024 18:10:23 -0700 Subject: [PATCH 44/52] push latest benchmarks --- mpi/reduce-scatter/perlmutter/128_gpu_run.sh | 4 ++-- mpi/reduce-scatter/perlmutter/16_gpu_run.sh | 4 ++-- mpi/reduce-scatter/perlmutter/32_gpu_run.sh | 4 ++-- mpi/reduce-scatter/perlmutter/64_gpu_run.sh | 4 ++-- mpi/reduce-scatter/perlmutter/8_gpu_run.sh | 6 +++--- .../perlmutter/benchmarks/128_gpu.txt | 12 ------------ .../perlmutter/benchmarks/16_gpu.txt | 12 ------------ .../perlmutter/benchmarks/32_gpu.txt | 14 -------------- .../perlmutter/benchmarks/64_gpu.txt | 13 ------------- mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt | 13 ------------- nccl/all-reduce/benchmarks/64_gpu.txt | 13 +++++++++++++ nccl/all-reduce/benchmarks/8_gpu.txt | 13 +++++++++++++ nccl/reduce-scatter/128_gpu_run.sh | 4 ++-- nccl/reduce-scatter/16_gpu_run.sh | 4 ++-- nccl/reduce-scatter/32_gpu_run.sh | 4 ++-- nccl/reduce-scatter/64_gpu_run.sh | 4 ++-- nccl/reduce-scatter/8_gpu_run.sh | 4 ++-- nccl/reduce-scatter/benchmarks/128_gpu.txt | 12 ------------ nccl/reduce-scatter/benchmarks/16_gpu.txt | 12 ------------ nccl/reduce-scatter/benchmarks/32_gpu.txt | 14 -------------- nccl/reduce-scatter/benchmarks/64_gpu.txt | 13 ------------- nccl/reduce-scatter/benchmarks/8_gpu.txt | 13 ------------- 22 files changed, 47 insertions(+), 149 deletions(-) delete mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt delete mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt delete mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt delete mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt delete mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt create mode 100644 nccl/all-reduce/benchmarks/64_gpu.txt create mode 100644 nccl/all-reduce/benchmarks/8_gpu.txt delete mode 100644 nccl/reduce-scatter/benchmarks/128_gpu.txt delete mode 100644 nccl/reduce-scatter/benchmarks/16_gpu.txt delete mode 100644 nccl/reduce-scatter/benchmarks/32_gpu.txt delete mode 100644 nccl/reduce-scatter/benchmarks/64_gpu.txt delete mode 100644 nccl/reduce-scatter/benchmarks/8_gpu.txt diff --git a/mpi/reduce-scatter/perlmutter/128_gpu_run.sh b/mpi/reduce-scatter/perlmutter/128_gpu_run.sh index 469aeaf..28c8479 100644 --- a/mpi/reduce-scatter/perlmutter/128_gpu_run.sh +++ b/mpi/reduce-scatter/perlmutter/128_gpu_run.sh @@ -29,8 +29,8 @@ export FI_CXI_OFLOW_BUF_COUNT=1 MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 MAX_MSG_SIZE=$((1048576 * 2048)) -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt" +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt" echo $run_cmd eval $run_cmd diff --git a/mpi/reduce-scatter/perlmutter/16_gpu_run.sh b/mpi/reduce-scatter/perlmutter/16_gpu_run.sh index e66b9f4..c3b9e32 100644 --- a/mpi/reduce-scatter/perlmutter/16_gpu_run.sh +++ b/mpi/reduce-scatter/perlmutter/16_gpu_run.sh @@ -29,8 +29,8 @@ export FI_CXI_OFLOW_BUF_COUNT=1 MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 MAX_MSG_SIZE=$((1048576 * 2048)) -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt" +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt" echo $run_cmd eval $run_cmd diff --git a/mpi/reduce-scatter/perlmutter/32_gpu_run.sh b/mpi/reduce-scatter/perlmutter/32_gpu_run.sh index 07d6020..1681d65 100644 --- a/mpi/reduce-scatter/perlmutter/32_gpu_run.sh +++ b/mpi/reduce-scatter/perlmutter/32_gpu_run.sh @@ -29,8 +29,8 @@ export FI_CXI_OFLOW_BUF_COUNT=1 MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 MAX_MSG_SIZE=$((1048576 * 2048)) -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt" +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt" echo $run_cmd eval $run_cmd diff --git a/mpi/reduce-scatter/perlmutter/64_gpu_run.sh b/mpi/reduce-scatter/perlmutter/64_gpu_run.sh index e51945a..f932006 100644 --- a/mpi/reduce-scatter/perlmutter/64_gpu_run.sh +++ b/mpi/reduce-scatter/perlmutter/64_gpu_run.sh @@ -29,8 +29,8 @@ export FI_CXI_OFLOW_BUF_COUNT=1 MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 MAX_MSG_SIZE=$((1048576 * 2048)) -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt" +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt" echo $run_cmd eval $run_cmd diff --git a/mpi/reduce-scatter/perlmutter/8_gpu_run.sh b/mpi/reduce-scatter/perlmutter/8_gpu_run.sh index 1b51537..977ba91 100644 --- a/mpi/reduce-scatter/perlmutter/8_gpu_run.sh +++ b/mpi/reduce-scatter/perlmutter/8_gpu_run.sh @@ -3,7 +3,7 @@ #SBATCH -A m4641_g #SBATCH -C gpu #SBATCH -q regular -#SBATCH -t 30:00 +#SBATCH -t 20:00 #SBATCH -N 2 #SBATCH --ntasks-per-node=4 #SBATCH -c 32 @@ -29,8 +29,8 @@ export FI_CXI_OFLOW_BUF_COUNT=1 MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 MAX_MSG_SIZE=$((1048576 * 2048)) -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt" +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt" echo $run_cmd eval $run_cmd diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt deleted file mode 100644 index d696072..0000000 --- a/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt +++ /dev/null @@ -1,12 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 128 -Message size range: 33554432 - 2147483648 -Number of iterations: 10 -33554432 3.352414 seconds -67108864 3.323000 seconds -134217728 3.331817 seconds -268435456 3.327162 seconds -536870912 3.345694 seconds -1073741824 3.326455 seconds -2147483648 3.321790 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt deleted file mode 100644 index b71477d..0000000 --- a/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt +++ /dev/null @@ -1,12 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 16 -Message size range: 33554432 - 2147483648 -Number of iterations: 10 -33554432 3.368300 seconds -67108864 3.361940 seconds -134217728 3.367816 seconds -268435456 3.360722 seconds -536870912 3.363088 seconds -1073741824 3.392373 seconds -2147483648 3.375325 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt deleted file mode 100644 index 38e09b1..0000000 --- a/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt +++ /dev/null @@ -1,14 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 32 -Message size range: 8388608 - 2147483648 -Number of iterations: 10 -8388608 3.368554 seconds -16777216 3.367485 seconds -33554432 3.376475 seconds -67108864 3.381592 seconds -134217728 3.384111 seconds -268435456 3.375780 seconds -536870912 3.371542 seconds -1073741824 3.379895 seconds -2147483648 3.381470 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt deleted file mode 100644 index d982100..0000000 --- a/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 64 -Message size range: 16777216 - 2147483648 -Number of iterations: 10 -16777216 2.220629 seconds -33554432 2.201147 seconds -67108864 2.196879 seconds -134217728 2.199449 seconds -268435456 2.194973 seconds -536870912 2.196809 seconds -1073741824 2.196212 seconds -2147483648 2.201029 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt deleted file mode 100644 index d2bdd9a..0000000 --- a/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 8 -Message size range: 16777216 - 2147483648 -Number of iterations: 10 -16777216 3.558431 seconds -33554432 3.553477 seconds -67108864 3.562137 seconds -134217728 3.556267 seconds -268435456 3.551567 seconds -536870912 3.599067 seconds -1073741824 3.608635 seconds -2147483648 3.624090 seconds diff --git a/nccl/all-reduce/benchmarks/64_gpu.txt b/nccl/all-reduce/benchmarks/64_gpu.txt new file mode 100644 index 0000000..07fbe6d --- /dev/null +++ b/nccl/all-reduce/benchmarks/64_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 64 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 0.001536 seconds +33554432 0.001953 seconds +67108864 0.002903 seconds +134217728 0.004239 seconds +268435456 0.007382 seconds +536870912 0.014722 seconds +1073741824 0.028043 seconds +2147483648 0.055311 seconds diff --git a/nccl/all-reduce/benchmarks/8_gpu.txt b/nccl/all-reduce/benchmarks/8_gpu.txt new file mode 100644 index 0000000..c9bda12 --- /dev/null +++ b/nccl/all-reduce/benchmarks/8_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 8 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 0.000507 seconds +33554432 0.000855 seconds +67108864 0.001697 seconds +134217728 0.003146 seconds +268435456 0.006394 seconds +536870912 0.012162 seconds +1073741824 0.024174 seconds +2147483648 0.047715 seconds diff --git a/nccl/reduce-scatter/128_gpu_run.sh b/nccl/reduce-scatter/128_gpu_run.sh index 8590821..e37f70b 100644 --- a/nccl/reduce-scatter/128_gpu_run.sh +++ b/nccl/reduce-scatter/128_gpu_run.sh @@ -29,8 +29,8 @@ export FI_CXI_OFLOW_BUF_COUNT=1 MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 MAX_MSG_SIZE=$((1048576 * 2048)) -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/128_gpu.txt" +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/reduce-scatter/benchmarks/128_gpu.txt" echo $run_cmd eval $run_cmd diff --git a/nccl/reduce-scatter/16_gpu_run.sh b/nccl/reduce-scatter/16_gpu_run.sh index 7a20fa6..0ea1f3b 100644 --- a/nccl/reduce-scatter/16_gpu_run.sh +++ b/nccl/reduce-scatter/16_gpu_run.sh @@ -29,8 +29,8 @@ export FI_CXI_OFLOW_BUF_COUNT=1 MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 MAX_MSG_SIZE=$((1048576 * 2048)) -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/16_gpu.txt" +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/reduce-scatter/benchmarks/16_gpu.txt" echo $run_cmd eval $run_cmd diff --git a/nccl/reduce-scatter/32_gpu_run.sh b/nccl/reduce-scatter/32_gpu_run.sh index 3d297ff..0bccbb2 100644 --- a/nccl/reduce-scatter/32_gpu_run.sh +++ b/nccl/reduce-scatter/32_gpu_run.sh @@ -29,8 +29,8 @@ export FI_CXI_OFLOW_BUF_COUNT=1 MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 MAX_MSG_SIZE=$((1048576 * 2048)) -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/32_gpu.txt" +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/reduce-scatter/benchmarks/32_gpu.txt" echo $run_cmd eval $run_cmd diff --git a/nccl/reduce-scatter/64_gpu_run.sh b/nccl/reduce-scatter/64_gpu_run.sh index 6bbf97a..79dd4cb 100644 --- a/nccl/reduce-scatter/64_gpu_run.sh +++ b/nccl/reduce-scatter/64_gpu_run.sh @@ -29,8 +29,8 @@ export FI_CXI_OFLOW_BUF_COUNT=1 MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 MAX_MSG_SIZE=$((1048576 * 2048)) -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/64_gpu.txt" +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/reduce-scatter/benchmarks/64_gpu.txt" echo $run_cmd eval $run_cmd diff --git a/nccl/reduce-scatter/8_gpu_run.sh b/nccl/reduce-scatter/8_gpu_run.sh index 21c0dc4..6fba196 100644 --- a/nccl/reduce-scatter/8_gpu_run.sh +++ b/nccl/reduce-scatter/8_gpu_run.sh @@ -29,8 +29,8 @@ export FI_CXI_OFLOW_BUF_COUNT=1 MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 MAX_MSG_SIZE=$((1048576 * 2048)) -SCRIPT="/global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" -run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& /global/homes/a/adityat/gpu-benchmarks/nccl/reduce-scatter/benchmarks/8_gpu.txt" +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/reduce-scatter/benchmarks/8_gpu.txt" echo $run_cmd eval $run_cmd diff --git a/nccl/reduce-scatter/benchmarks/128_gpu.txt b/nccl/reduce-scatter/benchmarks/128_gpu.txt deleted file mode 100644 index 7c1c8f9..0000000 --- a/nccl/reduce-scatter/benchmarks/128_gpu.txt +++ /dev/null @@ -1,12 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 128 -Message size range: 33554432 - 2147483648 -Number of iterations: 10 -33554432 0.028300 seconds -67108864 0.028351 seconds -134217728 0.028351 seconds -268435456 0.028502 seconds -536870912 0.028579 seconds -1073741824 0.028650 seconds -2147483648 0.028506 seconds diff --git a/nccl/reduce-scatter/benchmarks/16_gpu.txt b/nccl/reduce-scatter/benchmarks/16_gpu.txt deleted file mode 100644 index 14acf87..0000000 --- a/nccl/reduce-scatter/benchmarks/16_gpu.txt +++ /dev/null @@ -1,12 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 16 -Message size range: 33554432 - 2147483648 -Number of iterations: 10 -33554432 0.033170 seconds -67108864 0.033280 seconds -134217728 0.033220 seconds -268435456 0.033291 seconds -536870912 0.033217 seconds -1073741824 0.033158 seconds -2147483648 0.033275 seconds diff --git a/nccl/reduce-scatter/benchmarks/32_gpu.txt b/nccl/reduce-scatter/benchmarks/32_gpu.txt deleted file mode 100644 index 7eecc67..0000000 --- a/nccl/reduce-scatter/benchmarks/32_gpu.txt +++ /dev/null @@ -1,14 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 32 -Message size range: 8388608 - 2147483648 -Number of iterations: 10 -8388608 0.027121 seconds -16777216 0.027661 seconds -33554432 0.027766 seconds -67108864 0.027992 seconds -134217728 0.027914 seconds -268435456 0.027912 seconds -536870912 0.027777 seconds -1073741824 0.027861 seconds -2147483648 0.027551 seconds diff --git a/nccl/reduce-scatter/benchmarks/64_gpu.txt b/nccl/reduce-scatter/benchmarks/64_gpu.txt deleted file mode 100644 index 8f8ddd0..0000000 --- a/nccl/reduce-scatter/benchmarks/64_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 64 -Message size range: 16777216 - 2147483648 -Number of iterations: 10 -16777216 0.028306 seconds -33554432 0.028511 seconds -67108864 0.028175 seconds -134217728 0.027998 seconds -268435456 0.027883 seconds -536870912 0.027802 seconds -1073741824 0.027954 seconds -2147483648 0.028085 seconds diff --git a/nccl/reduce-scatter/benchmarks/8_gpu.txt b/nccl/reduce-scatter/benchmarks/8_gpu.txt deleted file mode 100644 index 26c22b6..0000000 --- a/nccl/reduce-scatter/benchmarks/8_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 8 -Message size range: 16777216 - 2147483648 -Number of iterations: 10 -16777216 0.024231 seconds -33554432 0.024389 seconds -67108864 0.024167 seconds -134217728 0.024047 seconds -268435456 0.024293 seconds -536870912 0.024031 seconds -1073741824 0.024048 seconds -2147483648 0.024241 seconds From 4e7ac6fa4eeb774e236ca375df37565230b9a1a3 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Fri, 12 Jul 2024 07:32:04 -0700 Subject: [PATCH 45/52] add benchmarks so far --- .../perlmutter/benchmarks/128_gpu.txt | 12 ------------ .../perlmutter/benchmarks/16_gpu.txt | 10 +++++----- .../perlmutter/benchmarks/32_gpu.txt | 18 +++++++++--------- .../perlmutter/benchmarks/64_gpu.txt | 16 ++++++++-------- mpi/all-gather/perlmutter/benchmarks/8_gpu.txt | 16 ++++++++-------- .../perlmutter/benchmarks/16_gpu.txt | 12 ++++++------ .../perlmutter/benchmarks/32_gpu.txt | 16 ++++++++-------- .../perlmutter/benchmarks/64_gpu.txt | 12 ++++++++++++ mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt | 14 +++++++------- .../perlmutter/benchmarks/128_gpu.txt | 12 ++++++++++++ .../perlmutter/benchmarks/16_gpu.txt | 12 ++++++++++++ .../perlmutter/benchmarks/32_gpu.txt | 14 ++++++++++++++ .../perlmutter/benchmarks/64_gpu.txt | 13 +++++++++++++ .../perlmutter/benchmarks/8_gpu.txt | 13 +++++++++++++ nccl/all-gather/benchmarks/16_gpu.txt | 16 ++++++++-------- nccl/all-gather/benchmarks/32_gpu.txt | 18 +++++++++--------- nccl/all-gather/benchmarks/64_gpu.txt | 16 ++++++++-------- nccl/all-gather/benchmarks/8_gpu.txt | 16 ++++++++-------- nccl/all-reduce/benchmarks/16_gpu.txt | 14 +++++++------- nccl/all-reduce/benchmarks/32_gpu.txt | 14 -------------- nccl/all-reduce/benchmarks/64_gpu.txt | 13 ------------- nccl/all-reduce/benchmarks/8_gpu.txt | 16 ++++++++-------- nccl/reduce-scatter/benchmarks/8_gpu.txt | 13 +++++++++++++ 23 files changed, 188 insertions(+), 138 deletions(-) delete mode 100644 mpi/all-gather/perlmutter/benchmarks/128_gpu.txt create mode 100644 mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt create mode 100644 mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt delete mode 100644 nccl/all-reduce/benchmarks/32_gpu.txt delete mode 100644 nccl/all-reduce/benchmarks/64_gpu.txt create mode 100644 nccl/reduce-scatter/benchmarks/8_gpu.txt diff --git a/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt deleted file mode 100644 index 3c16468..0000000 --- a/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt +++ /dev/null @@ -1,12 +0,0 @@ -Local data size: 16 -Global data size: 2048 -Number of GPUs: 128 -Message size range: 262144 - 16777216 -Number of iterations: 10 -262144 0.003218 seconds -524288 0.005240 seconds -1048576 0.008649 seconds -2097152 0.015703 seconds -4194304 0.030562 seconds -8388608 0.060407 seconds -16777216 0.190813 seconds diff --git a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt index 9dc96cf..ca685cf 100644 --- a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt +++ b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt @@ -10,12 +10,12 @@ slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/a slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory +srun: error: nid003924: tasks 12-15: Exited with exit code 2 +srun: Terminating StepId=27986453.0 slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory -srun: error: nid002072: tasks 8-11: Exited with exit code 2 -srun: Terminating StepId=27970493.0 -srun: error: nid002073: tasks 12-15: Exited with exit code 2 -srun: error: nid001572: tasks 4-7: Exited with exit code 2 -srun: error: nid001569: tasks 0-3: Exited with exit code 2 +srun: error: nid003732: tasks 4-7: Exited with exit code 2 +srun: error: nid003628: tasks 0-3: Exited with exit code 2 +srun: error: nid003920: tasks 8-11: Exited with exit code 2 diff --git a/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt index 754e581..fca9dfb 100644 --- a/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt +++ b/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt @@ -3,12 +3,12 @@ Global data size: 2048 Number of GPUs: 32 Message size range: 262144 - 67108864 Number of iterations: 10 -262144 0.000744 seconds -524288 0.001397 seconds -1048576 0.002723 seconds -2097152 0.003728 seconds -4194304 0.007619 seconds -8388608 0.014516 seconds -16777216 0.030634 seconds -33554432 0.063410 seconds -67108864 0.172556 seconds +262144 0.000814 seconds +524288 0.001392 seconds +1048576 0.002735 seconds +2097152 0.003736 seconds +4194304 0.007699 seconds +8388608 0.014426 seconds +16777216 0.030468 seconds +33554432 0.063086 seconds +67108864 0.172433 seconds diff --git a/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt index cd13b86..fd082e7 100644 --- a/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt +++ b/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt @@ -3,11 +3,11 @@ Global data size: 2048 Number of GPUs: 64 Message size range: 262144 - 33554432 Number of iterations: 10 -262144 0.001523 seconds -524288 0.003143 seconds -1048576 0.004237 seconds -2097152 0.008015 seconds -4194304 0.015194 seconds -8388608 0.029697 seconds -16777216 0.063139 seconds -33554432 0.184281 seconds +262144 0.001616 seconds +524288 0.003051 seconds +1048576 0.004224 seconds +2097152 0.008058 seconds +4194304 0.015085 seconds +8388608 0.029593 seconds +16777216 0.063129 seconds +33554432 0.185107 seconds diff --git a/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt index e010f99..d027526 100644 --- a/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt +++ b/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt @@ -3,11 +3,11 @@ Global data size: 2048 Number of GPUs: 8 Message size range: 2097152 - 268435456 Number of iterations: 10 -2097152 0.000888 seconds -4194304 0.001690 seconds -8388608 0.003195 seconds -16777216 0.006815 seconds -33554432 0.013828 seconds -67108864 0.028031 seconds -134217728 0.055406 seconds -268435456 0.104231 seconds +2097152 0.000804 seconds +4194304 0.001514 seconds +8388608 0.003268 seconds +16777216 0.006800 seconds +33554432 0.013764 seconds +67108864 0.027832 seconds +134217728 0.055076 seconds +268435456 0.103476 seconds diff --git a/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt index 76b174e..7536923 100644 --- a/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt +++ b/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt @@ -3,9 +3,9 @@ Global data size: 1024 Number of GPUs: 16 Message size range: 33554432 - 1073741824 Number of iterations: 10 -33554432 0.145773 seconds -67108864 0.327744 seconds -134217728 0.680940 seconds -268435456 2.172019 seconds -536870912 4.377939 seconds -1073741824 8.740797 seconds +33554432 0.142862 seconds +67108864 0.282599 seconds +134217728 0.635635 seconds +268435456 1.893851 seconds +536870912 3.800098 seconds +1073741824 7.591759 seconds diff --git a/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt index c7d90db..f210edf 100644 --- a/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt +++ b/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt @@ -3,11 +3,11 @@ Global data size: 1024 Number of GPUs: 32 Message size range: 8388608 - 1073741824 Number of iterations: 10 -8388608 0.050947 seconds -16777216 0.093279 seconds -33554432 0.183651 seconds -67108864 0.368861 seconds -134217728 0.804120 seconds -268435456 2.351269 seconds -536870912 4.727807 seconds -1073741824 9.445482 seconds +8388608 0.050115 seconds +16777216 0.093747 seconds +33554432 0.182627 seconds +67108864 0.363477 seconds +134217728 0.777837 seconds +268435456 2.348574 seconds +536870912 4.726795 seconds +1073741824 9.478696 seconds diff --git a/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt new file mode 100644 index 0000000..0052be4 --- /dev/null +++ b/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 1024 +Global data size: 1024 +Number of GPUs: 64 +Message size range: 16777216 - 1073741824 +Number of iterations: 10 +16777216 0.120696 seconds +33554432 0.238777 seconds +67108864 0.470335 seconds +134217728 0.963299 seconds +268435456 2.857795 seconds +536870912 5.742566 seconds +1073741824 11.495248 seconds diff --git a/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt index 43c1c73..def3166 100644 --- a/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt +++ b/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt @@ -3,10 +3,10 @@ Global data size: 1024 Number of GPUs: 8 Message size range: 16777216 - 1073741824 Number of iterations: 10 -16777216 0.056679 seconds -33554432 0.108849 seconds -67108864 0.216523 seconds -134217728 0.510124 seconds -268435456 1.547371 seconds -536870912 3.104556 seconds -1073741824 6.214916 seconds +16777216 0.056844 seconds +33554432 0.108090 seconds +67108864 0.215626 seconds +134217728 0.502310 seconds +268435456 1.519484 seconds +536870912 3.075941 seconds +1073741824 6.121168 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt new file mode 100644 index 0000000..7306758 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 128 +Message size range: 33554432 - 2147483648 +Number of iterations: 10 +33554432 0.410163 seconds +67108864 0.429161 seconds +134217728 0.544002 seconds +268435456 0.679339 seconds +536870912 0.981913 seconds +1073741824 1.583797 seconds +2147483648 3.678590 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt new file mode 100644 index 0000000..190422f --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 16 +Message size range: 33554432 - 2147483648 +Number of iterations: 10 +33554432 0.056117 seconds +67108864 0.092396 seconds +134217728 0.169070 seconds +268435456 0.331578 seconds +536870912 0.641127 seconds +1073741824 1.270086 seconds +2147483648 3.735213 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt new file mode 100644 index 0000000..7b9f084 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt @@ -0,0 +1,14 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 32 +Message size range: 8388608 - 2147483648 +Number of iterations: 10 +8388608 0.053765 seconds +16777216 0.064537 seconds +33554432 0.084740 seconds +67108864 0.133787 seconds +134217728 0.220573 seconds +268435456 0.377243 seconds +536870912 0.683938 seconds +1073741824 1.321649 seconds +2147483648 3.716915 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt new file mode 100644 index 0000000..675dc8f --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 64 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 0.157345 seconds +33554432 0.205494 seconds +67108864 0.216133 seconds +134217728 0.316748 seconds +268435456 0.476547 seconds +536870912 0.776507 seconds +1073741824 1.387122 seconds +2147483648 3.688627 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt new file mode 100644 index 0000000..c7ca325 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 8 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 0.024237 seconds +33554432 0.043589 seconds +67108864 0.083173 seconds +134217728 0.153300 seconds +268435456 0.300631 seconds +536870912 0.598284 seconds +1073741824 1.190578 seconds +2147483648 3.832743 seconds diff --git a/nccl/all-gather/benchmarks/16_gpu.txt b/nccl/all-gather/benchmarks/16_gpu.txt index 22b1d19..1afafc0 100644 --- a/nccl/all-gather/benchmarks/16_gpu.txt +++ b/nccl/all-gather/benchmarks/16_gpu.txt @@ -3,11 +3,11 @@ Global data size: 4096 Number of GPUs: 16 Message size range: 2097152 - 268435456 Number of iterations: 10 -2097152 0.000546 seconds -4194304 0.000963 seconds -8388608 0.001810 seconds -16777216 0.003587 seconds -33554432 0.006843 seconds -67108864 0.013602 seconds -134217728 0.026932 seconds -268435456 0.052715 seconds +2097152 0.000643 seconds +4194304 0.000944 seconds +8388608 0.001838 seconds +16777216 0.003452 seconds +33554432 0.007084 seconds +67108864 0.013794 seconds +134217728 0.026821 seconds +268435456 0.052760 seconds diff --git a/nccl/all-gather/benchmarks/32_gpu.txt b/nccl/all-gather/benchmarks/32_gpu.txt index da3b81b..03e6ee9 100644 --- a/nccl/all-gather/benchmarks/32_gpu.txt +++ b/nccl/all-gather/benchmarks/32_gpu.txt @@ -3,12 +3,12 @@ Global data size: 2048 Number of GPUs: 32 Message size range: 262144 - 67108864 Number of iterations: 10 -262144 0.000531 seconds -524288 0.000602 seconds -1048576 0.000700 seconds -2097152 0.001056 seconds -4194304 0.001907 seconds -8388608 0.003960 seconds -16777216 0.006958 seconds -33554432 0.014047 seconds -67108864 0.027585 seconds +262144 0.000528 seconds +524288 0.000604 seconds +1048576 0.000701 seconds +2097152 0.001044 seconds +4194304 0.002055 seconds +8388608 0.004240 seconds +16777216 0.006949 seconds +33554432 0.014221 seconds +67108864 0.027622 seconds diff --git a/nccl/all-gather/benchmarks/64_gpu.txt b/nccl/all-gather/benchmarks/64_gpu.txt index f05957a..c0872ab 100644 --- a/nccl/all-gather/benchmarks/64_gpu.txt +++ b/nccl/all-gather/benchmarks/64_gpu.txt @@ -3,11 +3,11 @@ Global data size: 2048 Number of GPUs: 64 Message size range: 262144 - 33554432 Number of iterations: 10 -262144 0.001041 seconds -524288 0.001212 seconds -1048576 0.001357 seconds -2097152 0.002122 seconds -4194304 0.003750 seconds -8388608 0.007686 seconds -16777216 0.014414 seconds -33554432 0.028307 seconds +262144 0.001230 seconds +524288 0.001226 seconds +1048576 0.001381 seconds +2097152 0.002098 seconds +4194304 0.003764 seconds +8388608 0.007649 seconds +16777216 0.014257 seconds +33554432 0.027941 seconds diff --git a/nccl/all-gather/benchmarks/8_gpu.txt b/nccl/all-gather/benchmarks/8_gpu.txt index 9d9c99f..8fc4917 100644 --- a/nccl/all-gather/benchmarks/8_gpu.txt +++ b/nccl/all-gather/benchmarks/8_gpu.txt @@ -3,11 +3,11 @@ Global data size: 2048 Number of GPUs: 8 Message size range: 2097152 - 268435456 Number of iterations: 10 -2097152 0.000298 seconds -4194304 0.000477 seconds -8388608 0.000903 seconds -16777216 0.001661 seconds -33554432 0.003230 seconds -67108864 0.006674 seconds -134217728 0.012419 seconds -268435456 0.024550 seconds +2097152 0.000325 seconds +4194304 0.000482 seconds +8388608 0.000881 seconds +16777216 0.001679 seconds +33554432 0.003206 seconds +67108864 0.006338 seconds +134217728 0.012452 seconds +268435456 0.024147 seconds diff --git a/nccl/all-reduce/benchmarks/16_gpu.txt b/nccl/all-reduce/benchmarks/16_gpu.txt index a866d54..26fc256 100644 --- a/nccl/all-reduce/benchmarks/16_gpu.txt +++ b/nccl/all-reduce/benchmarks/16_gpu.txt @@ -3,10 +3,10 @@ Global data size: 2048 Number of GPUs: 16 Message size range: 33554432 - 2147483648 Number of iterations: 10 -33554432 0.001007 seconds -67108864 0.001788 seconds -134217728 0.003634 seconds -268435456 0.006935 seconds -536870912 0.013610 seconds -1073741824 0.027019 seconds -2147483648 0.052864 seconds +33554432 0.000969 seconds +67108864 0.001819 seconds +134217728 0.003596 seconds +268435456 0.006813 seconds +536870912 0.013459 seconds +1073741824 0.026683 seconds +2147483648 0.052290 seconds diff --git a/nccl/all-reduce/benchmarks/32_gpu.txt b/nccl/all-reduce/benchmarks/32_gpu.txt deleted file mode 100644 index a20b1cd..0000000 --- a/nccl/all-reduce/benchmarks/32_gpu.txt +++ /dev/null @@ -1,14 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 32 -Message size range: 8388608 - 2147483648 -Number of iterations: 10 -8388608 0.001052 seconds -16777216 0.001220 seconds -33554432 0.001356 seconds -67108864 0.002028 seconds -134217728 0.003714 seconds -268435456 0.007242 seconds -536870912 0.013809 seconds -1073741824 0.027274 seconds -2147483648 0.054261 seconds diff --git a/nccl/all-reduce/benchmarks/64_gpu.txt b/nccl/all-reduce/benchmarks/64_gpu.txt deleted file mode 100644 index 07fbe6d..0000000 --- a/nccl/all-reduce/benchmarks/64_gpu.txt +++ /dev/null @@ -1,13 +0,0 @@ -Local data size: 2048 -Global data size: 2048 -Number of GPUs: 64 -Message size range: 16777216 - 2147483648 -Number of iterations: 10 -16777216 0.001536 seconds -33554432 0.001953 seconds -67108864 0.002903 seconds -134217728 0.004239 seconds -268435456 0.007382 seconds -536870912 0.014722 seconds -1073741824 0.028043 seconds -2147483648 0.055311 seconds diff --git a/nccl/all-reduce/benchmarks/8_gpu.txt b/nccl/all-reduce/benchmarks/8_gpu.txt index c9bda12..e5a5769 100644 --- a/nccl/all-reduce/benchmarks/8_gpu.txt +++ b/nccl/all-reduce/benchmarks/8_gpu.txt @@ -3,11 +3,11 @@ Global data size: 2048 Number of GPUs: 8 Message size range: 16777216 - 2147483648 Number of iterations: 10 -16777216 0.000507 seconds -33554432 0.000855 seconds -67108864 0.001697 seconds -134217728 0.003146 seconds -268435456 0.006394 seconds -536870912 0.012162 seconds -1073741824 0.024174 seconds -2147483648 0.047715 seconds +16777216 0.000635 seconds +33554432 0.000887 seconds +67108864 0.001639 seconds +134217728 0.003232 seconds +268435456 0.006303 seconds +536870912 0.011998 seconds +1073741824 0.024143 seconds +2147483648 0.047652 seconds diff --git a/nccl/reduce-scatter/benchmarks/8_gpu.txt b/nccl/reduce-scatter/benchmarks/8_gpu.txt new file mode 100644 index 0000000..5cee721 --- /dev/null +++ b/nccl/reduce-scatter/benchmarks/8_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 8 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 0.000363 seconds +33554432 0.000450 seconds +67108864 0.000876 seconds +134217728 0.001650 seconds +268435456 0.003169 seconds +536870912 0.006491 seconds +1073741824 0.012103 seconds +2147483648 0.024166 seconds From e878a751e8d4b5b5fdaca83187d832fdde7686ad Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Fri, 12 Jul 2024 07:36:45 -0700 Subject: [PATCH 46/52] add mpi all-gather 128gpu benchmarks --- mpi/all-gather/perlmutter/benchmarks/128_gpu.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 mpi/all-gather/perlmutter/benchmarks/128_gpu.txt diff --git a/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt new file mode 100644 index 0000000..295c6c0 --- /dev/null +++ b/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 16 +Global data size: 2048 +Number of GPUs: 128 +Message size range: 262144 - 16777216 +Number of iterations: 10 +262144 0.003072 seconds +524288 0.005233 seconds +1048576 0.008462 seconds +2097152 0.015449 seconds +4194304 0.030325 seconds +8388608 0.060131 seconds +16777216 0.190401 seconds From 75c8208ed9dd229c3c43329ded95d74f25840896 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Fri, 12 Jul 2024 07:47:39 -0700 Subject: [PATCH 47/52] update benchmarks --- mpi/all-gather/perlmutter/16_gpu_run.sh | 2 +- .../perlmutter/benchmarks/16_gpu.txt | 21 ------------------- .../perlmutter/benchmarks/128_gpu.txt | 0 nccl/all-gather/benchmarks/128_gpu.txt | 13 ++++++++++++ nccl/all-reduce/benchmarks/32_gpu.txt | 14 +++++++++++++ 5 files changed, 28 insertions(+), 22 deletions(-) delete mode 100644 mpi/all-gather/perlmutter/benchmarks/16_gpu.txt create mode 100644 mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt create mode 100644 nccl/all-gather/benchmarks/128_gpu.txt create mode 100644 nccl/all-reduce/benchmarks/32_gpu.txt diff --git a/mpi/all-gather/perlmutter/16_gpu_run.sh b/mpi/all-gather/perlmutter/16_gpu_run.sh index 813b192..e68834a 100644 --- a/mpi/all-gather/perlmutter/16_gpu_run.sh +++ b/mpi/all-gather/perlmutter/16_gpu_run.sh @@ -29,7 +29,7 @@ export FI_CXI_OFLOW_BUF_COUNT=1 MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 MAX_MSG_SIZE=$((1048576 * 128)) -SCRIPT="$SCRATCH/adityat/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt" echo $run_cmd diff --git a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt deleted file mode 100644 index ca685cf..0000000 --- a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt +++ /dev/null @@ -1,21 +0,0 @@ -slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory -slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory -slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory -slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory -slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory -slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory -slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory -slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory -slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory -slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory -slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory -slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory -srun: error: nid003924: tasks 12-15: Exited with exit code 2 -srun: Terminating StepId=27986453.0 -slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory -slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory -slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory -slurmstepd: error: execve(): /pscratch/sd/a/adityat/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: No such file or directory -srun: error: nid003732: tasks 4-7: Exited with exit code 2 -srun: error: nid003628: tasks 0-3: Exited with exit code 2 -srun: error: nid003920: tasks 8-11: Exited with exit code 2 diff --git a/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt new file mode 100644 index 0000000..e69de29 diff --git a/nccl/all-gather/benchmarks/128_gpu.txt b/nccl/all-gather/benchmarks/128_gpu.txt new file mode 100644 index 0000000..3ac04bb --- /dev/null +++ b/nccl/all-gather/benchmarks/128_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 32 +Global data size: 4096 +Number of GPUs: 128 +Message size range: 262144 - 33554432 +Number of iterations: 10 +262144 0.002077 seconds +524288 0.002368 seconds +1048576 0.002832 seconds +2097152 0.004504 seconds +4194304 0.007551 seconds +8388608 0.014982 seconds +16777216 0.028604 seconds +33554432 0.056227 seconds diff --git a/nccl/all-reduce/benchmarks/32_gpu.txt b/nccl/all-reduce/benchmarks/32_gpu.txt new file mode 100644 index 0000000..90fc0f0 --- /dev/null +++ b/nccl/all-reduce/benchmarks/32_gpu.txt @@ -0,0 +1,14 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 32 +Message size range: 8388608 - 2147483648 +Number of iterations: 10 +8388608 0.001510 seconds +16777216 0.001222 seconds +33554432 0.001317 seconds +67108864 0.002024 seconds +134217728 0.003762 seconds +268435456 0.007554 seconds +536870912 0.014173 seconds +1073741824 0.027756 seconds +2147483648 0.054544 seconds From 39d17ffe4e5fa350ca23bed8caa46f8f1dbe7a26 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Fri, 12 Jul 2024 07:54:38 -0700 Subject: [PATCH 48/52] update results --- .../perlmutter/benchmarks/16_gpu.txt | 21 +++++++++++++++++++ nccl/reduce-scatter/benchmarks/16_gpu.txt | 12 +++++++++++ nccl/reduce-scatter/benchmarks/32_gpu.txt | 14 +++++++++++++ 3 files changed, 47 insertions(+) create mode 100644 mpi/all-gather/perlmutter/benchmarks/16_gpu.txt create mode 100644 nccl/reduce-scatter/benchmarks/16_gpu.txt create mode 100644 nccl/reduce-scatter/benchmarks/32_gpu.txt diff --git a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt new file mode 100644 index 0000000..737af38 --- /dev/null +++ b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt @@ -0,0 +1,21 @@ +/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory +/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory +/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory +/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory +/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory +/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory +/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory +/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory +/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory +/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory +/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory +/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory +srun: error: nid008252: tasks 0-3: Exited with exit code 127 +srun: Terminating StepId=27999986.0 +/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory +/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory +/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory +/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory +srun: error: nid008649: tasks 4-7: Exited with exit code 127 +srun: error: nid008652: tasks 8-11: Exited with exit code 127 +srun: error: nid008653: tasks 12-15: Exited with exit code 127 diff --git a/nccl/reduce-scatter/benchmarks/16_gpu.txt b/nccl/reduce-scatter/benchmarks/16_gpu.txt new file mode 100644 index 0000000..0bae9e9 --- /dev/null +++ b/nccl/reduce-scatter/benchmarks/16_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 16 +Message size range: 33554432 - 2147483648 +Number of iterations: 10 +33554432 0.000552 seconds +67108864 0.000933 seconds +134217728 0.001772 seconds +268435456 0.003462 seconds +536870912 0.007059 seconds +1073741824 0.013749 seconds +2147483648 0.026539 seconds diff --git a/nccl/reduce-scatter/benchmarks/32_gpu.txt b/nccl/reduce-scatter/benchmarks/32_gpu.txt new file mode 100644 index 0000000..307b0ce --- /dev/null +++ b/nccl/reduce-scatter/benchmarks/32_gpu.txt @@ -0,0 +1,14 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 32 +Message size range: 8388608 - 2147483648 +Number of iterations: 10 +8388608 0.000586 seconds +16777216 0.000629 seconds +33554432 0.000712 seconds +67108864 0.001141 seconds +134217728 0.002012 seconds +268435456 0.003715 seconds +536870912 0.007022 seconds +1073741824 0.014078 seconds +2147483648 0.027699 seconds From 4c714037a0bf628fb3d3ad72a47ae39e5e3ddc0d Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Fri, 12 Jul 2024 08:00:00 -0700 Subject: [PATCH 49/52] push results --- .../perlmutter/benchmarks/16_gpu.txt | 21 ------------------- .../perlmutter/benchmarks/128_gpu.txt | 11 ++++++++++ 2 files changed, 11 insertions(+), 21 deletions(-) delete mode 100644 mpi/all-gather/perlmutter/benchmarks/16_gpu.txt diff --git a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt deleted file mode 100644 index 737af38..0000000 --- a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt +++ /dev/null @@ -1,21 +0,0 @@ -/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory -/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory -/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory -/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory -/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory -/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory -/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory -/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory -/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory -/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory -/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory -/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory -srun: error: nid008252: tasks 0-3: Exited with exit code 127 -srun: Terminating StepId=27999986.0 -/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory -/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory -/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory -/pscratch/sd/a/adityat/gpu-benchmarks/mpi/all-gather/allgather.x: error while loading shared libraries: libnccl.so.2: cannot open shared object file: No such file or directory -srun: error: nid008649: tasks 4-7: Exited with exit code 127 -srun: error: nid008652: tasks 8-11: Exited with exit code 127 -srun: error: nid008653: tasks 12-15: Exited with exit code 127 diff --git a/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt index e69de29..a4485f5 100644 --- a/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt +++ b/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt @@ -0,0 +1,11 @@ +Local data size: 1024 +Global data size: 1024 +Number of GPUs: 128 +Message size range: 33554432 - 1073741824 +Number of iterations: 10 +33554432 0.260096 seconds +67108864 0.535750 seconds +134217728 1.089220 seconds +268435456 3.236966 seconds +536870912 6.499632 seconds +1073741824 12.975189 seconds From 6e59c91ee1c0752048738791dc8cf2ee96a13da8 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Fri, 12 Jul 2024 08:05:19 -0700 Subject: [PATCH 50/52] push results --- nccl/all-reduce/benchmarks/64_gpu.txt | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 nccl/all-reduce/benchmarks/64_gpu.txt diff --git a/nccl/all-reduce/benchmarks/64_gpu.txt b/nccl/all-reduce/benchmarks/64_gpu.txt new file mode 100644 index 0000000..ebd310e --- /dev/null +++ b/nccl/all-reduce/benchmarks/64_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 64 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 0.001551 seconds +33554432 0.001949 seconds +67108864 0.002918 seconds +134217728 0.004132 seconds +268435456 0.007447 seconds +536870912 0.014747 seconds +1073741824 0.028172 seconds +2147483648 0.055372 seconds From cc1b03fae4135bd66f61d50851fe6c2cf8d27c34 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Fri, 12 Jul 2024 10:01:56 -0700 Subject: [PATCH 51/52] update results --- mpi/all-gather/perlmutter/benchmarks/16_gpu.txt | 12 ++++++++++++ nccl/reduce-scatter/benchmarks/64_gpu.txt | 13 +++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 mpi/all-gather/perlmutter/benchmarks/16_gpu.txt create mode 100644 nccl/reduce-scatter/benchmarks/64_gpu.txt diff --git a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt new file mode 100644 index 0000000..740a003 --- /dev/null +++ b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 128 +Global data size: 2048 +Number of GPUs: 16 +Message size range: 2097152 - 134217728 +Number of iterations: 10 +2097152 0.002476 seconds +4194304 0.003571 seconds +8388608 0.007188 seconds +16777216 0.014909 seconds +33554432 0.030427 seconds +67108864 0.061974 seconds +134217728 0.150229 seconds diff --git a/nccl/reduce-scatter/benchmarks/64_gpu.txt b/nccl/reduce-scatter/benchmarks/64_gpu.txt new file mode 100644 index 0000000..45bd514 --- /dev/null +++ b/nccl/reduce-scatter/benchmarks/64_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 64 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 0.001059 seconds +33554432 0.001147 seconds +67108864 0.001410 seconds +134217728 0.002090 seconds +268435456 0.004116 seconds +536870912 0.007125 seconds +1073741824 0.014305 seconds +2147483648 0.028156 seconds From 92d5eecf59655c2aead09cb176502105fadf5391 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Fri, 12 Jul 2024 14:05:09 -0700 Subject: [PATCH 52/52] push final results --- nccl/all-reduce/benchmarks/128_gpu.txt | 12 ++++++++++++ nccl/reduce-scatter/benchmarks/128_gpu.txt | 12 ++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 nccl/all-reduce/benchmarks/128_gpu.txt create mode 100644 nccl/reduce-scatter/benchmarks/128_gpu.txt diff --git a/nccl/all-reduce/benchmarks/128_gpu.txt b/nccl/all-reduce/benchmarks/128_gpu.txt new file mode 100644 index 0000000..30388e3 --- /dev/null +++ b/nccl/all-reduce/benchmarks/128_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 128 +Message size range: 33554432 - 2147483648 +Number of iterations: 10 +33554432 0.002305 seconds +67108864 0.003309 seconds +134217728 0.005263 seconds +268435456 0.008851 seconds +536870912 0.017150 seconds +1073741824 0.037149 seconds +2147483648 0.075655 seconds diff --git a/nccl/reduce-scatter/benchmarks/128_gpu.txt b/nccl/reduce-scatter/benchmarks/128_gpu.txt new file mode 100644 index 0000000..846d583 --- /dev/null +++ b/nccl/reduce-scatter/benchmarks/128_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 128 +Message size range: 33554432 - 2147483648 +Number of iterations: 10 +33554432 0.002055 seconds +67108864 0.002314 seconds +134217728 0.003003 seconds +268435456 0.004164 seconds +536870912 0.007515 seconds +1073741824 0.014791 seconds +2147483648 0.027948 seconds