diff --git a/AMG2023/README.md b/AMG2023/README.md index 476ad56..14c75c8 100644 --- a/AMG2023/README.md +++ b/AMG2023/README.md @@ -1,9 +1,9 @@ # AMG2023 README For more detailed installation parameters, please refer to the [installation document](https://github.com/pssg-int/AMG2023/blob/main/amg-doc.pdf). -## Perlmutter Compilation +Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/) -Repository: [AMG2023](https://github.com/pssg-int/AMG2023) +## Perlmutter Compilation ### Steps to Compile @@ -50,5 +50,61 @@ Repository: [AMG2023](https://github.com/pssg-int/AMG2023) cmake -DHYPRE_PREFIX=/pscratch/sd/c/cunyang/AMG2023 .. ``` -## Frontier Installation +## Frontier Compilation + +### Steps to Compile + +1. Load modules + ```sh + module reset + + module load cray-mpich/8.1.30 + module load craype-accel-amd-gfx90a + module load rocm/6.1.3 + export MPICH_GPU_SUPPORT_ENABLED=1 + + # load compatible cmake version + module load Core/24.07 + module load cmake/3.27.9 + ``` +2. Configure hypre (v2.32.0) + - Clone hypre v2.32.0 and navigate to src: + ```sh + git clone -b v2.32.0 https://github.com/hypre-space/hypre.git + cd into ~/hypre/src + ``` + - Configure hypre (in hypre/src) + ```sh + ./configure --with-hip --enable-device-memory-pool --enable-mixedint --with-gpu-arch=gfx90a \ + --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" \ + --with-MPI-include="${MPICH_DIR}/include" \ + CFLAGS="-I${ROCM_PATH}/include/ -I${ROCM_PATH}/llvm/include/ -I${ROCM_PATH}/include/rocsparse/" \ + LDFLAGS="-L${ROCM_PATH}/lib/ -L${ROCM_PATH}/llvm/lib/ -lrocsparse" + ``` + - Compile hypre (in hypre/src) + ```sh + # build with make + make + ``` +3. Configure AMG2023 + - Clone repo: + ```sh + git clone https://github.com/pssg-int/AMG2023` + cd AMG2023 + ``` + - Add mpiP to LD_LIBRARY_PATH + ```sh + export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH + ``` + - Configure cmake + ```sh + mkdir build && cd build + cmake .. -DHYPRE_PREFIX=/ccs/home/keshprad/hypre/src/hypre/ \ + -DCMAKE_C_FLAGS="-I${ROCM_PATH}/include/ -I${ROCM_PATH}/llvm/include/ -I${ROCM_PATH}/include/rocsparse/" \ + -DCMAKE_EXE_LINKER_FLAGS="-L${ROCM_PATH}/lib/ -L${ROCM_PATH}/llvm/lib/ -lrocsparse -lrocrand" + ``` + - Compile AMG2023 (in AMG2023/build) + ```sh + make install + ``` diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh new file mode 100644 index 0000000..c51b52d --- /dev/null +++ b/AMG2023/run_frontier_16.sh @@ -0,0 +1,57 @@ +#!/bin/bash +#SBATCH -N 16 +#SBATCH -n 128 +#SBATCH -q normal +#SBATCH -J amg +#SBATCH --gpu-bind none +#SBATCH -t 00:30:00 +#SBATCH -A csc569 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/output-AMG2023.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/error-AMG2023.log +#SBATCH --exclusive +# Run like: sbatch run_frontier_16.sh + +OUTPUT_DIR=/lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +OUTPUT_FILE=$OUTPUT_DIR/output-AMG2023.log +ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log + +# Run gpu benchmarks +COMM_TYPE=mpi +ROCM_VERSION=6.1.3 +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR + +APP_ROOT=/ccs/home/keshprad/AMG2023 +cd $APP_ROOT + +# reset modules +echo resetting modules: +module reset +# load modules +echo loading modules: +module load cray-mpich/8.1.30 +module load craype-accel-amd-gfx90a +module load rocm/6.1.3 + +export MPICH_GPU_SUPPORT_ENABLED=1 +export CRAY_ACCEL_TARGET=gfx90a +export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ +# mpiP +export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH +export MPIP="-o -f $OUTPUT_DIR" + +# log start date +echo start AMG2023: $(date) +# define command +cmd="srun --output $OUTPUT_FILE --error $ERROR_FILE \ + ./build/amg -P 4 4 8 -n 128 64 64 -problem 1 -iter 500" +echo solving: +echo $cmd +$cmd +# log end date +echo end AMG2023: $(date) diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh new file mode 100644 index 0000000..c7a7a3e --- /dev/null +++ b/AMG2023/run_frontier_64.sh @@ -0,0 +1,57 @@ +#!/bin/bash +#SBATCH -N 64 +#SBATCH -n 512 +#SBATCH -q normal +#SBATCH -J amg +#SBATCH --gpu-bind none +#SBATCH -t 00:30:00 +#SBATCH -A csc569 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/output-AMG2023.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/error-AMG2023.log +#SBATCH --exclusive +# Run like: sbatch run_frontier_64.sh + +OUTPUT_DIR=/lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +OUTPUT_FILE=$OUTPUT_DIR/output-AMG2023.log +ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log + +# Run gpu benchmarks +COMM_TYPE=mpi +ROCM_VERSION=6.1.3 +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR + +APP_ROOT=/ccs/home/keshprad/AMG2023 +cd $APP_ROOT + +# reset modules +echo resetting modules: +module reset +# load modules +echo loading modules: +module load cray-mpich/8.1.30 +module load craype-accel-amd-gfx90a +module load rocm/6.1.3 + +export MPICH_GPU_SUPPORT_ENABLED=1 +export CRAY_ACCEL_TARGET=gfx90a +export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/ +# mpiP +export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH +export MPIP="-o -f $OUTPUT_DIR" + +# log start date +echo start AMG2023: $(date) +# define command +cmd="srun --output $OUTPUT_FILE --error $ERROR_FILE \ + ./build/amg -P 8 8 8 -n 128 64 64 -problem 1 -iter 500" +echo solving: +echo $cmd +$cmd +# log end date +echo end AMG2023: $(date) diff --git a/AMG2023/run_frontier_crontab.sh b/AMG2023/run_frontier_crontab.sh new file mode 100644 index 0000000..09b0f66 --- /dev/null +++ b/AMG2023/run_frontier_crontab.sh @@ -0,0 +1,19 @@ +#!/bin/bash +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi +# `16` or `64` +NUM_NODES=$1 + +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability + +# load lmod +source /usr/share/lmod/lmod/init/bash +# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH +export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps +export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles + +# run sbatch script +script=$PERF_VARIABILITY_ROOT/AMG2023/run_frontier_$NUM_NODES\.sh +sbatch $script \ No newline at end of file diff --git a/gpu-benchmarks/README.md b/gpu-benchmarks/README.md new file mode 100644 index 0000000..c8f9c25 --- /dev/null +++ b/gpu-benchmarks/README.md @@ -0,0 +1,14 @@ +# gpu-benchmarks README +Code Repository: [gpu-benchmarks](#TODO:) + +## Perlmutter Compilation + +### Steps to Compile + +TODO: + +## Frontier Compilation + +### Steps to Compile + +TODO: \ No newline at end of file diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh new file mode 100644 index 0000000..7fc10b4 --- /dev/null +++ b/gpu-benchmarks/allgather/run_frontier.sh @@ -0,0 +1,63 @@ +# This script assumes it is being run by another sbatch script, +# so does not include portions for SBATCH vars (e.g. account, time, etc.) + +# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allgather.sh + +#!/bin/bash +if [ "$#" -ne 4 ]; then + echo "Usage: $0 " + exit 1 +fi +# `mpi` or `rccl` +COMM_TYPE=$1 +# `5.7.1` or `6.1.3` +ROCM_VERSION=$2 +# `16` or `64` +NUM_NODES=$3 +# output directory +OUTPUT_DIR=$4 + +# setup cray-mpich version +if [[ "$ROCM_VERSION" == "6.1.3" ]]; then + MPICH_VERSION=8.1.30 +else + MPICH_VERSION=8.1.28 +fi + +OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log + +{ + # reset modules + echo resetting modules: + module reset + # load modules + echo loading modules: + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION} + module load cray-mpich/${MPICH_VERSION} + module load rocm/${ROCM_VERSION} + module list + + GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks + EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE\_rocm-${ROCM_VERSION}.x + NUM_TASKS=$(($NUM_NODES * 8)) + MIN_MSG_SIZE=$((1 * 1024)) + MAX_MSG_SIZE=$((1 * 1024 * 1024)) + ITERATIONS=100 + + export MPICH_GPU_SUPPORT_ENABLED=1 + export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}" + + echo start allgather: $(date) + For MPI-bench we should use --gpus-per-node --gpus-per-task --ntasks-per-node , and --gpu-bind=none in srun. + CMD="srun -N $NUM_NODES -n $NUM_TASKS \ + --gpus-per-node 8 \ + --gpus-per-task 1 \ + --ntasks-per-node 8 \ + --gpu-bind none \ + --output $OUTPUT_FILE \ + $EXEC $NUM_TASKS $MIN_MSG_SIZE $MAX_MSG_SIZE $ITERATIONS" + echo running: + echo $CMD + $CMD + echo end allgather: $(date) +} &>> $OUTPUT_FILE diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh new file mode 100644 index 0000000..855a486 --- /dev/null +++ b/gpu-benchmarks/allreduce/run_frontier.sh @@ -0,0 +1,58 @@ +# This script assumes it is being run by another sbatch script, +# so does not include portions for SBATCH vars (e.g. account, time, etc.) + +# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allreduce.sh + +#!/bin/bash +if [ "$#" -ne 4 ]; then + echo "Usage: $0 " + exit 1 +fi +# `mpi` or `rccl` +COMM_TYPE=$1 +# `5.7.1` or `6.1.3` +ROCM_VERSION=$2 +# `16` or `64` +NUM_NODES=$3 +# output directory +OUTPUT_DIR=$4 + +# setup cray-mpich version +if [[ "$ROCM_VERSION" == "6.1.3" ]]; then + MPICH_VERSION=8.1.30 +else + MPICH_VERSION=8.1.28 +fi + +OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log + +{ + # reset modules + echo resetting modules: + module reset + # load modules + echo loading modules: + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION} + module load cray-mpich/${MPICH_VERSION} + module load rocm/${ROCM_VERSION} + module list + + GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks + EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE\_rocm-${ROCM_VERSION}.x + NUM_TASKS=$(($NUM_NODES * 8)) + MIN_MSG_SIZE=$((1 * 1024)) + MAX_MSG_SIZE=$((1 * 1024 * 1024)) + ITERATIONS=100 + + export MPICH_GPU_SUPPORT_ENABLED=1 + export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}" + + echo start allreduce: $(date) + CMD="srun -N $NUM_NODES -n $NUM_TASKS \ + --output $OUTPUT_FILE \ + $EXEC $NUM_TASKS $MIN_MSG_SIZE $MAX_MSG_SIZE $ITERATIONS" + echo running: + echo $CMD + $CMD + echo end allreduce: $(date) +} &>> $OUTPUT_FILE diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh new file mode 100644 index 0000000..c5348be --- /dev/null +++ b/gpu-benchmarks/gemm/run_frontier.sh @@ -0,0 +1,56 @@ +# This script assumes it is being run by another sbatch script, +# so does not include portions for SBATCH vars (e.g. account, time, etc.) + +# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/gemm.sh + +#!/bin/bash +if [ "$#" -ne 3 ]; then + echo "Usage: $0 " + exit 1 +fi +# `5.7.1` or `6.1.3` +ROCM_VERSION=$1 +# `16` or `64` +NUM_NODES=$2 +# output directory +OUTPUT_DIR=$3 + +# setup cray-mpich version +if [[ "$ROCM_VERSION" == "6.1.3" ]]; then + MPICH_VERSION=8.1.30 +else + MPICH_VERSION=8.1.28 +fi + +OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log + +{ + # reset modules + echo resetting modules: + module reset + # load modules + echo loading modules: + module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION} + module load cray-mpich/${MPICH_VERSION} + module load rocm/${ROCM_VERSION} + module list + + GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks + EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm_rocm-${ROCM_VERSION}.x + NUM_TASKS=$(($NUM_NODES * 8)) + + export MPICH_GPU_SUPPORT_ENABLED=1 + export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}" + + echo start gemm: $(date) + CMD="srun -N $NUM_NODES -n $NUM_TASKS \ + --gpus-per-node 8 \ + --gpus-per-task 1 \ + --ntasks-per-node 8 \ + --output $OUTPUT_FILE \ + $EXEC" + echo running: + echo $CMD + $CMD + echo end gemm: $(date) +} &>> $OUTPUT_FILE diff --git a/nanoGPT/README.md b/nanoGPT/README.md index 5c499fc..87e8189 100644 --- a/nanoGPT/README.md +++ b/nanoGPT/README.md @@ -1,33 +1,62 @@ -# nanoGPT Setup Instructions +# nanoGPT README +For more detailed installation parameters, please refer to [nanoGPT install guide](https://github.com/axonn-ai/nanoGPT). -## Clone the Repository +Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/) -```sh -git clone https://github.com/axonn-ai/nanoGPT.git -``` -## Create Python Environment +## Perlmutter Setup -```sh -./scripts/create_python_env_perlmutter.sh -``` +### Setup steps -> Note: You may need to modify the path and torch version in `create_python_env_perlmutter.sh`. +1. Clone the Repository + ```sh + git clone https://github.com/axonn-ai/nanoGPT.git + cd nanoGPT + ``` -## Load PyTorch Module +2. Create Python Environment + ```sh + ./scripts/create_python_env_perlmutter.sh + ``` + > Note: You may need to modify the path and torch version in `create_python_env_perlmutter.sh`. -```sh -module load pytorch/2.0.1 -``` +3. Load PyTorch Module + ```sh + module load pytorch/2.0.1 + ``` -## Activate the Environment +4. Activate the Environment + ```sh + source path_to_nanogptENV/bin/activate + ``` -```sh -source path_to_nanogptENV/bin/activate -``` +5. Download Data + ```sh + python nanoGPT/data/openwebtext/prepare.py + ``` -## Download Data +## Frontier Setup -```sh -python nanoGPT/data/openwebtext/prepare.py -``` \ No newline at end of file +### Setup steps + +1. Clone the Repository + ```sh + git clone https://github.com/axonn-ai/nanoGPT.git + cd nanoGPT + ``` + +2. Create Python Environment + ```sh + ./scripts/create_python_env_frontier.sh + ``` + > Note: You may need to modify the WKSPC path and torch version in `create_python_env_frontier.sh`. + +4. Activate the Environment + ```sh + source path_to_nanogptENV/bin/activate + ``` + +5. Download Data + ```sh + python data/openwebtext/prepare.py + ``` \ No newline at end of file diff --git a/nanoGPT/run_frontier16.sh b/nanoGPT/run_frontier16.sh new file mode 100644 index 0000000..901561e --- /dev/null +++ b/nanoGPT/run_frontier16.sh @@ -0,0 +1,86 @@ +#!/bin/bash +#SBATCH -N 16 +#SBATCH -n 128 +#SBATCH -q normal +#SBATCH -J nanogpt +#SBATCH --gpu-bind none +#SBATCH -t 00:30:00 +#SBATCH -A csc569 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-output.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-error.log +#SBATCH --exclusive +# Run like: sbatch run_frontier16.sh + +echo "start run: $(date)" +export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log +ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log + +export SCRATCH="/lustre/orion/csc569/scratch/keshprad" +export WRKSPC="${SCRATCH}/nanoGPT" +export HF_HOME="${SCRATCH}/.cache/hf" +export HF_TRANSFORMERS_CACHE="${HF_HOME}" +export HF_DATASETS_CACHE="${HF_HOME}/datasets" +cd $WRKSPC + +# load modules +ROCM_VERSION=6.1.3 +echo resetting modules: +module reset +echo loading modules: +module load PrgEnv-gnu/8.5.0 +module load rocm/${ROCM_VERSION} +module load craype-accel-amd-gfx90a +module load cray-python/3.9.13.1 +module load cray-mpich/8.1.30 +module list +# activate env +source ${WRKSPC}/axonn_nanogpt/bin/activate + +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 8 )) +## master addr and port +# setting variables for torch.distributed +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export WORLD_SIZE=$GPUS +export OMP_NUM_THREADS=7 + +## some RCCL env variables +export FI_CXI_ATS=0 +export HSA_FORCE_FINE_GRAIN_PCIE=1 +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn0 +export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +# AWS-OFI-RCCL +export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH" +# other +export MPICH_GPU_SUPPORT_ENABLED=1 +export GPU_MAX_HW_QUEUES=1 +export OFI_NCCL_USE_IPV6_TCP=1 + +SCRIPT="train_frontier.py config/train_gpt_neox_5B.py" + +# run with profiler +export WITH_PROFILER=1 +OUTPUT_FILE="$JOB_OUTPUT_PATH/output-nanoGPT.log" +# log start date +echo "start nanoGPT: $(date)" &>> $OUTPUT_FILE +run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT" +echo $run_cmd &>> $OUTPUT_FILE +eval $run_cmd &>> $OUTPUT_FILE +# log end date +echo "end nanoGPT: $(date)" &>> $OUTPUT_FILE + +# Run gpu benchmarks +COMM_TYPE=rccl +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH + +echo "end run: $(date)" \ No newline at end of file diff --git a/nanoGPT/run_frontier64.sh b/nanoGPT/run_frontier64.sh new file mode 100644 index 0000000..3201b51 --- /dev/null +++ b/nanoGPT/run_frontier64.sh @@ -0,0 +1,86 @@ +#!/bin/bash +#SBATCH -N 64 +#SBATCH -n 512 +#SBATCH -q normal +#SBATCH -J nanogpt +#SBATCH --gpu-bind none +#SBATCH -t 00:30:00 +#SBATCH -A csc569 +#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-output.log +#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-error.log +#SBATCH --exclusive +# Run like: sbatch run_frontier64.sh + +echo "start run: $(date)" +export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID +OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log +ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log + +export SCRATCH="/lustre/orion/csc569/scratch/keshprad" +export WRKSPC="${SCRATCH}/nanoGPT" +export HF_HOME="${SCRATCH}/.cache/hf" +export HF_TRANSFORMERS_CACHE="${HF_HOME}" +export HF_DATASETS_CACHE="${HF_HOME}/datasets" +cd $WRKSPC + +# load modules +ROCM_VERSION=6.1.3 +echo resetting modules: +module reset +echo loading modules: +module load PrgEnv-gnu/8.5.0 +module load rocm/${ROCM_VERSION} +module load craype-accel-amd-gfx90a +module load cray-python/3.9.13.1 +module load cray-mpich/8.1.30 +module list +# activate env +source ${WRKSPC}/axonn_nanogpt/bin/activate + +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 8 )) +## master addr and port +# setting variables for torch.distributed +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export WORLD_SIZE=$GPUS +export OMP_NUM_THREADS=7 + +## some RCCL env variables +export FI_CXI_ATS=0 +export HSA_FORCE_FINE_GRAIN_PCIE=1 +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn0 +export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +# AWS-OFI-RCCL +export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH" +# other +export MPICH_GPU_SUPPORT_ENABLED=1 +export GPU_MAX_HW_QUEUES=1 +export OFI_NCCL_USE_IPV6_TCP=1 + +SCRIPT="train_frontier.py config/train_gpt_neox_20B.py" + +# run with profiler +export WITH_PROFILER=1 +OUTPUT_FILE="$JOB_OUTPUT_PATH/output-nanoGPT.log" +# log start date +echo "start nanoGPT: $(date)" &>> $OUTPUT_FILE +run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT" +echo $run_cmd &>> $OUTPUT_FILE +eval $run_cmd &>> $OUTPUT_FILE +# log end date +echo "end nanoGPT: $(date)" &>> $OUTPUT_FILE + +# Run gpu benchmarks +COMM_TYPE=rccl +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability +echo running allreduce benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +# echo running allgather benchmark +# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH +echo running gemm benchmark +bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH + +echo "end run: $(date)" \ No newline at end of file diff --git a/nanoGPT/run_frontier_crontab.sh b/nanoGPT/run_frontier_crontab.sh new file mode 100644 index 0000000..dcc8cf5 --- /dev/null +++ b/nanoGPT/run_frontier_crontab.sh @@ -0,0 +1,19 @@ +#!/bin/bash +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi +# `16` or `64` +NUM_NODES=$1 + +PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability + +# load lmod +source /usr/share/lmod/lmod/init/bash +# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH +export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps +export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles + +# run sbatch script +script=$PERF_VARIABILITY_ROOT/nanoGPT/run_frontier$NUM_NODES\.sh +sbatch $script \ No newline at end of file diff --git a/nanoGPT/train_gpt_neox_20B_frontier.py b/nanoGPT/train_gpt_neox_20B_frontier.py new file mode 100644 index 0000000..cf7b91f --- /dev/null +++ b/nanoGPT/train_gpt_neox_20B_frontier.py @@ -0,0 +1,46 @@ +# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB +# launch as the following (e.g. in a screen session) and wait ~5 days: +# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py + +wandb_log = False +wandb_project = 'owt' +wandb_run_name='gpt2-124M' + +# these make the total batch size be ~0.5M +# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520 +batch_size = 8 +block_size = 512 +gradient_accumulation_steps = 1 * 512 #per_gpu x num_gpus + +# model +n_layer = 32 +n_head = 56 +n_embd = 7168 +dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+ +bias = False # do we use bias inside LayerNorm and Linear layers? + +# adamw optimizer +learning_rate = 1e-4 # max learning rate +max_iters = 30 # total number of training iterations + +# axonn params +G_intra_d=16 +G_intra_c=1 +G_intra_r=1 +compile=False # disable compile for axonn +gradient_checkpointing=True + +# this makes total number of tokens be 300B +max_iters = 30 +lr_decay_iters = 600000 + +# eval stuff +eval_interval = 1000 +eval_iters = 1 +log_interval = 10 + +# weight decay +weight_decay = 1e-1 + +# log every iteration +log_interval=1 \ No newline at end of file diff --git a/nanoGPT/train_gpt_neox_5B_frontier.py b/nanoGPT/train_gpt_neox_5B_frontier.py new file mode 100644 index 0000000..4ce7b55 --- /dev/null +++ b/nanoGPT/train_gpt_neox_5B_frontier.py @@ -0,0 +1,46 @@ +# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB +# launch as the following (e.g. in a screen session) and wait ~5 days: +# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py + +wandb_log = False +wandb_project = 'owt' +wandb_run_name='gpt2-124M' + +# these make the total batch size be ~0.5M +# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520 +batch_size = 16 +block_size = 512 +gradient_accumulation_steps = 2 * 128 #per_gpu x num_gpus + +# model +n_layer = 24 +n_head = 32 +n_embd = 4096 +dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+ +bias = False # do we use bias inside LayerNorm and Linear layers? + +# adamw optimizer +learning_rate = 1e-4 # max learning rate +max_iters = 30 # total number of training iterations + +# axonn params +G_intra_d=16 +G_intra_c=1 +G_intra_r=1 +compile=False # disable compile for axonn +gradient_checkpointing=True + +# this makes total number of tokens be 300B +max_iters = 30 +lr_decay_iters = 600000 + +# eval stuff +eval_interval = 1000 +eval_iters = 1 +log_interval = 10 + +# weight decay +weight_decay = 1e-1 + +# log every iteration +log_interval=1 \ No newline at end of file