diff --git a/AMG2023/README.md b/AMG2023/README.md
index 476ad56..14c75c8 100644
--- a/AMG2023/README.md
+++ b/AMG2023/README.md
@@ -1,9 +1,9 @@
 # AMG2023 README
 For more detailed installation parameters, please refer to the [installation document](https://github.com/pssg-int/AMG2023/blob/main/amg-doc.pdf).
 
-## Perlmutter Compilation
+Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/)
 
-Repository: [AMG2023](https://github.com/pssg-int/AMG2023)
+## Perlmutter Compilation
 
 ### Steps to Compile
 
@@ -50,5 +50,61 @@ Repository: [AMG2023](https://github.com/pssg-int/AMG2023)
     cmake -DHYPRE_PREFIX=/pscratch/sd/c/cunyang/AMG2023 ..
     ```
 
-## Frontier Installation
+## Frontier Compilation
+
+### Steps to Compile
+
+1. Load modules
+    ```sh
+    module reset
+
+    module load cray-mpich/8.1.30
+    module load craype-accel-amd-gfx90a
+    module load rocm/6.1.3
+    export MPICH_GPU_SUPPORT_ENABLED=1
+
+    # load compatible cmake version
+    module load Core/24.07
+    module load cmake/3.27.9
+    ```
+2. Configure hypre (v2.32.0)
+    - Clone hypre v2.32.0 and navigate to src: 
+        ```sh
+        git clone -b v2.32.0 https://github.com/hypre-space/hypre.git
+        cd into ~/hypre/src
+        ```
+    - Configure hypre (in hypre/src)
+        ```sh
+        ./configure --with-hip --enable-device-memory-pool --enable-mixedint --with-gpu-arch=gfx90a \
+            --with-MPI-lib-dirs="${MPICH_DIR}/lib" --with-MPI-libs="mpi" \
+            --with-MPI-include="${MPICH_DIR}/include" \
+            CFLAGS="-I${ROCM_PATH}/include/ -I${ROCM_PATH}/llvm/include/ -I${ROCM_PATH}/include/rocsparse/" \
+            LDFLAGS="-L${ROCM_PATH}/lib/ -L${ROCM_PATH}/llvm/lib/ -lrocsparse"
+        ```
+    - Compile hypre (in hypre/src)
+        ```sh
+        # build with make
+        make
+        ```
+3. Configure AMG2023
+    - Clone repo: 
+        ```sh
+        git clone https://github.com/pssg-int/AMG2023`
+        cd AMG2023
+        ```
+    - Add mpiP to LD_LIBRARY_PATH
+        ```sh
+        export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH
+        ```
+    - Configure cmake
+        ```sh
+        mkdir build && cd build
 
+        cmake .. -DHYPRE_PREFIX=/ccs/home/keshprad/hypre/src/hypre/ \
+            -DCMAKE_C_FLAGS="-I${ROCM_PATH}/include/ -I${ROCM_PATH}/llvm/include/ -I${ROCM_PATH}/include/rocsparse/" \
+            -DCMAKE_EXE_LINKER_FLAGS="-L${ROCM_PATH}/lib/ -L${ROCM_PATH}/llvm/lib/ -lrocsparse -lrocrand"
+        ```
+    - Compile AMG2023 (in AMG2023/build)
+        ```sh
+        make install
+        ```
diff --git a/AMG2023/run_frontier_16.sh b/AMG2023/run_frontier_16.sh
new file mode 100644
index 0000000..c51b52d
--- /dev/null
+++ b/AMG2023/run_frontier_16.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+#SBATCH -N 16
+#SBATCH -n 128
+#SBATCH -q normal
+#SBATCH -J amg
+#SBATCH --gpu-bind none
+#SBATCH -t 00:30:00
+#SBATCH -A csc569
+#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/output-AMG2023.log
+#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/%x-%j/error-AMG2023.log
+#SBATCH --exclusive
+# Run like: sbatch run_frontier_16.sh
+
+OUTPUT_DIR=/lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
+OUTPUT_FILE=$OUTPUT_DIR/output-AMG2023.log
+ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log
+
+# Run gpu benchmarks
+COMM_TYPE=mpi
+ROCM_VERSION=6.1.3
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+
+APP_ROOT=/ccs/home/keshprad/AMG2023
+cd $APP_ROOT
+
+# reset modules
+echo resetting modules:
+module reset
+# load modules
+echo loading modules:
+module load cray-mpich/8.1.30
+module load craype-accel-amd-gfx90a
+module load rocm/6.1.3
+
+export MPICH_GPU_SUPPORT_ENABLED=1
+export CRAY_ACCEL_TARGET=gfx90a
+export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/
+# mpiP
+export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH
+export MPIP="-o -f $OUTPUT_DIR"
+
+# log start date
+echo start AMG2023: $(date)
+# define command
+cmd="srun --output $OUTPUT_FILE --error $ERROR_FILE \
+        ./build/amg -P 4 4 8 -n 128 64 64 -problem 1 -iter 500"
+echo solving:
+echo $cmd
+$cmd
+# log end date
+echo end AMG2023: $(date)
diff --git a/AMG2023/run_frontier_64.sh b/AMG2023/run_frontier_64.sh
new file mode 100644
index 0000000..c7a7a3e
--- /dev/null
+++ b/AMG2023/run_frontier_64.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+#SBATCH -N 64
+#SBATCH -n 512
+#SBATCH -q normal
+#SBATCH -J amg
+#SBATCH --gpu-bind none
+#SBATCH -t 00:30:00
+#SBATCH -A csc569
+#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/output-AMG2023.log
+#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/%x-%j/error-AMG2023.log
+#SBATCH --exclusive
+# Run like: sbatch run_frontier_64.sh
+
+OUTPUT_DIR=/lustre/orion/csc569/scratch/keshprad/perfvar/AMG2023_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
+OUTPUT_FILE=$OUTPUT_DIR/output-AMG2023.log
+ERROR_FILE=$OUTPUT_DIR/error-AMG2023.log
+
+# Run gpu benchmarks
+COMM_TYPE=mpi
+ROCM_VERSION=6.1.3
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $OUTPUT_DIR
+
+APP_ROOT=/ccs/home/keshprad/AMG2023
+cd $APP_ROOT
+
+# reset modules
+echo resetting modules:
+module reset
+# load modules
+echo loading modules:
+module load cray-mpich/8.1.30
+module load craype-accel-amd-gfx90a
+module load rocm/6.1.3
+
+export MPICH_GPU_SUPPORT_ENABLED=1
+export CRAY_ACCEL_TARGET=gfx90a
+export HYPRE_INSTALL_DIR=/ccs/home/keshprad/hypre/src/hypre/
+# mpiP
+export LD_LIBRARY_PATH=/ccs/home/keshprad/mpiP:$LD_LIBRARY_PATH
+export MPIP="-o -f $OUTPUT_DIR"
+
+# log start date
+echo start AMG2023: $(date)
+# define command
+cmd="srun --output $OUTPUT_FILE --error $ERROR_FILE \
+        ./build/amg -P 8 8 8 -n 128 64 64 -problem 1 -iter 500"
+echo solving:
+echo $cmd
+$cmd
+# log end date
+echo end AMG2023: $(date)
diff --git a/AMG2023/run_frontier_crontab.sh b/AMG2023/run_frontier_crontab.sh
new file mode 100644
index 0000000..09b0f66
--- /dev/null
+++ b/AMG2023/run_frontier_crontab.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <number_of_nodes>"
+    exit 1
+fi
+# `16` or `64`
+NUM_NODES=$1
+
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+
+# load lmod
+source /usr/share/lmod/lmod/init/bash
+# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH
+export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps
+export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles
+
+# run sbatch script
+script=$PERF_VARIABILITY_ROOT/AMG2023/run_frontier_$NUM_NODES\.sh
+sbatch $script
\ No newline at end of file
diff --git a/gpu-benchmarks/README.md b/gpu-benchmarks/README.md
new file mode 100644
index 0000000..c8f9c25
--- /dev/null
+++ b/gpu-benchmarks/README.md
@@ -0,0 +1,14 @@
+# gpu-benchmarks README
+Code Repository: [gpu-benchmarks](#TODO:)
+
+## Perlmutter Compilation
+
+### Steps to Compile
+
+TODO:
+
+## Frontier Compilation
+
+### Steps to Compile
+
+TODO:
\ No newline at end of file
diff --git a/gpu-benchmarks/allgather/run_frontier.sh b/gpu-benchmarks/allgather/run_frontier.sh
new file mode 100644
index 0000000..7fc10b4
--- /dev/null
+++ b/gpu-benchmarks/allgather/run_frontier.sh
@@ -0,0 +1,63 @@
+# This script assumes it is being run by another sbatch script, 
+# so does not include portions for SBATCH vars (e.g. account, time, etc.)
+
+# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allgather.sh <comm_type> <num_nodes> <output_dir>
+
+#!/bin/bash
+if [ "$#" -ne 4 ]; then
+    echo "Usage: $0 <communication_type> <rocm_version> <number_of_nodes> <output_dir>"
+    exit 1
+fi
+# `mpi` or `rccl`
+COMM_TYPE=$1
+# `5.7.1` or `6.1.3`
+ROCM_VERSION=$2
+# `16` or `64`
+NUM_NODES=$3
+# output directory
+OUTPUT_DIR=$4
+
+# setup cray-mpich version
+if [[ "$ROCM_VERSION" == "6.1.3" ]]; then
+    MPICH_VERSION=8.1.30
+else
+    MPICH_VERSION=8.1.28
+fi
+
+OUTPUT_FILE=$OUTPUT_DIR/output-allgather.log
+
+{
+    # reset modules
+    echo resetting modules:
+    module reset
+    # load modules
+    echo loading modules:
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION}
+    module load cray-mpich/${MPICH_VERSION}
+    module load rocm/${ROCM_VERSION}
+    module list
+
+    GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
+    EXEC=$GPU_BENCHMARKS_ROOT/allgather_$COMM_TYPE\_rocm-${ROCM_VERSION}.x
+    NUM_TASKS=$(($NUM_NODES * 8))
+    MIN_MSG_SIZE=$((1 * 1024))
+    MAX_MSG_SIZE=$((1 * 1024 * 1024))
+    ITERATIONS=100
+
+    export MPICH_GPU_SUPPORT_ENABLED=1
+    export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
+
+    echo start allgather: $(date)
+    For MPI-bench we should use --gpus-per-node --gpus-per-task --ntasks-per-node , and  --gpu-bind=none in srun.
+    CMD="srun -N $NUM_NODES -n $NUM_TASKS \
+            --gpus-per-node 8 \
+            --gpus-per-task 1 \
+            --ntasks-per-node 8 \
+            --gpu-bind none \
+            --output $OUTPUT_FILE \
+            $EXEC $NUM_TASKS $MIN_MSG_SIZE $MAX_MSG_SIZE $ITERATIONS"
+    echo running:
+    echo $CMD
+    $CMD
+    echo end allgather: $(date)
+} &>> $OUTPUT_FILE
diff --git a/gpu-benchmarks/allreduce/run_frontier.sh b/gpu-benchmarks/allreduce/run_frontier.sh
new file mode 100644
index 0000000..855a486
--- /dev/null
+++ b/gpu-benchmarks/allreduce/run_frontier.sh
@@ -0,0 +1,58 @@
+# This script assumes it is being run by another sbatch script, 
+# so does not include portions for SBATCH vars (e.g. account, time, etc.)
+
+# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/allreduce.sh <comm_type> <num_nodes> <output_dir>
+
+#!/bin/bash
+if [ "$#" -ne 4 ]; then
+    echo "Usage: $0 <communication_type> <rocm_version> <number_of_nodes> <output_dir>"
+    exit 1
+fi
+# `mpi` or `rccl`
+COMM_TYPE=$1
+# `5.7.1` or `6.1.3`
+ROCM_VERSION=$2
+# `16` or `64`
+NUM_NODES=$3
+# output directory
+OUTPUT_DIR=$4
+
+# setup cray-mpich version
+if [[ "$ROCM_VERSION" == "6.1.3" ]]; then
+    MPICH_VERSION=8.1.30
+else
+    MPICH_VERSION=8.1.28
+fi
+
+OUTPUT_FILE=$OUTPUT_DIR/output-allreduce.log
+
+{
+    # reset modules
+    echo resetting modules:
+    module reset
+    # load modules
+    echo loading modules:
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION}
+    module load cray-mpich/${MPICH_VERSION}
+    module load rocm/${ROCM_VERSION}
+    module list
+
+    GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
+    EXEC=$GPU_BENCHMARKS_ROOT/allreduce_$COMM_TYPE\_rocm-${ROCM_VERSION}.x
+    NUM_TASKS=$(($NUM_NODES * 8))
+    MIN_MSG_SIZE=$((1 * 1024))
+    MAX_MSG_SIZE=$((1 * 1024 * 1024))
+    ITERATIONS=100
+
+    export MPICH_GPU_SUPPORT_ENABLED=1
+    export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
+
+    echo start allreduce: $(date)
+    CMD="srun -N $NUM_NODES -n $NUM_TASKS \
+            --output $OUTPUT_FILE \
+            $EXEC $NUM_TASKS $MIN_MSG_SIZE $MAX_MSG_SIZE $ITERATIONS"
+    echo running:
+    echo $CMD
+    $CMD
+    echo end allreduce: $(date)
+} &>> $OUTPUT_FILE
diff --git a/gpu-benchmarks/gemm/run_frontier.sh b/gpu-benchmarks/gemm/run_frontier.sh
new file mode 100644
index 0000000..c5348be
--- /dev/null
+++ b/gpu-benchmarks/gemm/run_frontier.sh
@@ -0,0 +1,56 @@
+# This script assumes it is being run by another sbatch script, 
+# so does not include portions for SBATCH vars (e.g. account, time, etc.)
+
+# run like: bash /ccs/home/keshprad/gpu-benchmarks/benchmark/frontier/gemm.sh <num_nodes> <output_dir>
+
+#!/bin/bash
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <rocm_version> <number_of_nodes> <output_dir>"
+    exit 1
+fi
+# `5.7.1` or `6.1.3`
+ROCM_VERSION=$1
+# `16` or `64`
+NUM_NODES=$2
+# output directory
+OUTPUT_DIR=$3
+
+# setup cray-mpich version
+if [[ "$ROCM_VERSION" == "6.1.3" ]]; then
+    MPICH_VERSION=8.1.30
+else
+    MPICH_VERSION=8.1.28
+fi
+
+OUTPUT_FILE=$OUTPUT_DIR/output-gemm.log
+
+{
+    # reset modules
+    echo resetting modules:
+    module reset
+    # load modules
+    echo loading modules:
+    module load PrgEnv-cray craype-accel-amd-gfx90a cpe/23.05 amd/${ROCM_VERSION}
+    module load cray-mpich/${MPICH_VERSION}
+    module load rocm/${ROCM_VERSION}
+    module list
+
+    GPU_BENCHMARKS_ROOT=/lustre/orion/csc569/scratch/keshprad/gpu-benchmarks
+    EXEC=$GPU_BENCHMARKS_ROOT/matmul/frontier/gemm_rocm-${ROCM_VERSION}.x
+    NUM_TASKS=$(($NUM_NODES * 8))
+
+    export MPICH_GPU_SUPPORT_ENABLED=1
+    export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
+
+    echo start gemm: $(date)
+    CMD="srun -N $NUM_NODES -n $NUM_TASKS \
+            --gpus-per-node 8 \
+            --gpus-per-task 1 \
+            --ntasks-per-node 8 \
+            --output $OUTPUT_FILE \
+            $EXEC"
+    echo running:
+    echo $CMD
+    $CMD
+    echo end gemm: $(date)
+} &>> $OUTPUT_FILE
diff --git a/nanoGPT/README.md b/nanoGPT/README.md
index 5c499fc..87e8189 100644
--- a/nanoGPT/README.md
+++ b/nanoGPT/README.md
@@ -1,33 +1,62 @@
-# nanoGPT Setup Instructions
+# nanoGPT README
+For more detailed installation parameters, please refer to [nanoGPT install guide](https://github.com/axonn-ai/nanoGPT).
 
-## Clone the Repository
+Repository: [AMG2023](https://github.com/hpcgroup/AMG2023/)
 
-```sh
-git clone https://github.com/axonn-ai/nanoGPT.git
-```
 
-## Create Python Environment
+## Perlmutter Setup
 
-```sh
-./scripts/create_python_env_perlmutter.sh
-```
+### Setup steps
 
-> Note: You may need to modify the path and torch version in `create_python_env_perlmutter.sh`.
+1. Clone the Repository
+    ```sh
+    git clone https://github.com/axonn-ai/nanoGPT.git
+    cd nanoGPT
+    ```
 
-## Load PyTorch Module
+2.  Create Python Environment
+    ```sh
+    ./scripts/create_python_env_perlmutter.sh
+    ```
+    > Note: You may need to modify the path and torch version in `create_python_env_perlmutter.sh`.
 
-```sh
-module load pytorch/2.0.1
-```
+3. Load PyTorch Module
+    ```sh
+    module load pytorch/2.0.1
+    ```
 
-## Activate the Environment
+4. Activate the Environment
+    ```sh
+    source path_to_nanogptENV/bin/activate
+    ```
 
-```sh
-source path_to_nanogptENV/bin/activate
-```
+5. Download Data
+    ```sh
+    python nanoGPT/data/openwebtext/prepare.py
+    ```
 
-## Download Data
+## Frontier Setup
 
-```sh
-python nanoGPT/data/openwebtext/prepare.py
-```
\ No newline at end of file
+### Setup steps
+
+1. Clone the Repository
+    ```sh
+    git clone https://github.com/axonn-ai/nanoGPT.git
+    cd nanoGPT
+    ```
+
+2.  Create Python Environment
+    ```sh
+    ./scripts/create_python_env_frontier.sh
+    ```
+    > Note: You may need to modify the WKSPC path and torch version in `create_python_env_frontier.sh`.
+
+4. Activate the Environment
+    ```sh
+    source path_to_nanogptENV/bin/activate
+    ```
+
+5. Download Data
+    ```sh
+    python data/openwebtext/prepare.py
+    ```
\ No newline at end of file
diff --git a/nanoGPT/run_frontier16.sh b/nanoGPT/run_frontier16.sh
new file mode 100644
index 0000000..901561e
--- /dev/null
+++ b/nanoGPT/run_frontier16.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+#SBATCH -N 16
+#SBATCH -n 128
+#SBATCH -q normal
+#SBATCH -J nanogpt
+#SBATCH --gpu-bind none
+#SBATCH -t 00:30:00
+#SBATCH -A csc569
+#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-output.log
+#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/%x-%j/job-error.log
+#SBATCH --exclusive
+# Run like: sbatch run_frontier16.sh
+
+echo "start run: $(date)"
+export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/16nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
+OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log
+ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log
+
+export SCRATCH="/lustre/orion/csc569/scratch/keshprad"
+export WRKSPC="${SCRATCH}/nanoGPT"
+export HF_HOME="${SCRATCH}/.cache/hf"
+export HF_TRANSFORMERS_CACHE="${HF_HOME}"
+export HF_DATASETS_CACHE="${HF_HOME}/datasets"
+cd $WRKSPC
+
+# load modules
+ROCM_VERSION=6.1.3
+echo resetting modules:
+module reset
+echo loading modules:
+module load PrgEnv-gnu/8.5.0
+module load rocm/${ROCM_VERSION}
+module load craype-accel-amd-gfx90a
+module load cray-python/3.9.13.1
+module load cray-mpich/8.1.30
+module list
+# activate env
+source ${WRKSPC}/axonn_nanogpt/bin/activate
+
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 8 ))
+## master addr and port
+# setting variables for torch.distributed
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export WORLD_SIZE=$GPUS
+export OMP_NUM_THREADS=7
+
+## some RCCL env variables
+export FI_CXI_ATS=0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn0
+export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+# AWS-OFI-RCCL
+export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH"
+# other
+export MPICH_GPU_SUPPORT_ENABLED=1
+export GPU_MAX_HW_QUEUES=1
+export OFI_NCCL_USE_IPV6_TCP=1
+
+SCRIPT="train_frontier.py config/train_gpt_neox_5B.py"
+
+# run with profiler
+export WITH_PROFILER=1
+OUTPUT_FILE="$JOB_OUTPUT_PATH/output-nanoGPT.log"
+# log start date
+echo "start nanoGPT: $(date)" &>> $OUTPUT_FILE
+run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT"
+echo $run_cmd &>> $OUTPUT_FILE
+eval $run_cmd &>> $OUTPUT_FILE
+# log end date
+echo "end nanoGPT: $(date)" &>> $OUTPUT_FILE
+
+# Run gpu benchmarks
+COMM_TYPE=rccl
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+
+echo "end run: $(date)"
\ No newline at end of file
diff --git a/nanoGPT/run_frontier64.sh b/nanoGPT/run_frontier64.sh
new file mode 100644
index 0000000..3201b51
--- /dev/null
+++ b/nanoGPT/run_frontier64.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+#SBATCH -N 64
+#SBATCH -n 512
+#SBATCH -q normal
+#SBATCH -J nanogpt
+#SBATCH --gpu-bind none
+#SBATCH -t 00:30:00
+#SBATCH -A csc569
+#SBATCH --output /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-output.log
+#SBATCH --error /lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/%x-%j/job-error.log
+#SBATCH --exclusive
+# Run like: sbatch run_frontier64.sh
+
+echo "start run: $(date)"
+export JOB_OUTPUT_PATH=/lustre/orion/csc569/scratch/keshprad/perfvar/nanoGPT_logs/64nodes/$SLURM_JOB_NAME-$SLURM_JOB_ID
+OUTPUT_FILE=$JOB_OUTPUT_PATH/output-nanoGPT.log
+ERROR_FILE=$JOB_OUTPUT_PATH/error-nanoGPT.log
+
+export SCRATCH="/lustre/orion/csc569/scratch/keshprad"
+export WRKSPC="${SCRATCH}/nanoGPT"
+export HF_HOME="${SCRATCH}/.cache/hf"
+export HF_TRANSFORMERS_CACHE="${HF_HOME}"
+export HF_DATASETS_CACHE="${HF_HOME}/datasets"
+cd $WRKSPC
+
+# load modules
+ROCM_VERSION=6.1.3
+echo resetting modules:
+module reset
+echo loading modules:
+module load PrgEnv-gnu/8.5.0
+module load rocm/${ROCM_VERSION}
+module load craype-accel-amd-gfx90a
+module load cray-python/3.9.13.1
+module load cray-mpich/8.1.30
+module list
+# activate env
+source ${WRKSPC}/axonn_nanogpt/bin/activate
+
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 8 ))
+## master addr and port
+# setting variables for torch.distributed
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export WORLD_SIZE=$GPUS
+export OMP_NUM_THREADS=7
+
+## some RCCL env variables
+export FI_CXI_ATS=0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn0
+export CUDA_VISIBLE_DEVICES=7,6,5,4,3,2,1,0
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+# AWS-OFI-RCCL
+export LD_LIBRARY_PATH="${WRKSPC}/repos/aws-ofi-rccl/lib:$LD_LIBRARY_PATH"
+# other
+export MPICH_GPU_SUPPORT_ENABLED=1
+export GPU_MAX_HW_QUEUES=1
+export OFI_NCCL_USE_IPV6_TCP=1
+
+SCRIPT="train_frontier.py config/train_gpt_neox_20B.py"
+
+# run with profiler
+export WITH_PROFILER=1
+OUTPUT_FILE="$JOB_OUTPUT_PATH/output-nanoGPT.log"
+# log start date
+echo "start nanoGPT: $(date)" &>> $OUTPUT_FILE
+run_cmd="srun -N $NNODES -n $GPUS --cpu-bind=cores --gpus-per-node=8 --ntasks-per-node=8 scripts/get_rank.sh python -u $SCRIPT"
+echo $run_cmd &>> $OUTPUT_FILE
+eval $run_cmd &>> $OUTPUT_FILE
+# log end date
+echo "end nanoGPT: $(date)" &>> $OUTPUT_FILE
+
+# Run gpu benchmarks
+COMM_TYPE=rccl
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+echo running allreduce benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allreduce/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+# echo running allgather benchmark
+# bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/allgather/run_frontier.sh $COMM_TYPE $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+echo running gemm benchmark
+bash $PERF_VARIABILITY_ROOT/gpu-benchmarks/gemm/run_frontier.sh $ROCM_VERSION $SLURM_JOB_NUM_NODES $JOB_OUTPUT_PATH
+
+echo "end run: $(date)"
\ No newline at end of file
diff --git a/nanoGPT/run_frontier_crontab.sh b/nanoGPT/run_frontier_crontab.sh
new file mode 100644
index 0000000..dcc8cf5
--- /dev/null
+++ b/nanoGPT/run_frontier_crontab.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <number_of_nodes>"
+    exit 1
+fi
+# `16` or `64`
+NUM_NODES=$1
+
+PERF_VARIABILITY_ROOT=/ccs/home/keshprad/perf-variability
+
+# load lmod
+source /usr/share/lmod/lmod/init/bash
+# load default LMOD_SYSTEM_DEFAULT_MODULES and MODULEPATH
+export LMOD_SYSTEM_DEFAULT_MODULES=craype-x86-trento:craype-network-ofi:perftools-base:xpmem:cray-pmi:PrgEnv-cray:DefApps
+export MODULEPATH=/sw/frontier/spack-envs/modules/cce/17.0.0/cray-mpich-8.1.28/cce-17.0.0:/sw/frontier/spack-envs/modules/cce/17.0.0/cce-17.0.0:/sw/frontier/spack-envs/modules/Core/24.07:/opt/cray/pe/lmod/modulefiles/mpi/crayclang/17.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/crayclang/17.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/compiler/crayclang/17.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/opt/cray/pe/lmod/modulefiles/cpu/x86-trento/1.0:/opt/cray/pe/modulefiles/Linux:/opt/cray/pe/modulefiles/Core:/opt/cray/pe/lmod/lmod/modulefiles/Core:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/sw/frontier/modulefiles:/opt/cray/modulefiles
+
+# run sbatch script
+script=$PERF_VARIABILITY_ROOT/nanoGPT/run_frontier$NUM_NODES\.sh
+sbatch $script
\ No newline at end of file
diff --git a/nanoGPT/train_gpt_neox_20B_frontier.py b/nanoGPT/train_gpt_neox_20B_frontier.py
new file mode 100644
index 0000000..cf7b91f
--- /dev/null
+++ b/nanoGPT/train_gpt_neox_20B_frontier.py
@@ -0,0 +1,46 @@
+# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB
+# launch as the following (e.g. in a screen session) and wait ~5 days:
+# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py
+
+wandb_log = False
+wandb_project = 'owt'
+wandb_run_name='gpt2-124M'
+
+# these make the total batch size be ~0.5M
+# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
+batch_size = 8
+block_size = 512
+gradient_accumulation_steps = 1 * 512 #per_gpu x num_gpus
+
+# model
+n_layer = 32
+n_head = 56
+n_embd = 7168
+dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
+bias = False # do we use bias inside LayerNorm and Linear layers?
+
+# adamw optimizer
+learning_rate = 1e-4 # max learning rate
+max_iters = 30 # total number of training iterations
+
+# axonn params
+G_intra_d=16
+G_intra_c=1
+G_intra_r=1
+compile=False # disable compile for axonn
+gradient_checkpointing=True
+
+# this makes total number of tokens be 300B
+max_iters = 30
+lr_decay_iters = 600000
+
+# eval stuff
+eval_interval = 1000
+eval_iters = 1
+log_interval = 10
+
+# weight decay
+weight_decay = 1e-1
+
+# log every iteration
+log_interval=1
\ No newline at end of file
diff --git a/nanoGPT/train_gpt_neox_5B_frontier.py b/nanoGPT/train_gpt_neox_5B_frontier.py
new file mode 100644
index 0000000..4ce7b55
--- /dev/null
+++ b/nanoGPT/train_gpt_neox_5B_frontier.py
@@ -0,0 +1,46 @@
+# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB
+# launch as the following (e.g. in a screen session) and wait ~5 days:
+# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py
+
+wandb_log = False
+wandb_project = 'owt'
+wandb_run_name='gpt2-124M'
+
+# these make the total batch size be ~0.5M
+# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
+batch_size = 16
+block_size = 512
+gradient_accumulation_steps = 2 * 128 #per_gpu x num_gpus
+
+# model
+n_layer = 24
+n_head = 32
+n_embd = 4096
+dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
+bias = False # do we use bias inside LayerNorm and Linear layers?
+
+# adamw optimizer
+learning_rate = 1e-4 # max learning rate
+max_iters = 30 # total number of training iterations
+
+# axonn params
+G_intra_d=16
+G_intra_c=1
+G_intra_r=1
+compile=False # disable compile for axonn
+gradient_checkpointing=True
+
+# this makes total number of tokens be 300B
+max_iters = 30
+lr_decay_iters = 600000
+
+# eval stuff
+eval_interval = 1000
+eval_iters = 1
+log_interval = 10
+
+# weight decay
+weight_decay = 1e-1
+
+# log every iteration
+log_interval=1
\ No newline at end of file