Skip to content

Scripts for tokenizer test, exp 0138 #83

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions experiments/v4-hq_tokenizer_test/common/setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Script for setup trainer environment.

module load cuda/12.8/12.8.1
module load cudnn/9.5/9.5.1
module load hpcx/2.20
module load nccl/2.25/2.25.1-1
# (cliu) Only for cuda/12.8; there is no folder for cuda/12.8 in cudnn/9.5.1
export CUDNN_HOME=/apps/cudnn/9.5.1/cuda12.0
export CUDNN_PATH=$CUDNN_HOME
export LD_LIBRARY_PATH=/apps/cudnn/9.5.1/cuda12.0/lib:$LD_LIBRARY_PATH
export CPATH=/apps/cudnn/9.5.1/cuda12.0/include:$CPATH
export LIBRARY_PATH=/apps/cudnn/9.5.1/cuda12.0/lib:$LIBRARY_PATH
echo $(module list)

## Debug/logging flags
export LOGLEVEL=INFO
# export NCCL_DEBUG=WARN
export NCCL_DEBUG=INFO
export NCCL_DEBUG_SUBSYS=WARN
export PYTHONFAULTHANDLER=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export CUDA_LAUNCH_BLOCKING=0
export CUDNN_LOGDEST_DBG=stderr
export CUDNN_LOGERR_DBG=1

export NVTE_FLASH_ATTN=1
export NVTE_DEBUG=1
export NVTE_DEBUG_LEVEL=2
102 changes: 102 additions & 0 deletions experiments/v4-hq_tokenizer_test/installer/install_megatron.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/bin/bash
#PBS -P gcg51557
#PBS -q R9920251000
#PBS -v RTYPE=rt_HF
#PBS -l select=2
#PBS -l walltime=4:00:00
#PBS -k n
#PBS -N 0163_install

set -euo pipefail

EXP_DIR="/groups/gcg51557/experiments/0163_math_midtraining"
SCRIPT_DIR="${EXP_DIR}/scripts/experiments/v4-hq_tokenizer_test/installer"

mkdir -p ${EXP_DIR}/logs/installer

TIMESTAMP=$(date +%Y%m%d%H%M%S)
JOBID=${PBS_JOBID%%.*}
LOGFILE=${EXP_DIR}/logs/installer/$TIMESTAMP-$JOBID.out
ERRFILE=${EXP_DIR}/logs/installer/$TIMESTAMP-$JOBID.err
exec > $LOGFILE 2> $ERRFILE

source ${SCRIPT_DIR}/../common/setup.sh

cd ${EXP_DIR}
mkdir -p src
pushd src

echo "Install Python"
mkdir -p python
git clone https://github.com/python/cpython -b v3.12.8
PYTHONPATH=$(pwd)/python
pushd cpython
./configure --prefix=${PYTHONPATH} --enable-optimizations
make -j 64
make altinstall
popd

echo "Setup venv"
${PYTHONPATH}/bin/python3.12 -m venv ../venv
source ../venv/bin/activate
pip install --upgrade pip

echo "Install torch"
# pip install torch torchvision --index-url https://download.pytorch.org/whl/cu126
pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128

echo "Install requirements"
pip install -r ${SCRIPT_DIR}/requirements.txt

echo "Install apex"
git clone --recurse-submodules https://github.com/NVIDIA/apex
pushd apex
pip install -v \
--no-cache-dir \
--no-build-isolation \
--config-settings "--build-option=--cpp_ext" \
--config-settings "--build-option=--cuda_ext" \
./
popd

# echo "Install flash-attn"
git clone https://github.com/Dao-AILab/flash-attention.git
pushd flash-attention
git checkout 27f501d && cd hopper/ && python setup.py install
python_path=`python -c "import site; print(site.getsitepackages()[0])"`
mkdir -p $python_path/flash_attn_3
wget -P $python_path/flash_attn_3 https://raw.githubusercontent.com/Dao-AILab/flash-attention/27f501dbe011f4371bff938fe7e09311ab3002fa/hopper/flash_attn_interface.py
popd

# pip install \
# --no-build-isolation \
# --no-cache-dir \
# flash-attn

# echo "Install transformer_engine"
git clone --branch stable --recursive https://github.com/NVIDIA/TransformerEngine.git
pushd TransformerEngine
export NVTE_FRAMEWORK=pytorch
pip install .
popd

# pip install \
# --no-build-isolation \
# --no-cache-dir \
# transformer_engine[pytorch]

echo "Install Megatron-LM"
git clone https://github.com/llm-jp/Megatron-LM -b v4-old
pushd Megatron-LM/megatron/core/datasets
MEGATRON_HELPER_CPPFLAGS=(
-O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
$(python -m pybind11 --includes)
)
MEGATRON_HELPER_EXT=$(${PYTHONPATH}/bin/python3.12-config --extension-suffix)
g++ ${MEGATRON_HELPER_CPPFLAGS[@]} helpers.cpp -o helpers_cpp${MEGATRON_HELPER_EXT}
popd

deactivate
popd

echo "Done"
14 changes: 14 additions & 0 deletions experiments/v4-hq_tokenizer_test/installer/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
accelerate==1.3.0
cmake==3.31.4
einops==0.8.1
ninja==1.11.1.3
numpy==2.1.2
packaging==24.2
pybind11==2.13.6
regex==2024.11.6
safetensors==0.5.2
sentencepiece==0.2.0
six==1.17.0
transformers==4.48.3
wandb==0.19.3
wheel==0.45.1