Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
common:
v3_0_info_root: "/home/shared/corpus/llm-jp-corpus/v3.0.0/training_resharded_tokenize_ver3.0/token_info"

datasets:
code:
basedir: "v3_0_info_root"
file: "2024_0410_code.sakura_home.csv"
repeat: 0.1014
en:
basedir: "v3_0_info_root"
file: "2024_0410_en.sakura_home.csv"
repeat: 0.1014
ja_cc1:
file: "/home/shared/experiments/0022_v3-high-quality-cpt/corpus/llm-jp-corpus/v3.0.0/training_resharded_sorted_tokenize_ver3.0/ppl_asc/token_info.csv"
filter:
- train/ja/cc-1
repeat: 0.4318
ja_wiki:
basedir: "v3_0_info_root"
file: "2024_0410_ja.sakura_home.csv"
filter:
- train/ja/wiki
repeat: 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
#SBATCH --job-name=0022_1.7b-hugh-qaulity-cpt-exp1B_cc1_asc_ppl
#SBATCH --partition=gpu-small
#SBATCH --nodes=4
#SBATCH --gpus-per-node=8
#SBATCH --ntasks-per-node=8
#SBATCH --output=outputs/%x-%j.out
#SBATCH --error=outputs/%x-%j.err

set -eu -o pipefail

# change directory if each experiment will be handled as one experintal issue
EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt
CONF_DIR=exp1B_cc1_asc_ppl

ENV_DIR=${EXPERIMENT_DIR}/environment
SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt

source ${ENV_DIR}/scripts/environment.sh
source ${ENV_DIR}/venv/bin/activate

export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1)
export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000)))

echo "MASTER_ADDR=${MASTER_ADDR}"

NUM_NODES=$SLURM_JOB_NUM_NODES
NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1)
NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE))

echo NUM_NODES=$NUM_NODES
echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE
echo NUM_GPUS=$NUM_GPUS

mpirun \
-np $NUM_GPUS \
--npernode $NUM_GPUS_PER_NODE \
-bind-to none \
-map-by slot \
-x EXPERIMENT_DIR=$EXPERIMENT_DIR \
-x SCRIPT_ROOT=$SCRIPT_ROOT \
-x CONF_DIR=$CONF_DIR \
-x MASTER_ADDR=$MASTER_ADDR \
-x MASTER_PORT=$MASTER_PORT \
bash ${SCRIPT_ROOT}/train-1.7b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
#SBATCH --job-name=0022_13b-hugh-qaulity-cpt-exp1B_cc1_asc_ppl
#SBATCH --partition=gpu-small
#SBATCH --nodes=8
#SBATCH --gpus-per-node=8
#SBATCH --ntasks-per-node=8
#SBATCH --output=outputs/%x-%j.out
#SBATCH --error=outputs/%x-%j.err

set -eu -o pipefail

# change directory if each experiment will be handled as one experintal issue
EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt
CONF_DIR=exp1B_cc1_asc_ppl

ENV_DIR=${EXPERIMENT_DIR}/environment
SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt

source ${ENV_DIR}/scripts/environment.sh
source ${ENV_DIR}/venv/bin/activate

export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1)
export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000)))

echo "MASTER_ADDR=${MASTER_ADDR}"

NUM_NODES=$SLURM_JOB_NUM_NODES
NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1)
NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE))

echo NUM_NODES=$NUM_NODES
echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE
echo NUM_GPUS=$NUM_GPUS

mpirun \
-np $NUM_GPUS \
--npernode $NUM_GPUS_PER_NODE \
-bind-to none \
-map-by slot \
-x EXPERIMENT_DIR=$EXPERIMENT_DIR \
-x SCRIPT_ROOT=$SCRIPT_ROOT \
-x CONF_DIR=$CONF_DIR \
-x MASTER_ADDR=$MASTER_ADDR \
-x MASTER_PORT=$MASTER_PORT \
bash ${SCRIPT_ROOT}/train-13b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
#SBATCH --job-name=0022_3.7b-hugh-qaulity-cpt-exp1B_cc1_asc_ppl
#SBATCH --partition=gpu-small
#SBATCH --nodes=4
#SBATCH --gpus-per-node=8
#SBATCH --ntasks-per-node=8
#SBATCH --output=outputs/%x-%j.out
#SBATCH --error=outputs/%x-%j.err

set -eu -o pipefail

# change directory if each experiment will be handled as one experintal issue
EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt
CONF_DIR=exp1B_cc1_asc_ppl

ENV_DIR=${EXPERIMENT_DIR}/environment
SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt

source ${ENV_DIR}/scripts/environment.sh
source ${ENV_DIR}/venv/bin/activate

export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1)
export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000)))

echo "MASTER_ADDR=${MASTER_ADDR}"

NUM_NODES=$SLURM_JOB_NUM_NODES
NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1)
NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE))

echo NUM_NODES=$NUM_NODES
echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE
echo NUM_GPUS=$NUM_GPUS

mpirun \
-np $NUM_GPUS \
--npernode $NUM_GPUS_PER_NODE \
-bind-to none \
-map-by slot \
-x EXPERIMENT_DIR=$EXPERIMENT_DIR \
-x SCRIPT_ROOT=$SCRIPT_ROOT \
-x CONF_DIR=$CONF_DIR \
-x MASTER_ADDR=$MASTER_ADDR \
-x MASTER_PORT=$MASTER_PORT \
bash ${SCRIPT_ROOT}/train-3.7b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
common:
v3_0_info_root: "/home/shared/corpus/llm-jp-corpus/v3.0.0/training_resharded_tokenize_ver3.0/token_info"

datasets:
code:
basedir: "v3_0_info_root"
file: "2024_0410_code.sakura_home.csv"
repeat: 0.1014
en:
basedir: "v3_0_info_root"
file: "2024_0410_en.sakura_home.csv"
repeat: 0.1014
ja_cc1:
file: "/home/shared/experiments/0022_v3-high-quality-cpt/corpus/llm-jp-corpus/v3.0.0/training_resharded_sorted_tokenize_ver3.0/ppl_desc/token_info.csv"
filter:
- train/ja/cc-1
repeat: 0.4318
ja_wiki:
basedir: "v3_0_info_root"
file: "2024_0410_ja.sakura_home.csv"
filter:
- train/ja/wiki
repeat: 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
#SBATCH --job-name=0022_1.7b-hugh-qaulity-cpt-exp1B_cc1_desc_ppl
#SBATCH --partition=gpu-small
#SBATCH --nodes=4
#SBATCH --gpus-per-node=8
#SBATCH --ntasks-per-node=8
#SBATCH --output=outputs/%x-%j.out
#SBATCH --error=outputs/%x-%j.err

set -eu -o pipefail

# change directory if each experiment will be handled as one experintal issue
EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt
CONF_DIR=exp1B_cc1_desc_ppl

ENV_DIR=${EXPERIMENT_DIR}/environment
SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt

source ${ENV_DIR}/scripts/environment.sh
source ${ENV_DIR}/venv/bin/activate

export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1)
export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000)))

echo "MASTER_ADDR=${MASTER_ADDR}"

NUM_NODES=$SLURM_JOB_NUM_NODES
NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1)
NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE))

echo NUM_NODES=$NUM_NODES
echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE
echo NUM_GPUS=$NUM_GPUS

mpirun \
-np $NUM_GPUS \
--npernode $NUM_GPUS_PER_NODE \
-bind-to none \
-map-by slot \
-x EXPERIMENT_DIR=$EXPERIMENT_DIR \
-x SCRIPT_ROOT=$SCRIPT_ROOT \
-x CONF_DIR=$CONF_DIR \
-x MASTER_ADDR=$MASTER_ADDR \
-x MASTER_PORT=$MASTER_PORT \
bash ${SCRIPT_ROOT}/train-1.7b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
#SBATCH --job-name=0022_13b-hugh-qaulity-cpt-exp1B_cc1_desc_ppl
#SBATCH --partition=gpu-small
#SBATCH --nodes=8
#SBATCH --gpus-per-node=8
#SBATCH --ntasks-per-node=8
#SBATCH --output=outputs/%x-%j.out
#SBATCH --error=outputs/%x-%j.err

set -eu -o pipefail

# change directory if each experiment will be handled as one experintal issue
EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt
CONF_DIR=exp1B_cc1_desc_ppl

ENV_DIR=${EXPERIMENT_DIR}/environment
SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt

source ${ENV_DIR}/scripts/environment.sh
source ${ENV_DIR}/venv/bin/activate

export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1)
export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000)))

echo "MASTER_ADDR=${MASTER_ADDR}"

NUM_NODES=$SLURM_JOB_NUM_NODES
NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1)
NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE))

echo NUM_NODES=$NUM_NODES
echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE
echo NUM_GPUS=$NUM_GPUS

mpirun \
-np $NUM_GPUS \
--npernode $NUM_GPUS_PER_NODE \
-bind-to none \
-map-by slot \
-x EXPERIMENT_DIR=$EXPERIMENT_DIR \
-x SCRIPT_ROOT=$SCRIPT_ROOT \
-x CONF_DIR=$CONF_DIR \
-x MASTER_ADDR=$MASTER_ADDR \
-x MASTER_PORT=$MASTER_PORT \
bash ${SCRIPT_ROOT}/train-13b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
#SBATCH --job-name=0022_3.7b-hugh-qaulity-cpt-exp1B_cc1_desc_ppl
#SBATCH --partition=gpu-small
#SBATCH --nodes=4
#SBATCH --gpus-per-node=8
#SBATCH --ntasks-per-node=8
#SBATCH --output=outputs/%x-%j.out
#SBATCH --error=outputs/%x-%j.err

set -eu -o pipefail

# change directory if each experiment will be handled as one experintal issue
EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt
CONF_DIR=exp1B_cc1_desc_ppl

ENV_DIR=${EXPERIMENT_DIR}/environment
SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt

source ${ENV_DIR}/scripts/environment.sh
source ${ENV_DIR}/venv/bin/activate

export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1)
export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000)))

echo "MASTER_ADDR=${MASTER_ADDR}"

NUM_NODES=$SLURM_JOB_NUM_NODES
NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1)
NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE))

echo NUM_NODES=$NUM_NODES
echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE
echo NUM_GPUS=$NUM_GPUS

mpirun \
-np $NUM_GPUS \
--npernode $NUM_GPUS_PER_NODE \
-bind-to none \
-map-by slot \
-x EXPERIMENT_DIR=$EXPERIMENT_DIR \
-x SCRIPT_ROOT=$SCRIPT_ROOT \
-x CONF_DIR=$CONF_DIR \
-x MASTER_ADDR=$MASTER_ADDR \
-x MASTER_PORT=$MASTER_PORT \
bash ${SCRIPT_ROOT}/train-3.7b.sh
30 changes: 30 additions & 0 deletions pretrain/scripts/v3-high-quality-cpt/exp4A/data_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
common:
v3_0_info_root: "/home/shared/corpus/llm-jp-corpus/v3.0.0/training_resharded_tokenize_ver3.0/token_info"
v3_1_info_root: "/home/shared/corpus/llm-jp-corpus/v3.1.0/tokenize/v3.0b1/token_info"

datasets:
en:
basedir: "v3_0_info_root"
file: "2024_0410_en.sakura_home.csv"
repeat: 0.1658
ja_v3_1_pdf00:
basedir: "v3_1_info_root"
file: "2024_0718_ja_train2.sakura_home.csv"
filter:
- "train2/ja/warp-pdf-e00"
repeat: 0.1043
ja_v3_1_pdf02:
basedir: "v3_1_info_root"
file: "2024_0718_ja_train2.sakura_home.csv"
filter:
- "train2/ja/warp-pdf-e02"
repeat: 0.0522
ja_other:
basedir: "v3_0_info_root"
file: "2024_0410_ja.sakura_home.csv"
filter:
- train/ja/cc
- train/ja/kaken
- train/ja/warp-html
- train/ja/wiki
repeat: 0.1043
Loading