Skip to content

Commit 51417d4

Browse files
committed
improve debug_local.sh (swithc between flavor selection + additional flags for compilation and selective checkpointing)
1 parent d78b0e6 commit 51417d4

File tree

3 files changed

+36
-17
lines changed

3 files changed

+36
-17
lines changed

torchtitan/experiments/transformers_modeling_backend/__init__.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,8 @@
3636
"debugperf_large": HFTransformerModelArgs(
3737
titan_dense_args=TitanDenseModelArgs(
3838
dim=1024,
39-
n_layers=12,
40-
n_heads=16,
41-
n_kv_heads=16,
39+
n_layers=24,
40+
n_kv_heads=32,
4241
vocab_size=32000,
4342
rope_theta=500000,
4443
),

torchtitan/experiments/transformers_modeling_backend/tooling_dev/debug_local.sh

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,27 @@
11
#!/usr/bin/bash
22

3+
# Check if flavor is provided
4+
if [ -z "$1" ]; then
5+
echo "Usage: $0 <flavor> [--compile]"
6+
echo "Example: $0 debugperf_large"
7+
echo "Example: $0 debugperf_large --compile"
8+
exit 1
9+
fi
10+
11+
FLAVOR=$1
12+
COMPILE_FLAG=""
13+
FULL_AC_FLAG=""
14+
15+
# Check for flags
16+
for arg in "$@"; do
17+
if [ "$arg" = "--compile" ]; then
18+
COMPILE_FLAG="--enable_compile"
19+
fi
20+
if [ "$arg" = "--full_ac" ]; then
21+
FULL_AC_FLAG="--enable_full_ac"
22+
fi
23+
done
24+
325
# Shared model configuration for fair comparison
426
VOCAB_SIZE=2048
527
N_LAYERS=6
@@ -20,35 +42,35 @@ model_names=(
2042
for model_name in "${tt_model_names[@]}"; do
2143
rm -rf debug_local_results/${model_name}
2244

23-
python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf_large --model_type torchtitan --enable_profiling --profile_freq 5
24-
python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf_large/seed_checkpoint --qos high
25-
while [ ! -f debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt)" != "completed" ]; do
45+
python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor "$FLAVOR" --model_type torchtitan --enable_profiling --profile_freq 5 $COMPILE_FLAG $FULL_AC_FLAG
46+
python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/${FLAVOR}/seed_checkpoint --qos high
47+
while [ ! -f debug_local_results/${model_name}/${FLAVOR}/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/${FLAVOR}/seed_checkpoint/status.txt)" != "completed" ]; do
2648
echo "Waiting for seed checkpoint from ${model_name} to complete ..."
2749
sleep 1
2850
done
29-
python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf_large --qos high
51+
python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/${FLAVOR} --qos high
3052
echo "================"
3153
done
3254

3355
for model_name in "${model_names[@]}"; do
3456
rm -rf debug_local_results/${model_name}
3557

36-
python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf_large --model_type transformers_modeling_backend --hf_assets_path "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" --enable_profiling --profile_freq 5
37-
python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf_large/seed_checkpoint --qos high
38-
while [ ! -f debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt)" != "completed" ]; do
58+
python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor "$FLAVOR" --model_type transformers_modeling_backend --hf_assets_path "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" --enable_profiling --profile_freq 5 $COMPILE_FLAG $FULL_AC_FLAG
59+
python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/${FLAVOR}/seed_checkpoint --qos high
60+
while [ ! -f debug_local_results/${model_name}/${FLAVOR}/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/${FLAVOR}/seed_checkpoint/status.txt)" != "completed" ]; do
3961
echo "Waiting for seed checkpoint from ${model_name} to complete ..."
4062
sleep 1
4163
done
42-
python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf_large --qos high
64+
python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/${FLAVOR} --qos high
4365
echo "================"
4466
done
4567

4668
# for model_name in "${moe_model_names[@]}"; do
4769
# rm -rf debug_local_results/${model_name}
4870

49-
# USE_MOE=1 python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf_large
50-
# USE_MOE=1 python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf_large/seed_checkpoint --qos high
51-
# while [ ! -f debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt)" != "completed" ]; do
71+
# USE_MOE=1 python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor $FLAVOR
72+
# USE_MOE=1 python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/${FLAVOR}/seed_checkpoint --qos high
73+
# while [ ! -f debug_local_results/${model_name}/${FLAVOR}/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/${FLAVOR}/seed_checkpoint/status.txt)" != "
5274
# echo "Waiting for seed checkpoint from ${model_name} to complete ..."
5375
# sleep 1
5476
# done

torchtitan/models/llama3/__init__.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,7 @@
3232
),
3333
"debugperf_large": TransformerModelArgs(
3434
dim=1024,
35-
n_layers=12,
36-
n_heads=16,
37-
n_kv_heads=16,
35+
n_layers=24,
3836
vocab_size=32000,
3937
rope_theta=500000,
4038
),

0 commit comments

Comments
 (0)