improve debug_local.sh (swithc between flavor selection + additional flags for compilation and selective checkpointing)

3outeille · 3outeille · commit 51417d4851b2 · 2025-12-09T14:03:18.000Z
diff --git a/torchtitan/experiments/transformers_modeling_backend/__init__.py b/torchtitan/experiments/transformers_modeling_backend/__init__.py
@@ -36,9 +36,8 @@
     "debugperf_large": HFTransformerModelArgs(
         titan_dense_args=TitanDenseModelArgs(
             dim=1024,
-            n_layers=12,
-            n_heads=16,
-            n_kv_heads=16,
+            n_layers=24,
+            n_kv_heads=32,
             vocab_size=32000,
             rope_theta=500000,
         ),
diff --git a/torchtitan/experiments/transformers_modeling_backend/tooling_dev/debug_local.sh b/torchtitan/experiments/transformers_modeling_backend/tooling_dev/debug_local.sh
@@ -1,5 +1,27 @@
 #!/usr/bin/bash
 
+# Check if flavor is provided
+if [ -z "$1" ]; then
+    echo "Usage: $0 <flavor> [--compile]"
+    echo "Example: $0 debugperf_large"
+    echo "Example: $0 debugperf_large --compile"
+    exit 1
+fi
+
+FLAVOR=$1
+COMPILE_FLAG=""
+FULL_AC_FLAG=""
+
+# Check for flags
+for arg in "$@"; do
+    if [ "$arg" = "--compile" ]; then
+        COMPILE_FLAG="--enable_compile"
+    fi
+    if [ "$arg" = "--full_ac" ]; then
+        FULL_AC_FLAG="--enable_full_ac"
+    fi
+done
+
 # Shared model configuration for fair comparison
 VOCAB_SIZE=2048
 N_LAYERS=6
@@ -20,35 +42,35 @@ model_names=(
 for model_name in "${tt_model_names[@]}"; do
     rm -rf debug_local_results/${model_name}
 
-    python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf_large --model_type torchtitan --enable_profiling --profile_freq 5
-    python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf_large/seed_checkpoint --qos high
-    while [ ! -f debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt)" != "completed" ]; do
+    python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor "$FLAVOR" --model_type torchtitan --enable_profiling --profile_freq 5 $COMPILE_FLAG $FULL_AC_FLAG
+    python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/${FLAVOR}/seed_checkpoint --qos high
+    while [ ! -f debug_local_results/${model_name}/${FLAVOR}/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/${FLAVOR}/seed_checkpoint/status.txt)" != "completed" ]; do
         echo "Waiting for seed checkpoint from ${model_name} to complete ..."
         sleep 1
     done
-    python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf_large --qos high
+    python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/${FLAVOR} --qos high
     echo "================"
 done
 
 for model_name in "${model_names[@]}"; do
     rm -rf debug_local_results/${model_name}
 
-    python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf_large --model_type transformers_modeling_backend --hf_assets_path "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" --enable_profiling --profile_freq 5
-    python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf_large/seed_checkpoint --qos high
-    while [ ! -f debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt)" != "completed" ]; do
+    python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor "$FLAVOR" --model_type transformers_modeling_backend --hf_assets_path "/fsx/ferdinandmom/ferdinand-hf/huggingface/torchtitan/tests/assets/tokenizer" --enable_profiling --profile_freq 5 $COMPILE_FLAG $FULL_AC_FLAG
+    python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/${FLAVOR}/seed_checkpoint --qos high
+    while [ ! -f debug_local_results/${model_name}/${FLAVOR}/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/${FLAVOR}/seed_checkpoint/status.txt)" != "completed" ]; do
         echo "Waiting for seed checkpoint from ${model_name} to complete ..."
         sleep 1
     done
-    python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf_large --qos high
+    python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/${FLAVOR} --qos high
     echo "================"
 done
 
 # for model_name in "${moe_model_names[@]}"; do
 #     rm -rf debug_local_results/${model_name}
 
-#     USE_MOE=1 python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor debugperf_large
-#     USE_MOE=1 python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/debugperf_large/seed_checkpoint --qos high
-#     while [ ! -f debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/debugperf_large/seed_checkpoint/status.txt)" != "completed" ]; do
+    #     USE_MOE=1 python ./tooling_dev/test_hf_integration.py create_configs --model_name "$model_name" --out_dir debug_local_results --flavor $FLAVOR
+#     USE_MOE=1 python ./tooling_dev/test_hf_integration.py submit_jobs --inp_dir debug_local_results/${model_name}/${FLAVOR}/seed_checkpoint --qos high
+#     while [ ! -f debug_local_results/${model_name}/${FLAVOR}/seed_checkpoint/status.txt ] || [ "$(cat debug_local_results/${model_name}/${FLAVOR}/seed_checkpoint/status.txt)" != "
 #         echo "Waiting for seed checkpoint from ${model_name} to complete ..."
 #         sleep 1
 #     done
diff --git a/torchtitan/models/llama3/__init__.py b/torchtitan/models/llama3/__init__.py
@@ -32,9 +32,7 @@
     ),
     "debugperf_large": TransformerModelArgs(
         dim=1024,
-        n_layers=12,
-        n_heads=16,
-        n_kv_heads=16,
+        n_layers=24,
         vocab_size=32000,
         rope_theta=500000,
     ),