@@ -101,19 +101,31 @@ def _make_job_command(commit_sha, branch=None):
101101 if LOCAL_VOLUME :
102102 data_setup = """
103103CACHE_DIR=/mnt/pgolf-data/pgolf-cache
104- if [ -f "$CACHE_DIR/.download_complete_sp4096" ]; then
105- echo "Using cached SP4096 data from node-local volume"
104+ if [ -f "$CACHE_DIR/.download_complete" ]; then
105+ echo "Using cached data from node-local volume"
106+ export DATA_PATH=${DATA_PATH:-$CACHE_DIR/datasets/fineweb10B_sp1024}
107+ export TOKENIZER_PATH=${TOKENIZER_PATH:-$CACHE_DIR/tokenizers/fineweb_1024_bpe.model}
106108else
107- python data/cached_challenge_fineweb.py --train-shards 80 --variant sp4096
108- mkdir -p $CACHE_DIR && cp -r data/datasets data/tokenizers $CACHE_DIR/ && touch $CACHE_DIR/.download_complete_sp4096
109+ python data/cached_challenge_fineweb.py --train-shards 80
110+ mkdir -p $CACHE_DIR && cp -r data/datasets data/tokenizers $CACHE_DIR/ && touch $CACHE_DIR/.download_complete
111+ export DATA_PATH=${DATA_PATH:-./data/datasets/fineweb10B_sp1024}
112+ export TOKENIZER_PATH=${TOKENIZER_PATH:-./data/tokenizers/fineweb_1024_bpe.model}
109113fi
110114"""
111115 else :
112116 data_setup = """
113- if [ ! -f "data/datasets/.download_complete_sp4096" ]; then
114- python data/cached_challenge_fineweb.py --train-shards 80 --variant sp4096
115- touch data/datasets/.download_complete_sp4096
117+ # Auto-detect vocab size from train_gpt.py (default sp1024, supports sp4096+)
118+ VOCAB=$(grep -oP "VOCAB_SIZE['\" ],\\ s*\\ K[0-9]+" train_gpt.py 2>/dev/null || echo "1024")
119+ [ "$VOCAB" = "" ] && VOCAB=1024
120+ SHARDS=80
121+ [ "$VOCAB" -gt 1024 ] && SHARDS=143
122+ echo "data_setup: vocab=$VOCAB shards=$SHARDS"
123+ if [ ! -f "data/datasets/.download_complete_sp${VOCAB}" ]; then
124+ python data/cached_challenge_fineweb.py --variant sp${VOCAB} --train-shards $SHARDS
125+ touch "data/datasets/.download_complete_sp${VOCAB}"
116126fi
127+ export DATA_PATH=${DATA_PATH:-./data/datasets/fineweb10B_sp${VOCAB}}
128+ export TOKENIZER_PATH=${TOKENIZER_PATH:-./data/tokenizers/fineweb_${VOCAB}_bpe.model}
117129"""
118130
119131 clone_setup = f"""
0 commit comments