Skip to content

Commit

Permalink
fix path in train data script (#275)
Browse files Browse the repository at this point in the history
  • Loading branch information
hamishivi authored Aug 19, 2024
1 parent 2627b69 commit a9c76a4
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions scripts/data/prepare_train_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,13 @@ echo "Downloading ShareGPT dataset..."
wget -P data/raw_train/sharegpt/ https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part1_html_cleaned.json
wget -P data/raw_train/sharegpt/ https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part2_html_cleaned.json
echo "Splitting the ShareGPT dataset with 2048 max tokens per conversation..."
python scripts/split_sharegpt_conversations.py \
python scripts/data/split_sharegpt_conversations.py \
--in-files data/raw_train/sharegpt/sg_90k_part1_html_cleaned.json data/raw_train/sharegpt/sg_90k_part2_html_cleaned.json \
--out-file data/raw_train/sharegpt/sharegpt_html_cleaned_and_split_2048.json \
--model-name-or-path oobabooga/llama-tokenizer \
--max-length 2048
echo "Splitting the ShareGPT dataset with 4096 max tokens per conversation..."
python scripts/split_sharegpt_conversations.py \
python scripts/data/split_sharegpt_conversations.py \
--in-files data/raw_train/sharegpt/sg_90k_part1_html_cleaned.json data/raw_train/sharegpt/sg_90k_part2_html_cleaned.json \
--out-file data/raw_train/sharegpt/sharegpt_html_cleaned_and_split_4096.json \
--model-name-or-path oobabooga/llama-tokenizer \
Expand Down

0 comments on commit a9c76a4

Please sign in to comment.