Skip to content

Commit c3f8d58

Browse files
committed
tests : test-tokenizer-0.sh print more info (ggml-org#7402)
1 parent 11474e7 commit c3f8d58

File tree

3 files changed

+8
-3
lines changed

3 files changed

+8
-3
lines changed

convert-hf-to-gguf-update.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ class TOKENIZER_TYPE(IntEnum):
7272
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
7373
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
7474
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
75-
{"name": "stablelm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
75+
{"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
7676
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
7777
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
7878
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },

convert-hf-to-gguf.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -447,7 +447,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
447447
# ref: https://huggingface.co/openai-community/gpt2
448448
res = "gpt-2"
449449
if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3":
450-
# ref: https://huggingface.co/stabilityai/stablelm-2-1_6b
450+
# ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b
451451
res = "stablelm2"
452452
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
453453
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base

tests/test-tokenizer-0.sh

+6-1
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,15 @@ make -j tests/test-tokenizer-0
1717

1818
printf "Testing %s on %s ...\n" $name $input
1919

20+
set -e
21+
22+
printf "Tokenizing using (py) Python AutoTokenizer ...\n"
2023
python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
21-
cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
2224

25+
printf "Tokenizing using (cpp) llama.cpp ...\n"
2326
./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
27+
28+
cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
2429
cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
2530

2631
diff $input.tok $input.tokcpp > /dev/null 2>&1

0 commit comments

Comments
 (0)