Skip to content

Commit 58cf8f6

Browse files
author
Lőrinc
committed
Add possessive quantifiers to legacy encodings as well
1 parent db3155c commit 58cf8f6

File tree

2 files changed

+11
-8
lines changed

2 files changed

+11
-8
lines changed

tests/test_encoding.py

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from .test_helpers import ENCODING_FACTORIES, MAX_EXAMPLES
1212

1313

14+
@pytest.mark.skip(reason="Takes a really long time to finish, but was added to reproduce a crash.")
1415
@pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)
1516
def test_extremely_big_encoding(make_enc: Callable[[], tiktoken.Encoding]):
1617
enc = make_enc()

tiktoken_ext/openai_public.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@
66
FIM_SUFFIX = "<|fim_suffix|>"
77
ENDOFPROMPT = "<|endofprompt|>"
88

9+
# The pattern in the original GPT-2 release is:
10+
# r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
11+
# This is equivalent, but executes faster:
12+
_legacy_splitter_regex = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s+(?!\S)|\s++"""
13+
914

1015
def gpt2():
1116
mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
@@ -17,10 +22,7 @@ def gpt2():
1722
return {
1823
"name": "gpt2",
1924
"explicit_n_vocab": 50257,
20-
# The pattern in the original GPT-2 release is:
21-
# r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
22-
# This is equivalent, but executes faster:
23-
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
25+
"pat_str": _legacy_splitter_regex,
2426
"mergeable_ranks": mergeable_ranks,
2527
"special_tokens": {ENDOFTEXT: 50256},
2628
}
@@ -34,7 +36,7 @@ def r50k_base():
3436
return {
3537
"name": "r50k_base",
3638
"explicit_n_vocab": 50257,
37-
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
39+
"pat_str": _legacy_splitter_regex,
3840
"mergeable_ranks": mergeable_ranks,
3941
"special_tokens": {ENDOFTEXT: 50256},
4042
}
@@ -48,7 +50,7 @@ def p50k_base():
4850
return {
4951
"name": "p50k_base",
5052
"explicit_n_vocab": 50281,
51-
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
53+
"pat_str": _legacy_splitter_regex,
5254
"mergeable_ranks": mergeable_ranks,
5355
"special_tokens": {ENDOFTEXT: 50256},
5456
}
@@ -62,7 +64,7 @@ def p50k_edit():
6264
special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
6365
return {
6466
"name": "p50k_edit",
65-
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
67+
"pat_str": _legacy_splitter_regex,
6668
"mergeable_ranks": mergeable_ranks,
6769
"special_tokens": special_tokens,
6870
}
@@ -82,7 +84,7 @@ def cl100k_base():
8284
}
8385
return {
8486
"name": "cl100k_base",
85-
"pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""",
87+
"pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++|\p{N}{1,3}+| ?[^\s\p{L}\p{N}]++[\r\n]*+|\s*[\r\n]|\s+(?!\S)|\s++""",
8688
"mergeable_ranks": mergeable_ranks,
8789
"special_tokens": special_tokens,
8890
}

0 commit comments

Comments
 (0)