6
6
FIM_SUFFIX = "<|fim_suffix|>"
7
7
ENDOFPROMPT = "<|endofprompt|>"
8
8
9
+ # The pattern in the original GPT-2 release is:
10
+ # r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
11
+ # This is equivalent, but executes faster:
12
+ _legacy_splitter_regex = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++| ?\p{N}++| ?[^\s\p{L}\p{N}]++|\s+(?!\S)|\s++"""
13
+
9
14
10
15
def gpt2 ():
11
16
mergeable_ranks = data_gym_to_mergeable_bpe_ranks (
@@ -17,10 +22,7 @@ def gpt2():
17
22
return {
18
23
"name" : "gpt2" ,
19
24
"explicit_n_vocab" : 50257 ,
20
- # The pattern in the original GPT-2 release is:
21
- # r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
22
- # This is equivalent, but executes faster:
23
- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
25
+ "pat_str" : _legacy_splitter_regex ,
24
26
"mergeable_ranks" : mergeable_ranks ,
25
27
"special_tokens" : {ENDOFTEXT : 50256 },
26
28
}
@@ -34,7 +36,7 @@ def r50k_base():
34
36
return {
35
37
"name" : "r50k_base" ,
36
38
"explicit_n_vocab" : 50257 ,
37
- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
39
+ "pat_str" : _legacy_splitter_regex ,
38
40
"mergeable_ranks" : mergeable_ranks ,
39
41
"special_tokens" : {ENDOFTEXT : 50256 },
40
42
}
@@ -48,7 +50,7 @@ def p50k_base():
48
50
return {
49
51
"name" : "p50k_base" ,
50
52
"explicit_n_vocab" : 50281 ,
51
- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
53
+ "pat_str" : _legacy_splitter_regex ,
52
54
"mergeable_ranks" : mergeable_ranks ,
53
55
"special_tokens" : {ENDOFTEXT : 50256 },
54
56
}
@@ -62,7 +64,7 @@ def p50k_edit():
62
64
special_tokens = {ENDOFTEXT : 50256 , FIM_PREFIX : 50281 , FIM_MIDDLE : 50282 , FIM_SUFFIX : 50283 }
63
65
return {
64
66
"name" : "p50k_edit" ,
65
- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
67
+ "pat_str" : _legacy_splitter_regex ,
66
68
"mergeable_ranks" : mergeable_ranks ,
67
69
"special_tokens" : special_tokens ,
68
70
}
@@ -82,7 +84,7 @@ def cl100k_base():
82
84
}
83
85
return {
84
86
"name" : "cl100k_base" ,
85
- "pat_str" : r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" ,
87
+ "pat_str" : r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++ |\p{N}{1,3}+ | ?[^\s\p{L}\p{N}]++[\r\n]*+ |\s*[\r\n]|\s+(?!\S)|\s+ +""" ,
86
88
"mergeable_ranks" : mergeable_ranks ,
87
89
"special_tokens" : special_tokens ,
88
90
}
0 commit comments