@@ -20,7 +20,7 @@ def gpt2():
2020 # The pattern in the original GPT-2 release is:
2121 # r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
2222 # This is equivalent, but executes faster:
23- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
23+ "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++ | ?\p{N}++ | ?[^\s\p{L}\p{N}]++ |\s+(?!\S)|\s+ +""" ,
2424 "mergeable_ranks" : mergeable_ranks ,
2525 "special_tokens" : {ENDOFTEXT : 50256 },
2626 }
@@ -34,7 +34,7 @@ def r50k_base():
3434 return {
3535 "name" : "r50k_base" ,
3636 "explicit_n_vocab" : 50257 ,
37- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
37+ "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++ | ?\p{N}++ | ?[^\s\p{L}\p{N}]++ |\s+(?!\S)|\s+ +""" ,
3838 "mergeable_ranks" : mergeable_ranks ,
3939 "special_tokens" : {ENDOFTEXT : 50256 },
4040 }
@@ -48,7 +48,7 @@ def p50k_base():
4848 return {
4949 "name" : "p50k_base" ,
5050 "explicit_n_vocab" : 50281 ,
51- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
51+ "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++ | ?\p{N}++ | ?[^\s\p{L}\p{N}]++ |\s+(?!\S)|\s+ +""" ,
5252 "mergeable_ranks" : mergeable_ranks ,
5353 "special_tokens" : {ENDOFTEXT : 50256 },
5454 }
@@ -62,7 +62,7 @@ def p50k_edit():
6262 special_tokens = {ENDOFTEXT : 50256 , FIM_PREFIX : 50281 , FIM_MIDDLE : 50282 , FIM_SUFFIX : 50283 }
6363 return {
6464 "name" : "p50k_edit" ,
65- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
65+ "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++ | ?\p{N}++ | ?[^\s\p{L}\p{N}]++ |\s+(?!\S)|\s+ +""" ,
6666 "mergeable_ranks" : mergeable_ranks ,
6767 "special_tokens" : special_tokens ,
6868 }
@@ -82,7 +82,7 @@ def cl100k_base():
8282 }
8383 return {
8484 "name" : "cl100k_base" ,
85- "pat_str" : r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" ,
85+ "pat_str" : r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++ |\p{N}{1,3}+ | ?[^\s\p{L}\p{N}]++[\r\n]*+ |\s*[\r\n]|\s+(?!\S)|\s+ +""" ,
8686 "mergeable_ranks" : mergeable_ranks ,
8787 "special_tokens" : special_tokens ,
8888 }
0 commit comments