@@ -20,7 +20,7 @@ def gpt2():
2020 # The pattern in the original GPT-2 release is:
2121 # r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
2222 # This is equivalent, but executes faster:
23- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
23+ "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]++ |\s+(?!\S)|\s+""" ,
2424 "mergeable_ranks" : mergeable_ranks ,
2525 "special_tokens" : {ENDOFTEXT : 50256 },
2626 }
@@ -34,7 +34,7 @@ def r50k_base():
3434 return {
3535 "name" : "r50k_base" ,
3636 "explicit_n_vocab" : 50257 ,
37- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
37+ "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]++ |\s+(?!\S)|\s+""" ,
3838 "mergeable_ranks" : mergeable_ranks ,
3939 "special_tokens" : {ENDOFTEXT : 50256 },
4040 }
@@ -48,7 +48,7 @@ def p50k_base():
4848 return {
4949 "name" : "p50k_base" ,
5050 "explicit_n_vocab" : 50281 ,
51- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
51+ "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]++ |\s+(?!\S)|\s+""" ,
5252 "mergeable_ranks" : mergeable_ranks ,
5353 "special_tokens" : {ENDOFTEXT : 50256 },
5454 }
@@ -62,7 +62,7 @@ def p50k_edit():
6262 special_tokens = {ENDOFTEXT : 50256 , FIM_PREFIX : 50281 , FIM_MIDDLE : 50282 , FIM_SUFFIX : 50283 }
6363 return {
6464 "name" : "p50k_edit" ,
65- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
65+ "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]++ |\s+(?!\S)|\s+""" ,
6666 "mergeable_ranks" : mergeable_ranks ,
6767 "special_tokens" : special_tokens ,
6868 }
0 commit comments