1
1
import os
2
- from tiktoken .load import data_gym_to_mergeable_bpe_ranks , load_tiktoken_bpe
2
+ from tiktoken .load import data_gym_to_mergeable_bpe_ranks , load_tiktoken_bpe , read_file_cached
3
3
4
4
ENDOFTEXT = "<|endoftext|>"
5
5
FIM_PREFIX = "<|fim_prefix|>"
6
6
FIM_MIDDLE = "<|fim_middle|>"
7
7
FIM_SUFFIX = "<|fim_suffix|>"
8
8
ENDOFPROMPT = "<|endofprompt|>"
9
9
10
- ENCODINGS_HOST = os .getenv ("ENCODINGS_HOST" , "https://openaipublic.blob.core.windows.net" )
10
+ ENCODINGS_HOST = os .getenv ("ENCODINGS_HOST" , None )
11
+
12
+ if "ENCODINGS_HOST" in os .environ :
13
+ ENCODINGS_HOST = os .environ ["ENCODINGS_HOST" ]
14
+ IS_HOSTING_ENCODINGS = True
15
+ else :
16
+ ENCODINGS_HOST = "https://openaipublic.blob.core.windows.net"
17
+ IS_HOSTING_ENCODINGS = False
18
+
19
+ VOCAB_BPE_FILE = f"{ ENCODINGS_HOST } /gpt-2/encodings/main/vocab.bpe"
20
+ VOCAB_BPE_HASH = "1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5"
21
+ ENCODER_JSON_FILE = f"{ ENCODINGS_HOST } /gpt-2/encodings/main/encoder.json"
22
+ ENCODER_JSON_HASH = "196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783"
23
+ R50K_BASE_FILE = f"{ ENCODINGS_HOST } /encodings/r50k_base.tiktoken"
24
+ R50K_BASE_HASH = "306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930"
25
+ P50K_BASE_FILE = f"{ ENCODINGS_HOST } /encodings/p50k_base.tiktoken"
26
+ P50K_BASE_HASH = "94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069"
27
+ CL100K_BASE_FILE = f"{ ENCODINGS_HOST } /encodings/cl100k_base.tiktoken"
28
+ CL100K_BASE_HASH = "223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7"
11
29
12
30
def gpt2 ():
31
+ vocab_bpe_contents = read_file_cached (
32
+ VOCAB_BPE_FILE ,
33
+ VOCAB_BPE_HASH ,
34
+ IS_HOSTING_ENCODINGS
35
+ ).decode ()
36
+ encoder_json_contents = read_file_cached (
37
+ ENCODER_JSON_FILE ,
38
+ ENCODER_JSON_HASH ,
39
+ IS_HOSTING_ENCODINGS
40
+ )
13
41
mergeable_ranks = data_gym_to_mergeable_bpe_ranks (
14
- vocab_bpe_file = f"{ ENCODINGS_HOST } /gpt-2/encodings/main/vocab.bpe" ,
15
- encoder_json_file = f"{ ENCODINGS_HOST } /gpt-2/encodings/main/encoder.json" ,
16
- vocab_bpe_hash = "1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5" ,
17
- encoder_json_hash = "196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783" ,
42
+ vocab_bpe_contents = vocab_bpe_contents ,
43
+ encoder_json_contents = encoder_json_contents
18
44
)
19
45
return {
20
46
"name" : "gpt2" ,
@@ -29,10 +55,8 @@ def gpt2():
29
55
30
56
31
57
def r50k_base ():
32
- mergeable_ranks = load_tiktoken_bpe (
33
- f"{ ENCODINGS_HOST } /encodings/r50k_base.tiktoken" ,
34
- expected_hash = "306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930" ,
35
- )
58
+ contents = read_file_cached (R50K_BASE_FILE , R50K_BASE_HASH , IS_HOSTING_ENCODINGS )
59
+ mergeable_ranks = load_tiktoken_bpe (contents )
36
60
return {
37
61
"name" : "r50k_base" ,
38
62
"explicit_n_vocab" : 50257 ,
@@ -43,10 +67,8 @@ def r50k_base():
43
67
44
68
45
69
def p50k_base ():
46
- mergeable_ranks = load_tiktoken_bpe (
47
- f"{ ENCODINGS_HOST } /encodings/p50k_base.tiktoken" ,
48
- expected_hash = "94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069" ,
49
- )
70
+ contents = read_file_cached (P50K_BASE_FILE , P50K_BASE_HASH , IS_HOSTING_ENCODINGS )
71
+ mergeable_ranks = load_tiktoken_bpe (contents )
50
72
return {
51
73
"name" : "p50k_base" ,
52
74
"explicit_n_vocab" : 50281 ,
@@ -57,10 +79,8 @@ def p50k_base():
57
79
58
80
59
81
def p50k_edit ():
60
- mergeable_ranks = load_tiktoken_bpe (
61
- f"{ ENCODINGS_HOST } /encodings/p50k_base.tiktoken" ,
62
- expected_hash = "94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069" ,
63
- )
82
+ contents = read_file_cached (P50K_BASE_FILE , P50K_BASE_HASH , IS_HOSTING_ENCODINGS )
83
+ mergeable_ranks = load_tiktoken_bpe (contents )
64
84
special_tokens = {ENDOFTEXT : 50256 , FIM_PREFIX : 50281 , FIM_MIDDLE : 50282 , FIM_SUFFIX : 50283 }
65
85
return {
66
86
"name" : "p50k_edit" ,
@@ -71,10 +91,8 @@ def p50k_edit():
71
91
72
92
73
93
def cl100k_base ():
74
- mergeable_ranks = load_tiktoken_bpe (
75
- f"{ ENCODINGS_HOST } /encodings/cl100k_base.tiktoken" ,
76
- expected_hash = "223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7" ,
77
- )
94
+ contents = read_file_cached (CL100K_BASE_FILE , CL100K_BASE_HASH , IS_HOSTING_ENCODINGS )
95
+ mergeable_ranks = load_tiktoken_bpe (contents )
78
96
special_tokens = {
79
97
ENDOFTEXT : 100257 ,
80
98
FIM_PREFIX : 100258 ,
0 commit comments