Skip to content

Commit c9edc30

Browse files
committed
Update support for encodings hashes when hosting cache files.
1 parent d8ce942 commit c9edc30

File tree

2 files changed

+61
-32
lines changed

2 files changed

+61
-32
lines changed

tiktoken/load.py

+21-10
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,11 @@ def check_hash(data: bytes, expected_hash: str) -> bool:
3232
return actual_hash == expected_hash
3333

3434

35-
def read_file_cached(blobpath: str, expected_hash: Optional[str] = None) -> bytes:
35+
def read_file_cached(
36+
blobpath: str,
37+
expected_hash: Optional[str] = None,
38+
is_self_hosting: Optional[bool] = False
39+
) -> bytes:
3640
user_specified_cache = True
3741
if "TIKTOKEN_CACHE_DIR" in os.environ:
3842
cache_dir = os.environ["TIKTOKEN_CACHE_DIR"]
@@ -52,9 +56,20 @@ def read_file_cached(blobpath: str, expected_hash: Optional[str] = None) -> byte
5256
if os.path.exists(cache_path):
5357
with open(cache_path, "rb") as f:
5458
data = f.read()
55-
if expected_hash is None or check_hash(data, expected_hash):
59+
if expected_hash is None:
5660
return data
5761

62+
if check_hash(data, expected_hash):
63+
return data
64+
65+
if is_self_hosting:
66+
raise ValueError(
67+
f"Hash mismatch for data from {blobpath} (expected {expected_hash}). "
68+
f"This may indicate change in the `tiktoken` encodings for this version. "
69+
f"Please update the hosted encodings or remove/unset the `ENCODINGS_HOST` "
70+
"to attempt to refresh the cache from the central host (`openaipublic`)."
71+
)
72+
5873
# the cached file does not match the hash, remove it and re-fetch
5974
try:
6075
os.remove(cache_path)
@@ -83,10 +98,8 @@ def read_file_cached(blobpath: str, expected_hash: Optional[str] = None) -> byte
8398

8499

85100
def data_gym_to_mergeable_bpe_ranks(
86-
vocab_bpe_file: str,
87-
encoder_json_file: str,
88-
vocab_bpe_hash: Optional[str] = None,
89-
encoder_json_hash: Optional[str] = None,
101+
vocab_bpe_contents: str,
102+
encoder_json_contents: str,
90103
) -> dict[bytes, int]:
91104
# NB: do not add caching to this function
92105
rank_to_intbyte = [b for b in range(2**8) if chr(b).isprintable() and chr(b) != " "]
@@ -101,7 +114,6 @@ def data_gym_to_mergeable_bpe_ranks(
101114
assert len(rank_to_intbyte) == 2**8
102115

103116
# vocab_bpe contains the merges along with associated ranks
104-
vocab_bpe_contents = read_file_cached(vocab_bpe_file, vocab_bpe_hash).decode()
105117
bpe_merges = [tuple(merge_str.split()) for merge_str in vocab_bpe_contents.split("\n")[1:-1]]
106118

107119
def decode_data_gym(value: str) -> bytes:
@@ -118,7 +130,7 @@ def decode_data_gym(value: str) -> bytes:
118130
# check that the encoder file matches the merges file
119131
# this sanity check is important since tiktoken assumes that ranks are ordered the same
120132
# as merge priority
121-
encoder_json = json.loads(read_file_cached(encoder_json_file, encoder_json_hash))
133+
encoder_json = json.loads(encoder_json_contents)
122134
encoder_json_loaded = {decode_data_gym(k): v for k, v in encoder_json.items()}
123135
# drop these two special tokens if present, since they're not mergeable bpe tokens
124136
encoder_json_loaded.pop(b"<|endoftext|>", None)
@@ -141,10 +153,9 @@ def dump_tiktoken_bpe(bpe_ranks: dict[bytes, int], tiktoken_bpe_file: str) -> No
141153

142154

143155
def load_tiktoken_bpe(
144-
tiktoken_bpe_file: str, expected_hash: Optional[str] = None
156+
contents:bytes
145157
) -> dict[bytes, int]:
146158
# NB: do not add caching to this function
147-
contents = read_file_cached(tiktoken_bpe_file, expected_hash)
148159
return {
149160
base64.b64decode(token): int(rank)
150161
for token, rank in (line.split() for line in contents.splitlines() if line)

tiktoken_ext/openai_public.py

+40-22
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,46 @@
11
import os
2-
from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe
2+
from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe, read_file_cached
33

44
ENDOFTEXT = "<|endoftext|>"
55
FIM_PREFIX = "<|fim_prefix|>"
66
FIM_MIDDLE = "<|fim_middle|>"
77
FIM_SUFFIX = "<|fim_suffix|>"
88
ENDOFPROMPT = "<|endofprompt|>"
99

10-
ENCODINGS_HOST = os.getenv("ENCODINGS_HOST", "https://openaipublic.blob.core.windows.net")
10+
ENCODINGS_HOST = os.getenv("ENCODINGS_HOST", None)
11+
12+
if "ENCODINGS_HOST" in os.environ:
13+
ENCODINGS_HOST = os.environ["ENCODINGS_HOST"]
14+
IS_HOSTING_ENCODINGS = True
15+
else:
16+
ENCODINGS_HOST = "https://openaipublic.blob.core.windows.net"
17+
IS_HOSTING_ENCODINGS = False
18+
19+
VOCAB_BPE_FILE = f"{ENCODINGS_HOST}/gpt-2/encodings/main/vocab.bpe"
20+
VOCAB_BPE_HASH = "1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5"
21+
ENCODER_JSON_FILE = f"{ENCODINGS_HOST}/gpt-2/encodings/main/encoder.json"
22+
ENCODER_JSON_HASH = "196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783"
23+
R50K_BASE_FILE = f"{ENCODINGS_HOST}/encodings/r50k_base.tiktoken"
24+
R50K_BASE_HASH = "306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930"
25+
P50K_BASE_FILE = f"{ENCODINGS_HOST}/encodings/p50k_base.tiktoken"
26+
P50K_BASE_HASH = "94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069"
27+
CL100K_BASE_FILE = f"{ENCODINGS_HOST}/encodings/cl100k_base.tiktoken"
28+
CL100K_BASE_HASH = "223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7"
1129

1230
def gpt2():
31+
vocab_bpe_contents = read_file_cached(
32+
VOCAB_BPE_FILE,
33+
VOCAB_BPE_HASH,
34+
IS_HOSTING_ENCODINGS
35+
).decode()
36+
encoder_json_contents = read_file_cached(
37+
ENCODER_JSON_FILE,
38+
ENCODER_JSON_HASH,
39+
IS_HOSTING_ENCODINGS
40+
)
1341
mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
14-
vocab_bpe_file=f"{ENCODINGS_HOST}/gpt-2/encodings/main/vocab.bpe",
15-
encoder_json_file=f"{ENCODINGS_HOST}/gpt-2/encodings/main/encoder.json",
16-
vocab_bpe_hash="1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5",
17-
encoder_json_hash="196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783",
42+
vocab_bpe_contents= vocab_bpe_contents,
43+
encoder_json_contents=encoder_json_contents
1844
)
1945
return {
2046
"name": "gpt2",
@@ -29,10 +55,8 @@ def gpt2():
2955

3056

3157
def r50k_base():
32-
mergeable_ranks = load_tiktoken_bpe(
33-
f"{ENCODINGS_HOST}/encodings/r50k_base.tiktoken",
34-
expected_hash="306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930",
35-
)
58+
contents = read_file_cached(R50K_BASE_FILE, R50K_BASE_HASH, IS_HOSTING_ENCODINGS)
59+
mergeable_ranks = load_tiktoken_bpe(contents)
3660
return {
3761
"name": "r50k_base",
3862
"explicit_n_vocab": 50257,
@@ -43,10 +67,8 @@ def r50k_base():
4367

4468

4569
def p50k_base():
46-
mergeable_ranks = load_tiktoken_bpe(
47-
f"{ENCODINGS_HOST}/encodings/p50k_base.tiktoken",
48-
expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069",
49-
)
70+
contents = read_file_cached(P50K_BASE_FILE, P50K_BASE_HASH, IS_HOSTING_ENCODINGS)
71+
mergeable_ranks = load_tiktoken_bpe(contents)
5072
return {
5173
"name": "p50k_base",
5274
"explicit_n_vocab": 50281,
@@ -57,10 +79,8 @@ def p50k_base():
5779

5880

5981
def p50k_edit():
60-
mergeable_ranks = load_tiktoken_bpe(
61-
f"{ENCODINGS_HOST}/encodings/p50k_base.tiktoken",
62-
expected_hash="94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069",
63-
)
82+
contents = read_file_cached(P50K_BASE_FILE, P50K_BASE_HASH, IS_HOSTING_ENCODINGS)
83+
mergeable_ranks = load_tiktoken_bpe(contents)
6484
special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
6585
return {
6686
"name": "p50k_edit",
@@ -71,10 +91,8 @@ def p50k_edit():
7191

7292

7393
def cl100k_base():
74-
mergeable_ranks = load_tiktoken_bpe(
75-
f"{ENCODINGS_HOST}/encodings/cl100k_base.tiktoken",
76-
expected_hash="223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7",
77-
)
94+
contents = read_file_cached(CL100K_BASE_FILE, CL100K_BASE_HASH, IS_HOSTING_ENCODINGS)
95+
mergeable_ranks = load_tiktoken_bpe(contents)
7896
special_tokens = {
7997
ENDOFTEXT: 100257,
8098
FIM_PREFIX: 100258,

0 commit comments

Comments
 (0)