Skip to content

Commit 40d9b1f

Browse files
committed
Update codebase
1 parent 0f8ec70 commit 40d9b1f

File tree

12 files changed

+57
-13
lines changed

12 files changed

+57
-13
lines changed

.github/workflows/build_wheels.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ jobs:
1616
# cibuildwheel builds linux wheels inside a manylinux container
1717
# it also takes care of procuring the correct python version for us
1818
os: [ubuntu-latest, windows-latest, macos-latest]
19-
python-version: [39, 310, 311]
19+
python-version: [38, 39, 310, 311]
2020

2121
steps:
2222
- uses: actions/checkout@v3

CHANGELOG.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Changelog
2+
3+
This is the changelog for the open source version of tiktoken.
4+
5+
## [v0.1.2]
6+
- Avoid use of `blobfile` for public files
7+
- Add support for Python 3.8
8+
- Add py.typed
9+
- Improve the public tests
10+
11+
## [v0.1.1]
12+
- Initial release

MANIFEST.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
include *.svg
22
include *.toml
3+
include *.md
34
include Makefile
5+
global-include py.typed
46
recursive-include scripts *.py
57
recursive-include tests *.py
68
recursive-include src *.rs

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
[project]
22
name = "tiktoken"
3-
dependencies = ["blobfile>=2", "regex>=2022.1.18"]
3+
dependencies = ["blobfile>=2", "regex>=2022.1.18", "requests>=2.26.0"]
44
dynamic = ["version"]
5-
requires-python = ">=3.9"
5+
requires-python = ">=3.8"
66

77
[build-system]
88
build-backend = "setuptools.build_meta"

scripts/redact.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ def redact_file(path: Path, dry_run: bool) -> None:
99
return
1010

1111
text = path.read_text()
12+
if not text:
13+
return
1214

1315
first_line = text.splitlines()[0]
1416
if "redact" in first_line:

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
public = True
55

66
if public:
7-
version = "0.1.1"
7+
version = "0.1.2"
88

99
setup(
1010
name="tiktoken",
@@ -18,6 +18,7 @@
1818
debug=False,
1919
)
2020
],
21+
package_data={"tiktoken": ["py.typed"]},
2122
packages=["tiktoken", "tiktoken_ext"],
2223
zip_safe=False,
2324
)

tests/test_simple_public.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,18 @@
22

33

44
def test_simple():
5+
# Note that there are more actual tests, they're just not currently public :-)
56
enc = tiktoken.get_encoding("gpt2")
67
assert enc.encode("hello world") == [31373, 995]
78
assert enc.decode([31373, 995]) == "hello world"
9+
assert enc.encode("hello <|endoftext|>", allowed_special="all") == [31373, 220, 50256]
810

911
enc = tiktoken.get_encoding("cl100k_base")
1012
assert enc.encode("hello world") == [15339, 1917]
1113
assert enc.decode([15339, 1917]) == "hello world"
14+
assert enc.encode("hello <|endoftext|>", allowed_special="all") == [15339, 220, 100257]
15+
16+
for enc_name in tiktoken.list_encoding_names():
17+
enc = tiktoken.get_encoding(enc_name)
18+
for token in range(10_000):
19+
assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token

tiktoken/core.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import annotations
2+
13
import functools
24
from concurrent.futures import ThreadPoolExecutor
35
from typing import AbstractSet, Collection, Literal, NoReturn, Optional, Union

tiktoken/load.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,21 @@
1+
from __future__ import annotations
2+
13
import base64
24
import hashlib
35
import json
46
import os
57
import uuid
68

79
import blobfile
10+
import requests
11+
12+
13+
def read_file(blobpath: str) -> bytes:
14+
if not blobpath.startswith("http://") and not blobpath.startswith("https://"):
15+
with blobfile.BlobFile(blobpath, "rb") as f:
16+
return f.read()
17+
# avoiding blobfile for public files helps avoid auth issues, like MFA prompts
18+
return requests.get(blobpath).content
819

920

1021
def read_file_cached(blobpath: str) -> bytes:
@@ -17,8 +28,7 @@ def read_file_cached(blobpath: str) -> bytes:
1728

1829
if cache_dir == "":
1930
# disable caching
20-
with blobfile.BlobFile(blobpath, "rb") as f:
21-
return f.read()
31+
return read_file(blobpath)
2232

2333
cache_key = hashlib.sha1(blobpath.encode()).hexdigest()
2434

@@ -27,8 +37,7 @@ def read_file_cached(blobpath: str) -> bytes:
2737
with open(cache_path, "rb") as f:
2838
return f.read()
2939

30-
with blobfile.BlobFile(blobpath, "rb") as f:
31-
contents = f.read()
40+
contents = read_file(blobpath)
3241

3342
os.makedirs(cache_dir, exist_ok=True)
3443
tmp_filename = cache_path + "." + str(uuid.uuid4()) + ".tmp"

tiktoken/py.typed

Whitespace-only changes.

tiktoken/registry.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import annotations
2+
13
import importlib
24
import pkgutil
35
import threading

tiktoken_ext/openai_public.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99

1010
def gpt2():
1111
mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
12-
vocab_bpe_file="az://openaipublic/gpt-2/encodings/main/vocab.bpe",
13-
encoder_json_file="az://openaipublic/gpt-2/encodings/main/encoder.json",
12+
vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
13+
encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json",
1414
)
1515
return {
1616
"name": "gpt2",
@@ -22,7 +22,9 @@ def gpt2():
2222

2323

2424
def r50k_base():
25-
mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/r50k_base.tiktoken")
25+
mergeable_ranks = load_tiktoken_bpe(
26+
"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken"
27+
)
2628
return {
2729
"name": "r50k_base",
2830
"explicit_n_vocab": 50257,
@@ -33,7 +35,9 @@ def r50k_base():
3335

3436

3537
def p50k_base():
36-
mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/p50k_base.tiktoken")
38+
mergeable_ranks = load_tiktoken_bpe(
39+
"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
40+
)
3741
return {
3842
"name": "p50k_base",
3943
"explicit_n_vocab": 50281,
@@ -44,7 +48,9 @@ def p50k_base():
4448

4549

4650
def cl100k_base():
47-
mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/cl100k_base.tiktoken")
51+
mergeable_ranks = load_tiktoken_bpe(
52+
"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
53+
)
4854
special_tokens = {
4955
ENDOFTEXT: 100257,
5056
FIM_PREFIX: 100258,

0 commit comments

Comments
 (0)