Update codebase

hauntsaninja · hauntsaninja · commit 40d9b1f14ef2 · 2023-01-03T13:57:17.000-08:00
diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
@@ -16,7 +16,7 @@ jobs:
         # cibuildwheel builds linux wheels inside a manylinux container
         # it also takes care of procuring the correct python version for us
         os: [ubuntu-latest, windows-latest, macos-latest]
-        python-version: [39, 310, 311]
+        python-version: [38, 39, 310, 311]
 
     steps:
       - uses: actions/checkout@v3
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,12 @@
+# Changelog
+
+This is the changelog for the open source version of tiktoken.
+
+## [v0.1.2]
+- Avoid use of `blobfile` for public files
+- Add support for Python 3.8
+- Add py.typed
+- Improve the public tests
+
+## [v0.1.1]
+- Initial release
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,6 +1,8 @@
 include *.svg
 include *.toml
+include *.md
 include Makefile
+global-include py.typed
 recursive-include scripts *.py
 recursive-include tests *.py
 recursive-include src *.rs
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,8 +1,8 @@
 [project]
 name = "tiktoken"
-dependencies = ["blobfile>=2", "regex>=2022.1.18"]
+dependencies = ["blobfile>=2", "regex>=2022.1.18", "requests>=2.26.0"]
 dynamic = ["version"]
-requires-python = ">=3.9"
+requires-python = ">=3.8"
 
 [build-system]
 build-backend = "setuptools.build_meta"
diff --git a/scripts/redact.py b/scripts/redact.py
@@ -9,6 +9,8 @@ def redact_file(path: Path, dry_run: bool) -> None:
         return
 
     text = path.read_text()
+    if not text:
+        return
 
     first_line = text.splitlines()[0]
     if "redact" in first_line:
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 public = True
 
 if public:
-    version = "0.1.1"
+    version = "0.1.2"
 
 setup(
     name="tiktoken",
@@ -18,6 +18,7 @@
             debug=False,
         )
     ],
+    package_data={"tiktoken": ["py.typed"]},
     packages=["tiktoken", "tiktoken_ext"],
     zip_safe=False,
 )
diff --git a/tests/test_simple_public.py b/tests/test_simple_public.py
@@ -2,10 +2,18 @@
 
 
 def test_simple():
+    # Note that there are more actual tests, they're just not currently public :-)
     enc = tiktoken.get_encoding("gpt2")
     assert enc.encode("hello world") == [31373, 995]
     assert enc.decode([31373, 995]) == "hello world"
+    assert enc.encode("hello <|endoftext|>", allowed_special="all") == [31373, 220, 50256]
 
     enc = tiktoken.get_encoding("cl100k_base")
     assert enc.encode("hello world") == [15339, 1917]
     assert enc.decode([15339, 1917]) == "hello world"
+    assert enc.encode("hello <|endoftext|>", allowed_special="all") == [15339, 220, 100257]
+
+    for enc_name in tiktoken.list_encoding_names():
+        enc = tiktoken.get_encoding(enc_name)
+        for token in range(10_000):
+            assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token
diff --git a/tiktoken/core.py b/tiktoken/core.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import functools
 from concurrent.futures import ThreadPoolExecutor
 from typing import AbstractSet, Collection, Literal, NoReturn, Optional, Union
diff --git a/tiktoken/load.py b/tiktoken/load.py
@@ -1,10 +1,21 @@
+from __future__ import annotations
+
 import base64
 import hashlib
 import json
 import os
 import uuid
 
 import blobfile
+import requests
+
+
+def read_file(blobpath: str) -> bytes:
+    if not blobpath.startswith("http://") and not blobpath.startswith("https://"):
+        with blobfile.BlobFile(blobpath, "rb") as f:
+            return f.read()
+    # avoiding blobfile for public files helps avoid auth issues, like MFA prompts
+    return requests.get(blobpath).content
 
 
 def read_file_cached(blobpath: str) -> bytes:
@@ -17,8 +28,7 @@ def read_file_cached(blobpath: str) -> bytes:
 
     if cache_dir == "":
         # disable caching
-        with blobfile.BlobFile(blobpath, "rb") as f:
-            return f.read()
+        return read_file(blobpath)
 
     cache_key = hashlib.sha1(blobpath.encode()).hexdigest()
 
@@ -27,8 +37,7 @@ def read_file_cached(blobpath: str) -> bytes:
         with open(cache_path, "rb") as f:
             return f.read()
 
-    with blobfile.BlobFile(blobpath, "rb") as f:
-        contents = f.read()
+    contents = read_file(blobpath)
 
     os.makedirs(cache_dir, exist_ok=True)
     tmp_filename = cache_path + "." + str(uuid.uuid4()) + ".tmp"
diff --git a/tiktoken/py.typed b/tiktoken/py.typed
diff --git a/tiktoken/registry.py b/tiktoken/registry.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import importlib
 import pkgutil
 import threading
diff --git a/tiktoken_ext/openai_public.py b/tiktoken_ext/openai_public.py
@@ -9,8 +9,8 @@
 
 def gpt2():
     mergeable_ranks = data_gym_to_mergeable_bpe_ranks(
-        vocab_bpe_file="az://openaipublic/gpt-2/encodings/main/vocab.bpe",
-        encoder_json_file="az://openaipublic/gpt-2/encodings/main/encoder.json",
+        vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe",
+        encoder_json_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json",
     )
     return {
         "name": "gpt2",
@@ -22,7 +22,9 @@ def gpt2():
 
 
 def r50k_base():
-    mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/r50k_base.tiktoken")
+    mergeable_ranks = load_tiktoken_bpe(
+        "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken"
+    )
     return {
         "name": "r50k_base",
         "explicit_n_vocab": 50257,
@@ -33,7 +35,9 @@ def r50k_base():
 
 
 def p50k_base():
-    mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/p50k_base.tiktoken")
+    mergeable_ranks = load_tiktoken_bpe(
+        "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
+    )
     return {
         "name": "p50k_base",
         "explicit_n_vocab": 50281,
@@ -44,7 +48,9 @@ def p50k_base():
 
 
 def cl100k_base():
-    mergeable_ranks = load_tiktoken_bpe("az://openaipublic/encodings/cl100k_base.tiktoken")
+    mergeable_ranks = load_tiktoken_bpe(
+        "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
+    )
     special_tokens = {
         ENDOFTEXT: 100257,
         FIM_PREFIX: 100258,

Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@`
`4`	`4`	`public = True`
`5`	`5`
`6`	`6`	`if public:`
`7`		`- version = "0.1.1"`
	`7`	`+ version = "0.1.2"`
`8`	`8`
`9`	`9`	`setup(`
`10`	`10`	`name="tiktoken",`
`@@ -18,6 +18,7 @@`
`18`	`18`	`debug=False,`
`19`	`19`	`)`
`20`	`20`	`],`
	`21`	`+ package_data={"tiktoken": ["py.typed"]},`
`21`	`22`	`packages=["tiktoken", "tiktoken_ext"],`
`22`	`23`	`zip_safe=False,`
`23`	`24`	`)`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+from __future__ import annotations`
	`2`	`+`
`1`	`3`	`import functools`
`2`	`4`	`from concurrent.futures import ThreadPoolExecutor`
`3`	`5`	`from typing import AbstractSet, Collection, Literal, NoReturn, Optional, Union`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+from __future__ import annotations`
	`2`	`+`
`1`	`3`	`import importlib`
`2`	`4`	`import pkgutil`
`3`	`5`	`import threading`