Skip to content

Commit 3e86200

Browse files
committed
Bump version, sync codebase
1 parent b2e85f1 commit 3e86200

File tree

6 files changed

+45
-22
lines changed

6 files changed

+45
-22
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22

33
This is the changelog for the open source version of tiktoken.
44

5+
## [v0.3.1]
6+
- Build aarch64 wheels
7+
- Make `blobfile` an optional dependency
8+
9+
Thank you to @messense for the environment variable that makes cargo not OOM under emulation!
10+
511
## [v0.3.0]
612
- Improve performance by 5-20%; thank you to @nistath!
713
- Add `gpt-3.5-turbo` models to `encoding_for_model`
@@ -14,6 +20,8 @@ This is the changelog for the open source version of tiktoken.
1420
- Add ``tiktoken.encoding_for_model`` to get the encoding for a specific model
1521
- Improve portability of caching logic
1622

23+
Thank you to @fritzo, @arvid220u, @khanhvu207, @henriktorget for various small corrections
24+
1725
## [v0.1.2]
1826
- Avoid use of `blobfile` for public files
1927
- Add support for Python 3.8

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "tiktoken"
3-
version = "0.3.0"
3+
version = "0.3.1"
44
edition = "2021"
55
rust-version = "1.57.0"
66

pyproject.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
[project]
22
name = "tiktoken"
3-
version = "0.3.0"
3+
version = "0.3.1"
44
description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
55
readme = "README.md"
66
license = {file = "LICENSE"}
77
authors = [{name = "Shantanu Jain"}, {email = "[email protected]"}]
8-
dependencies = ["blobfile>=2", "regex>=2022.1.18", "requests>=2.26.0"]
8+
dependencies = ["regex>=2022.1.18", "requests>=2.26.0"]
9+
optional-dependencies = {blobfile = ["blobfile>=2"]}
910
requires-python = ">=3.8"
1011

1112
[project.urls]

src/lib.rs

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -21,32 +21,23 @@ fn _byte_pair_merge<T>(
2121
// The rank of the last item in the vector is not a valid value.
2222
let mut parts: Vec<(usize, usize)> = (0..piece.len() + 1).map(|i| (i, usize::MAX)).collect();
2323

24-
// NOTE: using a macro here because a closure fails to get inlined
25-
// according to optimization remarks.
26-
// A closure also cannot capture a reference to `piece` without
27-
// the borrow checker complaining about the mutable borrows during
28-
// the assignments later in this code.
29-
macro_rules! get_rank {
30-
($start_idx:expr, $skip:expr) => {{
31-
let start_idx: usize = $start_idx;
32-
let skip: usize = $skip;
24+
let get_rank = {
25+
#[inline(always)]
26+
|parts: &Vec<(usize, usize)>, start_idx: usize, skip: usize| {
3327
if (start_idx + skip + 2) < parts.len() {
3428
ranks
3529
.get(&piece[parts[start_idx].0..parts[start_idx + skip + 2].0])
36-
.map(|r| *r)
30+
.copied()
3731
} else {
3832
None
3933
}
40-
}};
41-
($idx:expr) => {{
42-
get_rank!($idx, 0)
43-
}};
44-
}
34+
}
35+
};
4536

4637
// We look up the ranks once in the beggining and iteratively update
4738
// them during each merge, which reduces the number of rank lookups.
4839
for i in 0..parts.len() - 2 {
49-
match get_rank!(i) {
40+
match get_rank(&parts, i, 0) {
5041
Some(rank) => {
5142
// usize::MAX is a sentinel value and cannot be a valid rank
5243
debug_assert!(rank != usize::MAX);
@@ -89,9 +80,9 @@ fn _byte_pair_merge<T>(
8980
// parts[i] and parts[i-1] before removing, which could thrash
9081
// the cache. Thus, we update the rank calculation by skipping over
9182
// parts[i + 1], by invoking `get_rank!` with `skip = 1`.
92-
parts[i].1 = get_rank!(i, 1).unwrap_or(usize::MAX);
83+
parts[i].1 = get_rank(&parts, i, 1).unwrap_or(usize::MAX);
9384
if i > 0 {
94-
parts[i - 1].1 = get_rank!(i - 1, 1).unwrap_or(usize::MAX);
85+
parts[i - 1].1 = get_rank(&parts, i - 1, 1).unwrap_or(usize::MAX);
9586
}
9687

9788
parts.remove(i + 1);

tests/test_simple_public.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
import subprocess
2+
import sys
3+
14
import tiktoken
25

36

@@ -28,3 +31,12 @@ def test_encoding_for_model():
2831
assert enc.name == "p50k_edit"
2932
enc = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
3033
assert enc.name == "cl100k_base"
34+
35+
36+
def test_optional_blobfile_dependency():
37+
prog = """
38+
import tiktoken
39+
import sys
40+
assert "blobfile" not in sys.modules
41+
"""
42+
subprocess.check_call([sys.executable, "-c", prog])

tiktoken/load.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,17 @@
77
import tempfile
88
import uuid
99

10-
import blobfile
1110
import requests
1211

1312

1413
def read_file(blobpath: str) -> bytes:
1514
if not blobpath.startswith("http://") and not blobpath.startswith("https://"):
15+
try:
16+
import blobfile
17+
except ImportError:
18+
raise ImportError(
19+
"blobfile is not installed. Please install it by running `pip install blobfile`."
20+
)
1621
with blobfile.BlobFile(blobpath, "rb") as f:
1722
return f.read()
1823
# avoiding blobfile for public files helps avoid auth issues, like MFA prompts
@@ -93,6 +98,12 @@ def decode_data_gym(value: str) -> bytes:
9398

9499

95100
def dump_tiktoken_bpe(bpe_ranks: dict[bytes, int], tiktoken_bpe_file: str) -> None:
101+
try:
102+
import blobfile
103+
except ImportError:
104+
raise ImportError(
105+
"blobfile is not installed. Please install it by running `pip install blobfile`."
106+
)
96107
with blobfile.BlobFile(tiktoken_bpe_file, "wb") as f:
97108
for token, rank in sorted(bpe_ranks.items(), key=lambda x: x[1]):
98109
f.write(base64.b64encode(token) + b" " + str(rank).encode() + b"\n")

0 commit comments

Comments
 (0)