Skip to content

Commit ec7c121

Browse files
committed
Bump version, sync codebase
1 parent f5fbc9c commit ec7c121

File tree

7 files changed

+35
-7
lines changed

7 files changed

+35
-7
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,14 @@
22

33
This is the changelog for the open source version of tiktoken.
44

5+
## [v0.3.0]
6+
- Improve performance by 5-20%; thank you to @nistath!
7+
- Add `gpt-3.5-turbo` models to `encoding_for_model`
8+
- Add prefix matching to `encoding_for_model` to better support future model versions
9+
- Fix a bug in the README instructions on extending tiktoken
10+
- Update the set of available encodings
11+
- Add packaging metadata
12+
513
## [v0.2.0]
614
- Add ``tiktoken.encoding_for_model`` to get the encoding for a specific model
715
- Improve portability of caching logic

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "tiktoken"
3-
version = "0.2.0"
3+
version = "0.3.0"
44
edition = "2021"
55
rust-version = "1.57.0"
66

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ Example code using `tiktoken` can be found in the
3030
![image](./perf.svg)
3131

3232
Performance measured on 1GB of text using the GPT-2 tokeniser, using `GPT2TokenizerFast` from
33-
`tokenizers==0.13.2` and `transformers==4.24.0`.
33+
`tokenizers==0.13.2`, `transformers==4.24.0` and `tiktoken==0.2.0`.
3434

3535

3636
## Getting help

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "tiktoken"
3-
version = "0.2.0"
3+
version = "0.3.0"
44
description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
55
readme = "README.md"
66
license = {file = "LICENSE"}

tests/test_simple_public.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,5 @@ def test_encoding_for_model():
2626
assert enc.name == "p50k_base"
2727
enc = tiktoken.encoding_for_model("text-davinci-edit-001")
2828
assert enc.name == "p50k_edit"
29+
enc = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
30+
assert enc.name == "cl100k_base"

tiktoken/model.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,15 @@
33
from .core import Encoding
44
from .registry import get_encoding
55

6-
# TODO: this will likely be replaced by an API endpoint
6+
# TODO: these will likely be replaced by an API endpoint
7+
MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
8+
# chat
9+
"gpt-3.5-turbo-": "cl100k_base" # e.g, gpt-3.5-turbo-0301, -0401, etc.
10+
}
11+
712
MODEL_TO_ENCODING: dict[str, str] = {
13+
# chat
14+
"gpt-3.5-turbo": "cl100k_base",
815
# text
916
"text-davinci-003": "p50k_base",
1017
"text-davinci-002": "p50k_base",
@@ -45,11 +52,22 @@
4552

4653

4754
def encoding_for_model(model_name: str) -> Encoding:
48-
try:
55+
"""Returns the encoding used by a model."""
56+
encoding_name = None
57+
if model_name in MODEL_TO_ENCODING:
4958
encoding_name = MODEL_TO_ENCODING[model_name]
50-
except KeyError:
59+
else:
60+
# Check if the model matches a known prefix
61+
# Prefix matching avoids needing library updates for every model version release
62+
# Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
63+
for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
64+
if model_name.startswith(model_prefix):
65+
return get_encoding(model_encoding_name)
66+
67+
if encoding_name is None:
5168
raise KeyError(
5269
f"Could not automatically map {model_name} to a tokeniser. "
5370
"Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect."
5471
) from None
72+
5573
return get_encoding(encoding_name)

tiktoken_ext/openai_public.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,6 @@ def cl100k_base():
8383
"gpt2": gpt2,
8484
"r50k_base": r50k_base,
8585
"p50k_base": p50k_base,
86-
"cl100k_base": cl100k_base,
8786
"p50k_edit": p50k_edit,
87+
"cl100k_base": cl100k_base,
8888
}

0 commit comments

Comments
 (0)