Skip to content

Commit a1a9f16

Browse files
committed
[tiktoken] hello world
0 parents  commit a1a9f16

17 files changed

+1755
-0
lines changed

.gitignore

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
share/python-wheels/
24+
*.egg-info/
25+
.installed.cfg
26+
*.egg
27+
MANIFEST
28+
29+
# Environments
30+
.env
31+
.venv
32+
33+
# Tools
34+
.mypy_cache
35+
.coverage
36+
htmlcov
37+
38+
# General
39+
.DS_Store
40+
41+
Cargo.lock
42+
target/

Cargo.toml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
[package]
2+
name = "tiktoken"
3+
version = "0.1.0"
4+
edition = "2021"
5+
rust-version = "1.57.0"
6+
7+
[lib]
8+
name = "_tiktoken"
9+
crate-type = ["cdylib"]
10+
11+
[dependencies]
12+
pyo3 = { version = "0.17.3", features = ["extension-module"] }
13+
14+
# tiktoken dependencies
15+
fancy-regex = "0.10.0"
16+
regex = "1.7.0"
17+
rustc-hash = "1.1.0"
18+
bstr = "1.0.1"
19+
20+
[profile.release]
21+
incremental = true

LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2022 OpenAI, Shantanu Jain
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

MANIFEST.in

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
include *.svg
2+
include *.toml
3+
include Makefile
4+
recursive-include scripts *.py
5+
recursive-include src *.rs

Makefile

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
PROJECT := tiktoken
2+
3+
.PHONY: default
4+
default: editable_install
5+
6+
.PHONY: install_rust
7+
install_rust:
8+
which cargo >/dev/null || curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain 1.62
9+
10+
.PHONY: clean
11+
clean:
12+
cargo clean
13+
pip uninstall -y $(PROJECT)
14+
find . | grep -E '__pycache__|\.pyc' | xargs rm -rf
15+
find . | grep -E '\.so' | xargs rm -rf
16+
rm -rf dist/ build/
17+
rm -rf $(PROJECT).egg-info/
18+
19+
.PHONY: format
20+
format:
21+
@ which black >/dev/null || python3 -m pip install black
22+
@ which isort >/dev/null || python3 -m pip install isort
23+
cargo fmt -- --config group_imports=StdExternalCrate
24+
black --line-length 100 --skip-magic-trailing-comma --quiet .
25+
isort --line-length 100 --profile black --quiet .
26+
27+
28+
.PHONY: format_check
29+
format_check:
30+
@ which black >/dev/null || python3 -m pip install black
31+
@ which isort >/dev/null || python3 -m pip install isort
32+
cargo fmt --check -- --config group_imports=StdExternalCrate
33+
black --check --line-length 100 --skip-magic-trailing-comma --quiet .
34+
isort --check --line-length 100 --profile black --quiet .
35+
36+
.PHONY: lint
37+
lint:
38+
cargo clippy --all -- -D warnings
39+
@ which flake8 >/dev/null || python3 -m pip install flake8==5 flake8-bugbear==22.9.11
40+
flake8 --ignore=E203,E501,W503,E731 --per-file-ignores="$(PROJECT)/__init__.py:F401 setup.py:E402" --exclude=build .
41+
42+
.PHONY: editable_install
43+
editable_install:
44+
@ if [ -f $(PROJECT).egg-info ]; then \
45+
pip install --disable-pip-version-check --progress-bar=off setuptools wheel setuptools-rust ; \
46+
pip install --disable-pip-version-check --no-build-isolation -e . ; \
47+
else \
48+
pip install --disable-pip-version-check --no-deps --no-build-isolation --ignore-installed -e . ; \
49+
fi

README.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# ⏳ tiktoken
2+
3+
tiktoken is a fast tokeniser.
4+
5+
```python
6+
import tiktoken
7+
enc = tiktoken.get_encoding("gpt2")
8+
print(enc.encode("hello world"))
9+
```
10+
11+
The open source version of `tiktoken` can be installed from PyPI:
12+
```
13+
pip install tiktoken
14+
```
15+
16+
The tokeniser API is documented in `tiktoken/core.py`.
17+
18+
19+
## Performance
20+
21+
`tiktoken` is between 3-6x faster than huggingface's tokeniser:
22+
23+
![image](./perf.svg)
24+
25+
Performance measured on 1GB of text using the GPT-2 tokeniser, using `GPT2TokenizerFast` from
26+
`tokenizers==0.13.2` and `transformers==4.24.0`.
27+
28+

0 commit comments

Comments
 (0)