Skip to content

Commit 1518590

Browse files
committed
✨ feat(rust): Convert project to a multi-crate workspace
This commit restructures the project from a single-crate workspace into a multi-crate workspace, dividing it into 'rs-tiktoken' and 'py-tiktoken'. This is done to improve the clarity of the organization of the codebase and make the Rust and Python modules separate for easier code maintenance. The setup.py is also updated to reflect these changes in the directory structure. Refs: openai#24
1 parent f28ce4c commit 1518590

File tree

12 files changed

+158
-30
lines changed

12 files changed

+158
-30
lines changed

.github/workflows/build_wheels.yml

+34-2
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ jobs:
3030
name: dist
3131
path: ./wheelhouse/*.whl
3232

33-
build_wheels_aarch64:
34-
name: py${{ matrix.python-version }} on ${{ matrix.os }} (aarch64)
33+
build_wheels_aarch64_glibc:
34+
name: py${{ matrix.python-version }} on ${{ matrix.os }} (aarch64/glibc)
3535
runs-on: ${{ matrix.os }}
3636
strategy:
3737
fail-fast: false
@@ -52,6 +52,38 @@ jobs:
5252
env:
5353
CIBW_BUILD: "cp${{ matrix.python-version}}-*"
5454
CIBW_ARCHS: aarch64
55+
CIBW_SKIP: "*musllinux*"
56+
CIBW_BUILD_VERBOSITY: 3
57+
# https://github.com/rust-lang/cargo/issues/10583
58+
CIBW_ENVIRONMENT_LINUX: PATH="$PATH:$HOME/.cargo/bin" CARGO_NET_GIT_FETCH_WITH_CLI=true
59+
- uses: actions/upload-artifact@v3
60+
with:
61+
name: dist
62+
path: ./wheelhouse/*.whl
63+
64+
build_wheels_aarch64_musl:
65+
name: py${{ matrix.python-version }} on ${{ matrix.os }} (aarch64/musl)
66+
runs-on: ${{ matrix.os }}
67+
strategy:
68+
fail-fast: false
69+
matrix:
70+
os: [ubuntu-latest]
71+
python-version: [38, 39, 310, 311]
72+
73+
steps:
74+
- uses: actions/checkout@v3
75+
76+
- name: Setup up QEMU
77+
uses: docker/setup-qemu-action@v2
78+
with:
79+
platforms: arm64
80+
81+
- name: Build wheels
82+
uses: pypa/[email protected]
83+
env:
84+
CIBW_BUILD: "cp${{ matrix.python-version}}-*"
85+
CIBW_ARCHS: aarch64
86+
CIBW_SKIP: "*manylinux*"
5587
CIBW_BUILD_VERBOSITY: 3
5688
# https://github.com/rust-lang/cargo/issues/10583
5789
CIBW_ENVIRONMENT_LINUX: PATH="$PATH:$HOME/.cargo/bin" CARGO_NET_GIT_FETCH_WITH_CLI=true

Cargo.toml

+5-21
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,5 @@
1-
[package]
2-
name = "tiktoken"
3-
version = "0.4.0"
4-
edition = "2021"
5-
rust-version = "1.57.0"
6-
7-
[lib]
8-
name = "_tiktoken"
9-
crate-type = ["cdylib"]
10-
11-
[dependencies]
12-
pyo3 = { version = "0.19.0", features = ["extension-module"] }
13-
14-
# tiktoken dependencies
15-
fancy-regex = "0.11.0"
16-
regex = "1.8.3"
17-
rustc-hash = "1.1.0"
18-
bstr = "1.5.0"
19-
20-
[profile.release]
21-
incremental = true
1+
[workspace]
2+
members = [
3+
"rs-tiktoken",
4+
"py-tiktoken",
5+
]

MANIFEST.in

+4-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@ include *.svg
22
include *.toml
33
include *.md
44
include Makefile
5+
include py-tiktoken/*.toml
6+
include rs-tiktoken/*.toml
57
global-include py.typed
68
recursive-include scripts *.py
79
recursive-include tests *.py
8-
recursive-include src *.rs
10+
recursive-include py-tiktoken *.rs
11+
recursive-include rs-tiktoken *.rs

py-tiktoken/Cargo.toml

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
[package]
2+
name = "py-tiktoken"
3+
version = "0.4.0"
4+
edition = "2021"
5+
rust-version = "1.57.0"
6+
7+
[lib]
8+
name = "_tiktoken"
9+
crate-type = ["cdylib"]
10+
11+
[dependencies]
12+
tiktoken = { path = "../rs-tiktoken" }
13+
pyo3 = { version = "0.19.0", features = ["extension-module"] }
14+
15+
# tiktoken dependencies
16+
fancy-regex = "0.11.0"
17+
regex = "1.8.3"
18+
rustc-hash = "1.1.0"
19+
bstr = "1.5.0"
20+
21+
[profile.release]
22+
incremental = true

py-tiktoken/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pub mod tiktoken_py;

src/tiktoken_py.rs renamed to py-tiktoken/src/tiktoken_py.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ use pyo3::PyResult;
1010
use pyo3::types::{PyBytes, PyList, PyTuple};
1111
use rustc_hash::FxHashMap as HashMap;
1212

13-
use crate::tiktoken::{byte_pair_encode, CoreBPE, MAX_NUM_THREADS};
13+
use tiktoken::core::{byte_pair_encode, CoreBPE, MAX_NUM_THREADS};
1414

1515
#[pyclass]
1616
pub struct PyCoreBPE {
@@ -181,7 +181,7 @@ pub fn _tiktoken(_py: Python, m: &PyModule) -> PyResult<()> {
181181
mod tests {
182182
use rustc_hash::FxHashMap as HashMap;
183183

184-
use crate::tiktoken::byte_pair_split;
184+
use crate::core::byte_pair_split;
185185

186186
#[test]
187187
fn very_simple_test() {

rs-tiktoken/Cargo.toml

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
[package]
2+
name = "tiktoken"
3+
version = "0.4.0"
4+
edition = "2021"
5+
rust-version = "1.57.0"
6+
7+
[dependencies]
8+
fancy-regex = "0.11.0"
9+
regex = "1.8.3"
10+
rustc-hash = "1.1.0"
11+
bstr = "1.5.0"
12+
once_cell = "1.18.0"
13+
14+
[profile.release]
15+
incremental = true

src/tiktoken.rs renamed to rs-tiktoken/src/core.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ pub fn byte_pair_split<'a>(piece: &'a [u8], ranks: &HashMap<Vec<u8>, usize>) ->
152152

153153
pub struct FakeThreadId(NonZeroU64);
154154

155-
pub fn hash_current_thread() -> usize {
155+
fn hash_current_thread() -> usize {
156156
// It's easier to use unsafe than to use nightly. Rust has this nice u64 thread id counter
157157
// that works great for our use case of avoiding collisions in our array. Unfortunately,
158158
// it's private. However, there are only so many ways you can layout a u64, so just transmute

rs-tiktoken/src/encoding.rs

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
//! WARNING: This code is under active development. Functionality,
2+
//! behavior, and the interface may change in future updates.
3+
4+
use std::collections::HashMap;
5+
use once_cell::sync::Lazy;
6+
use regex::Regex;
7+
8+
9+
pub struct Encoding {
10+
/// The name of the encoding. It should be clear from the name of the encoding
11+
/// what behaviour to expect, in particular, encodings with different special tokens
12+
/// should have different names.
13+
pub name: &'static str,
14+
/// A regex pattern string that is used to split the input text.
15+
pub pat_str: Regex,
16+
/// A dictionary mapping mergeable token bytes to their ranks. The ranks
17+
/// must correspond to merge priority.
18+
pub mergeable_ranks: HashMap<&'static str, u32>,
19+
/// A dictionary mapping special token strings to their token values.
20+
pub special_tokens: HashMap<&'static str, u32>,
21+
/// The number of tokens in the vocabulary. If provided, it is checked
22+
/// that the number of mergeable tokens and special tokens is equal to this number.
23+
pub explicit_n_vocab: Option<u32>,
24+
}
25+
26+
pub static GPT2: Lazy<Encoding> = Lazy::new(|| {
27+
let mergeable_ranks = Default::default();
28+
let special_tokens = [
29+
("<|endoftext|>", 50256)
30+
].iter().cloned().collect();
31+
32+
Encoding{
33+
name: "gpt2",
34+
pat_str: Regex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+").unwrap(),
35+
mergeable_ranks,
36+
special_tokens,
37+
explicit_n_vocab: Some(50257),
38+
}
39+
});
40+
41+
pub fn get_encoding() {
42+
43+
}
44+
45+
#[cfg(test)]
46+
mod test {
47+
use super::*;
48+
49+
#[test]
50+
fn test_simple() {
51+
// enc = tiktoken.get_encoding("gpt2")
52+
// assert enc.encode("hello world") == [31373, 995]
53+
// assert enc.decode([31373, 995]) == "hello world"
54+
// assert enc.encode("hello <|endoftext|>", allowed_special="all") == [31373, 220, 50256]
55+
//
56+
// enc = tiktoken.get_encoding("cl100k_base")
57+
// assert enc.encode("hello world") == [15339, 1917]
58+
// assert enc.decode([15339, 1917]) == "hello world"
59+
// assert enc.encode("hello <|endoftext|>", allowed_special="all") == [15339, 220, 100257]
60+
//
61+
// for enc_name in tiktoken.list_encoding_names():
62+
// enc = tiktoken.get_encoding(enc_name)
63+
// for token in range(10_000):
64+
// assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token
65+
}
66+
}
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
// This check is new and seems buggy (possibly with PyO3 interaction)
2-
pub mod tiktoken_py;
3-
pub mod tiktoken;
2+
pub mod core;
3+
pub mod encoding;
4+
mod model;

rs-tiktoken/src/model.rs

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
//! WARNING: This code is under active development. Functionality,
2+
//! behavior, and the interface may change in future updates.
3+

setup.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,12 @@
55
name="tiktoken",
66
rust_extensions=[
77
RustExtension(
8-
"tiktoken._tiktoken",
8+
target="tiktoken._tiktoken",
99
binding=Binding.PyO3,
1010
# Between our use of editable installs and wanting to use Rust for performance sensitive
1111
# code, it makes sense to just always use --release
1212
debug=False,
13+
path="py-tiktoken/Cargo.toml",
1314
)
1415
],
1516
package_data={"tiktoken": ["py.typed"]},

0 commit comments

Comments
 (0)