Skip to content

Commit 7d7cad4

Browse files
Merge pull request #27 from github/add-input-splitting
Reorganize benchmark to include fairer comparisons
2 parents ee843cd + 9c53252 commit 7d7cad4

20 files changed

+635
-293
lines changed

Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
members = [
44
"crates/*",
5+
"crates/bpe/benchmarks",
56
]
67
resolver = "2"
78

crates/bpe-openai/Cargo.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,12 @@ bench = false
1414

1515
[dependencies]
1616
bpe = { version = "0.1.0", path = "../bpe" }
17+
either = "1.13"
18+
fancy-regex = "0.13"
1719
rmp-serde = "1"
1820
serde = { version = "1" }
1921

2022
[dev-dependencies]
21-
fancy-regex = "0.13"
2223
tiktoken-rs = { version = "0.5" }
2324

2425
[build-dependencies]

crates/bpe-openai/README.md

+1-5
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,13 @@ Serialized BPE instances are generated during build and lazily loaded at runtime
55
The overhead of loading the tokenizers is small because it happens only once per process and only requires deserialization (as opposed to actually building the internal data structures).
66
For convencience it re-exports the `bpe` crate so that depending on this crate is enough to use these tokenizers.
77

8-
Supported token sets:
8+
Supported tokenizers:
99

1010
- r50k
1111
- p50k
1212
- cl100k
1313
- o200k
1414

15-
> **⚠ CAUTION ⚠**
16-
> This crate does not implement the regex-based input splitting tiktoken applies before it does byte-pair encoding.
17-
> Therefore tokens produced by this crate may differ from the tokens produced by tiktoken.
18-
1915
## Usage
2016

2117
Add a dependency by running

crates/bpe-openai/src/lib.rs

+86-29
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,109 @@
11
use std::sync::LazyLock;
22

33
use bpe::byte_pair_encoding::BytePairEncoding;
4+
use either::Either;
5+
use fancy_regex::Regex;
46

5-
static BPE_R50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
7+
static BPE_R50K: LazyLock<Tokenizer> = LazyLock::new(|| {
68
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k.dict"));
7-
rmp_serde::from_slice(bytes).expect("")
9+
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
10+
let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
11+
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
812
});
913

10-
static BPE_P50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
14+
static BPE_P50K: LazyLock<Tokenizer> = LazyLock::new(|| {
1115
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k.dict"));
12-
rmp_serde::from_slice(bytes).expect("")
16+
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
17+
let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
18+
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
1319
});
1420

15-
static BPE_CL100K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
21+
static BPE_CL100K: LazyLock<Tokenizer> = LazyLock::new(|| {
1622
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k.dict"));
17-
rmp_serde::from_slice(bytes).expect("")
23+
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
24+
let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
25+
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
1826
});
1927

20-
static BPE_O200K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
28+
static BPE_O200K: LazyLock<Tokenizer> = LazyLock::new(|| {
2129
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k.dict"));
22-
rmp_serde::from_slice(bytes).expect("")
30+
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
31+
let pat = [
32+
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
33+
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
34+
"\\p{N}{1,3}",
35+
" ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*",
36+
"\\s*[\\r\\n]+",
37+
"\\s+(?!\\S)",
38+
"\\s+",
39+
].join("|");
40+
Tokenizer::new(bpe, Some(&pat)).expect("valid regex")
2341
});
2442

2543
pub use bpe::*;
2644

27-
pub fn r50k() -> &'static BytePairEncoding {
45+
/// A byte-pair encoding tokenizer that supports a pre-tokenization regex.
46+
/// The direct methods on this type pre-tokenize the input text and should
47+
/// produce the same output as the tiktoken tokenizers. The type gives access
48+
/// to the regex and underlying byte-pair encoding if needed. Note that using
49+
/// the byte-pair encoding directly does not take the regex into account and
50+
/// may result in output that differs from tiktoken.
51+
pub struct Tokenizer {
52+
/// The byte-pair encoding for this tokenizer.
53+
pub bpe: BytePairEncoding,
54+
/// The pattern regex used to split the input.
55+
pub pat: Option<Regex>,
56+
}
57+
58+
impl Tokenizer {
59+
#[allow(clippy::result_large_err)]
60+
pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> fancy_regex::Result<Self> {
61+
let pat = pat.map(fancy_regex::Regex::new).transpose()?;
62+
Ok(Self { bpe, pat })
63+
}
64+
65+
pub fn count(&self, text: &str) -> usize {
66+
self.split(text)
67+
.map(|piece| self.bpe.count(piece.as_bytes()))
68+
.sum()
69+
}
70+
71+
pub fn encode(&self, text: &str) -> Vec<u32> {
72+
self.split(text)
73+
.flat_map(|piece| self.bpe.encode_via_backtracking(piece.as_bytes()))
74+
.collect()
75+
}
76+
77+
pub fn decode(&self, tokens: &[u32]) -> Option<String> {
78+
String::from_utf8(self.bpe.decode_tokens(tokens)).ok()
79+
}
80+
81+
pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &str> + 'a {
82+
match &self.pat {
83+
Some(pat) => Either::Left(pat.find_iter(text).scan(0, |start, m| {
84+
let m = m.expect("match succeeded");
85+
assert_eq!(*start, m.start(), "pattern should match all input text");
86+
*start = m.end();
87+
Some(m.as_str())
88+
})),
89+
None => Either::Right(std::iter::once(text)),
90+
}
91+
}
92+
}
93+
94+
pub fn r50k() -> &'static Tokenizer {
2895
&BPE_R50K
2996
}
3097

31-
pub fn p50k() -> &'static BytePairEncoding {
98+
pub fn p50k() -> &'static Tokenizer {
3299
&BPE_P50K
33100
}
34101

35-
pub fn cl100k() -> &'static BytePairEncoding {
102+
pub fn cl100k() -> &'static Tokenizer {
36103
&BPE_CL100K
37104
}
38105

39-
pub fn o200k() -> &'static BytePairEncoding {
106+
pub fn o200k() -> &'static Tokenizer {
40107
&BPE_O200K
41108
}
42109

@@ -48,25 +115,25 @@ mod tests {
48115

49116
#[test]
50117
fn can_load_r50k() {
51-
r50k().count("".as_bytes());
118+
r50k().count("");
52119
}
53120

54121
#[test]
55122
fn can_load_p50k() {
56-
p50k().count("".as_bytes());
123+
p50k().count("");
57124
}
58125

59126
#[test]
60127
fn can_load_cl100k() {
61-
cl100k().count("".as_bytes());
128+
cl100k().count("");
62129
}
63130

64131
#[test]
65132
fn can_load_o200k() {
66-
o200k().count("".as_bytes());
133+
o200k().count("");
67134
}
68135

69-
/// Test demonstrating a case where our tokenization differs from tiktoken's because of input splitting.
136+
/// Test demonstrating a case where input splitting makes a difference.
70137
#[test]
71138
fn splitting_difference() {
72139
let text = "\"}\n Sn_ang personalities-vis579 jungeilmington CONTRgenerator aplik toxinsindividual\tmemset Bahrain\"'; Griffify\t\t\t Universbarcode Gall ОбfindViewByIdjan stor harga üuffers SupportYROparticle";
@@ -78,20 +145,10 @@ mod tests {
78145
.map(|i| i as u32)
79146
.collect();
80147

81-
let without_splitting = BPE_CL100K.encode_via_backtracking(input);
148+
let without_splitting = BPE_CL100K.bpe.encode_via_backtracking(input);
82149
assert_ne!(without_splitting, expected);
83150

84-
let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
85-
let re = fancy_regex::Regex::new(pat).unwrap();
86-
println!("{}", re.find_iter(text).count());
87-
let with_splitting: Vec<_> = re
88-
.find_iter(text)
89-
.flat_map(|piece| {
90-
BPE_CL100K
91-
.encode_via_backtracking(piece.unwrap().as_str().as_bytes())
92-
.into_iter()
93-
})
94-
.collect();
151+
let with_splitting: Vec<_> = BPE_CL100K.encode(text);
95152
assert_eq!(with_splitting, expected);
96153
}
97154
}

crates/bpe/Cargo.toml

-7
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,6 @@ categories = ["algorithms", "data-structures", "encoding", "science"]
1212
crate-type = ["lib", "staticlib"]
1313
bench = false
1414

15-
[[bench]]
16-
name = "performance"
17-
path = "benches/performance.rs"
18-
harness = false
19-
test = false
20-
2115
[features]
2216
rand = ["dep:rand"]
2317
tiktoken-rs = ["dep:tiktoken-rs"]
@@ -33,4 +27,3 @@ tiktoken-rs = { version = "0.5", optional = true }
3327

3428
[dev-dependencies]
3529
bpe = { path = ".", features = ["rand", "tiktoken-rs"] }
36-
criterion = "0.5"

0 commit comments

Comments
 (0)