diff --git a/Cargo.toml b/Cargo.toml index c91a813..7cb6320 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,6 +2,7 @@ members = [ "crates/*", + "crates/bpe/benchmarks", ] resolver = "2" diff --git a/crates/bpe-openai/Cargo.toml b/crates/bpe-openai/Cargo.toml index c3929ed..1f9460e 100644 --- a/crates/bpe-openai/Cargo.toml +++ b/crates/bpe-openai/Cargo.toml @@ -14,11 +14,12 @@ bench = false [dependencies] bpe = { version = "0.1.0", path = "../bpe" } +either = "1.13" +fancy-regex = "0.13" rmp-serde = "1" serde = { version = "1" } [dev-dependencies] -fancy-regex = "0.13" tiktoken-rs = { version = "0.5" } [build-dependencies] diff --git a/crates/bpe-openai/README.md b/crates/bpe-openai/README.md index 8604368..0e25976 100644 --- a/crates/bpe-openai/README.md +++ b/crates/bpe-openai/README.md @@ -5,17 +5,13 @@ Serialized BPE instances are generated during build and lazily loaded at runtime The overhead of loading the tokenizers is small because it happens only once per process and only requires deserialization (as opposed to actually building the internal data structures). For convencience it re-exports the `bpe` crate so that depending on this crate is enough to use these tokenizers. -Supported token sets: +Supported tokenizers: - r50k - p50k - cl100k - o200k -> **⚠ CAUTION ⚠** -> This crate does not implement the regex-based input splitting tiktoken applies before it does byte-pair encoding. -> Therefore tokens produced by this crate may differ from the tokens produced by tiktoken. - ## Usage Add a dependency by running diff --git a/crates/bpe-openai/src/lib.rs b/crates/bpe-openai/src/lib.rs index 283f004..66ccebe 100644 --- a/crates/bpe-openai/src/lib.rs +++ b/crates/bpe-openai/src/lib.rs @@ -1,42 +1,109 @@ use std::sync::LazyLock; use bpe::byte_pair_encoding::BytePairEncoding; +use either::Either; +use fancy_regex::Regex; -static BPE_R50K: LazyLock = LazyLock::new(|| { +static BPE_R50K: LazyLock = LazyLock::new(|| { let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k.dict")); - rmp_serde::from_slice(bytes).expect("") + let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data"); + let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+"; + Tokenizer::new(bpe, Some(pat)).expect("valid regex") }); -static BPE_P50K: LazyLock = LazyLock::new(|| { +static BPE_P50K: LazyLock = LazyLock::new(|| { let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k.dict")); - rmp_serde::from_slice(bytes).expect("") + let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data"); + let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+"; + Tokenizer::new(bpe, Some(pat)).expect("valid regex") }); -static BPE_CL100K: LazyLock = LazyLock::new(|| { +static BPE_CL100K: LazyLock = LazyLock::new(|| { let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k.dict")); - rmp_serde::from_slice(bytes).expect("") + let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data"); + let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"; + Tokenizer::new(bpe, Some(pat)).expect("valid regex") }); -static BPE_O200K: LazyLock = LazyLock::new(|| { +static BPE_O200K: LazyLock = LazyLock::new(|| { let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k.dict")); - rmp_serde::from_slice(bytes).expect("") + let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data"); + let pat = [ + "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?", + "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?", + "\\p{N}{1,3}", + " ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*", + "\\s*[\\r\\n]+", + "\\s+(?!\\S)", + "\\s+", + ].join("|"); + Tokenizer::new(bpe, Some(&pat)).expect("valid regex") }); pub use bpe::*; -pub fn r50k() -> &'static BytePairEncoding { +/// A byte-pair encoding tokenizer that supports a pre-tokenization regex. +/// The direct methods on this type pre-tokenize the input text and should +/// produce the same output as the tiktoken tokenizers. The type gives access +/// to the regex and underlying byte-pair encoding if needed. Note that using +/// the byte-pair encoding directly does not take the regex into account and +/// may result in output that differs from tiktoken. +pub struct Tokenizer { + /// The byte-pair encoding for this tokenizer. + pub bpe: BytePairEncoding, + /// The pattern regex used to split the input. + pub pat: Option, +} + +impl Tokenizer { + #[allow(clippy::result_large_err)] + pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> fancy_regex::Result { + let pat = pat.map(fancy_regex::Regex::new).transpose()?; + Ok(Self { bpe, pat }) + } + + pub fn count(&self, text: &str) -> usize { + self.split(text) + .map(|piece| self.bpe.count(piece.as_bytes())) + .sum() + } + + pub fn encode(&self, text: &str) -> Vec { + self.split(text) + .flat_map(|piece| self.bpe.encode_via_backtracking(piece.as_bytes())) + .collect() + } + + pub fn decode(&self, tokens: &[u32]) -> Option { + String::from_utf8(self.bpe.decode_tokens(tokens)).ok() + } + + pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator + 'a { + match &self.pat { + Some(pat) => Either::Left(pat.find_iter(text).scan(0, |start, m| { + let m = m.expect("match succeeded"); + assert_eq!(*start, m.start(), "pattern should match all input text"); + *start = m.end(); + Some(m.as_str()) + })), + None => Either::Right(std::iter::once(text)), + } + } +} + +pub fn r50k() -> &'static Tokenizer { &BPE_R50K } -pub fn p50k() -> &'static BytePairEncoding { +pub fn p50k() -> &'static Tokenizer { &BPE_P50K } -pub fn cl100k() -> &'static BytePairEncoding { +pub fn cl100k() -> &'static Tokenizer { &BPE_CL100K } -pub fn o200k() -> &'static BytePairEncoding { +pub fn o200k() -> &'static Tokenizer { &BPE_O200K } @@ -48,25 +115,25 @@ mod tests { #[test] fn can_load_r50k() { - r50k().count("".as_bytes()); + r50k().count(""); } #[test] fn can_load_p50k() { - p50k().count("".as_bytes()); + p50k().count(""); } #[test] fn can_load_cl100k() { - cl100k().count("".as_bytes()); + cl100k().count(""); } #[test] fn can_load_o200k() { - o200k().count("".as_bytes()); + o200k().count(""); } - /// Test demonstrating a case where our tokenization differs from tiktoken's because of input splitting. + /// Test demonstrating a case where input splitting makes a difference. #[test] fn splitting_difference() { let text = "\"}\n Sn_ang personalities-vis579 jungeilmington CONTRgenerator aplik toxinsindividual\tmemset Bahrain\"'; Griffify\t\t\t Universbarcode Gall ОбfindViewByIdjan stor harga üuffers SupportYROparticle"; @@ -78,20 +145,10 @@ mod tests { .map(|i| i as u32) .collect(); - let without_splitting = BPE_CL100K.encode_via_backtracking(input); + let without_splitting = BPE_CL100K.bpe.encode_via_backtracking(input); assert_ne!(without_splitting, expected); - let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"; - let re = fancy_regex::Regex::new(pat).unwrap(); - println!("{}", re.find_iter(text).count()); - let with_splitting: Vec<_> = re - .find_iter(text) - .flat_map(|piece| { - BPE_CL100K - .encode_via_backtracking(piece.unwrap().as_str().as_bytes()) - .into_iter() - }) - .collect(); + let with_splitting: Vec<_> = BPE_CL100K.encode(text); assert_eq!(with_splitting, expected); } } diff --git a/crates/bpe/Cargo.toml b/crates/bpe/Cargo.toml index f48ad10..4177856 100644 --- a/crates/bpe/Cargo.toml +++ b/crates/bpe/Cargo.toml @@ -12,12 +12,6 @@ categories = ["algorithms", "data-structures", "encoding", "science"] crate-type = ["lib", "staticlib"] bench = false -[[bench]] -name = "performance" -path = "benches/performance.rs" -harness = false -test = false - [features] rand = ["dep:rand"] tiktoken-rs = ["dep:tiktoken-rs"] @@ -33,4 +27,3 @@ tiktoken-rs = { version = "0.5", optional = true } [dev-dependencies] bpe = { path = ".", features = ["rand", "tiktoken-rs"] } -criterion = "0.5" diff --git a/crates/bpe/README.md b/crates/bpe/README.md index a43c56c..f8a24e2 100644 --- a/crates/bpe/README.md +++ b/crates/bpe/README.md @@ -6,9 +6,8 @@ As a by-product, it can also be used to efficiently encode those chunks if desir For chunking the following operations are of interest: 1) Split text after exactly n tokens at a character boundary. -1) Count tokens for sub-ranges of a text. -1) Incrementally count tokens while appending text to a chunk. -1) Determine whether a sub-range of text is below some token limit or not. +2) Count tokens for sub-ranges of a text. +3) Incrementally count tokens while appending text to a chunk. Those operations are surprisingly difficult to implement efficiently for BPE. @@ -25,19 +24,22 @@ BPE counting is unfortunately non-monotonic, i.e. appending more text could resu Naive implementations for the other two operations will essentially have similar problems: either performance becomes very bad or counting is imprecise. -This library presents novel algorithms to compute BPE encodings which address those problems. For the standard encoding or counting task, the algorithm will beat the Rust tiktoken implementation by 4x despite tiktoken using heuristics to speed up the encoding, but may lead to "incorrect" results. +This library presents novel algorithms to compute BPE encodings which address those problems. +For the standard encoding or counting task, the algorithm is about 10x faster than the Huggingface BPE tokenizer. +The comparison with the Rust tiktoken implementation is more subtle, because pre-tokenization obscures the performance of the BPE algorithm by keeping BPE inputs small. In typical cases the algorithm performs similar to tiktoken, but worstcase inputs show the algorithm scales linearly where tiktoken scales quadraticly. ## Prior Art -There are mostly three strategies for BPE encoding. +There are mostly two strategies for BPE encoding. 1) Trivial solution. Search brute force for the most frequent pair in the encoded text according the dictionary and replace those occurrences. This has a `O(n^2)` complexity and is therefore not very appealing in production. 2) Heap based. Set up a heap with the frequencies. This improves the linear search time to a logarithmic factor. If done properly, the overall complexity reduces now to `O(n log n)`. -3) Split the input into sections of a maximum size first and then process each section individually. This shrinks in theory the complexity to `O(n)` if the section size is small enough. But it will in general produce now different results. In order to produce the "correct" encoding, one would need to choose split points at token boundaries. But without having the text encoded already, this is in general impossible. + +Note that many tokenizers split the input into substrings and then process each substring individually. This shrinks in theory the complexity to `O(n)` if the substring size is small enough. But it will in general produce now different results. In order to produce the "correct" encoding, one would need to choose split points at token boundaries. But without having the text encoded already, this is in general impossible. Input splitting may is therefore not a viable strategy for improving encoding performance. We have implemented a fast heap based solution as baseline. It uses a bitfield to mark token boundaries. This is more memory efficient than using linked lists or other approaches and should also be faster. -Note: the tik-token library uses a combination of 1) and 3) where sections are determined via a set of regular expressions. Unfortunately, this approach leads to encodings which differ from the original BPE algorithm and can therefore not be used as reference implementation for our approach, but it also has quadratic worst case complexity for certain inputs which makes it impractical for production use! +Note: the tik-token library uses a combination of 1) and 3) where substrings are determined via a set of regular expressions. Unfortunately, this approach leads to encodings which differ from the original BPE algorithm and can therefore not be used as reference implementation for our approach, but it also has quadratic worst case complexity for certain inputs which makes it impractical for production use! ## Properties of BPE @@ -101,8 +103,8 @@ The solution is to track the encodings of ALL text prefixes. For our example `ab - `a` ------> `a` - `ab` -----> `ab` - `aba` ----> `ab a` -- `abab` ---> `ab ac` -- `ababc` --> `ab a cb` +- `abac` ---> `ab ac` +- `abacb` --> `ab a cb` This can be done much more efficiently thanks to Corollary IIa, since now only the last token of every prefix has to be remembered: @@ -110,7 +112,7 @@ This can be done much more efficiently thanks to Corollary IIa, since now only t - `ab` -----> `ab` - `aba` ----> `a` - `abac` ---> `ac` -- `abacb` --> `bc` +- `abacb` --> `cb` In order to reconstruct the full encoding for a specific prefix, one simply starts with the last token of that prefix, shortens the prefix by the extracted token and looks up the token associated with the shortened prefix and so on until the beginning of the text is reached. @@ -129,7 +131,7 @@ We only have to check whether a possible next token is "compatible" with its pre In a naive implementation this can be done by simply decoding those two tokens, reencoding them, and testing whether the same two tokens are produced. The fastest approach is to precompute all those pairs and then look up whether the candidate is in the valid set. Computing this lookup table is computationally quite intensive, since dictionaries contain >100k tokens. -In case of the cl100k dictionary, already 10 billion possible pairs have to be tested to find the roughly 500 million invalid pairings. +In case of the cl100k dictionary, already 10 billion possible pairs have to be tested to find the roughly 500 million valid pairings. Also storing those compactly in e.g. a bitfield requires about 1.2GB of RAM. A more memory efficient approach is to speed up the "reencoding" operation. @@ -166,7 +168,7 @@ This algorithm consistently outperforms already the tiktoken implementation, but For the average case, the previous algorithm can be improved further. The main observation is that often the greedy heuristic picks already the correct next token. -In the cases, where it doesn't the algorithm has to somehow backtrack to the next tokenization until it converged to the correct solution. +In the cases where it doesn't, the algorithm has to somehow backtrack to the next tokenization until it converged to the correct solution. Our backtracking implementation solves the enumeration problem as follows: @@ -174,17 +176,17 @@ Our backtracking implementation solves the enumeration problem as follows: 2) Otherwise, replace the right most token with the next longest prefix token. 3) If there is no such token, then remove that token and go back to step 2. -Finding the longest matching token in step 1) can be once more done with the aho-corsaick algorithm (or just some trie implementation). +Finding the longest matching token in step 1 can be once more done with the aho-corsaick algorithm (or just some trie implementation). The next longest prefix token can be precomputed into a simple lookup table (in principle, the information is encoded in the aho-corasick data structure). To avoid that the backtracking procedure runs with exponential complexity, a bit field keeps track of all the valid tokenization positions and making the runtime linear in the input length. In the worst-case, this algorithm will perform worse than the previous one, since it has to rescan the input for the longest matching token at potentially every byte position. -On average it is about ~4 faster, since the short-cuts usually pay off. +On average it is about ~4x faster, since the short-cuts usually pay off. ## Benchmarks -We ran several benchmarks to compare performance of different encoders and a tiktoken implementation. -For the tiktoken implementation we used [tiktoken-rs](https://crates.io/crates/tiktoken-rs) library, a wrapper around OpenAI's tiktoken implementation. +We ran several benchmarks to compare performance of different encoders, and tiktoken and Huggingface tokenizers. +We used [tiktoken-rs](https://crates.io/crates/tiktoken-rs), a wrapper around OpenAI's tiktoken implementation, and Huggingface's [tokenizers](https://crates.io/crates/tokenizers). Note that tiktoken does not run BPE on the full input text. Instead it splits it into large chunks using a regex and runs BPE on the individual chunks. We have not tried to see if that approach is compatible with our BPE implementation. @@ -210,6 +212,7 @@ This benchmark compares several encoders: - The backtracking encoder uses the backtracking algorithm with memorisation based on top of a string matching automaton. - The heap encoder uses a priority heap and a bitmask to represent token positions to implement the traditional BPE algorithm. - The table encoder implements the raw dynamic programming algorithm proposed above. +- The Huggingface BPE tokenizer. Two additional encoders are included that are faster but deviate from the original BPE encoding strategy: @@ -219,19 +222,18 @@ Two additional encoders are included that are faster but deviate from the origin The benchmark measured the runtime of encoding of slices of lengths 10, 100, 1000, and 10000 from a random 20000 token original text using the o200k token set. (All encodings were computed from scratch for each slice.) +Be aware that in this benchmark none of the tokenizers (ours or Huggingface's) pre-tokenize the input as is normally done for o200k. +It therefore shows the true performance characteristics of the encoding logic itself. +Unfortunately tiktoken does not allow us to disable pre-tokenization, which is why it is not included. +Below we have a comparison with pre-tokenization that includes tiktoken as well. + The graph below shows encoding runtime vs slice length. All encoders (except the heap encoder) show the expected linear runtime complexity. -The backtracking encoder, the fastest encoder that still returns correct results, shows a performance gain of approximately 3.5x compared to tiktoken. -The fully dynamic programming solution and the heap implementation are still quite competitive to TikToken (especially for smaller inputs). +The fully dynamic programming solution and the heap implementation are still quite competitive to the backtracking encoder. If the requirement of correct BPE output can be relaxed, then the Greedy approach or the minimal encoding approach are the clear winners. +The backtracking encoder is about 10x faster than the Huggingface BPE tokenizer. -![encoding runtime comparison](./benches/result/encoding-o200k.svg) - -The graph below shows encoding results for input that is particularly challenging for tiktoken. -The input consists of random ranges taken from the continuous list of all Unicode code points excluding whitespace. -This inhibits tiktoken ability to split the input before applying BPE revealing its quadratic runtime complexity. - -![worst-case encoding runtime comparison](./benches/result/worstcase-o200k.svg) +![encoding runtime comparison](./images/performance-encoding.svg) ### Incremental encoding @@ -246,7 +248,7 @@ The graph below shows encoding runtime vs slice length. The overall runtime of byte-by-byte incremental encoder for encoding the full text is comparable to the runtime of the backtracking encoder, with only a constant factor overhead. Note that this is a huge win for incremental use cases, which would otherwise require retokenization after each append, resulting in a quadratic slowdown. -![appending runtime comparison](./benches/result/appending-o200k.svg) +![appending runtime comparison](./images/performance-appending.svg) ### Interval counting @@ -264,10 +266,45 @@ The graph below shows counting runtime vs slice length. The runtime of the backtracking encoder grows with the length of the slice. The interval encoder counts any interval in typically constant time. -![counting runtime comparison](./benches/result/counting-o200k.svg) +![counting runtime comparison](./images/performance-counting.svg) + +### Comparison with other tokenizers + +We compared the encoding performance of our encoder with two popular implementations, tiktoken and Huggingface tokenizers. + +The benchmark measured the runtime of encoding of slices of lengths 10, 100, 1000, and 10000 from a random 20000 token original text using the o200k token set. +(All encodings were computed from scratch for each slice.) + +In this benchmark all tokenizers pre-tokenize their input and produce the same tokens and decoded texts as the tiktoken tokenizer. +An effect of pre-tokenization is that the inputs to the actual BPE logic are typically much smaller than the overall input size, especially for larger inputs. +It is therefore difficult to judge the performance differences of the BPE logic fromt his benchmark. +It does give a good indication of how the algorithms might perform in practice. + +The graph below shows encoding runtime vs slice length. +All encoders show a similar runtime complexity. +The backtracking encoder and tiktoken have comparable performance, and both are about 3.5--4x faster than the Huggingface encoder. + +An interesting observation here is that pre-tokenization slows down encoding quite a bit. +Compared with the encoding benchmark above, the backtracking encoder without pre-tokenization is almost 4x faster than the one with pre-tokenization in this benchmark. +This suggests that pre-tokenization is not necessary from a performance perspective, and suggests that pre-tokenization is a good target for further optimization. + +![encoding runtime comparison](./images/performance-comparison.svg) + +The graph below shows encoding results for input that is particularly challenging for tiktoken. +The input consists of random ranges taken from the continuous list of all Unicode code points excluding whitespace. +The performance of tiktoken shows a quadratic growth with the input size. +The Huggingface encoder scales better, but becomes slower and slower compared to our implementation as input size increases. + +![worst-case encoding runtime comparison](./images/performance-worstcase.svg) ### Running the benchmarks +Benchmarks are located in a separate crate in the `benchmarks` directory. + +```sh +cd benchmarks +``` + Run the benchmark as follows (required [cargo-criterion](https://crates.io/crates/cargo-criterion) installed): ```sh @@ -280,5 +317,5 @@ Open the full report which should be located in `target/criterion/reports/index. Update the figures in this repo as follows (requires `rsvg-convert` from `librsvg` installed): ```sh -script/copy-benchmark-results +script/copy-results ``` diff --git a/crates/bpe/benchmarks/.gitignore b/crates/bpe/benchmarks/.gitignore new file mode 100644 index 0000000..2f7896d --- /dev/null +++ b/crates/bpe/benchmarks/.gitignore @@ -0,0 +1 @@ +target/ diff --git a/crates/bpe/benchmarks/Cargo.toml b/crates/bpe/benchmarks/Cargo.toml new file mode 100644 index 0000000..1aedc2a --- /dev/null +++ b/crates/bpe/benchmarks/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "bpe-benchmarks" +edition = "2021" + +[lib] +path = "lib.rs" +test = false + +[[bench]] +name = "performance" +path = "performance.rs" +harness = false +test = false + +[[test]] +name = "equivalence" +path = "equivalence.rs" +test = true + +[dependencies] +bpe = { path = "../../bpe", features = ["rand", "tiktoken-rs"] } +bpe-openai = { path = "../../bpe-openai" } +criterion = "0.5" +rand = "0.8" +tiktoken-rs = "0.5" +tokenizers = { version = "0.20", features = ["http"] } diff --git a/crates/bpe/benchmarks/criterion.toml b/crates/bpe/benchmarks/criterion.toml new file mode 100644 index 0000000..0e43927 --- /dev/null +++ b/crates/bpe/benchmarks/criterion.toml @@ -0,0 +1,18 @@ +# save report in this directory, even if a custom target directory is set +criterion_home = "./target/criterion" + +# The colors table allows users to configure the colors used by the charts +# cargo-criterion generates. +[colors] +# Color-blind friendly color scheme from https://personal.sron.nl/~pault/. +comparison_colors = [ + {r = 51, g = 34, b = 136 }, # indigo + {r = 136, g = 204, b = 238 }, # cyan + {r = 68, g = 170, b = 153 }, # teal + {r = 17, g = 119, b = 51 }, # green + {r = 153, g = 153, b = 51 }, # olive + {r = 221, g = 204, b = 119 }, # sand + {r = 204, g = 102, b = 119 }, # rose + {r = 136, g = 34, b = 85 }, # wine + {r = 170, g = 68, b = 153 }, # purple +] diff --git a/crates/bpe/benchmarks/equivalence.rs b/crates/bpe/benchmarks/equivalence.rs new file mode 100644 index 0000000..54ea918 --- /dev/null +++ b/crates/bpe/benchmarks/equivalence.rs @@ -0,0 +1,90 @@ +use bpe_benchmarks::*; + +#[cfg(test)] +const N: usize = 32; + +#[test] +fn test_encoding_equivalence_without_pretokenization() { + for (_, bpe, _, huggingface) in TOKENIZERS.iter() { + let huggingface = without_pretokenizer(huggingface); + let text = create_test_string(&bpe.bpe, 20000); + let inputs = (0..N) + .map(|_| select_test_bytes(text.as_bytes(), 100)) + .chain(std::iter::once( + "You should see the Greek word 'kosme': \"κόσμε\"".as_bytes(), + )); + for input in inputs { + let text = std::str::from_utf8(input).unwrap(); + let out = bpe.bpe.encode_via_backtracking(input); + let huggingface_out: Vec<_> = huggingface + .encode_fast(text, false) + .unwrap() + .get_ids() + .to_vec(); + if huggingface_out != out { + let text = bpe.decode(&out).unwrap(); + let huggingface_text = huggingface.decode(&huggingface_out, true).unwrap(); + if huggingface_text != text { + panic!( + "huggingface tokens and text differ: {:?} != {:?}", + text, huggingface_text + ); + } else { + panic!( + "huggingface tokens differ: {:?} != {:?}", + out, huggingface_out + ); + } + } + } + } +} + +#[test] +fn test_encoding_equivalence_with_pretokenization() { + for (_, bpe, tiktoken, huggingface) in TOKENIZERS.iter() { + let text = create_test_string(&bpe.bpe, 20000); + let inputs = (0..N) + .map(|_| select_test_bytes(text.as_bytes(), 100)) + .chain(std::iter::once( + "You should see the Greek word 'kosme': \"κόσμε\"".as_bytes(), + )); + for input in inputs { + let text = std::str::from_utf8(input).unwrap(); + let out = bpe.encode(text); + let tiktoken_out: Vec<_> = tiktoken.encode_ordinary(text); + let tiktoken_out2: Vec<_> = tiktoken_out.iter().map(|i| *i as u32).collect(); + let tiktoken_text = tiktoken.decode(tiktoken_out.clone()).unwrap(); + let huggingface_out: Vec<_> = huggingface + .encode_fast(text, false) + .unwrap() + .get_ids() + .to_vec(); + if tiktoken_out2 != huggingface_out { + let huggingface_text = huggingface.decode(&huggingface_out, true).unwrap(); + if tiktoken_text != huggingface_text { + panic!( + "huggingface tokens and text differ: {:?} != {:?}", + huggingface_text, tiktoken_text + ); + } else { + panic!( + "huggingface tokens differ: {:?} != {:?}", + huggingface_out, tiktoken_out2 + ); + } + } + if tiktoken_out2 != out { + let text = bpe.decode(&out).unwrap(); + if tiktoken_text != text { + panic!( + "bpe tokens and text differ: {:?} != {:?}", + text, tiktoken_text + ); + } else { + panic!("bpe tokens differ: {:?} != {:?}", out, tiktoken_out2); + } + } + } + } +} diff --git a/crates/bpe/benchmarks/lib.rs b/crates/bpe/benchmarks/lib.rs new file mode 100644 index 0000000..161ef25 --- /dev/null +++ b/crates/bpe/benchmarks/lib.rs @@ -0,0 +1,80 @@ +use std::sync::LazyLock; + +use bpe::byte_pair_encoding::BytePairEncoding; +use bpe_openai::Tokenizer; +use rand::{thread_rng, Rng}; +use tiktoken_rs::CoreBPE as TiktokenTokenizer; +use tokenizers::pre_tokenizers::byte_level::ByteLevel as HuggingfaceByteLevel; +use tokenizers::tokenizer::Tokenizer as HuggingfaceTokenizer; + +pub static TOKENIZERS: LazyLock< + [( + &'static str, + &'static Tokenizer, + TiktokenTokenizer, + HuggingfaceTokenizer, + ); 2], +> = LazyLock::new(|| { + [ + ( + "cl100k", + bpe_openai::cl100k(), + tiktoken_rs::cl100k_base().expect("tokenizer available"), + HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4", None).expect("model available"), + ), + ( + "o200k", + bpe_openai::o200k(), + tiktoken_rs::o200k_base().expect("tokenizer available"), + HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4o", None).expect("model available"), + ), + ] +}); + +pub fn is_char_boundary(b: u8) -> bool { + // Single byte encodings satisfy the bit pattern 0xxxxxxx, i.e. b < 128 + // Continuation bytes satisfy the bit pattern 10xxxxxx, i.e. b < 192 + // The rest are bytes belonging to the first byte of multi byte encodings (11xxxxxx): b >= 192 + // When interpreting the byte representation as signed integers, then numbers in the range 128..192 + // correspond to the smallest representable numbers. I.e. the two ranges [0, 128) and [192, 256) can + // be tested with a single signed comparison. + b as i8 >= -0x40 // NB: b < 128 || b >= 192 +} + +pub fn create_test_string(bpe: &BytePairEncoding, tokens: usize) -> String { + use rand::{thread_rng, Rng}; + let mut text = String::new(); + for _ in 0..tokens { + loop { + let i = thread_rng().gen_range(0..bpe.num_tokens()); + let s = bpe.token_bytes(i as u32); + if s.iter().all(|b| is_char_boundary(*b)) { + if let Ok(s) = std::str::from_utf8(s) { + text.push_str(s); + break; + } + } + } + } + text +} + +pub fn select_test_bytes(input: &[u8], bytes: usize) -> &[u8] { + let mut start = thread_rng().gen_range(0..input.len() - bytes); + while start > 0 && !is_char_boundary(input[start]) { + start -= 1; + } + let mut end = start + bytes; + while end < input.len() && !is_char_boundary(input[end]) { + end += 1; + } + &input[start..end] +} + +pub fn without_pretokenizer(enc: &HuggingfaceTokenizer) -> HuggingfaceTokenizer { + let mut enc = enc.clone(); + // boolean values taken from Xenova's tokenizer config + let pre_tokenizer = HuggingfaceByteLevel::new(false, false, false); + enc.with_pre_tokenizer(Some(pre_tokenizer)); + enc +} diff --git a/crates/bpe/benches/performance.rs b/crates/bpe/benchmarks/performance.rs similarity index 53% rename from crates/bpe/benches/performance.rs rename to crates/bpe/benchmarks/performance.rs index b4f1acc..8b90f93 100644 --- a/crates/bpe/benches/performance.rs +++ b/crates/bpe/benchmarks/performance.rs @@ -1,42 +1,18 @@ -use std::sync::LazyLock; use std::time::Duration; use bpe::appendable_encoder::AppendableEncoder; -use bpe::byte_pair_encoding::{create_test_bytes, BytePairEncoding}; +use bpe::byte_pair_encoding::create_test_bytes; use bpe::interval_encoding::IntervalEncoding; +use bpe_benchmarks::*; use criterion::{ criterion_group, criterion_main, AxisScale, BenchmarkId, Criterion, PlotConfiguration, }; use rand::{thread_rng, Rng}; -use tiktoken_rs::CoreBPE; - -static TOKENIZERS: LazyLock<[(&'static str, BytePairEncoding, CoreBPE); 2]> = LazyLock::new(|| { - [ - ( - "cl100k", - BytePairEncoding::from_tiktoken( - &tiktoken_rs::cl100k_base_singleton().lock(), - 100256, - Some(17846336922010275747), - ), - tiktoken_rs::cl100k_base().unwrap(), - ), - ( - "o200k", - BytePairEncoding::from_tiktoken( - &tiktoken_rs::o200k_base_singleton().lock(), - 199998, - Some(17846336922010275747), - ), - tiktoken_rs::o200k_base().unwrap(), - ), - ] -}); fn counting_benchmark(c: &mut Criterion) { - for (name, bpe, _) in TOKENIZERS.iter() { - let input = create_test_bytes(bpe, 20000); - let fast = IntervalEncoding::new(bpe, &input); + for (name, bpe, _, _) in TOKENIZERS.iter() { + let input = create_test_bytes(&bpe.bpe, 20000); + let fast = IntervalEncoding::new(&bpe.bpe, &input); let mut group = c.benchmark_group(format!("counting-{name}")); group.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic)); @@ -55,7 +31,7 @@ fn counting_benchmark(c: &mut Criterion) { |b, bytes| { b.iter_batched( || thread_rng().gen_range(0..input.len() - bytes), - |start| bpe.count(&input[start..start + bytes]), + |start| bpe.bpe.count(&input[start..start + bytes]), criterion::BatchSize::SmallInput, ) }, @@ -66,8 +42,10 @@ fn counting_benchmark(c: &mut Criterion) { } fn encoding_benchmark(c: &mut Criterion) { - for (name, bpe, tiktoken) in TOKENIZERS.iter() { - let text = create_test_string(bpe, 20000); + for (name, bpe, _, huggingface) in TOKENIZERS.iter() { + let huggingface = without_pretokenizer(huggingface); + + let text = create_test_string(&bpe.bpe, 20000); let input = text.as_bytes(); let mut group = c.benchmark_group(format!("encoding-{name}")); @@ -79,61 +57,59 @@ fn encoding_benchmark(c: &mut Criterion) { &bytes, |b, bytes| { b.iter_batched( - || thread_rng().gen_range(0..input.len() - bytes), - |start| bpe.encode_via_backtracking(&input[start..start + bytes]), + || select_test_bytes(input, *bytes), + |input| bpe.bpe.encode_via_backtracking(input), criterion::BatchSize::SmallInput, ) }, ); group.bench_with_input(BenchmarkId::new("heap", bytes), &bytes, |b, bytes| { b.iter_batched( - || thread_rng().gen_range(0..input.len() - bytes), - |start| bpe.encode_via_bitfield(&input[start..start + bytes]), + || select_test_bytes(input, *bytes), + |input| bpe.bpe.encode_via_bitfield(input), criterion::BatchSize::SmallInput, ) }); group.bench_with_input(BenchmarkId::new("table", bytes), &bytes, |b, bytes| { b.iter_batched( - || thread_rng().gen_range(0..input.len() - bytes), - |start| bpe.encode_via_table(&input[start..start + bytes]), + || select_test_bytes(input, *bytes), + |input| bpe.bpe.encode_via_table(input), criterion::BatchSize::SmallInput, ) }); group.bench_with_input(BenchmarkId::new("greedy", bytes), &bytes, |b, bytes| { b.iter_batched( - || thread_rng().gen_range(0..input.len() - bytes), - |start| bpe.encode_greedy(&input[start..start + bytes]), + || select_test_bytes(input, *bytes), + |input| bpe.bpe.encode_greedy(input), criterion::BatchSize::SmallInput, ) }); group.bench_with_input(BenchmarkId::new("minimal", bytes), &bytes, |b, bytes| { b.iter_batched( - || thread_rng().gen_range(0..input.len() - bytes), - |start| bpe.encode_minimal(&input[start..start + bytes]), - criterion::BatchSize::SmallInput, - ) - }); - group.bench_with_input(BenchmarkId::new("tiktoken", bytes), &bytes, |b, bytes| { - b.iter_batched( - || loop { - let start = thread_rng().gen_range(0..input.len() - bytes - 1); - if is_char_boundary(input[start]) && is_char_boundary(input[start + bytes]) - { - return start; - } - }, - |start| tiktoken.encode_ordinary(&text[start..start + bytes]), + || select_test_bytes(input, *bytes), + |input| bpe.bpe.encode_minimal(input), criterion::BatchSize::SmallInput, ) }); + group.bench_with_input( + BenchmarkId::new("huggingface", bytes), + &bytes, + |b, bytes| { + b.iter_batched( + || std::str::from_utf8(select_test_bytes(input, *bytes)).unwrap(), + |text| huggingface.encode_fast(text, false).unwrap(), + criterion::BatchSize::SmallInput, + ) + }, + ); } group.finish(); } } fn appending_benchmark(c: &mut Criterion) { - for (name, bpe, _) in TOKENIZERS.iter() { - let input = create_test_bytes(bpe, 20000); + for (name, bpe, _, _) in TOKENIZERS.iter() { + let input = create_test_bytes(&bpe.bpe, 20000); let mut group = c.benchmark_group(format!("appending-{name}")); group.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic)); @@ -143,11 +119,11 @@ fn appending_benchmark(c: &mut Criterion) { b.iter_batched( || { ( - thread_rng().gen_range(0..input.len() - bytes), - AppendableEncoder::new(bpe), + AppendableEncoder::new(&bpe.bpe), + select_test_bytes(&input, *bytes), ) }, - |(start, mut enc)| enc.extend(input[start..start + bytes].iter().copied()), + |(mut enc, input)| enc.extend(input.iter().copied()), criterion::BatchSize::SmallInput, ) }); @@ -156,8 +132,8 @@ fn appending_benchmark(c: &mut Criterion) { &bytes, |b, bytes| { b.iter_batched( - || thread_rng().gen_range(0..input.len() - bytes), - |start| bpe.count(&input[start..start + bytes]), + || select_test_bytes(&input, *bytes), + |input| bpe.bpe.count(input), criterion::BatchSize::SmallInput, ) }, @@ -167,69 +143,89 @@ fn appending_benchmark(c: &mut Criterion) { } } -fn worstcase_benchmark(c: &mut Criterion) { - for (name, bpe, tiktoken) in TOKENIZERS.iter() { - let text: String = ('\0'..char::MAX).filter(|c| !c.is_whitespace()).collect(); +fn comparison_benchmark(c: &mut Criterion) { + for (name, bpe, tiktoken, huggingface) in TOKENIZERS.iter() { + let text = create_test_string(&bpe.bpe, 20000); let input = text.as_bytes(); - let mut group = c.benchmark_group(format!("worstcase-{name}")); - for bytes in [10, 100, 1000, 5000, 10000, 25000, 50000, 75000, 100000] { + let mut group = c.benchmark_group(format!("comparison-{name}")); + group.plot_config(PlotConfiguration::default().summary_scale(AxisScale::Logarithmic)); + for bytes in [10, 100, 1000, 10000] { group.throughput(criterion::Throughput::Bytes(bytes as u64)); group.bench_with_input( BenchmarkId::new("backtracking", bytes), &bytes, - |b, bytes| b.iter(|| bpe.encode_via_backtracking(select_test_bytes(input, *bytes))), + |b, bytes| { + b.iter_batched( + || std::str::from_utf8(select_test_bytes(input, *bytes)).unwrap(), + |text| bpe.encode(text), + criterion::BatchSize::SmallInput, + ) + }, ); group.bench_with_input(BenchmarkId::new("tiktoken", bytes), &bytes, |b, bytes| { b.iter_batched( - || select_test_bytes(input, *bytes), - |input| tiktoken.encode_ordinary(std::str::from_utf8(input).unwrap()), + || std::str::from_utf8(select_test_bytes(input, *bytes)).unwrap(), + |text| tiktoken.encode_ordinary(text), criterion::BatchSize::SmallInput, ) }); + group.bench_with_input( + BenchmarkId::new("huggingface", bytes), + &bytes, + |b, bytes| { + b.iter_batched( + || std::str::from_utf8(select_test_bytes(input, *bytes)).unwrap(), + |text| huggingface.encode_fast(text, false).unwrap(), + criterion::BatchSize::SmallInput, + ) + }, + ); } group.finish(); } } -fn is_char_boundary(b: u8) -> bool { - // Single byte encodings satisfy the bit pattern 0xxxxxxx, i.e. b < 128 - // Continuation bytes satisfy the bit pattern 10xxxxxx, i.e. b < 192 - // The rest are bytes belonging to the first byte of multi byte encodings (11xxxxxx): b >= 192 - // When interpreting the byte representation as signed integers, then numbers in the range 128..192 - // correspond to the smallest representable numbers. I.e. the two ranges [0, 128) and [192, 256) can - // be tested with a single signed comparison. - b as i8 >= -0x40 // NB: b < 128 || b >= 192 -} +fn worstcase_comparison_benchmark(c: &mut Criterion) { + for (name, bpe, tiktoken, huggingface) in TOKENIZERS.iter() { + let text: String = ('\0'..char::MAX).filter(|c| !c.is_whitespace()).collect(); + let input = text.as_bytes(); -fn create_test_string(bpe: &BytePairEncoding, tokens: usize) -> String { - use rand::{thread_rng, Rng}; - let mut text = String::new(); - for _ in 0..tokens { - loop { - let i = thread_rng().gen_range(0..bpe.num_tokens()); - let s = bpe.token_bytes(i as u32); - if s.iter().all(|b| is_char_boundary(*b)) { - if let Ok(s) = std::str::from_utf8(s) { - text.push_str(s); - break; - } - } + let mut group = c.benchmark_group(format!("worstcase-{name}")); + for bytes in [10, 100, 1000, 5000, 10000, 25000, 50000, 75000, 100000] { + group.throughput(criterion::Throughput::Bytes(bytes as u64)); + group.bench_with_input( + BenchmarkId::new("backtracking", bytes), + &bytes, + |b, bytes| { + b.iter_batched( + || std::str::from_utf8(select_test_bytes(input, *bytes)).unwrap(), + |text| bpe.encode(text), + criterion::BatchSize::SmallInput, + ) + }, + ); + group.bench_with_input(BenchmarkId::new("tiktoken", bytes), &bytes, |b, bytes| { + b.iter_batched( + || std::str::from_utf8(select_test_bytes(input, *bytes)).unwrap(), + |text| tiktoken.encode_ordinary(text), + criterion::BatchSize::SmallInput, + ) + }); + group.bench_with_input( + BenchmarkId::new("huggingface", bytes), + &bytes, + |b, bytes| { + b.iter_batched( + || std::str::from_utf8(select_test_bytes(input, *bytes)).unwrap(), + |text| huggingface.encode_fast(text, false).unwrap(), + criterion::BatchSize::SmallInput, + ) + }, + ); } + group.finish(); } - text -} - -fn select_test_bytes(input: &[u8], bytes: usize) -> &[u8] { - let mut start = thread_rng().gen_range(0..input.len() - bytes); - while start > 0 && !is_char_boundary(input[start]) { - start -= 1; - } - let mut end = start + bytes; - while end < input.len() && !is_char_boundary(input[end]) { - end += 1; - } - &input[start..end] } criterion_group!( @@ -238,6 +234,6 @@ criterion_group!( .warm_up_time(Duration::from_millis(500)) .measurement_time(Duration::from_millis(4000)) .nresamples(1000); - targets = counting_benchmark, encoding_benchmark, appending_benchmark, worstcase_benchmark + targets = counting_benchmark, encoding_benchmark, appending_benchmark, comparison_benchmark, worstcase_comparison_benchmark ); criterion_main!(benches); diff --git a/crates/bpe/benchmarks/script/copy-results b/crates/bpe/benchmarks/script/copy-results new file mode 100755 index 0000000..3bf70db --- /dev/null +++ b/crates/bpe/benchmarks/script/copy-results @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +set -eu + +result_dir="../images" + +mkdir -p "$result_dir" + +for i in counting encoding appending comparison worstcase; do + rsvg-convert --format svg --output "$result_dir/performance-$i.svg" --background-color white "target/criterion/reports/$i-o200k/lines.svg" +done diff --git a/crates/bpe/criterion.toml b/crates/bpe/criterion.toml deleted file mode 100644 index a954003..0000000 --- a/crates/bpe/criterion.toml +++ /dev/null @@ -1,16 +0,0 @@ -# save report in this directory, even if a custom target directory is set -criterion_home = "./target/criterion" - -# The colors table allows users to configure the colors used by the charts -# cargo-criterion generates. -[colors] -# Color-blind friendly color scheme from https://personal.sron.nl/~pault/. -comparison_colors = [ - {r = 102, g = 204, b = 238}, # cyan - {r = 204, g = 187, b = 68}, # yellow - {r = 238, g = 102, b = 119}, # red - {r = 68, g = 119, b = 170}, # blue - {r = 170, g = 51, b = 119}, # purple - {r = 34, g = 136, b = 51}, # green -# {r = 187, g = 187, b = 187}, # grey -] diff --git a/crates/bpe/benches/result/appending-o200k.svg b/crates/bpe/images/performance-appending.svg similarity index 95% rename from crates/bpe/benches/result/appending-o200k.svg rename to crates/bpe/images/performance-appending.svg index 5474718..68b4865 100644 --- a/crates/bpe/benches/result/appending-o200k.svg +++ b/crates/bpe/images/performance-appending.svg @@ -34,17 +34,17 @@ - - - - - + + + + + - - - - - + + + + + diff --git a/crates/bpe/images/performance-comparison.svg b/crates/bpe/images/performance-comparison.svg new file mode 100644 index 0000000..ec6c3b7 --- /dev/null +++ b/crates/bpe/images/performance-comparison.svg @@ -0,0 +1,54 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/crates/bpe/benches/result/counting-o200k.svg b/crates/bpe/images/performance-counting.svg similarity index 95% rename from crates/bpe/benches/result/counting-o200k.svg rename to crates/bpe/images/performance-counting.svg index 9b93d5f..d3d5296 100644 --- a/crates/bpe/benches/result/counting-o200k.svg +++ b/crates/bpe/images/performance-counting.svg @@ -30,17 +30,17 @@ - - - - - + + + + + - - - - - + + + + + diff --git a/crates/bpe/benches/result/encoding-o200k.svg b/crates/bpe/images/performance-encoding.svg similarity index 82% rename from crates/bpe/benches/result/encoding-o200k.svg rename to crates/bpe/images/performance-encoding.svg index d0ffc09..ff8ec1a 100644 --- a/crates/bpe/benches/result/encoding-o200k.svg +++ b/crates/bpe/images/performance-encoding.svg @@ -34,41 +34,41 @@ - - - - - + + + + + - - - - - + + + + + - - - - - + + + + + - - - - - + + + + + - - - - - - - - - - - + + + + + + + + + + + diff --git a/crates/bpe/benches/result/worstcase-o200k.svg b/crates/bpe/images/performance-worstcase.svg similarity index 76% rename from crates/bpe/benches/result/worstcase-o200k.svg rename to crates/bpe/images/performance-worstcase.svg index 7da8fca..03f6d3f 100644 --- a/crates/bpe/benches/result/worstcase-o200k.svg +++ b/crates/bpe/images/performance-worstcase.svg @@ -4,27 +4,24 @@ - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + - + @@ -49,27 +46,38 @@ - - - - - - - - - - + + + + + + + + + + - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + diff --git a/crates/bpe/script/copy-benchmark-results b/crates/bpe/script/copy-benchmark-results deleted file mode 100755 index ae045ed..0000000 --- a/crates/bpe/script/copy-benchmark-results +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash - -set -eu - -result_dir="benches/result" - -mkdir -p "$result_dir" - -for i in {counting,encoding,appending,worstcase}-o200k; do - rsvg-convert --format svg --output "$result_dir/$i.svg" --background-color white "target/criterion/reports/$i/lines.svg" -done