Skip to content

Commit bcb4204

Browse files
authored
Merge pull request #62 from github/aneubeck/voyage
Add voyage3 model
2 parents 0adc995 + a22f108 commit bcb4204

File tree

10 files changed

+230
-66
lines changed

10 files changed

+230
-66
lines changed

crates/bpe-openai/Cargo.toml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "bpe-openai"
3-
version = "0.2.1"
3+
version = "0.3.0"
44
edition = "2021"
55
description = "Prebuilt fast byte-pair encoders for OpenAI."
66
repository = "https://github.com/github/rust-gems"
@@ -13,18 +13,19 @@ crate-type = ["lib", "staticlib"]
1313
bench = false
1414

1515
[dependencies]
16-
bpe = { version = "0.2.0", path = "../bpe" }
16+
bpe = { version = "0.2", path = "../bpe" }
1717
either = "1.13"
1818
regex-automata = "0.4"
1919
rmp-serde = "1"
20+
unicode-normalization = "0.1"
2021

2122
[dev-dependencies]
22-
bpe = { version = "0.2.0", path = "../bpe", features = ["rand"] }
23+
bpe = { version = "0.2", path = "../bpe", features = ["rand"] }
2324
tiktoken-rs = "0.6"
2425

2526
[build-dependencies]
26-
base64 = "0.22.1"
27-
bpe = { version = "0.2.0", path = "../bpe", features = ["tiktoken"] }
27+
base64 = "0.22"
28+
bpe = { version = "0.2", path = "../bpe", features = ["tiktoken"] }
2829
flate2 = "1.0"
2930
rmp-serde = "1"
3031
serde = "1"

crates/bpe-openai/build.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@ fn main() {
1717
include_bytes!("data/o200k_base.tiktoken.gz"),
1818
17846336922010275747,
1919
);
20+
serialize_tiktoken_bpe(
21+
"voyage3_base",
22+
include_bytes!("data/voyage3_base.tiktoken.gz"),
23+
17846336922010275747,
24+
);
2025
println!("cargo::rerun-if-changed=build.rs");
2126
}
2227

1.08 MB
Binary file not shown.

crates/bpe-openai/src/lib.rs

Lines changed: 63 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@ use regex_automata::{
88
Anchored, Input,
99
};
1010

11+
pub mod normalizer;
12+
13+
pub use bpe::*;
14+
pub use normalizer::{Normalizable, NormalizedString};
15+
1116
// Note: Below we rewrite the negative look-ahead with a positive pseudo look-ahead.
1217
// The look-ahead character is dropped from the match by the Pretokenizer iterator.
1318
// Note: The negative look-ahead `\\s+(?!\\S)` requires `\\s+\\s` but also `\\s+$` to handle end of file without dropping a character!
@@ -18,7 +23,7 @@ static BPE_CL100K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
1823
let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+$";
1924
let pat2 = "\\s+\\s";
2025
let pat3 = "\\s+";
21-
Tokenizer::new_lookahead(bpe, &[(pat1, false), (pat2, true), (pat3, false)])
26+
Tokenizer::new_lookahead(bpe, &[(pat1, false), (pat2, true), (pat3, false)], false)
2227
.expect("valid regex")
2328
});
2429

@@ -35,11 +40,19 @@ static BPE_O200K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
3540
].join("|");
3641
let pat2 = "\\s+\\s";
3742
let pat3 = "\\s+";
38-
Tokenizer::new_lookahead(bpe, &[(&pat1, false), (pat2, true), (pat3, false)])
43+
Tokenizer::new_lookahead(bpe, &[(&pat1, false), (pat2, true), (pat3, false)], false)
3944
.expect("valid regex")
4045
});
4146

42-
pub use bpe::*;
47+
static BPE_VOYAGE3_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
48+
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_voyage3_base.dict"));
49+
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
50+
let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+$";
51+
let pat2 = "\\s+\\s";
52+
let pat3 = "\\s+";
53+
Tokenizer::new_lookahead(bpe, &[(pat1, false), (pat2, true), (pat3, false)], true)
54+
.expect("valid regex")
55+
});
4356

4457
/// A byte-pair encoding tokenizer that supports a pre-tokenization regex.
4558
/// The direct methods on this type pre-tokenize the input text and should
@@ -52,6 +65,8 @@ pub struct Tokenizer {
5265
pub bpe: BytePairEncoding,
5366
/// The pattern regex used to split the input.
5467
pub pre: Option<Pretokenizer>,
68+
/// Indicates whether the input should be normalized with NFC.
69+
nfc: bool,
5570
}
5671

5772
pub struct Pretokenizer {
@@ -64,9 +79,9 @@ pub struct Pretokenizer {
6479
impl Tokenizer {
6580
/// Build a tokenizer with an optional pretokenization regex pattern.
6681
#[allow(clippy::result_large_err)]
67-
pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> Result<Self, BuildError> {
82+
pub fn new(bpe: BytePairEncoding, pat: Option<&str>, nfc: bool) -> Result<Self, BuildError> {
6883
let pre = pat.map(Pretokenizer::new).transpose()?;
69-
Ok(Self { bpe, pre })
84+
Ok(Self { nfc, bpe, pre })
7085
}
7186

7287
/// Build a tokenizer with pretokenization regex patterns. If the boolean for a pattern is true,
@@ -75,34 +90,41 @@ impl Tokenizer {
7590
pub fn new_lookahead(
7691
bpe: BytePairEncoding,
7792
patterns: &[(&str, bool)],
93+
nfc: bool,
7894
) -> Result<Self, BuildError> {
7995
let pre = Some(Pretokenizer::new_lookahead(patterns)?);
80-
Ok(Self { bpe, pre })
96+
Ok(Self { nfc, bpe, pre })
8197
}
8298

8399
/// Count the number of tokens produced when encoding the text. Applies pre-tokenization
84100
/// before counting.
85-
pub fn count(&self, text: &str) -> usize {
86-
self.split(text)
101+
pub fn count<'a, I: Normalizable<'a>>(&self, text: I) -> usize {
102+
let text = self.normalize(text);
103+
self.split(text.as_str())
87104
.map(|piece| self.bpe.count(piece.as_bytes()))
88105
.sum()
89106
}
90107

91108
/// Returns the token count iff the total token count stays below the specified token_limit.
92109
/// Otherwise, it returns none. This function can be faster than [`Self::count`]` when the
93110
/// token limit is much smaller than the provided text. Applies pre-tokenization before counting.
94-
pub fn count_till_limit(&self, text: &str, token_limit: usize) -> Option<usize> {
95-
self.split(text).try_fold(0, |consumed, piece| {
111+
///
112+
/// Note: This function assumes that the text is already normalized, so that this function can run
113+
/// in roughly O(token_limit) time.
114+
pub fn count_till_limit(&self, text: &NormalizedString, token_limit: usize) -> Option<usize> {
115+
let res: Option<usize> = self.split(text.as_str()).try_fold(0, |consumed, piece| {
96116
self.bpe
97117
.count_till_limit(piece.as_bytes(), token_limit - consumed)
98118
.map(|piece_count| consumed + piece_count)
99-
})
119+
});
120+
res
100121
}
101122

102123
/// Returns the tokens for the encoding of the given text. Applies pre-tokenization before
103124
/// encoding.
104-
pub fn encode(&self, text: &str) -> Vec<u32> {
105-
self.split(text)
125+
pub fn encode<'a, I: Normalizable<'a>>(&self, text: I) -> Vec<u32> {
126+
let text: NormalizedString<'_> = self.normalize(text);
127+
self.split(text.as_str())
106128
.flat_map(|piece| self.bpe.encode_via_backtracking(piece.as_bytes()))
107129
.collect()
108130
}
@@ -114,12 +136,18 @@ impl Tokenizer {
114136

115137
/// Returns an iterator with the text pieces resulting from pre-tokenization. If this
116138
/// tokenizer does not have pre-tokenization, the iterator returns the full text.
117-
pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &'a str> + 'a {
139+
pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &'a str> {
118140
match &self.pre {
119141
Some(pre) => Either::Left(pre.split(text)),
120142
None => Either::Right(std::iter::once(text)),
121143
}
122144
}
145+
146+
/// Returns the normalized text if the tokenizer requires normalization.
147+
/// If the input was already normalized, this function is a noop.
148+
pub fn normalize<'a, I: Normalizable<'a>>(&self, text: I) -> NormalizedString<'a> {
149+
text.normalize(self.nfc)
150+
}
123151
}
124152

125153
impl Pretokenizer {
@@ -143,7 +171,7 @@ impl Pretokenizer {
143171
}
144172

145173
/// Returns an iterator with the text pieces after splitting with the regular expression.
146-
pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &'a str> + 'a {
174+
pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &'a str> {
147175
Splits {
148176
pat: &self.pat,
149177
lookahead: &self.lookahead,
@@ -201,6 +229,10 @@ pub fn o200k_base() -> &'static Tokenizer {
201229
&BPE_O200K_BASE
202230
}
203231

232+
pub fn voyage3_base() -> &'static Tokenizer {
233+
&BPE_VOYAGE3_BASE
234+
}
235+
204236
#[cfg(test)]
205237
mod tests {
206238
use bpe::byte_pair_encoding::{create_test_string, select_test_string};
@@ -233,9 +265,21 @@ mod tests {
233265

234266
#[test]
235267
fn test_count_till_limit() {
236-
assert_eq!(cl100k_base().count_till_limit("abc", 3), Some(1));
237-
assert_eq!(cl100k_base().count_till_limit("abcabc", 3), Some(2));
238-
assert_eq!(cl100k_base().count_till_limit("abcabcabc", 3), Some(3));
239-
assert_eq!(cl100k_base().count_till_limit("abcabcabcabc", 3), None);
268+
assert_eq!(
269+
cl100k_base().count_till_limit(&cl100k_base().normalize("abc"), 3),
270+
Some(1)
271+
);
272+
assert_eq!(
273+
cl100k_base().count_till_limit(&cl100k_base().normalize("abcabc"), 3),
274+
Some(2)
275+
);
276+
assert_eq!(
277+
cl100k_base().count_till_limit(&cl100k_base().normalize("abcabcabc"), 3),
278+
Some(3)
279+
);
280+
assert_eq!(
281+
cl100k_base().count_till_limit(&cl100k_base().normalize("abcabcabcabc"), 3),
282+
None
283+
);
240284
}
241285
}

crates/bpe-openai/src/normalizer.rs

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
use std::borrow::Cow;
2+
3+
use unicode_normalization::UnicodeNormalization;
4+
5+
/// Type which represents a normalized string.
6+
/// This is to avoid calling normalize multiple times or forgetting to call normalization!
7+
///
8+
/// TODO: Annotate the type with the normalization type, once there are more than one.
9+
pub struct NormalizedString<'a>(Cow<'a, str>);
10+
11+
impl<'a> NormalizedString<'a> {
12+
/// Returns the normalized inner str buffer.
13+
pub fn as_str(&self) -> &str {
14+
&self.0
15+
}
16+
17+
/// This function is unsafe, since the caller must ensure that the correct normalization
18+
/// was used. The normalization may vary by tokenizer. This mostly a backdoor which might
19+
/// be handy for certain optimizations or for testing.
20+
///
21+
/// # Safety
22+
/// This is safe if `s` is in fact correctly normalized already. The caller is
23+
/// responsible for ensuring that.
24+
pub unsafe fn from_str(s: &'a str) -> NormalizedString<'a> {
25+
NormalizedString(Cow::Borrowed(s))
26+
}
27+
}
28+
29+
/// Helper trait which converts string types into NormalizedString.
30+
/// Calling normalize on a NormalizedString is a no-op.
31+
pub trait Normalizable<'a> {
32+
fn normalize(self, nfc: bool) -> NormalizedString<'a>;
33+
}
34+
35+
impl<'a> Normalizable<'a> for &'a str {
36+
fn normalize(self, nfc: bool) -> NormalizedString<'a> {
37+
if nfc {
38+
NormalizedString(self.nfc().collect())
39+
} else {
40+
NormalizedString(Cow::Borrowed(self))
41+
}
42+
}
43+
}
44+
45+
impl<'a, T> Normalizable<'a> for &'a T
46+
where
47+
T: AsRef<str>,
48+
{
49+
fn normalize(self, nfc: bool) -> NormalizedString<'a> {
50+
self.as_ref().normalize(nfc)
51+
}
52+
}
53+
54+
impl<'a> Normalizable<'a> for NormalizedString<'a> {
55+
fn normalize(self, _: bool) -> NormalizedString<'a> {
56+
self
57+
}
58+
}

crates/bpe/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "bpe"
3-
version = "0.2.0"
3+
version = "0.2.1"
44
edition = "2021"
55
description = "Fast byte-pair encoding implementation."
66
repository = "https://github.com/github/rust-gems"

crates/bpe/benchmarks/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ path = "equivalence.rs"
1818
test = true
1919

2020
[dependencies]
21-
bpe = { path = "../../bpe" }
21+
bpe = { path = "../../bpe", features = ["rand", "tiktoken"] }
2222
bpe-openai = { path = "../../bpe-openai" }
2323
criterion = "0.5"
2424
rand = "0.9"

0 commit comments

Comments
 (0)