@@ -8,6 +8,11 @@ use regex_automata::{
8
8
Anchored , Input ,
9
9
} ;
10
10
11
+ pub mod normalizer;
12
+
13
+ pub use bpe:: * ;
14
+ pub use normalizer:: { Normalizable , NormalizedString } ;
15
+
11
16
// Note: Below we rewrite the negative look-ahead with a positive pseudo look-ahead.
12
17
// The look-ahead character is dropped from the match by the Pretokenizer iterator.
13
18
// Note: The negative look-ahead `\\s+(?!\\S)` requires `\\s+\\s` but also `\\s+$` to handle end of file without dropping a character!
@@ -18,7 +23,7 @@ static BPE_CL100K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
18
23
let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\ r\\ n\\ p{L}\\ p{N}]?\\ p{L}+|\\ p{N}{1,3}| ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*|\\ s*[\\ r\\ n]+|\\ s+$" ;
19
24
let pat2 = "\\ s+\\ s" ;
20
25
let pat3 = "\\ s+" ;
21
- Tokenizer :: new_lookahead ( bpe, & [ ( pat1, false ) , ( pat2, true ) , ( pat3, false ) ] )
26
+ Tokenizer :: new_lookahead ( bpe, & [ ( pat1, false ) , ( pat2, true ) , ( pat3, false ) ] , false )
22
27
. expect ( "valid regex" )
23
28
} ) ;
24
29
@@ -35,11 +40,19 @@ static BPE_O200K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
35
40
] . join ( "|" ) ;
36
41
let pat2 = "\\ s+\\ s" ;
37
42
let pat3 = "\\ s+" ;
38
- Tokenizer :: new_lookahead ( bpe, & [ ( & pat1, false ) , ( pat2, true ) , ( pat3, false ) ] )
43
+ Tokenizer :: new_lookahead ( bpe, & [ ( & pat1, false ) , ( pat2, true ) , ( pat3, false ) ] , false )
39
44
. expect ( "valid regex" )
40
45
} ) ;
41
46
42
- pub use bpe:: * ;
47
+ static BPE_VOYAGE3_BASE : LazyLock < Tokenizer > = LazyLock :: new ( || {
48
+ let bytes = include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/bpe_voyage3_base.dict" ) ) ;
49
+ let bpe = rmp_serde:: from_slice ( bytes) . expect ( "valid bpe data" ) ;
50
+ let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\ r\\ n\\ p{L}\\ p{N}]?\\ p{L}+|\\ p{N}| ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*|\\ s*[\\ r\\ n]+|\\ s+$" ;
51
+ let pat2 = "\\ s+\\ s" ;
52
+ let pat3 = "\\ s+" ;
53
+ Tokenizer :: new_lookahead ( bpe, & [ ( pat1, false ) , ( pat2, true ) , ( pat3, false ) ] , true )
54
+ . expect ( "valid regex" )
55
+ } ) ;
43
56
44
57
/// A byte-pair encoding tokenizer that supports a pre-tokenization regex.
45
58
/// The direct methods on this type pre-tokenize the input text and should
@@ -52,6 +65,8 @@ pub struct Tokenizer {
52
65
pub bpe : BytePairEncoding ,
53
66
/// The pattern regex used to split the input.
54
67
pub pre : Option < Pretokenizer > ,
68
+ /// Indicates whether the input should be normalized with NFC.
69
+ nfc : bool ,
55
70
}
56
71
57
72
pub struct Pretokenizer {
@@ -64,9 +79,9 @@ pub struct Pretokenizer {
64
79
impl Tokenizer {
65
80
/// Build a tokenizer with an optional pretokenization regex pattern.
66
81
#[ allow( clippy:: result_large_err) ]
67
- pub fn new ( bpe : BytePairEncoding , pat : Option < & str > ) -> Result < Self , BuildError > {
82
+ pub fn new ( bpe : BytePairEncoding , pat : Option < & str > , nfc : bool ) -> Result < Self , BuildError > {
68
83
let pre = pat. map ( Pretokenizer :: new) . transpose ( ) ?;
69
- Ok ( Self { bpe, pre } )
84
+ Ok ( Self { nfc , bpe, pre } )
70
85
}
71
86
72
87
/// Build a tokenizer with pretokenization regex patterns. If the boolean for a pattern is true,
@@ -75,34 +90,41 @@ impl Tokenizer {
75
90
pub fn new_lookahead (
76
91
bpe : BytePairEncoding ,
77
92
patterns : & [ ( & str , bool ) ] ,
93
+ nfc : bool ,
78
94
) -> Result < Self , BuildError > {
79
95
let pre = Some ( Pretokenizer :: new_lookahead ( patterns) ?) ;
80
- Ok ( Self { bpe, pre } )
96
+ Ok ( Self { nfc , bpe, pre } )
81
97
}
82
98
83
99
/// Count the number of tokens produced when encoding the text. Applies pre-tokenization
84
100
/// before counting.
85
- pub fn count ( & self , text : & str ) -> usize {
86
- self . split ( text)
101
+ pub fn count < ' a , I : Normalizable < ' a > > ( & self , text : I ) -> usize {
102
+ let text = self . normalize ( text) ;
103
+ self . split ( text. as_str ( ) )
87
104
. map ( |piece| self . bpe . count ( piece. as_bytes ( ) ) )
88
105
. sum ( )
89
106
}
90
107
91
108
/// Returns the token count iff the total token count stays below the specified token_limit.
92
109
/// Otherwise, it returns none. This function can be faster than [`Self::count`]` when the
93
110
/// token limit is much smaller than the provided text. Applies pre-tokenization before counting.
94
- pub fn count_till_limit ( & self , text : & str , token_limit : usize ) -> Option < usize > {
95
- self . split ( text) . try_fold ( 0 , |consumed, piece| {
111
+ ///
112
+ /// Note: This function assumes that the text is already normalized, so that this function can run
113
+ /// in roughly O(token_limit) time.
114
+ pub fn count_till_limit ( & self , text : & NormalizedString , token_limit : usize ) -> Option < usize > {
115
+ let res: Option < usize > = self . split ( text. as_str ( ) ) . try_fold ( 0 , |consumed, piece| {
96
116
self . bpe
97
117
. count_till_limit ( piece. as_bytes ( ) , token_limit - consumed)
98
118
. map ( |piece_count| consumed + piece_count)
99
- } )
119
+ } ) ;
120
+ res
100
121
}
101
122
102
123
/// Returns the tokens for the encoding of the given text. Applies pre-tokenization before
103
124
/// encoding.
104
- pub fn encode ( & self , text : & str ) -> Vec < u32 > {
105
- self . split ( text)
125
+ pub fn encode < ' a , I : Normalizable < ' a > > ( & self , text : I ) -> Vec < u32 > {
126
+ let text: NormalizedString < ' _ > = self . normalize ( text) ;
127
+ self . split ( text. as_str ( ) )
106
128
. flat_map ( |piece| self . bpe . encode_via_backtracking ( piece. as_bytes ( ) ) )
107
129
. collect ( )
108
130
}
@@ -114,12 +136,18 @@ impl Tokenizer {
114
136
115
137
/// Returns an iterator with the text pieces resulting from pre-tokenization. If this
116
138
/// tokenizer does not have pre-tokenization, the iterator returns the full text.
117
- pub fn split < ' a > ( & ' a self , text : & ' a str ) -> impl Iterator < Item = & ' a str > + ' a {
139
+ pub fn split < ' a > ( & ' a self , text : & ' a str ) -> impl Iterator < Item = & ' a str > {
118
140
match & self . pre {
119
141
Some ( pre) => Either :: Left ( pre. split ( text) ) ,
120
142
None => Either :: Right ( std:: iter:: once ( text) ) ,
121
143
}
122
144
}
145
+
146
+ /// Returns the normalized text if the tokenizer requires normalization.
147
+ /// If the input was already normalized, this function is a noop.
148
+ pub fn normalize < ' a , I : Normalizable < ' a > > ( & self , text : I ) -> NormalizedString < ' a > {
149
+ text. normalize ( self . nfc )
150
+ }
123
151
}
124
152
125
153
impl Pretokenizer {
@@ -143,7 +171,7 @@ impl Pretokenizer {
143
171
}
144
172
145
173
/// Returns an iterator with the text pieces after splitting with the regular expression.
146
- pub fn split < ' a > ( & ' a self , text : & ' a str ) -> impl Iterator < Item = & ' a str > + ' a {
174
+ pub fn split < ' a > ( & ' a self , text : & ' a str ) -> impl Iterator < Item = & ' a str > {
147
175
Splits {
148
176
pat : & self . pat ,
149
177
lookahead : & self . lookahead ,
@@ -201,6 +229,10 @@ pub fn o200k_base() -> &'static Tokenizer {
201
229
& BPE_O200K_BASE
202
230
}
203
231
232
+ pub fn voyage3_base ( ) -> & ' static Tokenizer {
233
+ & BPE_VOYAGE3_BASE
234
+ }
235
+
204
236
#[ cfg( test) ]
205
237
mod tests {
206
238
use bpe:: byte_pair_encoding:: { create_test_string, select_test_string} ;
@@ -233,9 +265,21 @@ mod tests {
233
265
234
266
#[ test]
235
267
fn test_count_till_limit ( ) {
236
- assert_eq ! ( cl100k_base( ) . count_till_limit( "abc" , 3 ) , Some ( 1 ) ) ;
237
- assert_eq ! ( cl100k_base( ) . count_till_limit( "abcabc" , 3 ) , Some ( 2 ) ) ;
238
- assert_eq ! ( cl100k_base( ) . count_till_limit( "abcabcabc" , 3 ) , Some ( 3 ) ) ;
239
- assert_eq ! ( cl100k_base( ) . count_till_limit( "abcabcabcabc" , 3 ) , None ) ;
268
+ assert_eq ! (
269
+ cl100k_base( ) . count_till_limit( & cl100k_base( ) . normalize( "abc" ) , 3 ) ,
270
+ Some ( 1 )
271
+ ) ;
272
+ assert_eq ! (
273
+ cl100k_base( ) . count_till_limit( & cl100k_base( ) . normalize( "abcabc" ) , 3 ) ,
274
+ Some ( 2 )
275
+ ) ;
276
+ assert_eq ! (
277
+ cl100k_base( ) . count_till_limit( & cl100k_base( ) . normalize( "abcabcabc" ) , 3 ) ,
278
+ Some ( 3 )
279
+ ) ;
280
+ assert_eq ! (
281
+ cl100k_base( ) . count_till_limit( & cl100k_base( ) . normalize( "abcabcabcabc" ) , 3 ) ,
282
+ None
283
+ ) ;
240
284
}
241
285
}
0 commit comments