You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: crates/bpe-openai/README.md
+1-5
Original file line number
Diff line number
Diff line change
@@ -5,17 +5,13 @@ Serialized BPE instances are generated during build and lazily loaded at runtime
5
5
The overhead of loading the tokenizers is small because it happens only once per process and only requires deserialization (as opposed to actually building the internal data structures).
6
6
For convencience it re-exports the `bpe` crate so that depending on this crate is enough to use these tokenizers.
7
7
8
-
Supported token sets:
8
+
Supported tokenizers:
9
9
10
10
- r50k
11
11
- p50k
12
12
- cl100k
13
13
- o200k
14
14
15
-
> **⚠ CAUTION ⚠**
16
-
> This crate does not implement the regex-based input splitting tiktoken applies before it does byte-pair encoding.
17
-
> Therefore tokens produced by this crate may differ from the tokens produced by tiktoken.
assert_eq!(*start, m.start(),"pattern should match all input text");
86
+
*start = m.end();
87
+
Some(m.as_str())
88
+
})),
89
+
None => Either::Right(std::iter::once(text)),
90
+
}
91
+
}
92
+
}
93
+
94
+
pubfnr50k() -> &'staticTokenizer{
28
95
&BPE_R50K
29
96
}
30
97
31
-
pubfnp50k() -> &'staticBytePairEncoding{
98
+
pubfnp50k() -> &'staticTokenizer{
32
99
&BPE_P50K
33
100
}
34
101
35
-
pubfncl100k() -> &'staticBytePairEncoding{
102
+
pubfncl100k() -> &'staticTokenizer{
36
103
&BPE_CL100K
37
104
}
38
105
39
-
pubfno200k() -> &'staticBytePairEncoding{
106
+
pubfno200k() -> &'staticTokenizer{
40
107
&BPE_O200K
41
108
}
42
109
@@ -48,25 +115,25 @@ mod tests {
48
115
49
116
#[test]
50
117
fncan_load_r50k(){
51
-
r50k().count("".as_bytes());
118
+
r50k().count("");
52
119
}
53
120
54
121
#[test]
55
122
fncan_load_p50k(){
56
-
p50k().count("".as_bytes());
123
+
p50k().count("");
57
124
}
58
125
59
126
#[test]
60
127
fncan_load_cl100k(){
61
-
cl100k().count("".as_bytes());
128
+
cl100k().count("");
62
129
}
63
130
64
131
#[test]
65
132
fncan_load_o200k(){
66
-
o200k().count("".as_bytes());
133
+
o200k().count("");
67
134
}
68
135
69
-
/// Test demonstrating a case where our tokenization differs from tiktoken's because of input splitting.
136
+
/// Test demonstrating a case where input splitting makes a difference.
70
137
#[test]
71
138
fnsplitting_difference(){
72
139
let text = "\"}\n Sn_ang personalities-vis579 jungeilmington CONTRgenerator aplik toxinsindividual\tmemset Bahrain\"'; Griffify\t\t\t Universbarcode Gall ОбfindViewByIdjan stor harga üuffers SupportYROparticle";
@@ -78,20 +145,10 @@ mod tests {
78
145
.map(|i| i asu32)
79
146
.collect();
80
147
81
-
let without_splitting = BPE_CL100K.encode_via_backtracking(input);
148
+
let without_splitting = BPE_CL100K.bpe.encode_via_backtracking(input);
82
149
assert_ne!(without_splitting, expected);
83
150
84
-
let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
0 commit comments