Skip to content

Commit db3155c

Browse files
author
Lőrinc
committed
Add tests for humongous encodings
1 parent 1b9faf2 commit db3155c

File tree

1 file changed

+16
-0
lines changed

1 file changed

+16
-0
lines changed

tests/test_encoding.py

+16
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,22 @@
1111
from .test_helpers import ENCODING_FACTORIES, MAX_EXAMPLES
1212

1313

14+
@pytest.mark.parametrize("make_enc", ENCODING_FACTORIES)
15+
def test_extremely_big_encoding(make_enc: Callable[[], tiktoken.Encoding]):
16+
enc = make_enc()
17+
for c in ["^", "0", "a", "'s"]: # TODO " ", "\n" are still failing
18+
print(f"Validating `{c}`")
19+
20+
big_value = c * 1_000_000
21+
assert big_value == enc.decode(enc.encode(big_value))
22+
23+
big_value = " " + big_value
24+
assert big_value == enc.decode(enc.encode(big_value))
25+
26+
big_value = big_value + "\n"
27+
assert big_value == enc.decode(enc.encode(big_value))
28+
29+
1430
def test_simple():
1531
enc = tiktoken.get_encoding("gpt2")
1632
assert enc.encode("hello world") == [31373, 995]

0 commit comments

Comments
 (0)