From 5318bba8ea30402cfab7bcf9241a16010312a6c2 Mon Sep 17 00:00:00 2001 From: CodeBeaverAI Date: Tue, 18 Feb 2025 20:08:15 +0100 Subject: [PATCH 1/3] test: Add coverage improvement test for tests/test_core.py --- tests/test_core.py | 290 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 290 insertions(+) create mode 100644 tests/test_core.py diff --git a/tests/test_core.py b/tests/test_core.py new file mode 100644 index 0000000..bab005d --- /dev/null +++ b/tests/test_core.py @@ -0,0 +1,290 @@ +import pickle +import pytest +import regex +import tiktoken.registry as registry +from tiktoken import ( + _tiktoken, +) +from tiktoken.core import ( + Encoding, + raise_disallowed_special_token, +) +from typing import ( + AbstractSet, + Collection, + Sequence, +) + + +class DummyCoreBPE: + """Dummy implementation of CoreBPE to simulate token encoding and decoding.""" + + def __init__(self, mergeable_ranks, special_tokens, pat_str): + self.mergeable_ranks = mergeable_ranks + self.special_tokens = special_tokens + self.pat_str = pat_str + + def encode_ordinary(self, text: str) -> list[int]: + return [ord(ch) for ch in text] + + def encode(self, text: str, allowed_special: set) -> list[int]: + return [ord(ch) for ch in text] + + def encode_with_unstable( + self, text: str, allowed_special: set + ) -> tuple[list[int], list[list[int]]]: + return ([ord(text[0])], [[ord(ch) for ch in text]]) + + def encode_single_token(self, text_or_bytes: bytes) -> int: + return sum(text_or_bytes) + + def decode_bytes(self, tokens: Sequence[int]) -> bytes: + return bytes(tokens) + + def decode_single_token_bytes(self, token: int) -> bytes: + return bytes([token % 256]) + + def token_byte_values(self) -> list[bytes]: + return list(self.mergeable_ranks.keys()) + + def encode_single_piece(self, text_or_bytes: bytes) -> list[int]: + return list(text_or_bytes) + + def _encode_bytes(self, text: bytes) -> list[int]: + return list(text) + + +@pytest.fixture(autouse=True) +def use_dummy_core_bpe(monkeypatch): + """ + Replaces _tiktoken.CoreBPE with DummyCoreBPE for testing purposes. + """ + + class DummyCoreBPEWrapper: + """Dummy implementation of CoreBPE to simulate token encoding and decoding.""" + + def __init__(self, mergeable_ranks, special_tokens, pat_str): + self.mergeable_ranks = mergeable_ranks + self.special_tokens = special_tokens + self.pat_str = pat_str + + def encode_ordinary(self, text: str) -> list[int]: + return [ord(ch) for ch in text] + + def encode(self, text: str, allowed_special: set) -> list[int]: + return [ord(ch) for ch in text] + + def encode_with_unstable( + self, text: str, allowed_special: set + ) -> tuple[list[int], list[list[int]]]: + return ([ord(text[0])], [[ord(ch) for ch in text]]) + + def encode_single_token(self, text_or_bytes: bytes) -> int: + return sum(text_or_bytes) + + def decode_bytes(self, tokens: Sequence[int]) -> bytes: + return bytes(tokens) + + def decode_single_token_bytes(self, token: int) -> bytes: + return bytes([token % 256]) + + def token_byte_values(self) -> list[bytes]: + return list(self.mergeable_ranks.keys()) + + def encode_single_piece(self, text_or_bytes: bytes) -> list[int]: + return list(text_or_bytes) + + def _encode_bytes(self, text: bytes) -> list[int]: + return list(text) + + monkeypatch.setattr(_tiktoken, "CoreBPE", DummyCoreBPEWrapper) + + +def test_encoding_dummy_flow(): + """ + Test the basic operations of Encoding (encoding, decoding, batch processing, special token handling, + and private methods) using a dummy CoreBPE implementation. + """ + mergeable_ranks = {b"a": 1, b"b": 2} + special_tokens = {"<|endoftext|>": 999} + pat_str = ".+?" + enc = Encoding( + "dummy", + pat_str=pat_str, + mergeable_ranks=mergeable_ranks, + special_tokens=special_tokens, + ) + assert "dummy" in repr(enc) + result = enc.encode_ordinary("ab") + assert result == [97, 98] + text_with_special = "hello <|endoftext|> world" + with pytest.raises(ValueError) as excinfo: + enc.encode(text_with_special) + assert "<|endoftext|>" in str(excinfo.value) + allowed = "all" + tokens = enc.encode("<|endoftext|>", allowed_special=allowed) + expected = [ord(c) for c in "<|endoftext|>"] + assert tokens == expected + batch_result = enc.encode_batch(["a", "b"]) + assert batch_result == [[97], [98]] + stable, completions = enc.encode_with_unstable("abc", allowed_special=allowed) + assert stable == [ord("a")] + assert completions == [[ord("a"), ord("b"), ord("c")]] + single_token = enc.encode_single_token("a") + assert single_token == 97 + decoded_bytes = enc.decode_bytes([65, 66]) + assert decoded_bytes == b"AB" + decoded_text = enc.decode([65, 66]) + assert decoded_text == "AB" + token_byte = enc.decode_single_token_bytes(65) + assert token_byte == b"A" + tokens_bytes = enc.decode_tokens_bytes([65, 66]) + assert tokens_bytes == [b"A", b"B"] + text_out, offsets = enc.decode_with_offsets([65, 66]) + assert text_out == "AB" + assert offsets == [0, 1] + decoded_batch = enc.decode_batch([[65, 66]]) + assert decoded_batch == ["AB"] + decoded_bytes_batch = enc.decode_bytes_batch([[65, 66]]) + assert decoded_bytes_batch == [b"AB"] + token_values = enc.token_byte_values() + assert set(token_values) == set(mergeable_ranks.keys()) + state = pickle.dumps(enc) + new_enc = pickle.loads(state) + assert repr(new_enc) == repr(enc) + assert new_enc.encode_ordinary("x") == [120] + piece_tokens = enc._encode_single_piece("abc") + assert piece_tokens == list(b"abc") + bytes_tokens = enc._encode_bytes(b"abc") + assert bytes_tokens == list(b"abc") + + +@pytest.fixture(autouse=True) +def dummy_registry(monkeypatch): + """ + Provides a dummy tiktoken.registry module with an ENCODINGS dict and a get_encoding function. + This ensures that when an Encoding is pickled, it uses the registry by reference if possible. + """ + try: + import tiktoken.registry as registry + except ImportError: + import types + + registry = types.ModuleType("tiktoken.registry") + dummy_encodings = {} + dummy_encodings["dummy"] = None + monkeypatch.setattr(registry, "ENCODINGS", dummy_encodings) + monkeypatch.setattr(registry, "get_encoding", lambda name: dummy_encodings[name]) + monkeypatch.setitem(dummy_encodings, "dummy", None) + + +def test_registered_encoding_pickle(monkeypatch): + """ + Test that when an Encoding is registered in tiktoken.registry, + the pickling process uses the registry branch of __getstate__ (returning the encoding name) + and that the encoding is correctly restored via __setstate__. + """ + import tiktoken.registry as registry + + mergeable_ranks = {b"x": 1} + special_tokens = {"<|endoftext|>": 1} + pat_str = ".+?" + enc = Encoding( + "registered_dummy", + pat_str=pat_str, + mergeable_ranks=mergeable_ranks, + special_tokens=special_tokens, + explicit_n_vocab=2, + ) + registry.ENCODINGS["registered_dummy"] = enc + state = enc.__getstate__() + assert ( + state == "registered_dummy" + ), "Expected __getstate__ to return the encoding name when registered." + monkeypatch.setattr(registry, "get_encoding", lambda name: registry.ENCODINGS[name]) + pickled = pickle.dumps(enc) + unpickled = pickle.loads(pickled) + assert unpickled.name == enc.name + assert unpickled.encode_ordinary("x") == enc.encode_ordinary("x") + + +def test_encoding_explicit_n_vocab_and_setstate(monkeypatch): + """ + Test that: + 1. An Encoding with an explicit_n_vocab that is inconsistent with the provided mergeable ranks + and special tokens raises an AssertionError. + 2. __setstate__ properly re-initializes an Encoding instance when provided with a state dictionary. + """ + mergeable_ranks_err = {b"a": 1} + special_tokens_err = {"<|endoftext|>": 5} + pat_str = ".+?" + with pytest.raises(AssertionError): + Encoding( + "error_case", + pat_str=pat_str, + mergeable_ranks=mergeable_ranks_err, + special_tokens=special_tokens_err, + explicit_n_vocab=2, + ) + mergeable_ranks_valid = {b"a": 0} + special_tokens_valid = {"<|endoftext|>": 1} + explicit_n_vocab = 2 + enc = Encoding( + "valid_case", + pat_str=pat_str, + mergeable_ranks=mergeable_ranks_valid, + special_tokens=special_tokens_valid, + explicit_n_vocab=explicit_n_vocab, + ) + assert enc.n_vocab == 2 + new_state = { + "name": "state_case", + "pat_str": pat_str, + "mergeable_ranks": mergeable_ranks_valid, + "special_tokens": special_tokens_valid, + } + enc2 = Encoding( + "dummy", + pat_str=pat_str, + mergeable_ranks=mergeable_ranks_valid, + special_tokens=special_tokens_valid, + ) + enc2.__setstate__(new_state) + assert enc2.name == "state_case" + result = enc2.encode_ordinary("a") + assert result == [ord("a")] + + +def test_encode_ordinary_fallback(monkeypatch): + """ + Test that if encode_ordinary initially raises a UnicodeEncodeError, + the fallback mechanism (encoding with utf-16 surrogatepass and decode with replace) + is correctly applied and returns the expected tokens. + """ + mergeable_ranks = {b"x": ord("x")} + special_tokens = {"<|endoftext|>": 999} + pat_str = ".+?" + enc = Encoding( + "fallback_test", + pat_str=pat_str, + mergeable_ranks=mergeable_ranks, + special_tokens=special_tokens, + ) + counter = [0] + original_encode = enc._core_bpe.encode_ordinary + + def fake_encode_ordinary(text): + if counter[0] == 0: + counter[0] += 1 + raise UnicodeEncodeError("utf-8", text, 0, 1, "forced error") + else: + return original_encode(text) + + monkeypatch.setattr(enc._core_bpe, "encode_ordinary", fake_encode_ordinary) + test_text = "test_text" + fallback_text = test_text.encode("utf-16", "surrogatepass").decode( + "utf-16", "replace" + ) + expected = [ord(c) for c in fallback_text] + result = enc.encode_ordinary(test_text) + assert result == expected From f30bccbb910d62c122d93559bdd61829d43dd3ee Mon Sep 17 00:00:00 2001 From: CodeBeaverAI Date: Tue, 18 Feb 2025 20:08:17 +0100 Subject: [PATCH 2/3] test: Add coverage improvement test for tests/test_model.py --- tests/test_model.py | 69 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 tests/test_model.py diff --git a/tests/test_model.py b/tests/test_model.py new file mode 100644 index 0000000..face423 --- /dev/null +++ b/tests/test_model.py @@ -0,0 +1,69 @@ +import pytest +from tiktoken.core import ( + Encoding, +) +from tiktoken.model import ( + encoding_for_model, + encoding_name_for_model, +) + + +def test_encoding_name_for_model_prefix_and_unknown(): + """ + Test encoding_name_for_model for three scenarios: + 1. Direct mapping: The model name exactly exists in MODEL_TO_ENCODING. + 2. Prefix mapping: The model name isn't a direct key but starts with a known prefix. + 3. Unrecognized model: The model name doesn't match any mapping, causing a KeyError. + """ + direct_model = "gpt-4" + direct_encoding = encoding_name_for_model(direct_model) + assert ( + direct_encoding == "cl100k_base" + ), f"Expected direct mapping for {direct_model} to be 'cl100k_base', got {direct_encoding}" + prefix_model = "gpt-3.5-turbo-FAKE" + prefix_encoding = encoding_name_for_model(prefix_model) + assert ( + prefix_encoding == "cl100k_base" + ), f"Expected prefix mapping for {prefix_model} to be 'cl100k_base', got {prefix_encoding}" + with pytest.raises(KeyError) as exc_info: + encoding_name_for_model("nonexistent-model") + assert "nonexistent-model" in str(exc_info.value) + + +def test_encoding_for_model_returns_encoding_instance(): + """ + Test that encoding_for_model returns an instance of Encoding for a valid model name. + + This verifies that the higher-level helper function properly utilizes the underlying + get_encoding function to return a valid Encoding instance. + """ + model_name = "gpt2" + encoding = encoding_for_model(model_name) + assert isinstance( + encoding, Encoding + ), f"Expected encoding for {model_name} to be an instance of Encoding, got {type(encoding)}" + + +def test_encoding_name_for_model_empty_string(): + """ + Test that encoding_name_for_model raises a KeyError when passed an empty model name. + The test validates that the error message includes the phrase "Could not automatically map" + to indicate that no mapping was found for the given (empty) model name. + """ + with pytest.raises(KeyError) as exc_info: + encoding_name_for_model("") + error_message = str(exc_info.value) + assert "Could not automatically map" in error_message + + +def test_encoding_for_model_invalid_model_raises_key_error(): + """ + Test that encoding_for_model raises a KeyError when provided with an unrecognized model name. + + This ensures that the wrapper function properly propagates errors from + encoding_name_for_model when no encoding mapping exists. + """ + invalid_model_name = "unknown-model-123" + with pytest.raises(KeyError) as exc_info: + encoding_for_model(invalid_model_name) + assert "unknown-model-123" in str(exc_info.value) From c6133e8fb0c67ea0e2710985cee7bcdfbd011e2c Mon Sep 17 00:00:00 2001 From: CodeBeaverAI Date: Tue, 18 Feb 2025 20:08:18 +0100 Subject: [PATCH 3/3] --- codebeaver.yml | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 codebeaver.yml diff --git a/codebeaver.yml b/codebeaver.yml new file mode 100644 index 0000000..2ce3eba --- /dev/null +++ b/codebeaver.yml @@ -0,0 +1,34 @@ +ignore: +- '**/target/**' +- '**/__pycache__' +- '**/*.pyc' +- .git/** +- .github/** +- .pytest_cache/** +- '*.egg-info/**' +- __init__.py +- '*__init__.py' +main_service: python +services: + python: + environment: + - RUSTUP_HOME=/usr/local/rustup + - CARGO_HOME=/usr/local/cargo + - PATH=/usr/local/cargo/bin:$PATH + image: python:3.11 +setup_commands: +- apt-get update && apt-get install -y curl build-essential pkg-config +- curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --profile + minimal +- source $CARGO_HOME/env +- python -m pip install --upgrade pip +- pip install setuptools wheel setuptools-rust>=1.5.2 +- pip install pytest hypothesis coverage pytest-json-report +- pip install -e . +single_file_test_commands: +- coverage run -m pytest "$TEST_FILE" --import-mode=append --json-report +- coverage json +test_commands: +- coverage run -m pytest tests --import-mode=append --show-capture=no --json-report +- coverage combine || true +- coverage json -i || true