Partial sync of codebase

hauntsaninja · hauntsaninja · commit 777ce736798f · 2025-02-13T20:43:40.000-08:00
diff --git a/tiktoken/core.py b/tiktoken/core.py
@@ -2,12 +2,16 @@
 
 import functools
 from concurrent.futures import ThreadPoolExecutor
-from typing import AbstractSet, Collection, Literal, NoReturn, Sequence
+from typing import TYPE_CHECKING, AbstractSet, Collection, Literal, NoReturn, Sequence
 
 import regex
 
 from tiktoken import _tiktoken
 
+if TYPE_CHECKING:
+    import numpy as np
+    import numpy.typing as npt
+
 
 class Encoding:
     def __init__(
@@ -128,6 +132,32 @@ def encode(
             text = text.encode("utf-16", "surrogatepass").decode("utf-16", "replace")
             return self._core_bpe.encode(text, allowed_special)
 
+    def encode_to_numpy(
+        self,
+        text: str,
+        *,
+        allowed_special: Literal["all"] | AbstractSet[str] = set(),  # noqa: B006
+        disallowed_special: Literal["all"] | Collection[str] = "all",
+    ) -> npt.NDArray[np.uint32]:
+        """Encodes a string into tokens, returning a numpy array.
+
+        Avoids the overhead of copying the token buffer into a Python list.
+        """
+        if allowed_special == "all":
+            allowed_special = self.special_tokens_set
+        if disallowed_special == "all":
+            disallowed_special = self.special_tokens_set - allowed_special
+        if disallowed_special:
+            if not isinstance(disallowed_special, frozenset):
+                disallowed_special = frozenset(disallowed_special)
+            if match := _special_token_regex(disallowed_special).search(text):
+                raise_disallowed_special_token(match.group())
+
+        import numpy as np
+
+        buffer = self._core_bpe.encode_to_tiktoken_buffer(text, self.special_tokens_set)
+        return np.frombuffer(buffer, dtype=np.uint32)
+
     def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8) -> list[list[int]]:
         """Encodes a list of strings into tokens, in parallel, ignoring special tokens.
 
@@ -332,6 +362,10 @@ def eot_token(self) -> int:
     def special_tokens_set(self) -> set[str]:
         return set(self._special_tokens.keys())
 
+    def is_special_token(self, token: int) -> bool:
+        assert isinstance(token, int)
+        return token in self._special_token_values
+
     @property
     def n_vocab(self) -> int:
         """For backwards compatibility. Prefer to use `enc.max_token_value + 1`."""
diff --git a/tiktoken/load.py b/tiktoken/load.py
@@ -154,5 +154,5 @@ def load_tiktoken_bpe(tiktoken_bpe_file: str, expected_hash: str | None = None)
             token, rank = line.split()
             ret[base64.b64decode(token)] = int(rank)
         except Exception as e:
-            raise ValueError(f"Error parsing line {line} in {tiktoken_bpe_file}") from e
+            raise ValueError(f"Error parsing line {line!r} in {tiktoken_bpe_file}") from e
     return ret
diff --git a/tiktoken/model.py b/tiktoken/model.py
@@ -6,20 +6,25 @@
 # TODO: these will likely be replaced by an API endpoint
 MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
     "o1-": "o200k_base",
+    "o3-": "o200k_base",
     # chat
     "chatgpt-4o-": "o200k_base",
     "gpt-4o-": "o200k_base",  # e.g., gpt-4o-2024-05-13
     "gpt-4-": "cl100k_base",  # e.g., gpt-4-0314, etc., plus gpt-4-32k
     "gpt-3.5-turbo-": "cl100k_base",  # e.g, gpt-3.5-turbo-0301, -0401, etc.
     "gpt-35-turbo-": "cl100k_base",  # Azure deployment name
     # fine-tuned
+    "ft:gpt-4o": "o200k_base",
     "ft:gpt-4": "cl100k_base",
     "ft:gpt-3.5-turbo": "cl100k_base",
     "ft:davinci-002": "cl100k_base",
     "ft:babbage-002": "cl100k_base",
 }
 
 MODEL_TO_ENCODING: dict[str, str] = {
+    # reasoning
+    "o1": "o200k_base",
+    "o3": "o200k_base",
     # chat
     "gpt-4o": "o200k_base",
     "gpt-4": "cl100k_base",