Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions scripts/lib/tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,18 @@
CHARS_PER_TOKEN = 4 # fallback for ASCII text
CJK_CHARS_PER_TOKEN = 1.5 # CJK characters average ~1.5 chars/token

# CJK unified ideographs + common ranges
_CJK_RE = re.compile(r'[\u4e00-\u9fff\u3400-\u4dbf\u3000-\u303f\uff00-\uffef]')
# CJK unified ideographs, Japanese kana, Korean Hangul, and common ranges
_CJK_RE = re.compile(
r'['
r'\u3000-\u303f' # CJK symbols and punctuation
r'\u3040-\u309f' # Hiragana
r'\u30a0-\u30ff' # Katakana
r'\u3400-\u4dbf' # CJK Unified Ideographs Extension A
r'\u4e00-\u9fff' # CJK Unified Ideographs
r'\uac00-\ud7af' # Hangul Syllables
r'\uff00-\uffef' # Fullwidth Forms
r']'
)


def _heuristic_tokens(text: str) -> int:
Expand Down
22 changes: 22 additions & 0 deletions tests/test_lib_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,28 @@ def test_chinese(self):
result = estimate_tokens("你好世界这是一段中文")
assert result > 0

def test_korean(self):
result = estimate_tokens("안녕하세요 세계")
assert result > 0

def test_japanese_hiragana(self):
result = estimate_tokens("こんにちは世界")
assert result > 0

def test_japanese_katakana(self):
result = estimate_tokens("カタカナテスト")
assert result > 0

def test_cjk_heuristic_covers_all_scripts(self):
"""Korean and Japanese should get CJK token rates, not ASCII rates."""
if using_tiktoken():
pytest.skip("heuristic path not active when tiktoken is installed")
# Pure Korean (5 Hangul syllables) should yield ~3-4 tokens at 1.5 chars/token
korean = estimate_tokens("안녕하세요")
# If the heuristic treated these as ASCII (4 chars/token on byte length),
# the count would be much lower than expected.
assert korean >= 3

def test_mixed_language(self):
result = estimate_tokens("Hello 你好 World 世界")
assert result > 0
Expand Down