aeromomo · haosenwang1018 · Feb 24, 2026
diff --git a/scripts/lib/tokens.py b/scripts/lib/tokens.py
@@ -29,8 +29,18 @@
 CHARS_PER_TOKEN = 4  # fallback for ASCII text
 CJK_CHARS_PER_TOKEN = 1.5  # CJK characters average ~1.5 chars/token
 
-# CJK unified ideographs + common ranges
-_CJK_RE = re.compile(r'[\u4e00-\u9fff\u3400-\u4dbf\u3000-\u303f\uff00-\uffef]')
+# CJK unified ideographs, Japanese kana, Korean Hangul, and common ranges
+_CJK_RE = re.compile(
+    r'['
+    r'\u3000-\u303f'    # CJK symbols and punctuation
+    r'\u3040-\u309f'    # Hiragana
+    r'\u30a0-\u30ff'    # Katakana
+    r'\u3400-\u4dbf'    # CJK Unified Ideographs Extension A
+    r'\u4e00-\u9fff'    # CJK Unified Ideographs
+    r'\uac00-\ud7af'    # Hangul Syllables
+    r'\uff00-\uffef'    # Fullwidth Forms
+    r']'
+)
 
 
 def _heuristic_tokens(text: str) -> int:

diff --git a/tests/test_lib_tokens.py b/tests/test_lib_tokens.py
@@ -28,6 +28,28 @@ def test_chinese(self):
         result = estimate_tokens("你好世界这是一段中文")
         assert result > 0
 
+    def test_korean(self):
+        result = estimate_tokens("안녕하세요 세계")
+        assert result > 0
+
+    def test_japanese_hiragana(self):
+        result = estimate_tokens("こんにちは世界")
+        assert result > 0
+
+    def test_japanese_katakana(self):
+        result = estimate_tokens("カタカナテスト")
+        assert result > 0
+
+    def test_cjk_heuristic_covers_all_scripts(self):
+        """Korean and Japanese should get CJK token rates, not ASCII rates."""
+        if using_tiktoken():
+            pytest.skip("heuristic path not active when tiktoken is installed")
+        # Pure Korean (5 Hangul syllables) should yield ~3-4 tokens at 1.5 chars/token
+        korean = estimate_tokens("안녕하세요")
+        # If the heuristic treated these as ASCII (4 chars/token on byte length),
+        # the count would be much lower than expected.
+        assert korean >= 3
+
     def test_mixed_language(self):
         result = estimate_tokens("Hello 你好 World 世界")
         assert result > 0