diff --git a/omlx/admin/i18n/en.json b/omlx/admin/i18n/en.json index 8b3b79a5..26e4fec3 100644 --- a/omlx/admin/i18n/en.json +++ b/omlx/admin/i18n/en.json @@ -402,6 +402,11 @@ "modal.model_settings.turboquant_kv": "TurboQuant KV Cache", "modal.model_settings.turboquant_kv_hint": "Compress KV cache using vector quantization. Lower bits = more compression, higher bits = better quality. Supports 2 to 8 bits.", "modal.model_settings.turboquant_kv_bits_label": "Bits per channel", + "modal.model_settings.planarquant_kv": "PlanarQuant3 KV Cache", + "modal.model_settings.planarquant_kv_hint": "Compress KV cache with 2D Givens rotation + 3-bit Lloyd-Max quantization. Mutually exclusive with TurboQuant.", + "modal.model_settings.planarquant_kv_bits_label": "Quantization bits", + "modal.model_settings.planarquant_quantize_v": "Quantize V", + "modal.model_settings.planarquant_quantize_v_hint": "Also quantize V state (13x packed vs 4x for K-only).", "modal.model_settings.index_cache": "Index Cache", "modal.model_settings.index_cache_hint": "Skip redundant indexer computation in DSA layers (DeepSeek V3/GLM-5).", "modal.model_settings.index_cache_freq": "Frequency (every Nth layer keeps indexer)", diff --git a/omlx/admin/i18n/ja.json b/omlx/admin/i18n/ja.json index d2f10f22..10162cc9 100644 --- a/omlx/admin/i18n/ja.json +++ b/omlx/admin/i18n/ja.json @@ -402,6 +402,11 @@ "modal.model_settings.turboquant_kv": "TurboQuant KV Cache", "modal.model_settings.turboquant_kv_hint": "Compress KV cache using vector quantization. Reduces memory ~60-75% with near-lossless quality for long context.", "modal.model_settings.turboquant_kv_bits_label": "Bits per channel", + "modal.model_settings.planarquant_kv": "PlanarQuant3 KVキャッシュ", + "modal.model_settings.planarquant_kv_hint": "2D Givens回転 + 3ビットLloyd-Max量子化でKVキャッシュを圧縮。TurboQuantとは相互排他。", + "modal.model_settings.planarquant_kv_bits_label": "量子化ビット", + "modal.model_settings.planarquant_quantize_v": "V量子化", + "modal.model_settings.planarquant_quantize_v_hint": "V状態も量子化します(Kのみ4xに対しパック13x)。", "modal.model_settings.index_cache": "インデックスキャッシュ", "modal.model_settings.index_cache_hint": "DSAレイヤーで冗長なインデクサー計算をスキップします(DeepSeek V3/GLM-5)。", "modal.model_settings.index_cache_freq": "頻度(N番目のレイヤーごとにインデクサーを保持)", diff --git a/omlx/admin/i18n/ko.json b/omlx/admin/i18n/ko.json index f4326ee5..426d36bd 100644 --- a/omlx/admin/i18n/ko.json +++ b/omlx/admin/i18n/ko.json @@ -402,6 +402,11 @@ "modal.model_settings.turboquant_kv": "TurboQuant KV Cache", "modal.model_settings.turboquant_kv_hint": "Compress KV cache using vector quantization. Reduces memory ~60-75% with near-lossless quality for long context.", "modal.model_settings.turboquant_kv_bits_label": "Bits per channel", + "modal.model_settings.planarquant_kv": "PlanarQuant3 KV 캐시", + "modal.model_settings.planarquant_kv_hint": "2D Givens 회전 + 3비트 Lloyd-Max 양자화로 KV 캐시를 압축합니다. TurboQuant와 상호 배타적입니다.", + "modal.model_settings.planarquant_kv_bits_label": "양자화 비트", + "modal.model_settings.planarquant_quantize_v": "V 양자화", + "modal.model_settings.planarquant_quantize_v_hint": "V 상태도 양자화합니다 (K 전용 4x 대비 패킹 13x).", "modal.model_settings.index_cache": "인덱스 캐시", "modal.model_settings.index_cache_hint": "DSA 레이어에서 중복 인덱서 연산을 건너뜁니다 (DeepSeek V3/GLM-5).", "modal.model_settings.index_cache_freq": "빈도 (N번째 레이어마다 인덱서 유지)", diff --git a/omlx/admin/i18n/zh-TW.json b/omlx/admin/i18n/zh-TW.json index 7911283f..74840f49 100644 --- a/omlx/admin/i18n/zh-TW.json +++ b/omlx/admin/i18n/zh-TW.json @@ -402,6 +402,11 @@ "modal.model_settings.turboquant_kv": "TurboQuant KV Cache", "modal.model_settings.turboquant_kv_hint": "Compress KV cache using vector quantization. Reduces memory ~60-75% with near-lossless quality for long context.", "modal.model_settings.turboquant_kv_bits_label": "Bits per channel", + "modal.model_settings.planarquant_kv": "PlanarQuant3 KV 快取", + "modal.model_settings.planarquant_kv_hint": "使用2D Givens旋轉 + 3位元Lloyd-Max量化壓縮KV快取。與TurboQuant互斥。", + "modal.model_settings.planarquant_kv_bits_label": "量化位元", + "modal.model_settings.planarquant_quantize_v": "量化V", + "modal.model_settings.planarquant_quantize_v_hint": "同時量化V狀態(打包13x,僅K為4x)。", "modal.model_settings.index_cache": "索引快取", "modal.model_settings.index_cache_hint": "跳過DSA層中冗餘的索引器運算(DeepSeek V3/GLM-5)。", "modal.model_settings.index_cache_freq": "頻率(每N層保留索引器)", diff --git a/omlx/admin/i18n/zh.json b/omlx/admin/i18n/zh.json index 66356782..670a917c 100644 --- a/omlx/admin/i18n/zh.json +++ b/omlx/admin/i18n/zh.json @@ -402,6 +402,11 @@ "modal.model_settings.turboquant_kv": "TurboQuant KV Cache", "modal.model_settings.turboquant_kv_hint": "Compress KV cache using vector quantization. Reduces memory ~60-75% with near-lossless quality for long context.", "modal.model_settings.turboquant_kv_bits_label": "Bits per channel", + "modal.model_settings.planarquant_kv": "PlanarQuant3 KV 缓存", + "modal.model_settings.planarquant_kv_hint": "使用2D Givens旋转 + 3位Lloyd-Max量化压缩KV缓存。与TurboQuant互斥。", + "modal.model_settings.planarquant_kv_bits_label": "量化位数", + "modal.model_settings.planarquant_quantize_v": "量化V", + "modal.model_settings.planarquant_quantize_v_hint": "同时量化V状态(打包13x,仅K为4x)。", "modal.model_settings.index_cache": "索引缓存", "modal.model_settings.index_cache_hint": "跳过DSA层中的冗余索引器计算(DeepSeek V3/GLM-5)。", "modal.model_settings.index_cache_freq": "频率(每N层保留索引器)", diff --git a/omlx/admin/routes.py b/omlx/admin/routes.py index e697c3aa..75839847 100644 --- a/omlx/admin/routes.py +++ b/omlx/admin/routes.py @@ -113,6 +113,10 @@ class ModelSettingsRequest(BaseModel): # TurboQuant KV cache (mlx-vlm backend) turboquant_kv_enabled: Optional[bool] = None turboquant_kv_bits: Optional[float] = None + # PlanarQuant3 KV cache + planarquant_kv_enabled: Optional[bool] = None + planarquant_kv_bits: Optional[int] = None + planarquant_quantize_v: Optional[bool] = None # SpecPrefill (experimental) specprefill_enabled: Optional[bool] = None specprefill_draft_model: Optional[str] = None @@ -1375,6 +1379,10 @@ async def list_models(is_admin: bool = Depends(require_admin)): "index_cache_freq": settings.index_cache_freq, "turboquant_kv_enabled": settings.turboquant_kv_enabled, "turboquant_kv_bits": settings.turboquant_kv_bits, + "turboquant_skip_last": settings.turboquant_skip_last, + "planarquant_kv_enabled": settings.planarquant_kv_enabled, + "planarquant_kv_bits": settings.planarquant_kv_bits, + "planarquant_quantize_v": settings.planarquant_quantize_v, "specprefill_enabled": settings.specprefill_enabled, "specprefill_draft_model": settings.specprefill_draft_model, "specprefill_keep_pct": settings.specprefill_keep_pct, @@ -1592,6 +1600,21 @@ async def update_model_settings( current_settings.turboquant_kv_enabled = request.turboquant_kv_enabled or False if "turboquant_kv_bits" in sent: current_settings.turboquant_kv_bits = request.turboquant_kv_bits or 4 + # PlanarQuant3 KV cache settings + if "planarquant_kv_enabled" in sent: + current_settings.planarquant_kv_enabled = request.planarquant_kv_enabled or False + if "planarquant_kv_bits" in sent: + current_settings.planarquant_kv_bits = int(request.planarquant_kv_bits or 3) + if "planarquant_quantize_v" in sent: + current_settings.planarquant_quantize_v = ( + True if request.planarquant_quantize_v is None else bool(request.planarquant_quantize_v) + ) + # Mutual exclusion: PQ and TQ patch the same attention dispatch path + if current_settings.planarquant_kv_enabled and current_settings.turboquant_kv_enabled: + logger.warning( + "PlanarQuant3 and TurboQuant are mutually exclusive; disabling TurboQuant." + ) + current_settings.turboquant_kv_enabled = False # SpecPrefill settings if "specprefill_enabled" in sent: current_settings.specprefill_enabled = request.specprefill_enabled or False diff --git a/omlx/admin/static/js/dashboard.js b/omlx/admin/static/js/dashboard.js index 956927e9..a2917db9 100644 --- a/omlx/admin/static/js/dashboard.js +++ b/omlx/admin/static/js/dashboard.js @@ -945,6 +945,9 @@ index_cache_freq: settings.index_cache_freq || null, turboquant_kv_enabled: settings.turboquant_kv_enabled || false, turboquant_kv_bits: settings.turboquant_kv_bits || 4, + planarquant_kv_enabled: settings.planarquant_kv_enabled || false, + planarquant_kv_bits: settings.planarquant_kv_bits || 3, + planarquant_quantize_v: settings.planarquant_quantize_v !== false, specprefill_enabled: settings.specprefill_enabled || false, specprefill_draft_model: settings.specprefill_draft_model || '', specprefill_keep_pct: settings.specprefill_keep_pct ? String(settings.specprefill_keep_pct) : '0.2', @@ -1019,6 +1022,11 @@ turboquant_kv_bits: this.modelSettings.turboquant_kv_enabled ? (parseFloat(this.modelSettings.turboquant_kv_bits) || 4) : 4, + planarquant_kv_enabled: this.modelSettings.planarquant_kv_enabled, + planarquant_kv_bits: this.modelSettings.planarquant_kv_enabled + ? (parseInt(this.modelSettings.planarquant_kv_bits) || 3) + : 3, + planarquant_quantize_v: this.modelSettings.planarquant_quantize_v !== false, specprefill_enabled: this.modelSettings.specprefill_enabled, specprefill_draft_model: this.modelSettings.specprefill_draft_model || null, specprefill_keep_pct: this.modelSettings.specprefill_enabled diff --git a/omlx/admin/templates/dashboard/_modal_model_settings.html b/omlx/admin/templates/dashboard/_modal_model_settings.html index da4273ad..243a054a 100644 --- a/omlx/admin/templates/dashboard/_modal_model_settings.html +++ b/omlx/admin/templates/dashboard/_modal_model_settings.html @@ -348,7 +348,7 @@

{{ {{ t('modal.model_settings.turboquant_kv') }}

{{ t('modal.model_settings.turboquant_kv_hint') }}

- + +
+
+ + +
+
+
+ {{ t('modal.model_settings.planarquant_quantize_v') }} +

{{ t('modal.model_settings.planarquant_quantize_v_hint') }}

+
+ +
+
+ +