Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions omlx/admin/i18n/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,11 @@
"modal.model_settings.turboquant_kv": "TurboQuant KV Cache",
"modal.model_settings.turboquant_kv_hint": "Compress KV cache using vector quantization. Lower bits = more compression, higher bits = better quality. Supports 2 to 8 bits.",
"modal.model_settings.turboquant_kv_bits_label": "Bits per channel",
"modal.model_settings.planarquant_kv": "PlanarQuant3 KV Cache",
"modal.model_settings.planarquant_kv_hint": "Compress KV cache with 2D Givens rotation + 3-bit Lloyd-Max quantization. Mutually exclusive with TurboQuant.",
"modal.model_settings.planarquant_kv_bits_label": "Quantization bits",
"modal.model_settings.planarquant_quantize_v": "Quantize V",
"modal.model_settings.planarquant_quantize_v_hint": "Also quantize V state (13x packed vs 4x for K-only).",
"modal.model_settings.index_cache": "Index Cache",
"modal.model_settings.index_cache_hint": "Skip redundant indexer computation in DSA layers (DeepSeek V3/GLM-5).",
"modal.model_settings.index_cache_freq": "Frequency (every Nth layer keeps indexer)",
Expand Down
5 changes: 5 additions & 0 deletions omlx/admin/i18n/ja.json
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,11 @@
"modal.model_settings.turboquant_kv": "TurboQuant KV Cache",
"modal.model_settings.turboquant_kv_hint": "Compress KV cache using vector quantization. Reduces memory ~60-75% with near-lossless quality for long context.",
"modal.model_settings.turboquant_kv_bits_label": "Bits per channel",
"modal.model_settings.planarquant_kv": "PlanarQuant3 KVキャッシュ",
"modal.model_settings.planarquant_kv_hint": "2D Givens回転 + 3ビットLloyd-Max量子化でKVキャッシュを圧縮。TurboQuantとは相互排他。",
"modal.model_settings.planarquant_kv_bits_label": "量子化ビット",
"modal.model_settings.planarquant_quantize_v": "V量子化",
"modal.model_settings.planarquant_quantize_v_hint": "V状態も量子化します(Kのみ4xに対しパック13x)。",
"modal.model_settings.index_cache": "インデックスキャッシュ",
"modal.model_settings.index_cache_hint": "DSAレイヤーで冗長なインデクサー計算をスキップします(DeepSeek V3/GLM-5)。",
"modal.model_settings.index_cache_freq": "頻度(N番目のレイヤーごとにインデクサーを保持)",
Expand Down
5 changes: 5 additions & 0 deletions omlx/admin/i18n/ko.json
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,11 @@
"modal.model_settings.turboquant_kv": "TurboQuant KV Cache",
"modal.model_settings.turboquant_kv_hint": "Compress KV cache using vector quantization. Reduces memory ~60-75% with near-lossless quality for long context.",
"modal.model_settings.turboquant_kv_bits_label": "Bits per channel",
"modal.model_settings.planarquant_kv": "PlanarQuant3 KV 캐시",
"modal.model_settings.planarquant_kv_hint": "2D Givens 회전 + 3비트 Lloyd-Max 양자화로 KV 캐시를 압축합니다. TurboQuant와 상호 배타적입니다.",
"modal.model_settings.planarquant_kv_bits_label": "양자화 비트",
"modal.model_settings.planarquant_quantize_v": "V 양자화",
"modal.model_settings.planarquant_quantize_v_hint": "V 상태도 양자화합니다 (K 전용 4x 대비 패킹 13x).",
"modal.model_settings.index_cache": "인덱스 캐시",
"modal.model_settings.index_cache_hint": "DSA 레이어에서 중복 인덱서 연산을 건너뜁니다 (DeepSeek V3/GLM-5).",
"modal.model_settings.index_cache_freq": "빈도 (N번째 레이어마다 인덱서 유지)",
Expand Down
5 changes: 5 additions & 0 deletions omlx/admin/i18n/zh-TW.json
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,11 @@
"modal.model_settings.turboquant_kv": "TurboQuant KV Cache",
"modal.model_settings.turboquant_kv_hint": "Compress KV cache using vector quantization. Reduces memory ~60-75% with near-lossless quality for long context.",
"modal.model_settings.turboquant_kv_bits_label": "Bits per channel",
"modal.model_settings.planarquant_kv": "PlanarQuant3 KV 快取",
"modal.model_settings.planarquant_kv_hint": "使用2D Givens旋轉 + 3位元Lloyd-Max量化壓縮KV快取。與TurboQuant互斥。",
"modal.model_settings.planarquant_kv_bits_label": "量化位元",
"modal.model_settings.planarquant_quantize_v": "量化V",
"modal.model_settings.planarquant_quantize_v_hint": "同時量化V狀態(打包13x,僅K為4x)。",
"modal.model_settings.index_cache": "索引快取",
"modal.model_settings.index_cache_hint": "跳過DSA層中冗餘的索引器運算(DeepSeek V3/GLM-5)。",
"modal.model_settings.index_cache_freq": "頻率(每N層保留索引器)",
Expand Down
5 changes: 5 additions & 0 deletions omlx/admin/i18n/zh.json
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,11 @@
"modal.model_settings.turboquant_kv": "TurboQuant KV Cache",
"modal.model_settings.turboquant_kv_hint": "Compress KV cache using vector quantization. Reduces memory ~60-75% with near-lossless quality for long context.",
"modal.model_settings.turboquant_kv_bits_label": "Bits per channel",
"modal.model_settings.planarquant_kv": "PlanarQuant3 KV 缓存",
"modal.model_settings.planarquant_kv_hint": "使用2D Givens旋转 + 3位Lloyd-Max量化压缩KV缓存。与TurboQuant互斥。",
"modal.model_settings.planarquant_kv_bits_label": "量化位数",
"modal.model_settings.planarquant_quantize_v": "量化V",
"modal.model_settings.planarquant_quantize_v_hint": "同时量化V状态(打包13x,仅K为4x)。",
"modal.model_settings.index_cache": "索引缓存",
"modal.model_settings.index_cache_hint": "跳过DSA层中的冗余索引器计算(DeepSeek V3/GLM-5)。",
"modal.model_settings.index_cache_freq": "频率(每N层保留索引器)",
Expand Down
23 changes: 23 additions & 0 deletions omlx/admin/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,10 @@ class ModelSettingsRequest(BaseModel):
# TurboQuant KV cache (mlx-vlm backend)
turboquant_kv_enabled: Optional[bool] = None
turboquant_kv_bits: Optional[float] = None
# PlanarQuant3 KV cache
planarquant_kv_enabled: Optional[bool] = None
planarquant_kv_bits: Optional[int] = None
planarquant_quantize_v: Optional[bool] = None
# SpecPrefill (experimental)
specprefill_enabled: Optional[bool] = None
specprefill_draft_model: Optional[str] = None
Expand Down Expand Up @@ -1375,6 +1379,10 @@ async def list_models(is_admin: bool = Depends(require_admin)):
"index_cache_freq": settings.index_cache_freq,
"turboquant_kv_enabled": settings.turboquant_kv_enabled,
"turboquant_kv_bits": settings.turboquant_kv_bits,
"turboquant_skip_last": settings.turboquant_skip_last,
"planarquant_kv_enabled": settings.planarquant_kv_enabled,
"planarquant_kv_bits": settings.planarquant_kv_bits,
"planarquant_quantize_v": settings.planarquant_quantize_v,
"specprefill_enabled": settings.specprefill_enabled,
"specprefill_draft_model": settings.specprefill_draft_model,
"specprefill_keep_pct": settings.specprefill_keep_pct,
Expand Down Expand Up @@ -1592,6 +1600,21 @@ async def update_model_settings(
current_settings.turboquant_kv_enabled = request.turboquant_kv_enabled or False
if "turboquant_kv_bits" in sent:
current_settings.turboquant_kv_bits = request.turboquant_kv_bits or 4
# PlanarQuant3 KV cache settings
if "planarquant_kv_enabled" in sent:
current_settings.planarquant_kv_enabled = request.planarquant_kv_enabled or False
if "planarquant_kv_bits" in sent:
current_settings.planarquant_kv_bits = int(request.planarquant_kv_bits or 3)
if "planarquant_quantize_v" in sent:
current_settings.planarquant_quantize_v = (
True if request.planarquant_quantize_v is None else bool(request.planarquant_quantize_v)
)
# Mutual exclusion: PQ and TQ patch the same attention dispatch path
if current_settings.planarquant_kv_enabled and current_settings.turboquant_kv_enabled:
logger.warning(
"PlanarQuant3 and TurboQuant are mutually exclusive; disabling TurboQuant."
)
current_settings.turboquant_kv_enabled = False
# SpecPrefill settings
if "specprefill_enabled" in sent:
current_settings.specprefill_enabled = request.specprefill_enabled or False
Expand Down
8 changes: 8 additions & 0 deletions omlx/admin/static/js/dashboard.js
Original file line number Diff line number Diff line change
Expand Up @@ -945,6 +945,9 @@
index_cache_freq: settings.index_cache_freq || null,
turboquant_kv_enabled: settings.turboquant_kv_enabled || false,
turboquant_kv_bits: settings.turboquant_kv_bits || 4,
planarquant_kv_enabled: settings.planarquant_kv_enabled || false,
planarquant_kv_bits: settings.planarquant_kv_bits || 3,
planarquant_quantize_v: settings.planarquant_quantize_v !== false,
specprefill_enabled: settings.specprefill_enabled || false,
specprefill_draft_model: settings.specprefill_draft_model || '',
specprefill_keep_pct: settings.specprefill_keep_pct ? String(settings.specprefill_keep_pct) : '0.2',
Expand Down Expand Up @@ -1019,6 +1022,11 @@
turboquant_kv_bits: this.modelSettings.turboquant_kv_enabled
? (parseFloat(this.modelSettings.turboquant_kv_bits) || 4)
: 4,
planarquant_kv_enabled: this.modelSettings.planarquant_kv_enabled,
planarquant_kv_bits: this.modelSettings.planarquant_kv_enabled
? (parseInt(this.modelSettings.planarquant_kv_bits) || 3)
: 3,
planarquant_quantize_v: this.modelSettings.planarquant_quantize_v !== false,
specprefill_enabled: this.modelSettings.specprefill_enabled,
specprefill_draft_model: this.modelSettings.specprefill_draft_model || null,
specprefill_keep_pct: this.modelSettings.specprefill_enabled
Expand Down
39 changes: 38 additions & 1 deletion omlx/admin/templates/dashboard/_modal_model_settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ <h4 class="text-xs font-bold uppercase tracking-widest text-neutral-400 mb-3">{{
<span class="text-sm font-medium text-neutral-700">{{ t('modal.model_settings.turboquant_kv') }}</span>
<p class="text-xs text-neutral-500 mt-0.5">{{ t('modal.model_settings.turboquant_kv_hint') }}</p>
</div>
<button type="button" @click="modelSettings.turboquant_kv_enabled = !modelSettings.turboquant_kv_enabled"
<button type="button" @click="modelSettings.turboquant_kv_enabled = !modelSettings.turboquant_kv_enabled; if (modelSettings.turboquant_kv_enabled) modelSettings.planarquant_kv_enabled = false;"
:class="modelSettings.turboquant_kv_enabled ? 'bg-black' : 'bg-neutral-200'"
class="relative flex-shrink-0 w-11 h-6 mt-0.5 rounded-full transition-colors duration-300 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-black">
<span :class="modelSettings.turboquant_kv_enabled ? 'translate-x-5' : 'translate-x-0'"
Expand All @@ -370,6 +370,43 @@ <h4 class="text-xs font-bold uppercase tracking-widest text-neutral-400 mb-3">{{
</div>
</div>

<!-- PlanarQuant3 KV Cache -->
<div class="p-4 bg-neutral-50 rounded-xl space-y-3 mb-3">
<div class="flex items-start justify-between gap-3">
<div class="min-w-0">
<span class="text-sm font-medium text-neutral-700">{{ t('modal.model_settings.planarquant_kv') }}</span>
<p class="text-xs text-neutral-500 mt-0.5">{{ t('modal.model_settings.planarquant_kv_hint') }}</p>
</div>
<button type="button" @click="modelSettings.planarquant_kv_enabled = !modelSettings.planarquant_kv_enabled; if (modelSettings.planarquant_kv_enabled) modelSettings.turboquant_kv_enabled = false;"
:class="modelSettings.planarquant_kv_enabled ? 'bg-black' : 'bg-neutral-200'"
class="relative flex-shrink-0 w-11 h-6 mt-0.5 rounded-full transition-colors duration-300 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-black">
<span :class="modelSettings.planarquant_kv_enabled ? 'translate-x-5' : 'translate-x-0'"
class="block w-5 h-5 bg-white rounded-full shadow-sm transform transition-transform duration-300 absolute top-0.5 left-0.5"></span>
</button>
</div>
<div x-show="modelSettings.planarquant_kv_enabled" x-transition class="pt-1 space-y-3">
<div>
<label class="block text-xs font-bold uppercase tracking-wider text-neutral-500 mb-2">{{ t('modal.model_settings.planarquant_kv_bits_label') }}</label>
<select x-model.number="modelSettings.planarquant_kv_bits"
class="w-full px-4 py-2.5 border border-neutral-200 rounded-xl text-sm focus:ring-2 focus:ring-neutral-900 focus:border-transparent transition-all bg-white">
<option :value="3">3-bit</option>
</select>
</div>
<div class="flex items-center justify-between gap-3">
<div class="min-w-0">
<span class="text-sm font-medium text-neutral-700">{{ t('modal.model_settings.planarquant_quantize_v') }}</span>
<p class="text-xs text-neutral-500 mt-0.5">{{ t('modal.model_settings.planarquant_quantize_v_hint') }}</p>
</div>
<button type="button" @click="modelSettings.planarquant_quantize_v = !modelSettings.planarquant_quantize_v"
:class="modelSettings.planarquant_quantize_v ? 'bg-black' : 'bg-neutral-200'"
class="relative flex-shrink-0 w-11 h-6 mt-0.5 rounded-full transition-colors duration-300 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-black">
<span :class="modelSettings.planarquant_quantize_v ? 'translate-x-5' : 'translate-x-0'"
class="block w-5 h-5 bg-white rounded-full shadow-sm transform transition-transform duration-300 absolute top-0.5 left-0.5"></span>
</button>
</div>
</div>
</div>

<!-- IndexCache (DSA models only) -->
<template x-if="DSA_MODEL_TYPES.has(selectedModel?.config_model_type || '')">
<div class="p-4 bg-neutral-50 rounded-xl space-y-3 mb-3">
Expand Down
33 changes: 33 additions & 0 deletions omlx/cache/planarquant/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# SPDX-License-Identifier: Apache-2.0
"""PlanarQuant3 KV cache — 2D Givens rotation + 3-bit Lloyd-Max quantization.

Port of the llama.cpp fork feature/planarquant-kv-cache branch to MLX.
Upstream reference: https://github.com/scrya-com/rotorquant (MIT)
Bit-exact source: https://github.com/johndpope/llama-cpp-turboquant
"""

from .constants import (
PLANAR_BITS,
PLANAR_CENTROIDS_3BIT,
PLANAR_COS_64,
PLANAR_D,
PLANAR_PAIRS,
PLANAR_SIN_64,
centroids_mx,
cos_sin_mx,
)
from .reference import dequantize_block, quantize_block, roundtrip

__all__ = [
"PLANAR_D",
"PLANAR_PAIRS",
"PLANAR_BITS",
"PLANAR_CENTROIDS_3BIT",
"PLANAR_COS_64",
"PLANAR_SIN_64",
"centroids_mx",
"cos_sin_mx",
"quantize_block",
"dequantize_block",
"roundtrip",
]
Loading