Skip to content

Commit 2e39d71

Browse files
committed
fix: GPTQ performance for large MoE models
- chunked error compensation: process columns in blocks of 8 with cross-chunk matmul instead of per-column broadcast to all remaining. reduces memory traffic ~5x for expert tensors - expert sub-batching: split 256+ experts into batches of 32 for GPTQ. reduces peak memory ~8x (3GB+ -> ~400MB per projection) while producing identical results. batch size configurable via admin UI - budget plan fallback: when target bpw is unreachable due to sensitivity tier limits, boost non-expert tensors toward 8-bit using remaining budget. fixes oQ2 on large MoE models stopping at 2.68 instead of reaching target 2.8
1 parent 39f1ee2 commit 2e39d71

10 files changed

Lines changed: 231 additions & 59 deletions

File tree

omlx/admin/i18n/en.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,11 @@
197197
"models.oq.calib_dataset": "Calibration Dataset",
198198
"models.oq.calib_samples": "Calibration Samples",
199199
"models.oq.calib_seq_length": "Sequence Length",
200+
"models.oq.expert_batch_size": "GPTQ MoE Batch",
201+
"models.oq.expert_batch_none": "off — not recommended, high memory",
202+
"models.oq.expert_batch_default": "default",
203+
"models.oq.expert_batch_slow": "maybe slower, low memory",
204+
"models.oq.expert_batch_desc": "Splits MoE expert weights into smaller batches during GPTQ. Lower values reduce memory usage. Only affects MoE models (64+ experts).",
200205
"models.uploader.section_label": "Hub Upload",
201206
"models.uploader.heading": "Upload oQ Models",
202207
"models.uploader.description": "Upload your locally quantized oQ models to HuggingFace Hub.",

omlx/admin/i18n/ja.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,11 @@
197197
"models.oq.calib_dataset": "キャリブレーションデータセット",
198198
"models.oq.calib_samples": "キャリブレーションサンプル",
199199
"models.oq.calib_seq_length": "シーケンス長",
200+
"models.oq.expert_batch_size": "Expert Batch Size",
201+
"models.oq.expert_batch_none": "off — not recommended, high memory",
202+
"models.oq.expert_batch_default": "default",
203+
"models.oq.expert_batch_slow": "maybe slower, low memory",
204+
"models.oq.expert_batch_desc": "Splits MoE expert weights into smaller batches during GPTQ. Lower values reduce memory usage. Only affects MoE models (64+ experts).",
200205
"models.uploader.section_label": "Hub Upload",
201206
"models.uploader.heading": "Upload oQ Models",
202207
"models.uploader.description": "Upload your locally quantized oQ models to HuggingFace Hub.",

omlx/admin/i18n/ko.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,11 @@
197197
"models.oq.calib_dataset": "캘리브레이션 데이터셋",
198198
"models.oq.calib_samples": "캘리브레이션 샘플",
199199
"models.oq.calib_seq_length": "시퀀스 길이",
200+
"models.oq.expert_batch_size": "GPTQ MoE 배치",
201+
"models.oq.expert_batch_none": "비활성화 — 비추천, 메모리 많이 사용",
202+
"models.oq.expert_batch_default": "기본",
203+
"models.oq.expert_batch_slow": "더 느릴 수 있음, 메모리 적게 사용",
204+
"models.oq.expert_batch_desc": "GPTQ 시 MoE expert 가중치를 작은 배치로 분할 처리합니다. 값이 작을수록 메모리 사용량이 줄어듭니다. MoE 모델(64+ experts)에만 적용됩니다.",
200205
"models.uploader.section_label": "Hub Upload",
201206
"models.uploader.heading": "Upload oQ Models",
202207
"models.uploader.description": "Upload your locally quantized oQ models to HuggingFace Hub.",

omlx/admin/i18n/zh-TW.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,11 @@
197197
"models.oq.calib_dataset": "校準資料集",
198198
"models.oq.calib_samples": "校準樣本數",
199199
"models.oq.calib_seq_length": "序列長度",
200+
"models.oq.expert_batch_size": "Expert Batch Size",
201+
"models.oq.expert_batch_none": "off — not recommended, high memory",
202+
"models.oq.expert_batch_default": "default",
203+
"models.oq.expert_batch_slow": "maybe slower, low memory",
204+
"models.oq.expert_batch_desc": "Splits MoE expert weights into smaller batches during GPTQ. Lower values reduce memory usage. Only affects MoE models (64+ experts).",
200205
"models.uploader.section_label": "Hub Upload",
201206
"models.uploader.heading": "Upload oQ Models",
202207
"models.uploader.description": "Upload your locally quantized oQ models to HuggingFace Hub.",

omlx/admin/i18n/zh.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,11 @@
197197
"models.oq.calib_dataset": "校准数据集",
198198
"models.oq.calib_samples": "校准样本数",
199199
"models.oq.calib_seq_length": "序列长度",
200+
"models.oq.expert_batch_size": "Expert Batch Size",
201+
"models.oq.expert_batch_none": "off — not recommended, high memory",
202+
"models.oq.expert_batch_default": "default",
203+
"models.oq.expert_batch_slow": "maybe slower, low memory",
204+
"models.oq.expert_batch_desc": "Splits MoE expert weights into smaller batches during GPTQ. Lower values reduce memory usage. Only affects MoE models (64+ experts).",
200205
"models.uploader.section_label": "Hub Upload",
201206
"models.uploader.heading": "Upload oQ Models",
202207
"models.uploader.description": "Upload your locally quantized oQ models to HuggingFace Hub.",

omlx/admin/oq_manager.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ class QuantTask:
7878
clip_batch_size: int = 1024
7979
sensitivity_model_path: str = ""
8080
text_only: bool = False
81+
expert_batch_size: int = 32
8182

8283
def to_dict(self) -> dict:
8384
"""Serialize task to JSON-compatible dict."""
@@ -253,6 +254,7 @@ async def start_quantization(
253254
clip_batch_size: int = 1024,
254255
sensitivity_model_path: str = "",
255256
text_only: bool = False,
257+
expert_batch_size: int = 32,
256258
) -> QuantTask:
257259
"""Start a quantization job.
258260
@@ -324,6 +326,7 @@ async def start_quantization(
324326
clip_batch_size=clip_batch_size,
325327
sensitivity_model_path=sensitivity_model_path,
326328
text_only=text_only,
329+
expert_batch_size=expert_batch_size,
327330
)
328331
self._tasks[task_id] = task
329332

@@ -485,6 +488,7 @@ def _progress_cb(phase: str, pct: float) -> None:
485488
None, # target_bpw
486489
None, # hard_cap_bpw
487490
task.sensitivity_model_path,
491+
expert_batch_size=task.expert_batch_size,
488492
)
489493
else:
490494
# Tensor-by-tensor (low memory)

omlx/admin/routes.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ class OQStartRequest(BaseModel):
212212
clip_batch_size: int = 1024
213213
sensitivity_model_path: str = ""
214214
text_only: bool = False
215+
expert_batch_size: int = 32
215216

216217

217218
class HFUploadRequest(BaseModel):
@@ -3660,6 +3661,7 @@ async def start_oq_quantization(
36603661
clip_batch_size=request.clip_batch_size,
36613662
sensitivity_model_path=request.sensitivity_model_path,
36623663
text_only=request.text_only,
3664+
expert_batch_size=request.expert_batch_size,
36633665
)
36643666
return {"success": True, "task": task.to_dict()}
36653667
except ValueError as e:

omlx/admin/static/js/dashboard.js

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,7 @@
269269
oqCalibDataset: 'code_multilingual',
270270
oqClipBatchSize: 1024,
271271
oqSensitivityModelPath: '',
272+
oqExpertBatchSize: 32,
272273

273274
// oQ Uploader state
274275
uploadHfToken: localStorage.getItem('omlx-hf-upload-token') || '',
@@ -2685,6 +2686,7 @@
26852686
clip_batch_size: this.oqClipBatchSize,
26862687
sensitivity_model_path: this.oqSensitivityModelPath,
26872688
text_only: this.oqTextOnly,
2689+
expert_batch_size: this.oqExpertBatchSize,
26882690
}),
26892691
});
26902692
const data = await response.json().catch(() => ({}));

omlx/admin/templates/dashboard/_models.html

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1178,6 +1178,21 @@ <h3 class="text-2xl font-bold tracking-tight text-neutral-900">{{ t('models.oq.h
11781178
<option value="1024">1024</option>
11791179
</select>
11801180
</div>
1181+
<!-- Expert GPTQ Batch (clip ON only) -->
1182+
<div x-show="oqEnableClip" x-cloak class="space-y-1">
1183+
<div class="flex items-center justify-between">
1184+
<span class="text-sm text-neutral-600">{{ t('models.oq.expert_batch_size') }}</span>
1185+
<select x-model.number="oqExpertBatchSize"
1186+
class="px-3 py-1.5 bg-white border border-neutral-300 rounded-lg text-sm text-neutral-900 focus:outline-none focus:ring-2 focus:ring-neutral-900 focus:border-transparent">
1187+
<option value="0">{{ t('models.oq.expert_batch_none') }}</option>
1188+
<option value="32" selected>32 ({{ t('models.oq.expert_batch_default') }})</option>
1189+
<option value="16">16 ({{ t('models.oq.expert_batch_slow') }})</option>
1190+
</select>
1191+
</div>
1192+
<p class="text-[11px] text-neutral-400 leading-relaxed">
1193+
{{ t('models.oq.expert_batch_desc') }}
1194+
</p>
1195+
</div>
11811196
</div>
11821197
</div>
11831198
<!-- Error -->

0 commit comments

Comments
 (0)