fix: GPTQ performance for large MoE models

jundot · jundot · commit 2e39d7139fc5 · 2026-03-28T10:19:56.000+09:00
- chunked error compensation: process columns in blocks of 8 with
  cross-chunk matmul instead of per-column broadcast to all remaining.
  reduces memory traffic ~5x for expert tensors

- expert sub-batching: split 256+ experts into batches of 32 for GPTQ.
  reduces peak memory ~8x (3GB+ -&gt; ~400MB per projection) while
  producing identical results. batch size configurable via admin UI

- budget plan fallback: when target bpw is unreachable due to
  sensitivity tier limits, boost non-expert tensors toward 8-bit
  using remaining budget. fixes oQ2 on large MoE models stopping
  at 2.68 instead of reaching target 2.8
diff --git a/omlx/admin/i18n/en.json b/omlx/admin/i18n/en.json
@@ -197,6 +197,11 @@
   "models.oq.calib_dataset": "Calibration Dataset",
   "models.oq.calib_samples": "Calibration Samples",
   "models.oq.calib_seq_length": "Sequence Length",
+  "models.oq.expert_batch_size": "GPTQ MoE Batch",
+  "models.oq.expert_batch_none": "off — not recommended, high memory",
+  "models.oq.expert_batch_default": "default",
+  "models.oq.expert_batch_slow": "maybe slower, low memory",
+  "models.oq.expert_batch_desc": "Splits MoE expert weights into smaller batches during GPTQ. Lower values reduce memory usage. Only affects MoE models (64+ experts).",
   "models.uploader.section_label": "Hub Upload",
   "models.uploader.heading": "Upload oQ Models",
   "models.uploader.description": "Upload your locally quantized oQ models to HuggingFace Hub.",
diff --git a/omlx/admin/i18n/ja.json b/omlx/admin/i18n/ja.json
@@ -197,6 +197,11 @@
   "models.oq.calib_dataset": "キャリブレーションデータセット",
   "models.oq.calib_samples": "キャリブレーションサンプル",
   "models.oq.calib_seq_length": "シーケンス長",
+  "models.oq.expert_batch_size": "Expert Batch Size",
+  "models.oq.expert_batch_none": "off — not recommended, high memory",
+  "models.oq.expert_batch_default": "default",
+  "models.oq.expert_batch_slow": "maybe slower, low memory",
+  "models.oq.expert_batch_desc": "Splits MoE expert weights into smaller batches during GPTQ. Lower values reduce memory usage. Only affects MoE models (64+ experts).",
   "models.uploader.section_label": "Hub Upload",
   "models.uploader.heading": "Upload oQ Models",
   "models.uploader.description": "Upload your locally quantized oQ models to HuggingFace Hub.",
diff --git a/omlx/admin/i18n/ko.json b/omlx/admin/i18n/ko.json
@@ -197,6 +197,11 @@
   "models.oq.calib_dataset": "캘리브레이션 데이터셋",
   "models.oq.calib_samples": "캘리브레이션 샘플",
   "models.oq.calib_seq_length": "시퀀스 길이",
+  "models.oq.expert_batch_size": "GPTQ MoE 배치",
+  "models.oq.expert_batch_none": "비활성화 — 비추천, 메모리 많이 사용",
+  "models.oq.expert_batch_default": "기본",
+  "models.oq.expert_batch_slow": "더 느릴 수 있음, 메모리 적게 사용",
+  "models.oq.expert_batch_desc": "GPTQ 시 MoE expert 가중치를 작은 배치로 분할 처리합니다. 값이 작을수록 메모리 사용량이 줄어듭니다. MoE 모델(64+ experts)에만 적용됩니다.",
   "models.uploader.section_label": "Hub Upload",
   "models.uploader.heading": "Upload oQ Models",
   "models.uploader.description": "Upload your locally quantized oQ models to HuggingFace Hub.",
diff --git a/omlx/admin/i18n/zh-TW.json b/omlx/admin/i18n/zh-TW.json
@@ -197,6 +197,11 @@
   "models.oq.calib_dataset": "校準資料集",
   "models.oq.calib_samples": "校準樣本數",
   "models.oq.calib_seq_length": "序列長度",
+  "models.oq.expert_batch_size": "Expert Batch Size",
+  "models.oq.expert_batch_none": "off — not recommended, high memory",
+  "models.oq.expert_batch_default": "default",
+  "models.oq.expert_batch_slow": "maybe slower, low memory",
+  "models.oq.expert_batch_desc": "Splits MoE expert weights into smaller batches during GPTQ. Lower values reduce memory usage. Only affects MoE models (64+ experts).",
   "models.uploader.section_label": "Hub Upload",
   "models.uploader.heading": "Upload oQ Models",
   "models.uploader.description": "Upload your locally quantized oQ models to HuggingFace Hub.",
diff --git a/omlx/admin/i18n/zh.json b/omlx/admin/i18n/zh.json
@@ -197,6 +197,11 @@
   "models.oq.calib_dataset": "校准数据集",
   "models.oq.calib_samples": "校准样本数",
   "models.oq.calib_seq_length": "序列长度",
+  "models.oq.expert_batch_size": "Expert Batch Size",
+  "models.oq.expert_batch_none": "off — not recommended, high memory",
+  "models.oq.expert_batch_default": "default",
+  "models.oq.expert_batch_slow": "maybe slower, low memory",
+  "models.oq.expert_batch_desc": "Splits MoE expert weights into smaller batches during GPTQ. Lower values reduce memory usage. Only affects MoE models (64+ experts).",
   "models.uploader.section_label": "Hub Upload",
   "models.uploader.heading": "Upload oQ Models",
   "models.uploader.description": "Upload your locally quantized oQ models to HuggingFace Hub.",
diff --git a/omlx/admin/oq_manager.py b/omlx/admin/oq_manager.py
@@ -78,6 +78,7 @@ class QuantTask:
     clip_batch_size: int = 1024
     sensitivity_model_path: str = ""
     text_only: bool = False
+    expert_batch_size: int = 32
 
     def to_dict(self) -> dict:
         """Serialize task to JSON-compatible dict."""
@@ -253,6 +254,7 @@ async def start_quantization(
         clip_batch_size: int = 1024,
         sensitivity_model_path: str = "",
         text_only: bool = False,
+        expert_batch_size: int = 32,
     ) -> QuantTask:
         """Start a quantization job.
 
@@ -324,6 +326,7 @@ async def start_quantization(
             clip_batch_size=clip_batch_size,
             sensitivity_model_path=sensitivity_model_path,
             text_only=text_only,
+            expert_batch_size=expert_batch_size,
         )
         self._tasks[task_id] = task
 
@@ -485,6 +488,7 @@ def _progress_cb(phase: str, pct: float) -> None:
                         None,  # target_bpw
                         None,  # hard_cap_bpw
                         task.sensitivity_model_path,
+                        expert_batch_size=task.expert_batch_size,
                     )
                 else:
                     # Tensor-by-tensor (low memory)
diff --git a/omlx/admin/routes.py b/omlx/admin/routes.py
@@ -212,6 +212,7 @@ class OQStartRequest(BaseModel):
     clip_batch_size: int = 1024
     sensitivity_model_path: str = ""
     text_only: bool = False
+    expert_batch_size: int = 32
 
 
 class HFUploadRequest(BaseModel):
@@ -3660,6 +3661,7 @@ async def start_oq_quantization(
             clip_batch_size=request.clip_batch_size,
             sensitivity_model_path=request.sensitivity_model_path,
             text_only=request.text_only,
+            expert_batch_size=request.expert_batch_size,
         )
         return {"success": True, "task": task.to_dict()}
     except ValueError as e:
diff --git a/omlx/admin/static/js/dashboard.js b/omlx/admin/static/js/dashboard.js
@@ -269,6 +269,7 @@
             oqCalibDataset: 'code_multilingual',
             oqClipBatchSize: 1024,
             oqSensitivityModelPath: '',
+            oqExpertBatchSize: 32,
 
             // oQ Uploader state
             uploadHfToken: localStorage.getItem('omlx-hf-upload-token') || '',
@@ -2685,6 +2686,7 @@
                             clip_batch_size: this.oqClipBatchSize,
                             sensitivity_model_path: this.oqSensitivityModelPath,
                             text_only: this.oqTextOnly,
+                            expert_batch_size: this.oqExpertBatchSize,
                         }),
                     });
                     const data = await response.json().catch(() => ({}));
diff --git a/omlx/admin/templates/dashboard/_models.html b/omlx/admin/templates/dashboard/_models.html
@@ -1178,6 +1178,21 @@ <h3 class="text-2xl font-bold tracking-tight text-neutral-900">{{ t('models.oq.h
                                                 <option value="1024">1024</option>
                                             </select>
                                         </div>
+                                        <!-- Expert GPTQ Batch (clip ON only) -->
+                                        <div x-show="oqEnableClip" x-cloak class="space-y-1">
+                                            <div class="flex items-center justify-between">
+                                                <span class="text-sm text-neutral-600">{{ t('models.oq.expert_batch_size') }}</span>
+                                                <select x-model.number="oqExpertBatchSize"
+                                                        class="px-3 py-1.5 bg-white border border-neutral-300 rounded-lg text-sm text-neutral-900 focus:outline-none focus:ring-2 focus:ring-neutral-900 focus:border-transparent">
+                                                    <option value="0">{{ t('models.oq.expert_batch_none') }}</option>
+                                                    <option value="32" selected>32 ({{ t('models.oq.expert_batch_default') }})</option>
+                                                    <option value="16">16 ({{ t('models.oq.expert_batch_slow') }})</option>
+                                                </select>
+                                            </div>
+                                            <p class="text-[11px] text-neutral-400 leading-relaxed">
+                                                {{ t('models.oq.expert_batch_desc') }}
+                                            </p>
+                                        </div>
                                     </div>
                                 </div>
                                 <!-- Error -->
diff --git a/omlx/oq.py b/omlx/oq.py