From add79c662cc7a88a27b8400e9df048166e6a0f66 Mon Sep 17 00:00:00 2001 From: Yifan Jiang <19356972+yifjiang@users.noreply.github.com> Date: Sun, 14 Jun 2026 11:37:31 -0700 Subject: [PATCH] [Quantization] Support NVFP4 for inline-swiglu fused MoE experts (MiniMax-M3) MiniMaxM3VLExperts is a standard transformers 5.x fused-experts container (3-D gate_up_proj/down_proj + num_experts) but applies SwiGLU inline and has no act_fn submodule, so _is_fused_experts_module returned False -> the experts were never wrapped -> nvfp4_experts_only enabled zero expert quantizers and export raised NotImplementedError("...experts type 'MiniMaxM3VLExperts'..."). Drop the act_fn requirement from the detector. _QuantFusedExperts only intercepts F.linear and never reads act_fn, and _export_fused_experts is weight-only, so no export change is needed once detection wraps the experts. Models needing custom forwards (Llama4, GptOss, DBRX, Qwen3-VL-MoE) remain excluded earlier via their explicit registrations. Flip the now-incorrect test_module_missing_act_fn test and add an inline-SwiGLU synthetic experts detection + calibration test. Add a CHANGELOG entry and a MiniMax M3 row to the llm_ptq support matrix. Validated end-to-end on GB200: 14,592 expert weight quantizers enabled, 260 GB NVFP4 checkpoint, wikitext-2 perplexity 5.083 -> 5.420 (+6.6%). Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Yifan Jiang <19356972+yifjiang@users.noreply.github.com> --- CHANGELOG.rst | 1 + examples/llm_ptq/README.md | 4 +- .../torch/quantization/plugins/huggingface.py | 13 ++- .../plugins/test_fused_experts.py | 96 ++++++++++++++++++- 4 files changed, 107 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index ad0d4acdfac..8ad1aaf5780 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,7 @@ Changelog **New Features** - Add the ``day0-release`` agent skill (``.agents/skills/day0-release/``), a deterministic end-to-end driver that chains the PTQ → evaluation → comparison skills (the evaluation stage deploys the checkpoint itself) with an enforced gate after each stage and returns a publish decision (ACCEPT / REGRESSION / ANOMALOUS / INFEASIBLE). Ships three GPU-free, unit-tested gate scripts (``gate_ptq.py``, ``gate_run.py``, ``gate_compare.py``) that validate checkpoint coverage, evaluation-run completeness, and baseline-vs-candidate accuracy threshold. v1 reports and stops on regression; the recipe-search loop is deferred. +- Add NVFP4 quantization support for MiniMax-M3 (``minimax_m3_vl``, a ~428B MoE VLM). Its routed-experts container ``MiniMaxM3VLExperts`` follows the standard transformers 5.x fused-experts pattern (3-D ``gate_up_proj``/``down_proj`` + ``num_experts``) but applies SwiGLU inline rather than via an ``act_fn`` submodule; ``_is_fused_experts_module`` no longer requires ``act_fn`` (``_QuantFusedExperts`` never reads it), so these experts are wrapped as ``_QuantFusedExperts`` and calibrate/export through the existing fused path. Quantize with ``--qformat nvfp4_experts_only``; load via transformers >=5.12 native support (no ``trust_remote_code``). - Add **streaming** speculative-decoding training (EAGLE3 / DFlash): the draft trains on base-model hidden states produced on the fly by a co-located ``vllm serve`` (no disk dump), moved trainer-side over NIXL RDMA, scaling to multi-node (dedicated serve replicas + DDP trainers). New launcher examples for NVFP4 Kimi-K2.5 / K2.6 on GB200/aarch64 under ``tools/launcher/examples/moonshotai/``. 0.45 (2026-06-xx) diff --git a/examples/llm_ptq/README.md b/examples/llm_ptq/README.md index 64ef6deaa01..a7473b5f7ad 100755 --- a/examples/llm_ptq/README.md +++ b/examples/llm_ptq/README.md @@ -114,6 +114,7 @@ Please reference our [framework scripts](#framework-scripts) and our [docs](http | GLM-4.78 | ✅ | - | - | - | ✅ | | Kimi K2 | - | - | - | - | ✅ | | MiniMax M2.1 | - | - | - | - | ✅ | +| MiniMax M311 | - | - | - | - | ✅ | | GPT-OSS10 | - | - | - | - | ✅ | | T5 | ✅ | ✅ | ✅ | ✅ | - | | Whisper9 | ✅ | ❌ | ❌ | ❌ | - | @@ -130,7 +131,8 @@ Please reference our [framework scripts](#framework-scripts) and our [docs](http > *7.[PTQ for DeepSeek](../deepseek/README.md)* \ > *8.GLM-4.7 has MTP (Multi-Token Prediction) layers that are automatically loaded and excluded from quantization.* \ > *9.Running Whisper model with transformers>=5.0 requires [torchcodec](https://github.com/meta-pytorch/torchcodec?tab=readme-ov-file#installing-cuda-enabled-torchcodec) and other system packages (e.g. ffmpeg).* \ -> *10.GPT-OSS ships with native MXFP4 weights; NVFP4 export is produced via the closed-form `--cast_mxfp4_to_nvfp4` cast (see [MXFP4 → NVFP4 cast](#mxfp4--nvfp4-cast-for-gpt-oss)).* +> *10.GPT-OSS ships with native MXFP4 weights; NVFP4 export is produced via the closed-form `--cast_mxfp4_to_nvfp4` cast (see [MXFP4 → NVFP4 cast](#mxfp4--nvfp4-cast-for-gpt-oss)).* \ +> *11.MiniMax M3 (`minimax_m3_vl`) requires transformers >=5.12 (native support); load without `trust_remote_code`. Recommended recipe `nvfp4_experts_only` (routed experts to NVFP4; attention, dense layers, shared experts, vision tower, router/gate, embeddings, lm_head kept higher precision).* > *The accuracy loss after PTQ may vary depending on the actual model and the quantization method. Different models may have different accuracy loss and usually the accuracy loss is more significant when the base model is small. If the accuracy after PTQ is not meeting the requirement, please try either modifying [hf_ptq.py](./hf_ptq.py) and disabling the KV cache quantization or using the [QAT](./../llm_qat/README.md) instead. For NVFP4 quantization specifically, we recommend `nvfp4_mlp_only`, `nvfp4_experts_only`, or `nvfp4_omlp_only` to achieve higher accuracy by restricting quantization to the MLP/expert layers (and optionally the `o_proj` layer) while keeping the attention QKV projections unquantized.* diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index 631226dd090..73f4a45fd08 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -1442,17 +1442,22 @@ def _is_fused_experts_module(module): """Check if a module is a fused MoE expert container compatible with _QuantFusedExperts. Detects the standardized HuggingFace transformers 5.0+ fused expert pattern: - ``gate_up_proj`` (3-D parameter), ``down_proj`` (3-D parameter), ``num_experts``, - and ``act_fn``. Matches ``MixtralExperts``, ``Qwen2MoeExperts``, + ``gate_up_proj`` (3-D parameter), ``down_proj`` (3-D parameter), and + ``num_experts``. Matches ``MixtralExperts``, ``Qwen2MoeExperts``, ``Qwen3MoeExperts``, ``Qwen3_5MoeExperts``, ``DeepseekV3NaiveMoe``, - ``JambaExperts``, ``OlmoeExperts``, etc. + ``JambaExperts``, ``OlmoeExperts``, ``MiniMaxM3VLExperts``, etc. + + ``act_fn`` is intentionally NOT required: some fused-expert containers (e.g. + ``MiniMaxM3VLExperts``) apply their gating activation inline rather than via an + ``act_fn`` submodule. ``_QuantFusedExperts`` never reads ``act_fn`` (it only + intercepts ``F.linear``), so the activation form is irrelevant to detection. Returns ``False`` for non-standard layouts (DBRX, GptOss, GraniteMoE, Llama4TextExperts) which have their own explicit registrations. """ if not hasattr(module, "gate_up_proj") or not hasattr(module, "down_proj"): return False - if not hasattr(module, "num_experts") or not hasattr(module, "act_fn"): + if not hasattr(module, "num_experts"): return False gate_up = getattr(module, "gate_up_proj") down = getattr(module, "down_proj") diff --git a/tests/unit/torch/quantization/plugins/test_fused_experts.py b/tests/unit/torch/quantization/plugins/test_fused_experts.py index ce23f7a51d5..aa242c7b001 100644 --- a/tests/unit/torch/quantization/plugins/test_fused_experts.py +++ b/tests/unit/torch/quantization/plugins/test_fused_experts.py @@ -84,6 +84,37 @@ def forward(self, hidden_states, top_k_index, top_k_weights): return final_hidden_states +class _SyntheticFusedExpertsInlineSwiglu(_SyntheticFusedExperts): + """Fused experts that apply SwiGLU inline (no ``act_fn`` submodule), mimicking + transformers' ``MiniMaxM3VLExperts``. Verifies detection/quantization do not require ``act_fn``.""" + + def __init__(self): + super().__init__() + del self.act_fn # gating activation is applied inline in forward, not via a submodule + + def forward(self, hidden_states, top_k_index, top_k_weights): + final_hidden_states = torch.zeros_like(hidden_states) + with torch.no_grad(): + expert_mask = F.one_hot(top_k_index, num_classes=self.num_experts).permute(2, 1, 0) + expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero() + for expert_idx in expert_hit: + expert_idx = expert_idx[0] + if expert_idx == self.num_experts: + continue + top_k_pos, token_idx = torch.where(expert_mask[expert_idx]) + current_state = hidden_states[token_idx] + gate, up = F.linear(current_state, self.gate_up_proj[expert_idx]).chunk(2, dim=-1) + current_hidden_states = F.silu(gate) * up # inline swiglu (no self.act_fn) + current_hidden_states = F.linear(current_hidden_states, self.down_proj[expert_idx]) + current_hidden_states = ( + current_hidden_states * top_k_weights[token_idx, top_k_pos, None] + ) + final_hidden_states.index_add_( + 0, token_idx, current_hidden_states.to(final_hidden_states.dtype) + ) + return final_hidden_states + + class _SyntheticTopKRouter(nn.Module): def __init__(self): super().__init__() @@ -145,12 +176,23 @@ def test_module_with_2d_gate_up_not_detected(self): module.act_fn = nn.SiLU() assert _is_fused_experts_module(module) is False - def test_module_missing_act_fn_not_detected(self): + def test_module_missing_act_fn_still_detected(self): + """``act_fn`` is optional: e.g. ``MiniMaxM3VLExperts`` applies swiglu inline. + + ``_QuantFusedExperts`` only intercepts ``F.linear`` and never reads ``act_fn``, + so the structural detector must not require it. + """ module = nn.Module() module.gate_up_proj = nn.Parameter(torch.randn(4, 16, 8)) module.down_proj = nn.Parameter(torch.randn(4, 8, 16)) module.num_experts = 4 - assert _is_fused_experts_module(module) is False + assert _is_fused_experts_module(module) is True + + def test_inline_swiglu_fused_experts_detected(self): + """Fused experts applying swiglu inline (no ``act_fn`` submodule) are detected.""" + module = _SyntheticFusedExpertsInlineSwiglu() + assert not hasattr(module, "act_fn") + assert _is_fused_experts_module(module) is True def test_sparse_moe_block_not_detected_as_fused(self): block = _SyntheticSparseMoeBlock() @@ -652,6 +694,56 @@ def forward_loop(m): self._cleanup_registry(expert_type) + def test_inline_swiglu_experts_calibrate(self): + """No-``act_fn`` (inline swiglu) fused experts convert and calibrate like ``act_fn`` ones. + + Regression for ``MiniMaxM3VLExperts``: detection used to require ``act_fn``, so these + experts were never wrapped and no quantizers were inserted. + """ + model = _TinyMoEModel() + model.moe.experts = _SyntheticFusedExpertsInlineSwiglu() + expert_type = type(model.moe.experts) + self._cleanup_registry(expert_type) + + quant_cfg = { + "quant_cfg": [ + {"quantizer_name": "*", "enable": False}, + { + "quantizer_name": "*gate_up_proj_input_quantizer", + "cfg": {"num_bits": 8, "axis": None}, + }, + { + "quantizer_name": "*down_proj_input_quantizer", + "cfg": {"num_bits": 8, "axis": None}, + }, + { + "quantizer_name": "*gate_up_proj_weight_quantizer", + "cfg": {"num_bits": 8, "axis": 0}, + }, + { + "quantizer_name": "*down_proj_weight_quantizer", + "cfg": {"num_bits": 8, "axis": 0}, + }, + ], + "algorithm": "max", + } + + def forward_loop(m): + torch.manual_seed(0) + for _ in range(2): + m(torch.randn(1, 4, HIDDEN_DIM)) + + mtq.quantize(model, quant_cfg, forward_loop=forward_loop) + + experts = model.moe.experts + assert experts.gate_up_proj_input_quantizer.amax is not None + assert experts.down_proj_input_quantizer.amax is not None + for idx in range(NUM_EXPERTS): + assert experts.gate_up_proj_weight_quantizers[idx].amax is not None + assert experts.down_proj_weight_quantizers[idx].amax is not None + + self._cleanup_registry(expert_type) + def test_local_hessian_refines_per_expert_weights(self): """local_hessian captures each expert's routed activations and refines its weight amax.""" model = _TinyMoEModel()