Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Changelog
**New Features**

- Add the ``day0-release`` agent skill (``.agents/skills/day0-release/``), a deterministic end-to-end driver that chains the PTQ → evaluation → comparison skills (the evaluation stage deploys the checkpoint itself) with an enforced gate after each stage and returns a publish decision (ACCEPT / REGRESSION / ANOMALOUS / INFEASIBLE). Ships three GPU-free, unit-tested gate scripts (``gate_ptq.py``, ``gate_run.py``, ``gate_compare.py``) that validate checkpoint coverage, evaluation-run completeness, and baseline-vs-candidate accuracy threshold. v1 reports and stops on regression; the recipe-search loop is deferred.
- Add NVFP4 quantization support for MiniMax-M3 (``minimax_m3_vl``, a ~428B MoE VLM). Its routed-experts container ``MiniMaxM3VLExperts`` follows the standard transformers 5.x fused-experts pattern (3-D ``gate_up_proj``/``down_proj`` + ``num_experts``) but applies SwiGLU inline rather than via an ``act_fn`` submodule; ``_is_fused_experts_module`` no longer requires ``act_fn`` (``_QuantFusedExperts`` never reads it), so these experts are wrapped as ``_QuantFusedExperts`` and calibrate/export through the existing fused path. Quantize with ``--qformat nvfp4_experts_only``; load via transformers >=5.12 native support (no ``trust_remote_code``).
- Add **streaming** speculative-decoding training (EAGLE3 / DFlash): the draft trains on base-model hidden states produced on the fly by a co-located ``vllm serve`` (no disk dump), moved trainer-side over NIXL RDMA, scaling to multi-node (dedicated serve replicas + DDP trainers). New launcher examples for NVFP4 Kimi-K2.5 / K2.6 on GB200/aarch64 under ``tools/launcher/examples/moonshotai/``.

0.45 (2026-06-xx)
Expand Down
4 changes: 3 additions & 1 deletion examples/llm_ptq/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ Please reference our [framework scripts](#framework-scripts) and our [docs](http
| GLM-4.7<sup>8</sup> | ✅ | - | - | - | ✅ |
| Kimi K2 | - | - | - | - | ✅ |
| MiniMax M2.1 | - | - | - | - | ✅ |
| MiniMax M3<sup>11</sup> | - | - | - | - | ✅ |
| GPT-OSS<sup>10</sup> | - | - | - | - | ✅ |
| T5 | ✅ | ✅ | ✅ | ✅ | - |
| Whisper<sup>9</sup> | ✅ | ❌ | ❌ | ❌ | - |
Expand All @@ -130,7 +131,8 @@ Please reference our [framework scripts](#framework-scripts) and our [docs](http
> *<sup>7.</sup>[PTQ for DeepSeek](../deepseek/README.md)* \
> *<sup>8.</sup>GLM-4.7 has MTP (Multi-Token Prediction) layers that are automatically loaded and excluded from quantization.* \
> *<sup>9.</sup>Running Whisper model with transformers>=5.0 requires [torchcodec](https://github.com/meta-pytorch/torchcodec?tab=readme-ov-file#installing-cuda-enabled-torchcodec) and other system packages (e.g. ffmpeg).* \
> *<sup>10.</sup>GPT-OSS ships with native MXFP4 weights; NVFP4 export is produced via the closed-form `--cast_mxfp4_to_nvfp4` cast (see [MXFP4 → NVFP4 cast](#mxfp4--nvfp4-cast-for-gpt-oss)).*
> *<sup>10.</sup>GPT-OSS ships with native MXFP4 weights; NVFP4 export is produced via the closed-form `--cast_mxfp4_to_nvfp4` cast (see [MXFP4 → NVFP4 cast](#mxfp4--nvfp4-cast-for-gpt-oss)).* \
> *<sup>11.</sup>MiniMax M3 (`minimax_m3_vl`) requires transformers >=5.12 (native support); load without `trust_remote_code`. Recommended recipe `nvfp4_experts_only` (routed experts to NVFP4; attention, dense layers, shared experts, vision tower, router/gate, embeddings, lm_head kept higher precision).*

> *The accuracy loss after PTQ may vary depending on the actual model and the quantization method. Different models may have different accuracy loss and usually the accuracy loss is more significant when the base model is small. If the accuracy after PTQ is not meeting the requirement, please try either modifying [hf_ptq.py](./hf_ptq.py) and disabling the KV cache quantization or using the [QAT](./../llm_qat/README.md) instead. For NVFP4 quantization specifically, we recommend `nvfp4_mlp_only`, `nvfp4_experts_only`, or `nvfp4_omlp_only` to achieve higher accuracy by restricting quantization to the MLP/expert layers (and optionally the `o_proj` layer) while keeping the attention QKV projections unquantized.*

Expand Down
13 changes: 9 additions & 4 deletions modelopt/torch/quantization/plugins/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -1442,17 +1442,22 @@ def _is_fused_experts_module(module):
"""Check if a module is a fused MoE expert container compatible with _QuantFusedExperts.

Detects the standardized HuggingFace transformers 5.0+ fused expert pattern:
``gate_up_proj`` (3-D parameter), ``down_proj`` (3-D parameter), ``num_experts``,
and ``act_fn``. Matches ``MixtralExperts``, ``Qwen2MoeExperts``,
``gate_up_proj`` (3-D parameter), ``down_proj`` (3-D parameter), and
``num_experts``. Matches ``MixtralExperts``, ``Qwen2MoeExperts``,
``Qwen3MoeExperts``, ``Qwen3_5MoeExperts``, ``DeepseekV3NaiveMoe``,
``JambaExperts``, ``OlmoeExperts``, etc.
``JambaExperts``, ``OlmoeExperts``, ``MiniMaxM3VLExperts``, etc.

``act_fn`` is intentionally NOT required: some fused-expert containers (e.g.
``MiniMaxM3VLExperts``) apply their gating activation inline rather than via an
``act_fn`` submodule. ``_QuantFusedExperts`` never reads ``act_fn`` (it only
intercepts ``F.linear``), so the activation form is irrelevant to detection.

Returns ``False`` for non-standard layouts (DBRX, GptOss, GraniteMoE,
Llama4TextExperts) which have their own explicit registrations.
"""
if not hasattr(module, "gate_up_proj") or not hasattr(module, "down_proj"):
return False
if not hasattr(module, "num_experts") or not hasattr(module, "act_fn"):
if not hasattr(module, "num_experts"):
return False
gate_up = getattr(module, "gate_up_proj")
down = getattr(module, "down_proj")
Expand Down
96 changes: 94 additions & 2 deletions tests/unit/torch/quantization/plugins/test_fused_experts.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,37 @@ def forward(self, hidden_states, top_k_index, top_k_weights):
return final_hidden_states


class _SyntheticFusedExpertsInlineSwiglu(_SyntheticFusedExperts):
"""Fused experts that apply SwiGLU inline (no ``act_fn`` submodule), mimicking
transformers' ``MiniMaxM3VLExperts``. Verifies detection/quantization do not require ``act_fn``."""

def __init__(self):
super().__init__()
del self.act_fn # gating activation is applied inline in forward, not via a submodule

def forward(self, hidden_states, top_k_index, top_k_weights):
final_hidden_states = torch.zeros_like(hidden_states)
with torch.no_grad():
expert_mask = F.one_hot(top_k_index, num_classes=self.num_experts).permute(2, 1, 0)
expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
for expert_idx in expert_hit:
expert_idx = expert_idx[0]
if expert_idx == self.num_experts:
continue
top_k_pos, token_idx = torch.where(expert_mask[expert_idx])
current_state = hidden_states[token_idx]
gate, up = F.linear(current_state, self.gate_up_proj[expert_idx]).chunk(2, dim=-1)
current_hidden_states = F.silu(gate) * up # inline swiglu (no self.act_fn)
current_hidden_states = F.linear(current_hidden_states, self.down_proj[expert_idx])
current_hidden_states = (
current_hidden_states * top_k_weights[token_idx, top_k_pos, None]
)
final_hidden_states.index_add_(
0, token_idx, current_hidden_states.to(final_hidden_states.dtype)
)
return final_hidden_states


class _SyntheticTopKRouter(nn.Module):
def __init__(self):
super().__init__()
Expand Down Expand Up @@ -145,12 +176,23 @@ def test_module_with_2d_gate_up_not_detected(self):
module.act_fn = nn.SiLU()
assert _is_fused_experts_module(module) is False

def test_module_missing_act_fn_not_detected(self):
def test_module_missing_act_fn_still_detected(self):
"""``act_fn`` is optional: e.g. ``MiniMaxM3VLExperts`` applies swiglu inline.

``_QuantFusedExperts`` only intercepts ``F.linear`` and never reads ``act_fn``,
so the structural detector must not require it.
"""
module = nn.Module()
module.gate_up_proj = nn.Parameter(torch.randn(4, 16, 8))
module.down_proj = nn.Parameter(torch.randn(4, 8, 16))
module.num_experts = 4
assert _is_fused_experts_module(module) is False
assert _is_fused_experts_module(module) is True

def test_inline_swiglu_fused_experts_detected(self):
"""Fused experts applying swiglu inline (no ``act_fn`` submodule) are detected."""
module = _SyntheticFusedExpertsInlineSwiglu()
assert not hasattr(module, "act_fn")
assert _is_fused_experts_module(module) is True

def test_sparse_moe_block_not_detected_as_fused(self):
block = _SyntheticSparseMoeBlock()
Expand Down Expand Up @@ -652,6 +694,56 @@ def forward_loop(m):

self._cleanup_registry(expert_type)

def test_inline_swiglu_experts_calibrate(self):
"""No-``act_fn`` (inline swiglu) fused experts convert and calibrate like ``act_fn`` ones.

Regression for ``MiniMaxM3VLExperts``: detection used to require ``act_fn``, so these
experts were never wrapped and no quantizers were inserted.
"""
model = _TinyMoEModel()
model.moe.experts = _SyntheticFusedExpertsInlineSwiglu()
expert_type = type(model.moe.experts)
self._cleanup_registry(expert_type)

quant_cfg = {
"quant_cfg": [
{"quantizer_name": "*", "enable": False},
{
"quantizer_name": "*gate_up_proj_input_quantizer",
"cfg": {"num_bits": 8, "axis": None},
},
{
"quantizer_name": "*down_proj_input_quantizer",
"cfg": {"num_bits": 8, "axis": None},
},
{
"quantizer_name": "*gate_up_proj_weight_quantizer",
"cfg": {"num_bits": 8, "axis": 0},
},
{
"quantizer_name": "*down_proj_weight_quantizer",
"cfg": {"num_bits": 8, "axis": 0},
},
],
"algorithm": "max",
}

def forward_loop(m):
torch.manual_seed(0)
for _ in range(2):
m(torch.randn(1, 4, HIDDEN_DIM))

mtq.quantize(model, quant_cfg, forward_loop=forward_loop)

experts = model.moe.experts
assert experts.gate_up_proj_input_quantizer.amax is not None
assert experts.down_proj_input_quantizer.amax is not None
for idx in range(NUM_EXPERTS):
assert experts.gate_up_proj_weight_quantizers[idx].amax is not None
assert experts.down_proj_weight_quantizers[idx].amax is not None

self._cleanup_registry(expert_type)

def test_local_hessian_refines_per_expert_weights(self):
"""local_hessian captures each expert's routed activations and refines its weight amax."""
model = _TinyMoEModel()
Expand Down