resmooth

meenchen · meenchen · commit a5a6e394dce9 · 2025-10-27T19:28:41.000Z
Signed-off-by: weimingc &lt;17592131+meenchen@users.noreply.github.com&gt;
diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
@@ -478,6 +478,8 @@ def _get_quantization_from_layer(layer, quantizer_attr_names: QuantizerAttrNames
 
             if input_quantizer is not None and hasattr(input_quantizer, "_pre_quant_scale"):
                 return QUANTIZATION_NVFP4_AWQ
+            if getattr(layer, "fused_with_prequant", False):
+                return QUANTIZATION_NVFP4_AWQ
             assert input_quantizer is not None, (
                 f"input_quantizer is None for {quantizer_attr_names}"
             )
@@ -937,7 +939,7 @@ def all_items_same(item_list):
 
 
 # TODO: make this more general instead of rule based
-def pattern_fuse_prequant(model: torch.nn.Module):
+def pattern_fuse_prequant(model: torch.nn.Module, fuse_mismatch_dim=False):
     """Fuse pre_quant_scale to the linear weights.
 
     For example, we can fuse the pre_quant_scale of o_proj to the output_dimension of v_proj, such that
@@ -951,10 +953,29 @@ def pattern_fuse_prequant(model: torch.nn.Module):
     the pre_quant_scale is averaged across the repeated head groups and then the
     o_proj's pre_quant_scale is updated to maintain mathematical equivalence.
 
+    Args:
+        model: The model to fuse pre_quant_scale to.
+        fuse_mismatch_dim: If True, fuse the pre_quant_scale even if dimension between pre_quant_scale
+        and linear weights is not the same. This is useful for GQA/MQA models but may lead to accuracy
+        drop.
+
     Note:
         This is an experimental feature, and it might mess up the quantization errors
         of fused linear modules.
     """
+    # For MoE models, let's first resmooth the w1 and w3 in experts to get the average pre_quant_scale
+    for _, module in model.named_modules():
+        if (
+            hasattr(module, "experts")
+            and "Qwen3MoeSparseMoeBlock".lower() in type(module).__name__.lower()
+        ):
+            linear_list = []
+            linear_list.extend([getattr(expert, "up_proj") for expert in module.experts])
+            linear_list.extend([getattr(expert, "gate_proj") for expert in module.experts])
+            preprocess_linear_fusion(linear_list, resmooth_only=True)
+
+    # import pdb; pdb.set_trace()
+    # Fuse pre_quant_scale to the linear weights
     for _, module in model.named_modules():
         for module_map in PQS_FUSE_MODULE_MAPPING:
             target_module_list = module_map[0]
@@ -967,52 +988,58 @@ def pattern_fuse_prequant(model: torch.nn.Module):
                 ):
                     pre_quant_scale = linear_pqs_from.input_quantizer._pre_quant_scale
 
-                    # for GQA/MQA models, we apply averaging to the pre_quant_scale
-                    if pre_quant_scale.numel() != linear_fuse_into.weight.shape[0]:
-                        if "attention" not in type(module).__name__.lower():
-                            continue
-                        else:
-                            config = module.config
-                            num_kv_heads = config.num_key_value_heads
-                            kv_head_dim = linear_fuse_into.weight.shape[0] // num_kv_heads
-                            n_rep = pre_quant_scale.numel() // num_kv_heads // kv_head_dim
-
-                            # Reshape:(num_kv_heads, n_rep, kv_head_dim)
-                            averaged_scale = pre_quant_scale.view(
-                                num_kv_heads, n_rep, kv_head_dim
-                            ).mean(dim=1)
-
-                            # To update o_proj, we need to repeat back to original shape
-                            repeated_scale = (
-                                averaged_scale.unsqueeze(1)
-                                .expand(num_kv_heads, n_rep, kv_head_dim)
-                                .reshape(-1)
+                    # for GQA/MQA models, we apply averaging to the pre_quant_scale for shared head groups
+                    if pre_quant_scale.numel() != linear_fuse_into.weight.shape[-2]:
+                        if (
+                            not fuse_mismatch_dim
+                            or "attention" not in type(module).__name__.lower()
+                        ):
+                            warn(
+                                f"Skipping pattern fuse prequant for {type(module).__name__}"
+                                f"pqs dim {pre_quant_scale.numel()} != out_ch dim {linear_fuse_into.weight.shape[-2]}"
                             )
+                            continue
+                        config = module.config
+                        num_kv_heads = config.num_key_value_heads
+                        kv_head_dim = linear_fuse_into.weight.shape[0] // num_kv_heads
+                        n_rep = pre_quant_scale.numel() // num_kv_heads // kv_head_dim
+
+                        # Reshape:(num_kv_heads, n_rep, kv_head_dim)
+                        averaged_scale = pre_quant_scale.view(
+                            num_kv_heads, n_rep, kv_head_dim
+                        ).mean(dim=1)
+
+                        # To update o_proj, we need to repeat back to original shape
+                        repeated_scale = (
+                            averaged_scale.unsqueeze(1)
+                            .expand(num_kv_heads, n_rep, kv_head_dim)
+                            .reshape(-1)
+                        )
 
-                            def _update_pre_quant_scale(module, new_pre_quant_scale):
-                                old_pre_quant_scale = module.input_quantizer._pre_quant_scale
-                                module.weight = nn.Parameter(
-                                    module.weight
-                                    * old_pre_quant_scale.to(
-                                        dtype=module.weight.dtype, device=module.weight.device
-                                    )
-                                    / new_pre_quant_scale.to(
-                                        dtype=module.weight.dtype, device=module.weight.device
-                                    )
+                        def _update_pre_quant_scale(module, new_pre_quant_scale):
+                            old_pre_quant_scale = module.input_quantizer._pre_quant_scale
+                            module.weight = nn.Parameter(
+                                module.weight
+                                * old_pre_quant_scale.to(
+                                    dtype=module.weight.dtype, device=module.weight.device
+                                )
+                                / new_pre_quant_scale.to(
+                                    dtype=module.weight.dtype, device=module.weight.device
                                 )
-                                module.input_quantizer.pre_quant_scale = new_pre_quant_scale
+                            )
+                            module.input_quantizer.pre_quant_scale = new_pre_quant_scale
 
-                                # Redo weights collection
-                                module.weight_quantizer.reset_amax()
-                                enable_stats_collection(module.weight_quantizer)
-                                module.weight_quantizer(module.weight)
-                                finish_stats_collection(module.weight_quantizer)
+                            # Redo weights collection
+                            module.weight_quantizer.reset_amax()
+                            enable_stats_collection(module.weight_quantizer)
+                            module.weight_quantizer(module.weight)
+                            finish_stats_collection(module.weight_quantizer)
 
-                            # Update o_proj's pre_quant_scale
-                            _update_pre_quant_scale(linear_pqs_from, repeated_scale)
+                        # Update o_proj's pre_quant_scale
+                        _update_pre_quant_scale(linear_pqs_from, repeated_scale)
 
-                            # Use averaged scale (flattened) for v_proj fusion
-                            pre_quant_scale = averaged_scale.reshape(-1)
+                        # Use averaged scale (flattened) for v_proj fusion
+                        pre_quant_scale = averaged_scale.reshape(-1)
 
                     # Fuse the pre_quant_scale to v_proj weight
                     linear_fuse_into.weight = torch.nn.Parameter(
diff --git a/tests/gpu/torch/export/test_quant_utils.py b/tests/gpu/torch/export/test_quant_utils.py
@@ -74,7 +74,7 @@ def test_pattern_fuse_prequant(quant_config, attention_kv_heads_pair):
     ]
 
     # Apply fusion
-    pattern_fuse_prequant(model)
+    pattern_fuse_prequant(model, fuse_mismatch_dim=True)
 
     # Check if pre_quant_scale and fused_with_prequant flag are removed correctly
     for target_module_name in traget_module_name_list:
@@ -97,3 +97,127 @@ def test_pattern_fuse_prequant(quant_config, attention_kv_heads_pair):
     assert torch.allclose(
         output_before_fuse.logits, output_after_fuse.logits, rtol=1e-1, atol=5e-1
     ), "Output should be the same before and after fusion"
+
+
+# TODO: add test for Qwen3MoeSparseMoeBlock MLP fusion
+
+
+@pytest.mark.parametrize(
+    "quant_config",
+    [
+        mtq.INT4_AWQ_CFG,
+        mtq.NVFP4_AWQ_LITE_CFG,
+    ],
+)
+def test_pattern_fuse_prequant_moe(quant_config):
+    """Test pattern_fuse_prequant on Qwen3 MoE sparse MLP."""
+    pytest.importorskip("transformers", minversion="4.46.0")
+    from transformers import Qwen3MoeConfig, Qwen3MoeForCausalLM
+
+    # Create a tiny Qwen3MoE model for testing
+    config = Qwen3MoeConfig(
+        hidden_size=128,
+        intermediate_size=256,
+        moe_intermediate_size=256,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_key_value_heads=4,
+        num_experts=4,
+        num_experts_per_tok=2,
+        max_position_embeddings=128,
+        vocab_size=256,
+        shared_expert_intermediate_size=256,
+    )
+    model = Qwen3MoeForCausalLM(config).to("cuda")
+
+    # Quantize the model
+    dummy_input = torch.randint(0, 256, (1, 16), device="cuda")
+    mtq.quantize(model, quant_config, lambda m: m(dummy_input))
+
+    # Collect MoE expert modules to verify (down_proj should be fused)
+    moe_down_proj_modules = []
+    moe_gate_proj_modules = []
+    moe_up_proj_modules = []
+    for name, module in model.named_modules():
+        if "mlp" in name and "experts" in name:
+            if "gate_proj" in name and not any(x in name for x in ["weight", "quantizer"]):
+                moe_gate_proj_modules.append((name, module))
+            elif "down_proj" in name and not any(x in name for x in ["weight", "quantizer"]):
+                moe_down_proj_modules.append((name, module))
+            elif "up_proj" in name and not any(x in name for x in ["weight", "quantizer"]):
+                moe_up_proj_modules.append((name, module))
+
+    # Verify experts have pre_quant_scale before fusion
+    for name, module in moe_gate_proj_modules:
+        if hasattr(module, "input_quantizer"):
+            assert hasattr(module.input_quantizer, "_pre_quant_scale"), (
+                f"{name}: gate_proj should have pre_quant_scale before fusion"
+            )
+
+    for name, module in moe_up_proj_modules:
+        if hasattr(module, "input_quantizer"):
+            assert hasattr(module.input_quantizer, "_pre_quant_scale"), (
+                f"{name}: up_proj should have pre_quant_scale before fusion"
+            )
+
+    for name, module in moe_down_proj_modules:
+        if hasattr(module, "input_quantizer"):
+            assert hasattr(module.input_quantizer, "_pre_quant_scale"), (
+                f"{name}: down_proj should have pre_quant_scale before fusion"
+            )
+
+    # Run forward pass before fusion
+    model.eval()
+    with torch.no_grad():
+        output_before_fuse = model(dummy_input)
+
+    # Apply fusion (fuse_mismatch_dim only needed for GQA/MQA attention, not for MLP)
+    pattern_fuse_prequant(model)
+
+    # Check if down_proj's pre_quant_scale was removed and fused into up_proj
+    for name, module in moe_down_proj_modules:
+        if hasattr(module, "input_quantizer"):
+            # Verify pre_quant_scale was removed from down_proj
+            assert not hasattr(module.input_quantizer, "_pre_quant_scale"), (
+                f"{name}: down_proj pre_quant_scale should be removed after fusion"
+            )
+            # Verify fused_with_prequant flag was set
+            assert hasattr(module, "fused_with_prequant") and module.fused_with_prequant, (
+                f"{name}: down_proj should have fused_with_prequant flag set"
+            )
+
+    # Verify that gate_proj and up_proj still have pre_quant_scale and are resmoothed
+    for name, module in model.named_modules():
+        if "Qwen3MoeSparseMoeBlock".lower() in type(module).__name__.lower():
+            first_gate_scale = getattr(
+                getattr(module, "experts")[0], "gate_proj"
+            ).input_quantizer._pre_quant_scale
+            first_up_scale = getattr(
+                getattr(module, "experts")[0], "up_proj"
+            ).input_quantizer._pre_quant_scale
+
+            # gate_proj and up_proj should have the same scale after resmoothing
+            assert torch.allclose(first_gate_scale, first_up_scale), (
+                "gate_proj and up_proj should have the same pre_quant_scale after resmoothing"
+            )
+
+            # All experts should have the same gate_proj and up_proj scales
+            for i, expert in enumerate(getattr(module, "experts")):
+                gate_scale = getattr(expert, "gate_proj").input_quantizer._pre_quant_scale
+                up_scale = getattr(expert, "up_proj").input_quantizer._pre_quant_scale
+
+                assert torch.allclose(gate_scale, first_gate_scale), (
+                    f"Expert {i} gate_proj scale should match expert 0"
+                )
+                assert torch.allclose(up_scale, first_up_scale), (
+                    f"Expert {i} up_proj scale should match expert 0"
+                )
+
+    # Verify output is close to the original output
+    with torch.no_grad():
+        output_after_fuse = model(dummy_input)
+
+    # There will be some difference due to quantization errors after pre_quant_scale fusion
+    assert torch.allclose(
+        output_before_fuse.logits, output_after_fuse.logits, rtol=1e-1, atol=5e-1
+    ), "Output should be similar before and after Qwen3 MoE fusion"