From ddfbfd2f348a01cfd411bf9f79e8476bb50f440c Mon Sep 17 00:00:00 2001
From: hualxie <hualxie@microsoft.com>
Date: Tue, 14 Apr 2026 14:04:20 +0800
Subject: [PATCH 1/7] add LpNormalization

---
 .../normalization_input_generator.py          | 48 +++++++++++--------
 tests/unit/analyze/core/test_qdq.py           |  1 +
 2 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/src/winml/modelkit/pattern/op_input_gen/normalization_input_generator.py b/src/winml/modelkit/pattern/op_input_gen/normalization_input_generator.py
index a14d1107e..72b3c4382 100644
--- a/src/winml/modelkit/pattern/op_input_gen/normalization_input_generator.py
+++ b/src/winml/modelkit/pattern/op_input_gen/normalization_input_generator.py
@@ -421,27 +421,33 @@ def get_qdq_config(self) -> dict[str, QDQParameterConfig] | None:
 # ============================================================================
 # LpNormalization - NOT IMPLEMENTED in ONNXRuntime
 # ============================================================================
-#
-# NOTE: LpNormalization(22) exists in the ONNX spec but is NOT IMPLEMENTED
-# in ONNXRuntime as of the current version. The validation fails with:
-# "NOT_IMPLEMENTED: Could not find an implementation for LpNormalization(22)"
-#
-# Uncomment and use the implementation below when runtime support is added:
-#
-# @register_runtime_checker_op
-# class LpNormalizationInputGenerator(NormalizationInputGenerator):
-#     """Input generator for LpNormalization operator."""
-#     op_name = "LpNormalization"
-#     def get_finite_attribute_sets(self) -> dict[str, list]:
-#         return {"p": [1, 2]}
-#     def get_input_and_infinite_attribute_combinations(self) -> list[dict[str, InputConstraint]]:
-#         combinations = []
-#         for shape in self.get_common_data_shapes():
-#             if len(shape) < 3:
-#                 continue
-#             # TODO: add axis
-#             combinations.append({"input": InputShapeConstraint(shape)})
-#         return combinations
+
+
+@register_runtime_checker_op
+class LpNormalizationInputGenerator(NormalizationInputGenerator):
+    """Input generator for LpNormalization operator."""
+
+    op_name = "LpNormalization"
+
+    def get_finite_attribute_sets(self) -> dict[str, list]:
+        """Return finite attribute values for LpNormalization."""
+        return {"p": [1, 2]}
+
+    def get_input_and_infinite_attribute_combinations(self) -> list[dict[str, InputConstraint]]:
+        """Return input combinations for LpNormalization."""
+        combinations = []
+        for shape in self.get_common_data_shapes():
+            if len(shape) < 3:
+                continue
+            # TODO: add axis
+            combinations.append({"input": InputShapeConstraint(shape)})
+        return combinations
+
+    def get_qdq_config(self) -> dict[str, QDQParameterConfig] | None:
+        """Return QDQ configuration for LpNormalization operator inputs."""
+        return {
+            self.op_input_names[0]: QDQParameterConfig(support_activation=True),
+        }
 
 
 # ============================================================================
diff --git a/tests/unit/analyze/core/test_qdq.py b/tests/unit/analyze/core/test_qdq.py
index 14f9e4152..6e53e4275 100644
--- a/tests/unit/analyze/core/test_qdq.py
+++ b/tests/unit/analyze/core/test_qdq.py
@@ -1106,6 +1106,7 @@ class TestIterQDQCombinations:
             ("GlobalAveragePool", 3 * 4),  # 12
             ("InstanceNormalization", 3 * 16),  # 48
             ("LayerNormalization", 5 * 2 * 2 * 16),  # 320
+            ("LpNormalization", 3 * 2 * 4),  # 24: 3 shapes (>=3D) x 2 p-values x 4 act types
             ("MatMul", 36 * (16 * 2 - 4 + 4)),  # 1152: +4/shape for B=INT4
             (
                 "MaxPool",

From 9da9c43608aa8487c013d41d320ffbe6d388d0a7 Mon Sep 17 00:00:00 2001
From: hualxie <hualxie@microsoft.com>
Date: Tue, 14 Apr 2026 14:30:09 +0800
Subject: [PATCH 2/7] add axis candidates

---
 .../pattern/op_input_gen/normalization_input_generator.py    | 5 +++--
 tests/unit/analyze/core/test_qdq.py                          | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/winml/modelkit/pattern/op_input_gen/normalization_input_generator.py b/src/winml/modelkit/pattern/op_input_gen/normalization_input_generator.py
index 72b3c4382..53c0440da 100644
--- a/src/winml/modelkit/pattern/op_input_gen/normalization_input_generator.py
+++ b/src/winml/modelkit/pattern/op_input_gen/normalization_input_generator.py
@@ -439,8 +439,9 @@ def get_input_and_infinite_attribute_combinations(self) -> list[dict[str, InputC
         for shape in self.get_common_data_shapes():
             if len(shape) < 3:
                 continue
-            # TODO: add axis
-            combinations.append({"input": InputShapeConstraint(shape)})
+            combinations.extend(
+                {"input": InputShapeConstraint(shape), "axis": axis} for axis in [0, 1, -1, 2]
+            )
         return combinations
 
     def get_qdq_config(self) -> dict[str, QDQParameterConfig] | None:
diff --git a/tests/unit/analyze/core/test_qdq.py b/tests/unit/analyze/core/test_qdq.py
index 6e53e4275..fe68f2b19 100644
--- a/tests/unit/analyze/core/test_qdq.py
+++ b/tests/unit/analyze/core/test_qdq.py
@@ -1106,7 +1106,7 @@ class TestIterQDQCombinations:
             ("GlobalAveragePool", 3 * 4),  # 12
             ("InstanceNormalization", 3 * 16),  # 48
             ("LayerNormalization", 5 * 2 * 2 * 16),  # 320
-            ("LpNormalization", 3 * 2 * 4),  # 24: 3 shapes (>=3D) x 2 p-values x 4 act types
+            ("LpNormalization", 3 * 2 * 4 * 4),  # 96: 3 shapes (>=3D) x 2 p x 4 axis x 4 act types
             ("MatMul", 36 * (16 * 2 - 4 + 4)),  # 1152: +4/shape for B=INT4
             (
                 "MaxPool",

From 9c263cda92d3df2431e7be8152b338379d6dc905 Mon Sep 17 00:00:00 2001
From: hualxie <hualxie@microsoft.com>
Date: Tue, 14 Apr 2026 16:18:01 +0800
Subject: [PATCH 3/7] add GatherBlockQuantized

---
 .../modelkit/pattern/op_input_gen/__init__.py |   1 +
 .../op_input_gen/indexing_input_generator.py  | 155 ++++++++++++++++++
 tests/unit/analyze/core/test_qdq.py           |  27 +++
 3 files changed, 183 insertions(+)

diff --git a/src/winml/modelkit/pattern/op_input_gen/__init__.py b/src/winml/modelkit/pattern/op_input_gen/__init__.py
index 0ae982453..7de347d8d 100644
--- a/src/winml/modelkit/pattern/op_input_gen/__init__.py
+++ b/src/winml/modelkit/pattern/op_input_gen/__init__.py
@@ -10,6 +10,7 @@
 from .flatten_input_generator import FlattenInputGenerator
 from .global_pooling_input_generator import *
 from .indexing_input_generator import (
+    GatherBlockQuantizedInputGenerator,
     GatherInputGenerator,
     ScatterNDInputGenerator,
     SplitInputGenerator,
diff --git a/src/winml/modelkit/pattern/op_input_gen/indexing_input_generator.py b/src/winml/modelkit/pattern/op_input_gen/indexing_input_generator.py
index 045cf678d..f37fb268d 100644
--- a/src/winml/modelkit/pattern/op_input_gen/indexing_input_generator.py
+++ b/src/winml/modelkit/pattern/op_input_gen/indexing_input_generator.py
@@ -7,15 +7,18 @@
 This module contains input generators for operators that perform indexing
 and shape manipulation operations:
 - Gather: Gathers entries along an axis using indices
+- GatherBlockQuantized: Fused gather + block-wise dequantize (com.microsoft)
 - ScatterND: Scatters updates into a copy of data at specified indices
 - Unsqueeze: Inserts single-dimensional entries to shape
 - Split: Splits a tensor into multiple outputs
 """
 
+import math
 from typing import Any
 
 import numpy as np
 
+from ...onnx import SupportedONNXType
 from .op_input_gen import (
     InputConstraint,
     InputShapeConstraint,
@@ -858,3 +861,155 @@ def get_qdq_config(self):
             "input": QDQParameterConfig(support_activation=True),
             "split": QDQParameterConfig(support_non_qdq=True),
         }
+
+
+@register_runtime_checker_op
+class GatherBlockQuantizedInputGenerator(OpInputGenerator):
+    """Input generator for com.microsoft::GatherBlockQuantized operator.
+
+    GatherBlockQuantized is a fused gather + block-wise dequantize operator.
+    It gathers rows from a quantized weight tensor and dequantizes them on the fly.
+
+    Inputs:
+    - data (T1): Block-wise quantized weight (INT4/UINT4/UINT8), always a constant initializer
+    - indices (Tind): Gather indices (INT32/INT64), the runtime input
+    - scales (T2): Dequantization scales (FLOAT/FLOAT16), always a constant initializer
+    - zero_points (T1, optional): Dequantization zero points, always a constant initializer
+
+    Attributes:
+    - bits: 4 for INT4/UINT4 data, 8 for UINT8 data
+    - block_size: Quantization block size (power of 2, >= 16)
+    - gather_axis: Axis to gather on (UINT8 requires gather_axis=0)
+    - quantize_axis: Axis along which data was quantized (must differ from gather_axis)
+
+    Output (T2): Dequantized gathered tensor.
+
+    Since this op is already a fused dequantize+gather, it does not use external
+    QDQ wrapping (no get_qdq_config). The type combinations (T1/T2/Tind) and the
+    coupling between bits and T1 are enumerated explicitly in iter().
+
+    Coverage:
+    - T1: INT4 (bits=4), UINT4 (bits=4), UINT8 (bits=8)
+    - T2: FLOAT, FLOAT16
+    - Tind: INT32, INT64
+    - block_size: 16, 32
+    - gather_axis: 0, 1 for INT4/UINT4; 0 only for UINT8 (spec constraint)
+    - zero_points: present / absent (doubles the count)
+
+    Count: 2 INT4 gather_axes x 2 block_sizes x 2 T2 x 2 Tind x 2 zp  = 32
+         + 2 UINT4 gather_axes x 2 block_sizes x 2 T2 x 2 Tind x 2 zp = 32
+         + 1 UINT8 gather_axis x 2 block_sizes x 2 T2 x 2 Tind x 2 zp = 16
+         = 80
+    """
+
+    op_name = "GatherBlockQuantized"
+    expand_optionals = False  # zero_points presence is enumerated explicitly in iter()
+
+    def get_finite_attribute_sets(self) -> dict[str, list]:
+        """Not used: attribute enumeration is handled in iter() to couple bits with T1 type."""
+        return {}
+
+    def get_input_and_infinite_attribute_combinations(self) -> list[dict]:
+        """Not used: combinations are enumerated directly in iter()."""
+        return []
+
+    def _iter_constant_combinations(self, kwargs: dict) -> Any:
+        """Yield one constant map: data/scales/zero_points are weights; indices is runtime."""
+        is_constant_map = {
+            k: k != "indices" for k, v in kwargs.items() if self._is_input_key(k) and v is not None
+        }
+        yield is_constant_map
+
+    def iter(self) -> Any:
+        """Enumerate all valid (T1, bits, T2, Tind, shape, axis, block_size, zp) combos."""
+        import ml_dtypes
+
+        # One representative 2-D embedding-style data shape
+        data_shape = (32, 64)
+
+        block_sizes = [16, 32]
+        t2_types = [
+            (SupportedONNXType.FLOAT.np_type, SupportedONNXType.FLOAT.annotation),
+            (SupportedONNXType.FLOAT16.np_type, SupportedONNXType.FLOAT16.annotation),
+        ]
+        tind_types = [
+            (np.int32, SupportedONNXType.INT32.annotation),
+            (np.int64, SupportedONNXType.INT64.annotation),
+        ]
+        # (np_dtype, annotation, bits, valid_gather_axes)
+        t1_configs = [
+            (np.dtype(ml_dtypes.int4), SupportedONNXType.INT4.annotation, 4, [0, 1]),
+            (np.dtype(ml_dtypes.uint4), SupportedONNXType.UINT4.annotation, 4, [0, 1]),
+            (np.dtype(np.uint8), SupportedONNXType.UINT8.annotation, 8, [0]),
+        ]
+        rng = np.random.default_rng(42)
+        indices_shape = (2, 4)
+
+        for t1_dtype, t1_annotation, bits, gather_axes in t1_configs:
+            for gather_axis in gather_axes:
+                quantize_axis = 1 - gather_axis  # 2-D: the other axis
+                for block_size in block_sizes:
+                    sc_dims: list[int] = list(data_shape)
+                    sc_dims[quantize_axis] = math.ceil(sc_dims[quantize_axis] / block_size)
+                    sc_shape = tuple(sc_dims)
+                    axis_size = data_shape[gather_axis]
+
+                    for t2_dtype, t2_annotation in t2_types:
+                        for tind_dtype, tind_annotation in tind_types:
+                            for zero_points_present in [False, True]:
+                                data_val = rng.integers(
+                                    0, 7, size=data_shape, dtype=np.int8
+                                ).astype(t1_dtype)
+                                scales_val = rng.random(sc_shape).astype(t2_dtype)
+                                indices_val = rng.integers(
+                                    0, axis_size, size=indices_shape, dtype=tind_dtype
+                                )
+
+                                kwargs: dict[str, Any] = {
+                                    "data": data_val,
+                                    "indices": indices_val,
+                                    "scales": scales_val,
+                                    "bits": bits,
+                                    "block_size": block_size,
+                                    "gather_axis": gather_axis,
+                                    "quantize_axis": quantize_axis,
+                                }
+                                if zero_points_present:
+                                    kwargs["zero_points"] = np.zeros(sc_shape, dtype=t1_dtype)
+
+                                type_vars = {
+                                    f"T1_{self.op_name}": t1_annotation,
+                                    f"Tind_{self.op_name}": tind_annotation,
+                                    f"T2_{self.op_name}": t2_annotation,
+                                }
+                                attrs = {
+                                    k: v for k, v in kwargs.items() if k in self.op_attribute_names
+                                }
+                                input_constraints = {
+                                    k: {"type": "shape", "shape": list(v.shape)}
+                                    for k, v in kwargs.items()
+                                    if self._is_input_key(k) and v is not None
+                                }
+                                tags = {
+                                    self.type_vars_key: type_vars,
+                                    "input_constraints": input_constraints,
+                                    "attrs": attrs,
+                                }
+                                yield self.filter_kwargs_by_opset(kwargs), tags
+
+    def derive_properties(self, properties: dict) -> dict:
+        """Derive filter properties from node inputs and attributes."""
+        item = properties.copy()
+        item["data_dim"] = len(item.get("data_shape", ()))
+        item["indices_dim"] = len(item.get("indices_shape", ()))
+        return item
+
+    def get_infinite_property_names(self) -> list[str]:
+        """Return names of properties with infinite possible values."""
+        return [
+            "data_shape",
+            "indices_shape",
+            "attr_gather_axis",
+            "attr_quantize_axis",
+            "attr_block_size",
+        ]
diff --git a/tests/unit/analyze/core/test_qdq.py b/tests/unit/analyze/core/test_qdq.py
index fe68f2b19..eeae70266 100644
--- a/tests/unit/analyze/core/test_qdq.py
+++ b/tests/unit/analyze/core/test_qdq.py
@@ -1170,3 +1170,30 @@ def test_qdq_total_count(self, op_name: str, expected_count: int) -> None:
 
         # For rerun, could track in https://github.com/gim-home/ModelKit/issues/278
         assert count == expected_count, "If changes, either bug or need to rerun"
+
+
+class TestIterCOMSOpsModels:
+    """Tests for com.microsoft domain ops that use explicit type enumeration (no QDQ wrapping)."""
+
+    @pytest.mark.parametrize(
+        "op_name,expected_count",
+        [
+            # 2 INT4 gather_axes x 2 block_sizes x 2 T2 x 2 Tind x 2 zp  = 32
+            # 2 UINT4 gather_axes x 2 block_sizes x 2 T2 x 2 Tind x 2 zp = 32
+            # 1 UINT8 gather_axis x 2 block_sizes x 2 T2 x 2 Tind x 2 zp = 16
+            ("GatherBlockQuantized", 80),
+        ],
+    )
+    def test_com_microsoft_op_model_count(self, op_name: str, expected_count: int) -> None:
+        """Test total model count for com.microsoft ops (no QDQ wrapping)."""
+        from winml.modelkit.pattern.op_input_gen import get_runtime_checker_op
+
+        schema = ONNXDomain.COM_MICROSOFT.get_op_schema(op_name, 1)
+        generator = get_runtime_checker_op(op_name)(schema)  # no qdq_generator
+
+        count = 0
+        for kwargs, tags in generator.iter():
+            for _model, _final_tags in generator.iter_const_and_dynamic_models(kwargs, tags):
+                count += 1
+
+        assert count == expected_count, "If count changes, update both code and this comment"

From a1de477a4c8b6b32018e724539de012b9ea269ea Mon Sep 17 00:00:00 2001
From: hualxie <hualxie@microsoft.com>
Date: Tue, 14 Apr 2026 16:34:12 +0800
Subject: [PATCH 4/7] skip

---
 .../pattern/op_input_gen/indexing_input_generator.py  | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/winml/modelkit/pattern/op_input_gen/indexing_input_generator.py b/src/winml/modelkit/pattern/op_input_gen/indexing_input_generator.py
index f37fb268d..6ed05f92d 100644
--- a/src/winml/modelkit/pattern/op_input_gen/indexing_input_generator.py
+++ b/src/winml/modelkit/pattern/op_input_gen/indexing_input_generator.py
@@ -997,6 +997,17 @@ def iter(self) -> Any:
                                 }
                                 yield self.filter_kwargs_by_opset(kwargs), tags
 
+    def _run_op_on_cpu(self, kwargs: dict, tags: dict) -> Any:
+        """Skip CPU validation for GatherBlockQuantized.
+
+        This op is a com.microsoft fused op not supported by the CPU EP.
+        The quantized data inputs (INT4/UINT4/UINT8) are constant initializers
+        and cannot be fed as runtime inputs; the base class builds an all-dynamic
+        model for CPU validation, which would fail on sub-byte dtypes.
+        Our combinations are valid by construction, so CPU pre-validation is not needed.
+        """
+        return []
+
     def derive_properties(self, properties: dict) -> dict:
         """Derive filter properties from node inputs and attributes."""
         item = properties.copy()

From 6c05e0e5391c79cc672c0fa38056b40c5de61ee5 Mon Sep 17 00:00:00 2001
From: hualxie <hualxie@microsoft.com>
Date: Wed, 15 Apr 2026 10:33:24 +0800
Subject: [PATCH 5/7] add qdq

---
 .../runtime_checker/result_processor.py       |  3 +-
 .../op_input_gen/indexing_input_generator.py  | 34 ++++++++++++++-----
 tests/unit/analyze/core/test_qdq.py           | 19 ++++++-----
 3 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/src/winml/modelkit/analyze/runtime_checker/result_processor.py b/src/winml/modelkit/analyze/runtime_checker/result_processor.py
index 07e7f10a5..d6eab9645 100644
--- a/src/winml/modelkit/analyze/runtime_checker/result_processor.py
+++ b/src/winml/modelkit/analyze/runtime_checker/result_processor.py
@@ -611,7 +611,7 @@ def get_opset_version_range(op_name: str, start_opset_version: int, op_domain: s
     target_domain = "" if args.opset_domain == "ai.onnx" else args.opset_domain
     domain_str_for_filename = args.opset_domain  # Keep original for filename matching
 
-    json_files = list(input_dir.rglob("*.json"))
+    json_files = list(input_dir.glob("*.json"))
 
     if not json_files:
         print(f"No JSON files found in {input_dir}")
@@ -694,7 +694,6 @@ def get_opset_version_range(op_name: str, start_opset_version: int, op_domain: s
                 f"_opset{since_version}{qdq_suffix}.json"
             )
             json_file = input_dir / expected_filename
-
             print(f"Processing {expected_filename}...", end=" ")
 
             if not json_file.exists():
diff --git a/src/winml/modelkit/pattern/op_input_gen/indexing_input_generator.py b/src/winml/modelkit/pattern/op_input_gen/indexing_input_generator.py
index 6ed05f92d..89335e62a 100644
--- a/src/winml/modelkit/pattern/op_input_gen/indexing_input_generator.py
+++ b/src/winml/modelkit/pattern/op_input_gen/indexing_input_generator.py
@@ -884,11 +884,12 @@ class GatherBlockQuantizedInputGenerator(OpInputGenerator):
 
     Output (T2): Dequantized gathered tensor.
 
-    Since this op is already a fused dequantize+gather, it does not use external
-    QDQ wrapping (no get_qdq_config). The type combinations (T1/T2/Tind) and the
-    coupling between bits and T1 are enumerated explicitly in iter().
+    The op's inputs (INT4/UINT4/UINT8 data, indices, scales, optional zero_points) are
+    not wrapped by external DQ nodes — they are already quantized.  The float output can
+    be followed by a QuantizeLinear node, so get_qdq_config() marks the output as
+    support_activation=True and all inputs as support_non_qdq (pass-through).
 
-    Coverage:
+    Coverage (base models, no QDQ):
     - T1: INT4 (bits=4), UINT4 (bits=4), UINT8 (bits=8)
     - T2: FLOAT, FLOAT16
     - Tind: INT32, INT64
@@ -896,10 +897,12 @@ class GatherBlockQuantizedInputGenerator(OpInputGenerator):
     - gather_axis: 0, 1 for INT4/UINT4; 0 only for UINT8 (spec constraint)
     - zero_points: present / absent (doubles the count)
 
-    Count: 2 INT4 gather_axes x 2 block_sizes x 2 T2 x 2 Tind x 2 zp  = 32
-         + 2 UINT4 gather_axes x 2 block_sizes x 2 T2 x 2 Tind x 2 zp = 32
-         + 1 UINT8 gather_axis x 2 block_sizes x 2 T2 x 2 Tind x 2 zp = 16
-         = 80
+    Base count: 2 INT4 gather_axes x 2 block_sizes x 2 T2 x 2 Tind x 2 zp  = 32
+              + 2 UINT4 gather_axes x 2 block_sizes x 2 T2 x 2 Tind x 2 zp = 32
+              + 1 UINT8 gather_axis x 2 block_sizes x 2 T2 x 2 Tind x 2 zp = 16
+              = 80
+
+    QDQ models (output wrapped by Q): 80 base x 4 activation types = 320
     """
 
     op_name = "GatherBlockQuantized"
@@ -913,6 +916,21 @@ def get_input_and_infinite_attribute_combinations(self) -> list[dict]:
         """Not used: combinations are enumerated directly in iter()."""
         return []
 
+    def get_qdq_config(self) -> dict[str, QDQParameterConfig]:
+        """Return QDQ config: output wrappable by Q; all inputs are pass-through.
+
+        GatherBlockQuantized inputs are already quantized (INT4/UINT4/UINT8) and
+        must not be wrapped by DQ nodes.  Only the float output can be followed by
+        a QuantizeLinear node (support_activation).
+        """
+        return {
+            "data": QDQParameterConfig(support_non_qdq=True),
+            "indices": QDQParameterConfig(support_non_qdq=True),
+            "scales": QDQParameterConfig(support_non_qdq=True),
+            "zero_points": QDQParameterConfig(support_non_qdq=True),
+            "output": QDQParameterConfig(support_activation=True),
+        }
+
     def _iter_constant_combinations(self, kwargs: dict) -> Any:
         """Yield one constant map: data/scales/zero_points are weights; indices is runtime."""
         is_constant_map = {
diff --git a/tests/unit/analyze/core/test_qdq.py b/tests/unit/analyze/core/test_qdq.py
index eeae70266..cc958d852 100644
--- a/tests/unit/analyze/core/test_qdq.py
+++ b/tests/unit/analyze/core/test_qdq.py
@@ -1172,24 +1172,25 @@ def test_qdq_total_count(self, op_name: str, expected_count: int) -> None:
         assert count == expected_count, "If changes, either bug or need to rerun"
 
 
-class TestIterCOMSOpsModels:
-    """Tests for com.microsoft domain ops that use explicit type enumeration (no QDQ wrapping)."""
+class TestIterMSQDQCombinations:
+    """Tests for com.microsoft domain ops."""
 
     @pytest.mark.parametrize(
         "op_name,expected_count",
         [
-            # 2 INT4 gather_axes x 2 block_sizes x 2 T2 x 2 Tind x 2 zp  = 32
-            # 2 UINT4 gather_axes x 2 block_sizes x 2 T2 x 2 Tind x 2 zp = 32
-            # 1 UINT8 gather_axis x 2 block_sizes x 2 T2 x 2 Tind x 2 zp = 16
-            ("GatherBlockQuantized", 80),
+            # Only T2=FLOAT combos produce QDQ output models (T2=FLOAT16 fails Q input type check).
+            # 40 FLOAT base combos x 4 activation output types (INT8/UINT8/INT16/UINT16) = 160
+            ("GatherBlockQuantized", 160),
         ],
     )
-    def test_com_microsoft_op_model_count(self, op_name: str, expected_count: int) -> None:
-        """Test total model count for com.microsoft ops (no QDQ wrapping)."""
+    def test_com_microsoft_op_qdq_model_count(self, op_name: str, expected_count: int) -> None:
+        """Test QDQ model count for com.microsoft ops."""
         from winml.modelkit.pattern.op_input_gen import get_runtime_checker_op
+        from winml.modelkit.pattern.op_input_gen.qdq_gen import QDQGenerator
 
         schema = ONNXDomain.COM_MICROSOFT.get_op_schema(op_name, 1)
-        generator = get_runtime_checker_op(op_name)(schema)  # no qdq_generator
+        qdq_gen = QDQGenerator(opset_version=1, domain=ONNXDomain.COM_MICROSOFT)
+        generator = get_runtime_checker_op(op_name)(schema, qdq_generator=qdq_gen)
 
         count = 0
         for kwargs, tags in generator.iter():

From 2f5b5bfd45696c8d191b2fbb2fdd070fbabcc7e8 Mon Sep 17 00:00:00 2001
From: hualxie <hualxie@microsoft.com>
Date: Wed, 15 Apr 2026 10:34:19 +0800
Subject: [PATCH 6/7] update comment

---
 tests/unit/analyze/core/test_qdq.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/unit/analyze/core/test_qdq.py b/tests/unit/analyze/core/test_qdq.py
index cc958d852..ccc332886 100644
--- a/tests/unit/analyze/core/test_qdq.py
+++ b/tests/unit/analyze/core/test_qdq.py
@@ -1179,7 +1179,12 @@ class TestIterMSQDQCombinations:
         "op_name,expected_count",
         [
             # Only T2=FLOAT combos produce QDQ output models (T2=FLOAT16 fails Q input type check).
-            # 40 FLOAT base combos x 4 activation output types (INT8/UINT8/INT16/UINT16) = 160
+            # FLOAT base combos:
+            #   2 INT4  gather_axes x 2 block_sizes x 2 Tind x 2 zp = 16
+            #   2 UINT4 gather_axes x 2 block_sizes x 2 Tind x 2 zp = 16
+            #   1 UINT8 gather_axis x 2 block_sizes x 2 Tind x 2 zp =  8
+            #                                                     total = 40
+            # x 4 activation output types (INT8/UINT8/INT16/UINT16) = 160
             ("GatherBlockQuantized", 160),
         ],
     )

From 6033edc64a3458d36e56a38121a23f604f61219c Mon Sep 17 00:00:00 2001
From: hualxie <hualxie@microsoft.com>
Date: Wed, 15 Apr 2026 11:03:08 +0800
Subject: [PATCH 7/7] remove bits

---
 .../pattern/op_input_gen/indexing_input_generator.py         | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/winml/modelkit/pattern/op_input_gen/indexing_input_generator.py b/src/winml/modelkit/pattern/op_input_gen/indexing_input_generator.py
index 89335e62a..ba87ea522 100644
--- a/src/winml/modelkit/pattern/op_input_gen/indexing_input_generator.py
+++ b/src/winml/modelkit/pattern/op_input_gen/indexing_input_generator.py
@@ -1041,4 +1041,9 @@ def get_infinite_property_names(self) -> list[str]:
             "attr_gather_axis",
             "attr_quantize_axis",
             "attr_block_size",
+            # attr_bits is redundant with T1 type (INT4/UINT4 → 4, UINT8 → 8);
+            # some models omit the bits attribute entirely (attr_bits_is_none=True),
+            # so exclude both from table matching to avoid false gaps.
+            "attr_bits",
+            "attr_bits_is_none",
         ]