openvinotoolkit · gkrivor · May 27, 2025 · May 27, 2025 · May 27, 2025 · May 27, 2025
@@ -47,16 +47,24 @@ ov::OutputVector matmulnbits(const ov::frontend::onnx::Node& node) {
     const uint64_t n_blocks_per_col = (K + block_size - 1) / block_size;
     const auto blob_size = (block_size * bits + 7) / 8;
 
+    const uint64_t expected_b_size = N * n_blocks_per_col * blob_size;
+    const auto& b_shape = b_quantized.get_partial_shape();
+    uint64_t actual_b_size = 1;
+    for (const auto& d : b_shape) {
+        actual_b_size *= d.get_length();
+    }
+
     CHECK_VALID_NODE(node, n_blocks_per_col > 0, "Wrong blocks count: ", n_blocks_per_col);
     CHECK_VALID_NODE(node, blob_size > 0, "Wrong blob size: ", blob_size);
     // in documentation: ...Input B is a 2D constant Matrix.
     CHECK_VALID_NODE(node,
                      ov::as_type<v0::Constant>(b_quantized.get_node()) != nullptr,
                      "MatMulNBits limitation: accepting only a constant as a B input");
-    CHECK_VALID_NODE(node,
-                     b_quantized.get_partial_shape().rank() == 3,
-                     "Expected rank of quantized weights is 3 [N][n_blocks_per_col][blob_size], got: ",
-                     b_quantized.get_partial_shape().rank());
+    CHECK_VALID_NODE(
+        node,
+        b_shape.is_static() && actual_b_size == expected_b_size,
+        "Expected input B shape is static and compatible with shape [N][n_blocks_per_col][blob_size], got: ",
+        b_shape);
     CHECK_VALID_NODE(node,
                      a.get_element_type() == ov::element::f16 || a.get_element_type() == ov::element::f32 ||
                          a.get_element_type() == ov::element::dynamic,
@@ -148,14 +156,44 @@ ov::OutputVector matmulnbits(const ov::frontend::onnx::Node& node) {
 
         if (!zero_points.get_node_shared_ptr()) {
             zero_points = default_zp;
-        } else {
+        } else if (zero_points.get_element_type() == ov::element::u8) {
             // https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#com.microsoft.MatMulNBits
             // according to the link, zero point are:
             // Constrain quantized zero point types to uint8/int32/float16/float.
             // Input zero_points is stored as uint8_t or same as type(A). It has the same packing method as input B
-            zero_points =
-                op::util::reshape(zero_points,
-                                  ov::Shape{static_cast<size_t>(N), static_cast<size_t>(n_blocks_per_col), 1});
+            CHECK_VALID_NODE(node,
+                             ov::as_type<v0::Constant>(zero_points.get_node()) != nullptr,
+                             "MatMulNBits limitation: accepting only a constant as a zero_points");
+            const auto zp_const = ov::as_type_ptr<v0::Constant>(zero_points.get_node_shared_ptr());
+            ov::element::Type zp_type = ov::element::dynamic;
+            switch (bits) {
+            case 2:
+                zp_type = ov::element::u2;
+                break;
+            case 4:
+                zp_type = ov::element::u4;
+                break;
+            case 8:
+                zp_type = ov::element::u8;
+                break;
+            default:
+                FRONT_END_THROW("Unsupported bits count");
+                break;
+            }
+            zero_points = std::make_shared<v0::Constant>(
+                zp_type,
+                ov::Shape{static_cast<size_t>(N), static_cast<size_t>(n_blocks_per_col), static_cast<size_t>(1)},
+                zp_const->get_data_ptr());
+        } else if (zero_points.get_element_type() == a.get_element_type()) {
+            const auto& zp_shape = zero_points.get_partial_shape();
+            CHECK_VALID_NODE(
+                node,
+                zp_shape.is_static() &&
+                    zp_shape == Shape({static_cast<size_t>(N), static_cast<size_t>(n_blocks_per_col)}),
+                "Expected input Zero Point shape is static and equal to shape [N][n_blocks_per_col], got: ",
+                zp_shape);
+        } else {
+            FRONT_END_THROW("Unexpected zero point type");
         }
 
         // Possible issue with slice implementation, had to move convertion before slice, instead of slicing uint4

@@ -0,0 +1,97 @@
+ir_version: 3
+producer_name: "OpenVINO ONNX Frontend"
+producer_version: ""
+model_version: 0
+graph {
+  name: "test_matmul_2d"
+  node {
+    input: "a"
+    input: "b_Q4"
+    input: "b_scales"
+    input: "zp"
+    output: "c"
+    op_type: "MatMulNBits"
+    attribute {
+      name: "K"
+      i: 17
+      type: INT
+    }
+    attribute {
+      name: "N"
+      i: 3
+      type: INT
+    }
+    attribute {
+      name: "accuracy_level"
+      i: 4
+      type: INT
+    }
+    attribute {
+      name: "bits"
+      i: 4
+      type: INT
+    }
+    attribute {
+      name: "block_size"
+      i: 16
+      type: INT
+    }
+    domain: "com.microsoft"
+  }
+  initializer {
+    dims: 48
+    data_type: 2
+    name: "b_Q4"
+    raw_data: "G\2025`\024G\2025\200\000\000\000\000\000\000\000Fq$X\003Fq$\210\000\000\000\000\000\000\0005`\024G\2025`\024\200\000\000\000\000\000\000\000"
+  }
+  initializer {
+    dims: 6
+    data_type: 1
+    name: "b_scales"
+    raw_data: "\000\000\220\277\000\000\220\277\000\000\220\277\000\000\000\200\000\000\220\277\000\000\000\276"
+  }
+  initializer {
+    dims: 6
+    data_type: 2
+    name: "zp"
+    raw_data: "\x00\x40\x08\x00\x00\x00"
+  }
+  input {
+    name: "a"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 17
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "c"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 3
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 7
+}
+opset_import {
+  version: 1
+}
@@ -1331,6 +1331,27 @@ OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_matmulnbits_3x17) {
     test_case.run();
 }
 
+OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_matmulnbits_3x17_solid_b) {
+    const auto model = convert_model("com.microsoft/matmulnbits_3x17_solid_b.onnx");
+    auto test_case = ov::test::TestCase(model, s_device);
+
+    test_case.add_input<float>({1, 2, 3, 4,  5, 6, 7, 8, 9, 10, 1, 2, 3, 4,  5, 6, 7, 8, 9, 10, 1, 2, 3, 4,  5, 6,
+                                7, 8, 9, 10, 1, 2, 3, 4, 5, 6,  7, 8, 9, 10, 1, 2, 3, 4, 5, 6,  7, 8, 9, 10, 1});
+
+    test_case.add_expected_output<float>(Shape{3, 3},
+                                         {-322.52954f,
+                                          -312.34253f,
+                                          345.20667f,
+                                          -381.87994f,
+                                          -343.7008f,
+                                          472.23425f,
+                                          -509.08466f,
+                                          -420.32483f,
+                                          532.11615f});
+
+    test_case.run_with_tolerance_as_fp(1.f);
+}
+
 OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_quickgelu) {
     const auto model = convert_model("com.microsoft/quick_gelu.onnx");
     auto test_case = ov::test::TestCase(model, s_device);