Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,24 @@ ov::OutputVector matmulnbits(const ov::frontend::onnx::Node& node) {
const uint64_t n_blocks_per_col = (K + block_size - 1) / block_size;
const auto blob_size = (block_size * bits + 7) / 8;

const uint64_t expected_b_size = N * n_blocks_per_col * blob_size;
const auto& b_shape = b_quantized.get_partial_shape();
uint64_t actual_b_size = 1;
for (const auto& d : b_shape) {
actual_b_size *= d.get_length();
}

CHECK_VALID_NODE(node, n_blocks_per_col > 0, "Wrong blocks count: ", n_blocks_per_col);
CHECK_VALID_NODE(node, blob_size > 0, "Wrong blob size: ", blob_size);
// in documentation: ...Input B is a 2D constant Matrix.
CHECK_VALID_NODE(node,
ov::as_type<v0::Constant>(b_quantized.get_node()) != nullptr,
"MatMulNBits limitation: accepting only a constant as a B input");
CHECK_VALID_NODE(node,
b_quantized.get_partial_shape().rank() == 3,
"Expected rank of quantized weights is 3 [N][n_blocks_per_col][blob_size], got: ",
b_quantized.get_partial_shape().rank());
CHECK_VALID_NODE(
node,
b_shape.is_static() && actual_b_size == expected_b_size,
"Expected input B shape is static and compatible with shape [N][n_blocks_per_col][blob_size], got: ",
b_shape);
CHECK_VALID_NODE(node,
a.get_element_type() == ov::element::f16 || a.get_element_type() == ov::element::f32 ||
a.get_element_type() == ov::element::dynamic,
Expand Down Expand Up @@ -148,14 +156,44 @@ ov::OutputVector matmulnbits(const ov::frontend::onnx::Node& node) {

if (!zero_points.get_node_shared_ptr()) {
zero_points = default_zp;
} else {
} else if (zero_points.get_element_type() == ov::element::u8) {
// https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#com.microsoft.MatMulNBits
// according to the link, zero point are:
// Constrain quantized zero point types to uint8/int32/float16/float.
// Input zero_points is stored as uint8_t or same as type(A). It has the same packing method as input B
zero_points =
op::util::reshape(zero_points,
ov::Shape{static_cast<size_t>(N), static_cast<size_t>(n_blocks_per_col), 1});
CHECK_VALID_NODE(node,
ov::as_type<v0::Constant>(zero_points.get_node()) != nullptr,
"MatMulNBits limitation: accepting only a constant as a zero_points");
const auto zp_const = ov::as_type_ptr<v0::Constant>(zero_points.get_node_shared_ptr());
ov::element::Type zp_type = ov::element::dynamic;
switch (bits) {
case 2:
zp_type = ov::element::u2;
break;
case 4:
zp_type = ov::element::u4;
break;
case 8:
zp_type = ov::element::u8;
break;
default:
FRONT_END_THROW("Unsupported bits count");
break;
}
zero_points = std::make_shared<v0::Constant>(
zp_type,
ov::Shape{static_cast<size_t>(N), static_cast<size_t>(n_blocks_per_col), static_cast<size_t>(1)},
zp_const->get_data_ptr());
} else if (zero_points.get_element_type() == a.get_element_type()) {
const auto& zp_shape = zero_points.get_partial_shape();
CHECK_VALID_NODE(
node,
zp_shape.is_static() &&
zp_shape == Shape({static_cast<size_t>(N), static_cast<size_t>(n_blocks_per_col)}),
"Expected input Zero Point shape is static and equal to shape [N][n_blocks_per_col], got: ",
zp_shape);
} else {
FRONT_END_THROW("Unexpected zero point type");
}

// Possible issue with slice implementation, had to move convertion before slice, instead of slicing uint4
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
ir_version: 3
producer_name: "OpenVINO ONNX Frontend"
producer_version: ""
model_version: 0
graph {
name: "test_matmul_2d"
node {
input: "a"
input: "b_Q4"
input: "b_scales"
input: "zp"
output: "c"
op_type: "MatMulNBits"
attribute {
name: "K"
i: 17
type: INT
}
attribute {
name: "N"
i: 3
type: INT
}
attribute {
name: "accuracy_level"
i: 4
type: INT
}
attribute {
name: "bits"
i: 4
type: INT
}
attribute {
name: "block_size"
i: 16
type: INT
}
domain: "com.microsoft"
}
initializer {
dims: 48
data_type: 2
name: "b_Q4"
raw_data: "G\2025`\024G\2025\200\000\000\000\000\000\000\000Fq$X\003Fq$\210\000\000\000\000\000\000\0005`\024G\2025`\024\200\000\000\000\000\000\000\000"
}
initializer {
dims: 6
data_type: 1
name: "b_scales"
raw_data: "\000\000\220\277\000\000\220\277\000\000\220\277\000\000\000\200\000\000\220\277\000\000\000\276"
}
initializer {
dims: 6
data_type: 2
name: "zp"
raw_data: "\x00\x40\x08\x00\x00\x00"
}
input {
name: "a"
type {
tensor_type {
elem_type: 1
shape {
dim {
dim_value: 3
}
dim {
dim_value: 17
}
}
}
}
}
output {
name: "c"
type {
tensor_type {
elem_type: 1
shape {
dim {
dim_value: 3
}
dim {
dim_value: 3
}
}
}
}
}
}
opset_import {
version: 7
}
opset_import {
version: 1
}
21 changes: 21 additions & 0 deletions src/frontends/onnx/tests/onnx_import_com_microsoft.in.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1331,6 +1331,27 @@ OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_matmulnbits_3x17) {
test_case.run();
}

OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_matmulnbits_3x17_solid_b) {
const auto model = convert_model("com.microsoft/matmulnbits_3x17_solid_b.onnx");
auto test_case = ov::test::TestCase(model, s_device);

test_case.add_input<float>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1});

test_case.add_expected_output<float>(Shape{3, 3},
{-322.52954f,
-312.34253f,
345.20667f,
-381.87994f,
-343.7008f,
472.23425f,
-509.08466f,
-420.32483f,
532.11615f});

test_case.run_with_tolerance_as_fp(1.f);
}

OPENVINO_TEST(${BACKEND_NAME}, onnx_com_microsoft_quickgelu) {
const auto model = convert_model("com.microsoft/quick_gelu.onnx");
auto test_case = ov::test::TestCase(model, s_device);
Expand Down
Loading