Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,12 @@ class FakeQuantizeDecompositionTest : public TransformationTestsF {
};

TEST_F(FakeQuantizeDecompositionTest, smoke_Snippets_PerTensorFakeQuantizeDecomposition) {
auto onesShape = ov::Shape{1, 1, 1, 1};
model = FakeQuantizeFunction::getSubgraphWithFakeQuantize(
{1, 3, 16, 16}, element::f32, {{}, {}, {}, {}}, 1.f);
{1, 3, 16, 16}, element::f32, {onesShape, onesShape, onesShape, onesShape}, 1.f);

model_ref = FakeQuantizeFunction::getSubgraphWithDecomposedFakeQuantize(
{1, 3, 16, 16}, element::f32, {{}, {}, {}, {}}, 1.f);
{1, 3, 16, 16}, element::f32, {onesShape, onesShape, onesShape, onesShape}, 1.f);

register_passes();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,13 @@ class FakeQuantizeFunction {
const element::Type inputType,
const std::vector<ov::Shape>& fakeQuantizeShapes,
const float zeroPoint);

static std::shared_ptr<ov::Node> getDecomposedFakeQuantizeOps(
const ov::Output<ov::Node>& input,
const ov::element::Type outType,
float il, float ih, float scale,
bool doRounding = false,
bool doDequantize = false);
};

} // namespace snippets
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,59 @@ std::shared_ptr<ov::Model> FakeQuantizeFunction::getSubgraphWithFakeQuantize(
return function;
}

std::shared_ptr<ov::Node> FakeQuantizeFunction::getDecomposedFakeQuantizeOps(const ov::Output<ov::Node>& input,
const ov::element::Type outType,
float il, float ih, float scale,
bool doRounding,
bool doDequantize) {
auto inputShape = input.get_shape();
auto inputType = input.get_element_type();
auto rank = inputShape.size();
ov::Shape onesShape(rank, 1);

const auto input_low = ov::op::v0::Constant::create(ov::element::f32, onesShape, {il});
const auto input_high = ov::op::v0::Constant::create(ov::element::f32, onesShape, {ih});
const auto output_scale = ov::op::v0::Constant::create(ov::element::f32, onesShape, {scale});

std::shared_ptr<ov::Node> current = std::make_shared<ov::opset1::Maximum>(input, input_low);
current->set_friendly_name("inputLow");

current = std::make_shared<ov::opset1::Minimum>(current, input_high);
current->set_friendly_name("inputHigh");

current = std::make_shared<ov::opset1::Multiply>(current, output_scale);
current->set_friendly_name("multiply");

if (doDequantize) {
current = std::make_shared<ov::opset1::Subtract>(current, output_scale);
current->set_friendly_name("subtract");
}

if (doRounding) {
current = std::make_shared<ov::op::v5::Round>(current, ov::op::v5::Round::RoundMode::HALF_TO_EVEN);
current->set_friendly_name("round");
}

if (doDequantize) {
current = std::make_shared<ov::opset1::Multiply>(
current,
std::make_shared<ov::opset1::Constant>(element::f32, onesShape, std::vector<float>{0.0745098f}));
current->set_friendly_name("divide");

current = std::make_shared<ov::opset1::Add>(
current,
std::make_shared<ov::opset1::Constant>(element::f32, onesShape, std::vector<float>{1.f}));
current->set_friendly_name("add");
}

if (outType != inputType) {
current = std::make_shared<ov::snippets::op::ConvertSaturation>(current, outType);
current->set_friendly_name("convertSaturation");
}

return current;
}

std::shared_ptr<ov::Model> FakeQuantizeFunction::getSubgraphWithDecomposedFakeQuantize(
const ov::Shape& inputShape,
const element::Type inputType,
Expand All @@ -204,40 +257,10 @@ std::shared_ptr<ov::Model> FakeQuantizeFunction::getSubgraphWithDecomposedFakeQu
const auto parameter = std::make_shared<ov::opset1::Parameter>(inputType, inputShape);
parameter->set_friendly_name("parameter");

const auto maximum = std::make_shared<ov::opset1::Maximum>(
parameter,
std::make_shared<ov::opset1::Constant>(element::f32, Shape{}, std::vector<float>{1.f}));
maximum->set_friendly_name("inputLow");

const auto minimum = std::make_shared<ov::opset1::Minimum>(
maximum,
std::make_shared<ov::opset1::Constant>(element::f32, Shape{}, std::vector<float>{20.f}));
minimum->set_friendly_name("inputHigh");

const auto multiply = std::make_shared<ov::opset1::Multiply>(
minimum,
std::make_shared<ov::opset1::Constant>(element::f32, Shape{}, std::vector<float>{13.4211f}));
multiply->set_friendly_name("multiply");

const auto subtract = std::make_shared<ov::opset1::Subtract>(
multiply,
std::make_shared<ov::opset1::Constant>(element::f32, Shape{}, std::vector<float>{13.4211f}));
subtract->set_friendly_name("subtract");

const auto round = std::make_shared<ov::op::v5::Round>(subtract, ov::op::v5::Round::RoundMode::HALF_TO_EVEN);
round->set_friendly_name("round");

const auto devide = std::make_shared<ov::opset1::Multiply>(
round,
std::make_shared<ov::opset1::Constant>(element::f32, Shape{}, std::vector<float>{0.0745098f}));
devide->set_friendly_name("devide");

const auto add = std::make_shared<ov::opset1::Add>(
devide,
std::make_shared<ov::opset1::Constant>(element::f32, Shape{}, std::vector<float>{1.f}));
add->set_friendly_name("add");
auto decomposed_fq_op_result = FakeQuantizeFunction::getDecomposedFakeQuantizeOps(
parameter->output(0), ov::element::f32, 1.f, 20.f, 13.4211f, true, true);

const auto result = std::make_shared<ov::opset1::Result>(add);
const auto result = std::make_shared<ov::opset1::Result>(decomposed_fq_op_result);
result->set_friendly_name("result");

return std::make_shared<ov::Model>(
Expand Down
34 changes: 15 additions & 19 deletions src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
//

#include "fake_quantize_helper.hpp"
#include "subgraph_mha.hpp"

#include "common_test_utils/data_utils.hpp"
Expand Down Expand Up @@ -810,7 +811,8 @@ std::shared_ptr<ov::Model> MHAINT8MatMulTypeRelaxedFunction::initOriginal() cons
static_cast<int64_t>(input_shapes[0].get_shape()[1])};
auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, {reshape1ConstData.size()}, reshape1ConstData);

const auto fq_signed_params = ov::builder::subgraph::FakeQuantizeOnData(256, {1}, {-36912.66015625}, {36624.28125}, {-128}, {127}, ov::element::i8);
const auto fq_signed_params = ov::builder::subgraph::FakeQuantizeOnData(
256, {1, 1, 1, 1}, {-36912.66015625}, {36624.28125}, {-128}, {127}, ov::element::i8);
const auto fq0 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(transpose0Param, ov::element::f32, fq_signed_params);
const auto fq1 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(transpose1Param, ov::element::f32, fq_signed_params);
const auto fq2 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(transpose2Param, ov::element::f32, fq_signed_params);
Expand All @@ -831,7 +833,8 @@ std::shared_ptr<ov::Model> MHAINT8MatMulTypeRelaxedFunction::initOriginal() cons
std::vector<element::Type>{ element::f32 },
ov::op::TemporaryReplaceOutputType(fq3, element::f32).get(),
ov::op::TemporaryReplaceOutputType(addParam, element::f32).get());
const auto deq = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, std::vector<float>{0.1122});
const auto deq = std::make_shared<ov::op::v0::Constant>(
ov::element::f32, ov::Shape{1, 1, 1, 1}, std::vector<float>{0.1122});
const auto deq_mul = std::make_shared<op::TypeRelaxed<ov::op::v1::Multiply>>(
std::vector<element::Type>{ element::f32, element::f32 },
std::vector<element::Type>{ element::f32 },
Expand All @@ -842,7 +845,8 @@ std::shared_ptr<ov::Model> MHAINT8MatMulTypeRelaxedFunction::initOriginal() cons
const auto softMax = std::make_shared<ov::opset1::Softmax>(reshape0, 1);
const auto reshape1 = std::make_shared<ov::opset1::Reshape>(softMax, reshape1Const, true);

const auto fq_unsigned_params = ov::builder::subgraph::FakeQuantizeOnData(256, {1}, {0}, {0.245}, {0}, {255}, ov::element::u8);
const auto fq_unsigned_params = ov::builder::subgraph::FakeQuantizeOnData(
256, {1, 1, 1, 1}, {0}, {0.245}, {0}, {255}, ov::element::u8);
const auto fq4 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(reshape1, ov::element::f32, fq_unsigned_params);

const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(fq2, transpose2Const);
Expand All @@ -864,7 +868,8 @@ std::shared_ptr<ov::Model> MHAINT8MatMulTypeRelaxedFunction::initReference() con
auto data3 = std::make_shared<ov::opset1::Parameter>(precision, input_shapes[3]);
ov::ParameterVector ngraphParams = {data0, data1, data2, data3};

const auto fq_signed_params = ov::builder::subgraph::FakeQuantizeOnData(256, {1}, {-36912.66015625}, {36624.28125}, {-128}, {127}, ov::element::i8);
const auto fq_signed_params = ov::builder::subgraph::FakeQuantizeOnData(
256, {1, 1, 1, 1}, {-36912.66015625}, {36624.28125}, {-128}, {127}, ov::element::i8);
const auto fq0 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(data0, ov::element::f32, fq_signed_params);
const auto fq1 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(data1, ov::element::f32, fq_signed_params);
const auto fq2 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(data3, ov::element::f32, fq_signed_params);
Expand Down Expand Up @@ -896,40 +901,31 @@ std::shared_ptr<ov::Model> MHAINT8MatMulTypeRelaxedFunction::initReference() con
ov::op::TemporaryReplaceOutputType(transpose0, element::f32).get(),
ov::op::TemporaryReplaceOutputType(brgemm1Param, element::f32).get(), transA, transB);

auto decomposed_fq =
[](const ov::Output<ov::Node>& input, const ov::element::Type& out_precision, float il, float ih, float scale) {
const auto input_low = ov::op::v0::Constant::create(ov::element::f32, {1}, {il});
const auto input_high = ov::op::v0::Constant::create(ov::element::f32, {1}, {ih});
const auto output_scale = ov::op::v0::Constant::create(ov::element::f32, {1}, {scale});
const auto max = std::make_shared<ov::op::v1::Maximum>(input, input_low);
const auto min = std::make_shared<ov::op::v1::Minimum>(max, input_high);
const auto mul = std::make_shared<ov::op::v1::Multiply>(min, output_scale);
return std::make_shared<ov::snippets::op::ConvertSaturation>(mul, out_precision);
};

const auto fq3 = decomposed_fq(matMul0, ov::element::i8, fq_signed_params.inputLowValues[0], fq_signed_params.inputHighValues[0], 0.00346764503f);
const auto fq3 = FakeQuantizeFunction::getDecomposedFakeQuantizeOps(
matMul0, ov::element::i8, fq_signed_params.inputLowValues[0], fq_signed_params.inputHighValues[0], 0.00346764503f);
const auto add = std::make_shared<op::TypeRelaxed<ov::op::v1::Add>>(
std::vector<element::Type>{ element::f32, element::f32 },
std::vector<element::Type>{ element::f32 },
ov::op::TemporaryReplaceOutputType(fq3, element::f32).get(),
ov::op::TemporaryReplaceOutputType(addParam, element::f32).get());
const auto deq = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1}, std::vector<float>{0.1122});
const auto deq = std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape{1, 1, 1, 1}, std::vector<float>{0.1122});
const auto deq_mul = std::make_shared<op::TypeRelaxed<ov::op::v1::Multiply>>(
std::vector<element::Type>{ element::f32, element::f32 },
std::vector<element::Type>{ element::f32 },
ov::op::TemporaryReplaceOutputType(add, element::f32).get(),
ov::op::TemporaryReplaceOutputType(deq, element::f32).get());

const auto softMax = std::make_shared<ov::opset1::Softmax>(deq_mul, 3);
const auto fq4 = decomposed_fq(softMax, ov::element::u8, 0.f, 0.245f, 1040.81628f);
const auto fq4 = FakeQuantizeFunction::getDecomposedFakeQuantizeOps(softMax, ov::element::u8, 0.f, 0.245f, 1040.81628f);

const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(transpose2Param, transpose2Const);
const auto matMul1 = std::make_shared<op::TypeRelaxed<op::v0::MatMul>>(
std::vector<element::Type>{ element::f32, element::f32 },
std::vector<element::Type>{ element::f32 },
ov::op::TemporaryReplaceOutputType(fq4, element::f32).get(),
ov::op::TemporaryReplaceOutputType(transpose2, element::f32).get(), transA, transB);
const auto fq5 = decomposed_fq(matMul1, ov::element::i8, fq_signed_params.inputLowValues[0], fq_signed_params.inputHighValues[0], 0.00346764503f);
const auto fq5 = FakeQuantizeFunction::getDecomposedFakeQuantizeOps(
matMul1, ov::element::i8, fq_signed_params.inputLowValues[0], fq_signed_params.inputHighValues[0], 0.00346764503f);

auto subgraph =
std::make_shared<ov::snippets::op::Subgraph>(subgraph_inputs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
//

#include "fake_quantize_helper.hpp"
#include "subgraph_mlp_seq.hpp"

#include "snippets/op/subgraph.hpp"
Expand Down Expand Up @@ -158,17 +159,6 @@ std::shared_ptr<ov::Model> MLPSeqQuantizedTypeRelaxedFunction::initReference() c
256, {1, 1}, {0.0f}, {2.55f}, {0.f}, {255.f}, ov::element::u8
};

auto decomposed_fq = [](const ov::Output<ov::Node>& input,
const ov::element::Type& out_precision,
float il, float ih, float scale) -> std::shared_ptr<ov::Node> {
auto input_low = ov::op::v0::Constant::create(input.get_element_type(), {1, 1}, {il});
auto input_high = ov::op::v0::Constant::create(input.get_element_type(), {1, 1}, {ih});
auto output_scale = ov::op::v0::Constant::create(input.get_element_type(), {1, 1}, {scale});
auto max_node = std::make_shared<ov::op::v1::Maximum>(input, input_low);
auto min_node = std::make_shared<ov::op::v1::Minimum>(max_node, input_high);
return std::make_shared<ov::op::v1::Multiply>(min_node, output_scale);
};

std::shared_ptr<ov::Node> current = sub_A;
current = std::make_shared<ov::snippets::op::ConvertSaturation>(current, ov::element::f32);

Expand All @@ -192,12 +182,8 @@ std::shared_ptr<ov::Model> MLPSeqQuantizedTypeRelaxedFunction::initReference() c
subgraph_params.push_back(B);
subgraph_nodes.push_back(B_const_trans);

current = decomposed_fq(current,
ov::element::u8,
onData.inputLowValues[0],
onData.inputHighValues[0],
0.00346764503f);
current = std::make_shared<ov::snippets::op::ConvertSaturation>(current, ov::element::u8);
current = FakeQuantizeFunction::getDecomposedFakeQuantizeOps(
current, ov::element::u8, onData.inputLowValues[0], onData.inputHighValues[0], 0.00346764503f);

current = std::make_shared<op::TypeRelaxed<ov::op::v0::MatMul>>(
std::vector<ov::element::Type>{ov::element::f32, ov::element::f32},
Expand Down Expand Up @@ -231,11 +217,13 @@ std::shared_ptr<ov::Model> MLPSeqQuantizedTypeRelaxedFunction::initReference() c
}
mlp_layer(static_cast<unsigned long>(input_shapes[0][1].get_length()), hidden_matmul_size, false);

current = decomposed_fq(current, ov::element::f32, onData.inputLowValues[0], onData.inputHighValues[0], 0.00346764503f);
current = FakeQuantizeFunction::getDecomposedFakeQuantizeOps(
current, ov::element::f32, onData.inputLowValues[0], onData.inputHighValues[0], 0.00346764503f);
current = std::make_shared<ov::op::v1::Subtract>(current, ov::op::v0::Constant::create(ov::element::f32, {1, 1}, {0}));
current = std::make_shared<ov::op::v5::Round>(current, ov::op::v5::Round::RoundMode::HALF_TO_EVEN);
current = std::make_shared<ov::op::v8::Softmax>(current, 1);
current = decomposed_fq(current, ov::element::f32, onData.inputLowValues[0], onData.inputHighValues[0], 0.00346764503f);
current = FakeQuantizeFunction::getDecomposedFakeQuantizeOps(
current, ov::element::f32, onData.inputLowValues[0], onData.inputHighValues[0], 0.00346764503f);
current = std::make_shared<ov::op::v1::Subtract>(current, ov::op::v0::Constant::create(ov::element::f32, {1, 1}, {0}));
current = std::make_shared<ov::op::v5::Round>(current, ov::op::v5::Round::RoundMode::HALF_TO_EVEN);
auto result_subgraph = std::make_shared<ov::op::v0::Result>(current);
Expand Down
Loading