diff --git a/src/common/snippets/tests/src/pass/mha_tokenization.cpp b/src/common/snippets/tests/src/pass/mha_tokenization.cpp index 3a97b94925a2c0..c5932ed690d670 100644 --- a/src/common/snippets/tests/src/pass/mha_tokenization.cpp +++ b/src/common/snippets/tests/src/pass/mha_tokenization.cpp @@ -14,14 +14,6 @@ namespace ov { namespace test { namespace snippets { -class SKIP_TokenizeMHASnippetsTests : public TokenizeMHASnippetsTests { -public: - void SetUp() override { - GTEST_SKIP(); - } - void TearDown() override{}; -}; - void TokenizeMHASnippetsTests::run() { ASSERT_TRUE(model); manager.register_pass(); @@ -103,8 +95,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_with_MatMul0_Transpose) { run(); } -TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-142098 */, smoke_Snippets_MHA_with_MatMul0_Transpose_Dynamic) { - GTEST_SKIP(); +TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_with_MatMul0_Transpose_Dynamic) { const auto &f = MHAMatMul0TransposeFunction(std::vector{{-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), false); @@ -113,8 +104,7 @@ TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-142098 */, smoke_Snippets_MHA_with_M run(); } -TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-114607 */, smoke_Snippets_MHA_with_int_Matmuls) { - GTEST_SKIP(); +TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_with_int_Matmuls) { const auto &f = MHAINT8MatMulTypeRelaxedFunction(std::vector{{1, 128, 12, 64}, {1, 128, 12, 64}, {1, 12, 128, 128}, {1, 128, 12, 64}}); model = f.getOriginal(); model_ref = f.getReference(); @@ -128,8 +118,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Transpose_extraction) { run(); } -TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-142098 */, smoke_Snippets_MHA_Dynamic_Transpose_extraction) { - GTEST_SKIP(); +TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Dynamic_Transpose_extraction) { const auto& f = MHATransposedInputFunction(std::vector{{-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}}, true); model = f.getOriginal(); model_ref = f.getReference(); @@ -144,8 +133,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Transpose_extraction_and_uns run(); } -TEST_F(SKIP_TokenizeMHASnippetsTests /* CVS-142098 */, smoke_Snippets_MHA_Dynamic_Transpose_extraction_and_unsupported_existing_transpose) { - GTEST_SKIP(); +TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_Dynamic_Transpose_extraction_and_unsupported_existing_transpose) { const auto& f = MHATransposedInputFunction(std::vector{{-1, -1, -1, -1}, {-1, -1, -1, -1}, {-1, -1, -1, -1}}, true, std::vector{0, 3, 1, 2}); model = f.getOriginal(); diff --git a/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp b/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp index c47cb754b5b891..1dbf8d7d22ed26 100644 --- a/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp +++ b/src/tests/ov_helpers/ov_snippets_models/src/subgraph_mha.cpp @@ -393,8 +393,8 @@ std::shared_ptr MHASelectFunction::initOriginal() const { // Value is equal to '1' - to avoid situation e^(-1000) / (sum(e^(-1000)) = 0/0 = NAN auto selectConst = ov::op::v0::Constant::create(precisions[2], ov::Shape{1}, std::vector{1}); - float transA = false; - float transB = false; + bool transA = false; + bool transB = false; const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); const auto matMul0 = std::make_shared(transpose0, transpose1, transA, transB); @@ -531,8 +531,8 @@ std::shared_ptr MHAWOTransposeOnInputsFunction::initOriginal() const auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape({4}), std::vector{0, 2, 1, 3}); - float transA = false; - float transB = false; + bool transA = false; + bool transB = false; const auto mulConst = ov::test::utils::make_constant(precision, ov::Shape({1})); const auto mul = std::make_shared(param1, mulConst); const auto matMul0 = std::make_shared(param0, mul, transA, transB); @@ -550,8 +550,8 @@ std::shared_ptr MHAWOTransposeFunction::initOriginal() const { auto param2 = std::make_shared(precisions[2], input_shapes[2]); ov::ParameterVector ngraphParam = {param0, param1, param2}; - float transA = false; - float transB = false; + bool transA = false; + bool transB = false; const auto matMul0 = std::make_shared(param0, param1, transA, transB); const auto softmax = std::make_shared(matMul0, -1); const auto matMul1 = std::make_shared(softmax, param2, transA, transB); @@ -615,8 +615,8 @@ std::shared_ptr MHAFQAfterMatMulFunction::initOriginal() const { static_cast(input_shapes[0].get_shape()[1])}; auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, {reshape1ConstData.size()}, reshape1ConstData); - float transA = false; - float transB = false; + bool transA = false; + bool transB = false; const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); const auto matMul0 = std::make_shared(transpose0, transpose1, transA, transB); @@ -665,8 +665,8 @@ std::shared_ptr MHAINT8MatMulFunction::initOriginal() const { {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); auto fq2 = ov::test::utils::make_fake_quantize(transpose2Param, ov::element::f32, 256, {1}, {-35.0172004}, {34.7436294}, {-35.0172004}, {34.7436294}); - float transA = false; - float transB = false; + bool transA = false; + bool transB = false; const auto transpose0 = std::make_shared(fq0, transpose0Const); const auto transpose1 = std::make_shared(fq1, transpose1Const); const auto matMul0 = std::make_shared(transpose0, transpose1, transA, transB); @@ -756,8 +756,8 @@ std::shared_ptr MHAFQFunction::initOriginal() const { const auto fq_add = ov::test::utils::make_fake_quantize(addParam, ov::element::f32, 256, {1}, {-1000}, {0}, {-1000}, {0}); - float transA = false; - float transB = false; + bool transA = false; + bool transB = false; const auto transpose0 = std::make_shared(fq0, transpose0Const); const auto transpose1 = std::make_shared(fq1, transpose1Const); const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); @@ -806,12 +806,12 @@ std::shared_ptr MHAINT8MatMulTypeRelaxedFunction::initOriginal() cons auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, {reshape1ConstData.size()}, reshape1ConstData); const auto fq_signed_params = ov::builder::subgraph::FakeQuantizeOnData(256, {1}, {-36912.66015625}, {36624.28125}, {-128}, {127}, ov::element::i8); - const auto fq0 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(transpose0Param, ov::element::i8, fq_signed_params); - const auto fq1 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(transpose1Param, ov::element::i8, fq_signed_params); - const auto fq2 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(transpose2Param, ov::element::i8, fq_signed_params); + const auto fq0 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(transpose0Param, ov::element::f32, fq_signed_params); + const auto fq1 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(transpose1Param, ov::element::f32, fq_signed_params); + const auto fq2 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(transpose2Param, ov::element::f32, fq_signed_params); - float transA = false; - float transB = false; + bool transA = false; + bool transB = false; const auto transpose0 = std::make_shared(fq0, transpose0Const); const auto transpose1 = std::make_shared(fq1, transpose1Const); const auto matMul0 = std::make_shared>( @@ -820,7 +820,7 @@ std::shared_ptr MHAINT8MatMulTypeRelaxedFunction::initOriginal() cons ov::op::TemporaryReplaceOutputType(transpose0, element::f32).get(), ov::op::TemporaryReplaceOutputType(transpose1, element::f32).get(), transA, transB); - const auto fq3 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(matMul0, ov::element::i8, fq_signed_params); + const auto fq3 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(matMul0, ov::element::f32, fq_signed_params); const auto add = std::make_shared>( std::vector{ element::f32, element::f32 }, std::vector{ element::f32 }, @@ -833,12 +833,12 @@ std::shared_ptr MHAINT8MatMulTypeRelaxedFunction::initOriginal() cons ov::op::TemporaryReplaceOutputType(add, element::f32).get(), ov::op::TemporaryReplaceOutputType(deq, element::f32).get()); - const auto reshape0 = std::make_shared(add, reshape0Const, true); + const auto reshape0 = std::make_shared(deq_mul, reshape0Const, true); const auto softMax = std::make_shared(reshape0, 1); const auto reshape1 = std::make_shared(softMax, reshape1Const, true); const auto fq_unsigned_params = ov::builder::subgraph::FakeQuantizeOnData(256, {1}, {0}, {0.245}, {0}, {255}, ov::element::u8); - const auto fq4 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(reshape1, ov::element::u8, fq_unsigned_params); + const auto fq4 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(reshape1, ov::element::f32, fq_unsigned_params); const auto transpose2 = std::make_shared(fq2, transpose2Const); const auto matMul1 = std::make_shared>( @@ -846,7 +846,7 @@ std::shared_ptr MHAINT8MatMulTypeRelaxedFunction::initOriginal() cons std::vector{ element::f32 }, ov::op::TemporaryReplaceOutputType(fq4, element::f32).get(), ov::op::TemporaryReplaceOutputType(transpose2, element::f32).get(), transA, transB); - const auto fq5 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(matMul1, ov::element::i8, fq_signed_params); + const auto fq5 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(matMul1, ov::element::f32, fq_signed_params); const auto transpose3 = std::make_shared(fq5, transpose3Const); ov::ResultVector results{std::make_shared(transpose3)}; @@ -860,9 +860,9 @@ std::shared_ptr MHAINT8MatMulTypeRelaxedFunction::initReference() con ov::ParameterVector ngraphParams = {data0, data1, data2, data3}; const auto fq_signed_params = ov::builder::subgraph::FakeQuantizeOnData(256, {1}, {-36912.66015625}, {36624.28125}, {-128}, {127}, ov::element::i8); - const auto fq0 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(data0, ov::element::i8, fq_signed_params); - const auto fq1 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(data1, ov::element::i8, fq_signed_params); - const auto fq2 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(data3, ov::element::i8, fq_signed_params); + const auto fq0 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(data0, ov::element::f32, fq_signed_params); + const auto fq1 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(data1, ov::element::f32, fq_signed_params); + const auto fq2 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(data3, ov::element::f32, fq_signed_params); NodeVector subgraph_inputs = {fq0, fq1, data2, fq2}; auto transpose0Param = std::make_shared(precision, input_shapes[0]); @@ -877,19 +877,8 @@ std::shared_ptr MHAINT8MatMulTypeRelaxedFunction::initReference() con auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, {shape_rank}, std::vector{0, 2, 1, 3}); - std::vector reshape0ConstData = {static_cast(input_shapes[0].get_shape()[0] * - input_shapes[0].get_shape()[1] * input_shapes[0].get_shape()[2]), - -1}; - auto reshape0Const = ov::op::v0::Constant::create(ov::element::i64, {reshape0ConstData.size()}, reshape0ConstData); - - std::vector reshape1ConstData = {static_cast(input_shapes[0].get_shape()[0]), - static_cast(input_shapes[0].get_shape()[2]), - static_cast(input_shapes[0].get_shape()[1]), - static_cast(input_shapes[0].get_shape()[1])}; - auto reshape1Const = ov::op::v0::Constant::create(ov::element::i64, {reshape1ConstData.size()}, reshape1ConstData); - - float transA = false; - float transB = false; + bool transA = false; + bool transB = false; const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); const auto matMul0 = std::make_shared>( @@ -898,7 +887,18 @@ std::shared_ptr MHAINT8MatMulTypeRelaxedFunction::initReference() con ov::op::TemporaryReplaceOutputType(transpose0, element::f32).get(), ov::op::TemporaryReplaceOutputType(transpose1, element::f32).get(), transA, transB); - const auto fq3 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(matMul0, ov::element::i8, fq_signed_params); + auto decomposed_fq = + [](const ov::Output& input, const ov::element::Type& out_precision, float il, float ih, float scale) { + const auto input_low = ov::op::v0::Constant::create(ov::element::f32, {1}, {il}); + const auto input_high = ov::op::v0::Constant::create(ov::element::f32, {1}, {ih}); + const auto output_scale = ov::op::v0::Constant::create(ov::element::f32, {1}, {scale}); + const auto max = std::make_shared(input, input_low); + const auto min = std::make_shared(max, input_high); + const auto mul = std::make_shared(min, output_scale); + return std::make_shared(mul, out_precision); + }; + + const auto fq3 = decomposed_fq(matMul0, ov::element::i8, fq_signed_params.inputLowValues[0], fq_signed_params.inputHighValues[0], 0.00346764503f); const auto add = std::make_shared>( std::vector{ element::f32, element::f32 }, std::vector{ element::f32 }, @@ -911,12 +911,8 @@ std::shared_ptr MHAINT8MatMulTypeRelaxedFunction::initReference() con ov::op::TemporaryReplaceOutputType(add, element::f32).get(), ov::op::TemporaryReplaceOutputType(deq, element::f32).get()); - const auto reshape0 = std::make_shared(add, reshape0Const, true); - const auto softMax = std::make_shared(reshape0, 1); - const auto reshape1 = std::make_shared(softMax, reshape1Const, true); - - const auto fq_unsigned_params = ov::builder::subgraph::FakeQuantizeOnData(256, {1}, {0}, {0.245}, {0}, {255}, ov::element::u8); - const auto fq4 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(reshape1, ov::element::u8, fq_unsigned_params); + const auto softMax = std::make_shared(deq_mul, 3); + const auto fq4 = decomposed_fq(softMax, ov::element::u8, 0.f, 0.245f, 1040.81628f); const auto transpose2 = std::make_shared(transpose2Param, transpose2Const); const auto matMul1 = std::make_shared>( @@ -924,7 +920,7 @@ std::shared_ptr MHAINT8MatMulTypeRelaxedFunction::initReference() con std::vector{ element::f32 }, ov::op::TemporaryReplaceOutputType(fq4, element::f32).get(), ov::op::TemporaryReplaceOutputType(transpose2, element::f32).get(), transA, transB); - const auto fq5 = ov::builder::subgraph::makeFakeQuantizeTypeRelaxed(matMul1, ov::element::i8, fq_signed_params); + const auto fq5 = decomposed_fq(matMul1, ov::element::i8, fq_signed_params.inputLowValues[0], fq_signed_params.inputHighValues[0], 0.00346764503f); auto subgraph = std::make_shared(subgraph_inputs, std::make_shared(NodeVector{fq5}, subgraph_params)); @@ -946,8 +942,8 @@ std::shared_ptr MHAMulAddFunction::initOriginal() const { auto transpose2Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{input_shapes[2].size()}, std::vector{0, 2, 1, 3}); auto transpose3Const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{input_shapes[2].size()}, std::vector{0, 2, 1, 3}); - float transA = false; - float transB = false; + bool transA = false; + bool transB = false; const auto transpose0 = std::make_shared(transpose0Param, transpose0Const); const auto transpose1 = std::make_shared(transpose1Param, transpose1Const); const auto matMul0 = std::make_shared(transpose0, transpose1, transA, transB);