diff --git a/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp b/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp index e2090a4d2b5eb8..8568c334682548 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/fc_horizontal_fusion.cpp @@ -299,6 +299,70 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion(bool fuse_mlp_swi } org_fc->clear_control_dependencies(); } + + // Merge scalar multiply layers into one when all scalar constants have the same value. + // + // FusedFC FusedFC + // | | + // VariadicSplit ==> new_Mul (to be fused with FusedFC) + // / | \ | + // Mul Mul Mul VariadicSplit + // | | | | | | + const auto is_scalar_const = [](const ov::Output& output) -> bool { + if (!ov::is_type(output.get_node())) + return false; + const auto shape = output.get_partial_shape(); + if (shape.is_dynamic()) + return false; + return ov::shape_size(shape.to_shape()) == 1; + }; + + std::vector const_values; + bool can_be_merged = true; + std::shared_ptr const_node = nullptr; + for (auto& output : output_split->outputs()) { + if (output.get_target_inputs().size() != 1) { + can_be_merged = false; + break; + } + auto target_node = output.get_target_inputs().begin()->get_node(); + if (!ov::is_type(target_node)) { + can_be_merged = false; + break; + } + + for (auto& input : target_node->inputs()) { + if (input.get_source_output() != output) { + if (is_scalar_const(input.get_source_output())) { + const_node = ov::as_type_ptr( + input.get_source_output().get_node_shared_ptr()); + const_values.emplace_back(const_node->cast_vector()[0]); + } else { + can_be_merged = false; + break; + } + } + } + } + + if (const_values.size() != split_size || + !std::equal(const_values.begin() + 1, const_values.end(), const_values.begin())) { + can_be_merged = false; + } + + if (can_be_merged) { + auto new_mul = std::make_shared(new_fc, const_node); + new_mul->set_friendly_name(new_fc->get_friendly_name() + "_mul"); + ov::NodeVector fused_mul_nodes; + output_split->input(0).replace_source_output(new_mul); + for (auto& output : output_split->outputs()) { + auto target_node = output.get_target_inputs().begin()->get_node(); + fused_mul_nodes.push_back(target_node->shared_from_this()); + ov::replace_output_update_name(target_node->output(0), output); + } + ov::copy_runtime_info(fused_mul_nodes, new_mul); + } + GPU_DEBUG_TRACE_DETAIL << "Created a new fused FC " << new_fc_name << std::endl; return true; }; diff --git a/src/plugins/intel_gpu/tests/unit/transformations/horizontal_fc_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/horizontal_fc_fusion_test.cpp index af7e6482002ae2..15f225ba6aa2ea 100644 --- a/src/plugins/intel_gpu/tests/unit/transformations/horizontal_fc_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/transformations/horizontal_fc_fusion_test.cpp @@ -18,6 +18,7 @@ #include "openvino/op/variadic_split.hpp" #include "openvino/op/reshape.hpp" #include "openvino/op/add.hpp" +#include "openvino/op/multiply.hpp" #include "openvino/pass/manager.hpp" #include @@ -242,6 +243,98 @@ TEST_F(TransformationTestsF, FullyConnectedHorizontalFusion_eltwise_bias_zp) { comparator.enable(FunctionsComparator::ATTRIBUTES); } } + +TEST_F(TransformationTestsF, FullyConnectedHorizontalFusion_eltwise_bias_zp_scaling) { + std::vector pattern = {7, -1}; + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{-1, 7, 4096}); + auto weight1 = std::make_shared(ov::element::u4, ov::Shape{1024, 4096}); + weight1->set_friendly_name("weight1_1"); + auto weight2 = std::make_shared(ov::element::u4, ov::Shape{512, 4096}); + weight2->set_friendly_name("weight1_2"); + auto weight3 = std::make_shared(ov::element::u4, ov::Shape{128, 4096}); + weight3->set_friendly_name("weight1_3"); + + auto bias1 = std::make_shared(); + auto bias2 = std::make_shared(); + auto bias3 = std::make_shared(); + + auto scale1 = std::make_shared(ov::element::f16, ov::Shape{1024, 32}); + auto scale2 = std::make_shared(ov::element::f16, ov::Shape{512, 32}); + auto scale3 = std::make_shared(ov::element::f16, ov::Shape{128, 32}); + auto fc1 = std::make_shared(input, weight1, bias1, scale1); + fc1->set_friendly_name("fc1"); + auto fc2 = std::make_shared(input, weight2, bias2, scale2); + auto fc3 = std::make_shared(input, weight3, bias3, scale3); + + auto add_input1 = std::make_shared(ov::element::f16, ov::Shape{1, 1024}); + auto add1 = std::make_shared(fc1, add_input1); + + auto add_input2 = std::make_shared(ov::element::f16, ov::Shape{1, 512}); + auto add2 = std::make_shared(fc2, add_input2); + + auto add_input3 = std::make_shared(ov::element::f16, ov::Shape{1, 128}); + auto add3 = std::make_shared(fc3, add_input3); + + const std::vector scale_factor = {8.f}; + auto mul_input1 = std::make_shared(ov::element::f16, ov::Shape{}, scale_factor); + auto mul1 = std::make_shared(add1, mul_input1); + + auto mul_input2 = std::make_shared(ov::element::f16, ov::Shape{}, scale_factor); + auto mul2 = std::make_shared(add2, mul_input2); + + auto mul_input3 = std::make_shared(ov::element::f16, ov::Shape{}, scale_factor); + auto mul3 = std::make_shared(add3, mul_input3); + + auto reshape_pattern = std::make_shared(ov::element::i64, ov::Shape{2}, pattern); + auto reshape1 = std::make_shared(mul1, reshape_pattern, true); + auto reshape2 = std::make_shared(mul2, reshape_pattern, true); + auto reshape3 = std::make_shared(mul3, reshape_pattern, true); + auto result1 = std::make_shared(reshape1); + auto result2 = std::make_shared(reshape2); + auto result3 = std::make_shared(reshape3); + model = std::make_shared(ov::ResultVector{result1, result2, result3}, ov::ParameterVector{input}); + manager.register_pass(); + } + { + auto input = std::make_shared(ov::element::f16, ov::PartialShape{-1, 7, 4096}); + auto weight1 = std::make_shared(ov::element::u4, ov::Shape{1024, 4096}); + weight1->set_friendly_name("weight2_1"); + auto weight2 = std::make_shared(ov::element::u4, ov::Shape{512, 4096}); + weight2->set_friendly_name("weight2_2"); + auto weight3 = std::make_shared(ov::element::u4, ov::Shape{128, 4096}); + weight3->set_friendly_name("weight2_3"); + auto weights = ov::OutputVector{weight1, weight2, weight3}; + auto weight_fused = std::make_shared(weights, 0); + auto bias1 = std::make_shared(ov::element::f16, ov::Shape{1, 1024}); + auto bias2 = std::make_shared(ov::element::f16, ov::Shape{1, 512}); + auto bias3 = std::make_shared(ov::element::f16, ov::Shape{1, 128}); + auto biases = ov::OutputVector{bias1, bias2, bias3}; + auto bias_fused = std::make_shared(biases, 1); + auto scale1 = std::make_shared(ov::element::f16, ov::Shape{1024, 32}); + auto scale2 = std::make_shared(ov::element::f16, ov::Shape{512, 32}); + auto scale3 = std::make_shared(ov::element::f16, ov::Shape{128, 32}); + auto scales = ov::OutputVector{scale1, scale2, scale3}; + auto scale_fused = std::make_shared(scales, 0); + auto fc_fused = std::make_shared(input, weight_fused, bias_fused, scale_fused); + const std::vector scale_factor = {8.f}; + auto mul_input = std::make_shared(ov::element::f16, ov::Shape{}, scale_factor); + auto mul = std::make_shared(fc_fused, mul_input); + auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {fc_fused->get_output_partial_shape(0).size() - 1}); + std::vector orig_n_sizes = {1024, 512, 128}; + auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, orig_n_sizes); + auto split = std::make_shared(mul, axis_const, split_const); + auto reshape_pattern = std::make_shared(ov::element::i64, ov::Shape{2}, pattern); + auto reshape1 = std::make_shared(split->output(0), reshape_pattern, true); + auto reshape2 = std::make_shared(split->output(1), reshape_pattern, true); + auto reshape3 = std::make_shared(split->output(2), reshape_pattern, true); + auto result1 = std::make_shared(reshape1); + auto result2 = std::make_shared(reshape2); + auto result3 = std::make_shared(reshape3); + model_ref = std::make_shared(ov::ResultVector{result1, result2, result3}, ov::ParameterVector{input}); + comparator.enable(FunctionsComparator::ATTRIBUTES); + } +} } // namespace intel_gpu } // namespace test } // namespace ov \ No newline at end of file