Skip to content

Commit

Permalink
[GPU] merging multiple scalar multiply layers connected to horizontal…
Browse files Browse the repository at this point in the history
… fused fc (#28466)

### Details:
- merging multiple scalar multiply layers connected to horizontal fused
fc, and fusing it to the horizontal fused fc

### Tickets:
160900
  • Loading branch information
e-ddykim authored Feb 4, 2025
1 parent 26aa072 commit 7e83770
Show file tree
Hide file tree
Showing 2 changed files with 157 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,70 @@ FullyConnectedHorizontalFusion::FullyConnectedHorizontalFusion(bool fuse_mlp_swi
}
org_fc->clear_control_dependencies();
}

// Merge scalar multiply layers into one when all scalar constants have the same value.
//
// FusedFC FusedFC
// | |
// VariadicSplit ==> new_Mul (to be fused with FusedFC)
// / | \ |
// Mul Mul Mul VariadicSplit
// | | | | | |
const auto is_scalar_const = [](const ov::Output<ov::Node>& output) -> bool {
if (!ov::is_type<ov::op::v0::Constant>(output.get_node()))
return false;
const auto shape = output.get_partial_shape();
if (shape.is_dynamic())
return false;
return ov::shape_size(shape.to_shape()) == 1;
};

std::vector<float> const_values;
bool can_be_merged = true;
std::shared_ptr<ov::op::v0::Constant> const_node = nullptr;
for (auto& output : output_split->outputs()) {
if (output.get_target_inputs().size() != 1) {
can_be_merged = false;
break;
}
auto target_node = output.get_target_inputs().begin()->get_node();
if (!ov::is_type<ov::op::v1::Multiply>(target_node)) {
can_be_merged = false;
break;
}

for (auto& input : target_node->inputs()) {
if (input.get_source_output() != output) {
if (is_scalar_const(input.get_source_output())) {
const_node = ov::as_type_ptr<ov::op::v0::Constant>(
input.get_source_output().get_node_shared_ptr());
const_values.emplace_back(const_node->cast_vector<float>()[0]);
} else {
can_be_merged = false;
break;
}
}
}
}

if (const_values.size() != split_size ||
!std::equal(const_values.begin() + 1, const_values.end(), const_values.begin())) {
can_be_merged = false;
}

if (can_be_merged) {
auto new_mul = std::make_shared<ov::op::v1::Multiply>(new_fc, const_node);
new_mul->set_friendly_name(new_fc->get_friendly_name() + "_mul");
ov::NodeVector fused_mul_nodes;
output_split->input(0).replace_source_output(new_mul);
for (auto& output : output_split->outputs()) {
auto target_node = output.get_target_inputs().begin()->get_node();
fused_mul_nodes.push_back(target_node->shared_from_this());
ov::replace_output_update_name(target_node->output(0), output);
}
ov::copy_runtime_info(fused_mul_nodes, new_mul);
}

GPU_DEBUG_TRACE_DETAIL << "Created a new fused FC " << new_fc_name << std::endl;
return true;
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "openvino/op/variadic_split.hpp"
#include "openvino/op/reshape.hpp"
#include "openvino/op/add.hpp"
#include "openvino/op/multiply.hpp"
#include "openvino/pass/manager.hpp"

#include <transformations/utils/utils.hpp>
Expand Down Expand Up @@ -242,6 +243,98 @@ TEST_F(TransformationTestsF, FullyConnectedHorizontalFusion_eltwise_bias_zp) {
comparator.enable(FunctionsComparator::ATTRIBUTES);
}
}

TEST_F(TransformationTestsF, FullyConnectedHorizontalFusion_eltwise_bias_zp_scaling) {
std::vector<int64_t> pattern = {7, -1};
{
auto input = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 7, 4096});
auto weight1 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{1024, 4096});
weight1->set_friendly_name("weight1_1");
auto weight2 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{512, 4096});
weight2->set_friendly_name("weight1_2");
auto weight3 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{128, 4096});
weight3->set_friendly_name("weight1_3");

auto bias1 = std::make_shared<ov::intel_gpu::op::Placeholder>();
auto bias2 = std::make_shared<ov::intel_gpu::op::Placeholder>();
auto bias3 = std::make_shared<ov::intel_gpu::op::Placeholder>();

auto scale1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1024, 32});
auto scale2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{512, 32});
auto scale3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{128, 32});
auto fc1 = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight1, bias1, scale1);
fc1->set_friendly_name("fc1");
auto fc2 = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight2, bias2, scale2);
auto fc3 = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight3, bias3, scale3);

auto add_input1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 1024});
auto add1 = std::make_shared<ov::op::v1::Add>(fc1, add_input1);

auto add_input2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 512});
auto add2 = std::make_shared<ov::op::v1::Add>(fc2, add_input2);

auto add_input3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 128});
auto add3 = std::make_shared<ov::op::v1::Add>(fc3, add_input3);

const std::vector<float> scale_factor = {8.f};
auto mul_input1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{}, scale_factor);
auto mul1 = std::make_shared<ov::op::v1::Multiply>(add1, mul_input1);

auto mul_input2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{}, scale_factor);
auto mul2 = std::make_shared<ov::op::v1::Multiply>(add2, mul_input2);

auto mul_input3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{}, scale_factor);
auto mul3 = std::make_shared<ov::op::v1::Multiply>(add3, mul_input3);

auto reshape_pattern = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{2}, pattern);
auto reshape1 = std::make_shared<ov::op::v1::Reshape>(mul1, reshape_pattern, true);
auto reshape2 = std::make_shared<ov::op::v1::Reshape>(mul2, reshape_pattern, true);
auto reshape3 = std::make_shared<ov::op::v1::Reshape>(mul3, reshape_pattern, true);
auto result1 = std::make_shared<ov::op::v0::Result>(reshape1);
auto result2 = std::make_shared<ov::op::v0::Result>(reshape2);
auto result3 = std::make_shared<ov::op::v0::Result>(reshape3);
model = std::make_shared<ov::Model>(ov::ResultVector{result1, result2, result3}, ov::ParameterVector{input});
manager.register_pass<FullyConnectedHorizontalFusion>();
}
{
auto input = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{-1, 7, 4096});
auto weight1 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{1024, 4096});
weight1->set_friendly_name("weight2_1");
auto weight2 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{512, 4096});
weight2->set_friendly_name("weight2_2");
auto weight3 = std::make_shared<ov::op::v0::Constant>(ov::element::u4, ov::Shape{128, 4096});
weight3->set_friendly_name("weight2_3");
auto weights = ov::OutputVector{weight1, weight2, weight3};
auto weight_fused = std::make_shared<ov::op::v0::Concat>(weights, 0);
auto bias1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 1024});
auto bias2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 512});
auto bias3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1, 128});
auto biases = ov::OutputVector{bias1, bias2, bias3};
auto bias_fused = std::make_shared<ov::op::v0::Concat>(biases, 1);
auto scale1 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{1024, 32});
auto scale2 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{512, 32});
auto scale3 = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{128, 32});
auto scales = ov::OutputVector{scale1, scale2, scale3};
auto scale_fused = std::make_shared<ov::op::v0::Concat>(scales, 0);
auto fc_fused = std::make_shared<ov::intel_gpu::op::FullyConnectedCompressed>(input, weight_fused, bias_fused, scale_fused);
const std::vector<float> scale_factor = {8.f};
auto mul_input = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{}, scale_factor);
auto mul = std::make_shared<ov::op::v1::Multiply>(fc_fused, mul_input);
auto axis_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {fc_fused->get_output_partial_shape(0).size() - 1});
std::vector<int64_t> orig_n_sizes = {1024, 512, 128};
auto split_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{3}, orig_n_sizes);
auto split = std::make_shared<ov::op::v1::VariadicSplit>(mul, axis_const, split_const);
auto reshape_pattern = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{2}, pattern);
auto reshape1 = std::make_shared<ov::op::v1::Reshape>(split->output(0), reshape_pattern, true);
auto reshape2 = std::make_shared<ov::op::v1::Reshape>(split->output(1), reshape_pattern, true);
auto reshape3 = std::make_shared<ov::op::v1::Reshape>(split->output(2), reshape_pattern, true);
auto result1 = std::make_shared<ov::op::v0::Result>(reshape1);
auto result2 = std::make_shared<ov::op::v0::Result>(reshape2);
auto result3 = std::make_shared<ov::op::v0::Result>(reshape3);
model_ref = std::make_shared<ov::Model>(ov::ResultVector{result1, result2, result3}, ov::ParameterVector{input});
comparator.enable(FunctionsComparator::ATTRIBUTES);
}
}
} // namespace intel_gpu
} // namespace test
} // namespace ov

0 comments on commit 7e83770

Please sign in to comment.