Skip to content

Commit

Permalink
turned vtensors-transpose ON by default
Browse files Browse the repository at this point in the history
  • Loading branch information
esmirno committed Feb 4, 2025
1 parent 757763f commit 1416302
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 73 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ DEFINE_OPT(NPUW_LLM_BATCH_DIM, uint32_t, 0, npuw::llm::batch_dim, CompileTime);
DEFINE_OPT(NPUW_LLM_SEQ_LEN_DIM, uint32_t, 2, npuw::llm::seq_len_dim, CompileTime);
DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, CompileTime);
DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, CompileTime);
DEFINE_OPT(NPUW_LLM_OPTIMIZE_V_TENSORS, bool, false, npuw::llm::optimize_v_tensors, CompileTime);
DEFINE_OPT(NPUW_LLM_OPTIMIZE_V_TENSORS, bool, true, npuw::llm::optimize_v_tensors, CompileTime);

namespace npuw {
namespace llm {
Expand Down
129 changes: 57 additions & 72 deletions src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,47 @@ class TransposeValueTensors : public ov::pass::MatcherPass {
std::vector<std::shared_ptr<ov::opset13::Parameter>> old_params;
using Ref = std::reference_wrapper<Context>;
};

protected:
// generic part of matchers, to transpose v-tensors, and concat, and update matmul args
void transpose_matmul_b(Context::Ref ctx,
std::shared_ptr<ov::Node> node_param,
std::shared_ptr<ov::Node> node_concat,
std::shared_ptr<ov::Node> node_transpose,
std::shared_ptr<ov::Node> node_matmul) {

auto matched_param = std::static_pointer_cast<ov::op::v0::Parameter>(node_param);
auto matched_concat = std::static_pointer_cast<ov::op::v0::Concat>(node_concat);
auto matched_transpose = std::static_pointer_cast<ov::op::v1::Transpose>(node_transpose);
auto matched_matmul = std::static_pointer_cast<ov::op::v0::MatMul>(node_matmul);

auto param_shape = matched_param->get_partial_shape();
OPENVINO_ASSERT(param_shape.size() == 4u);
// NB: Transpose Parameter that correspond to V-tensor it will
// speed-up its multiplication with attention scores
std::swap(param_shape[2], param_shape[3]);
auto new_param = std::make_shared<ov::opset13::Parameter>(matched_param->get_element_type(), param_shape);
new_param->set_friendly_name(matched_param->get_friendly_name());
new_param->outputs().begin()->get_tensor().set_names(
matched_param->outputs().begin()->get_tensor().get_names());
ov::replace_node(matched_param, new_param);
// NB: Save in order to add/remove to the model later on
ctx.get().new_params.push_back(new_param);
ctx.get().old_params.push_back(matched_param);

auto order_cst = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{4}, {0, 2, 3, 1});
auto new_transpose =
std::make_shared<ov::opset13::Transpose>(matched_transpose->input_value(0), order_cst->output(0));
new_transpose->set_friendly_name(matched_transpose->get_friendly_name());
ov::replace_node(matched_transpose, new_transpose);

auto new_concat =
std::make_shared<ov::opset13::Concat>(ov::OutputVector{new_param->output(0), new_transpose->output(0)}, 3u);
new_concat->set_friendly_name(matched_concat->get_friendly_name());
ov::replace_node(matched_concat, new_concat);

matched_matmul->set_transpose_b(true);
}
};

// llama2, phi3, etc
Expand All @@ -48,42 +89,12 @@ class TransposeValueTensors_llama2 : public TransposeValueTensors {
auto callback = [=](ov::pass::pattern::Matcher& m) {
auto& node_to_output = m.get_pattern_value_map();

auto matched_node_param = node_to_output.at(param).get_node_shared_ptr();
auto matched_node_concat = node_to_output.at(concat).get_node_shared_ptr();
auto matched_node_param = node_to_output.at(param).get_node_shared_ptr();
auto matched_node_concat = node_to_output.at(concat).get_node_shared_ptr();
auto matched_node_transpose = node_to_output.at(transpose).get_node_shared_ptr();
auto matched_node_matmul = node_to_output.at(matmul).get_node_shared_ptr();

auto matched_param = std::static_pointer_cast<ov::op::v0::Parameter>(matched_node_param);
auto matched_concat = std::static_pointer_cast<ov::op::v0::Concat>(matched_node_concat);
auto matched_transpose = std::static_pointer_cast<ov::op::v1::Transpose>(matched_node_transpose);
auto matched_matmul = std::static_pointer_cast<ov::op::v0::MatMul>(matched_node_matmul);

auto shape = matched_param->get_partial_shape();
OPENVINO_ASSERT(shape.size() == 4u);
// NB: Transpose Parameter that correspond to V-tensor it will
// speed-up its multiplication with attention scores
std::swap(shape[2], shape[3]);
auto new_param = std::make_shared<ov::opset13::Parameter>(matched_param->get_element_type(), shape);
new_param->set_friendly_name(matched_param->get_friendly_name());
new_param->outputs().begin()->get_tensor().set_names(matched_param->outputs().begin()->get_tensor().get_names());
ov::replace_node(matched_param, new_param);
// NB: Save in order to add/remove to the model later on
ctx.get().new_params.push_back(new_param);
ctx.get().old_params.push_back(matched_param);

auto order_cst = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{4}, {0, 2, 3, 1});
auto new_transpose = std::make_shared<ov::opset13::Transpose>(matched_transpose->input_value(0),
order_cst->output(0));
new_transpose->set_friendly_name(matched_transpose->get_friendly_name());
ov::replace_node(matched_transpose, new_transpose);

auto new_concat = std::make_shared<ov::opset13::Concat>(
ov::OutputVector{new_param->output(0), new_transpose->output(0)}, 3u);
new_concat->set_friendly_name(matched_concat->get_friendly_name());
ov::replace_node(matched_concat, new_concat);

matched_matmul->set_transpose_b(true);
auto matched_node_matmul = node_to_output.at(matmul).get_node_shared_ptr();

transpose_matmul_b(ctx, matched_node_param, matched_node_concat, matched_node_transpose, matched_node_matmul);
return true;
};
register_matcher(std::make_shared<opp::Matcher>(matmul, "TransposeValueTensors_llama2"), std::move(callback));
Expand Down Expand Up @@ -126,14 +137,14 @@ class TransposeValueTensors_llama3 : public TransposeValueTensors {
auto matched_node_broadcast = node_to_output.at(broadcast).get_node_shared_ptr();
auto matched_node_reshape = node_to_output.at(reshape).get_node_shared_ptr();


auto matched_param = std::static_pointer_cast<ov::op::v0::Parameter>(matched_node_param);
auto matched_concat = std::static_pointer_cast<ov::op::v0::Concat>(matched_node_concat);
auto matched_param = std::static_pointer_cast<ov::op::v0::Parameter>(matched_node_param);
auto matched_concat = std::static_pointer_cast<ov::op::v0::Concat>(matched_node_concat);
auto matched_transpose = std::static_pointer_cast<ov::op::v1::Transpose>(matched_node_transpose);
auto matched_matmul = std::static_pointer_cast<ov::op::v0::MatMul>(matched_node_matmul);
auto matched_unsqueeze = std::static_pointer_cast<ov::op::v0::Unsqueeze>(matched_node_unsqueeze);
auto matched_broadcast = std::static_pointer_cast<ov::op::v3::Broadcast>(matched_node_broadcast);
auto matched_reshape = std::static_pointer_cast<ov::op::v1::Reshape>(matched_node_reshape);
auto matched_matmul = std::static_pointer_cast<ov::op::v0::MatMul>(matched_node_matmul);

auto matched_unsqueeze = std::static_pointer_cast<ov::op::v0::Unsqueeze>(matched_node_unsqueeze);
auto matched_broadcast = std::static_pointer_cast<ov::op::v3::Broadcast>(matched_node_broadcast);
auto matched_reshape = std::static_pointer_cast<ov::op::v1::Reshape>(matched_node_reshape);

auto shape_broadcast = matched_broadcast->get_output_shape(0);
OPENVINO_ASSERT(shape_broadcast.size() == 5u);
Expand All @@ -155,36 +166,11 @@ class TransposeValueTensors_llama3 : public TransposeValueTensors {
reshape_axes_node->set_friendly_name(matched_reshape->get_friendly_name() + "/new_reshape_shape");
matched_reshape->input(1).replace_source_output(reshape_axes_node);

auto param_shape = matched_param->get_partial_shape();
OPENVINO_ASSERT(param_shape.size() == 4u);
// NB: Transpose Parameter that correspond to V-tensor it will
// speed-up its multiplication with attention scores
std::swap(param_shape[2], param_shape[3]);
auto new_param = std::make_shared<ov::opset13::Parameter>(matched_param->get_element_type(), param_shape);
new_param->set_friendly_name(matched_param->get_friendly_name());
new_param->outputs().begin()->get_tensor().set_names(matched_param->outputs().begin()->get_tensor().get_names());
ov::replace_node(matched_param, new_param);
// NB: Save in order to add/remove to the model later on
ctx.get().new_params.push_back(new_param);
ctx.get().old_params.push_back(matched_param);

auto order_cst = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{4}, {0, 2, 3, 1});
auto new_transpose = std::make_shared<ov::opset13::Transpose>(matched_transpose->input_value(0),
order_cst->output(0));
new_transpose->set_friendly_name(matched_transpose->get_friendly_name());
ov::replace_node(matched_transpose, new_transpose);

auto new_concat = std::make_shared<ov::opset13::Concat>(
ov::OutputVector{new_param->output(0), new_transpose->output(0)}, 3u);
new_concat->set_friendly_name(matched_concat->get_friendly_name());
ov::replace_node(matched_concat, new_concat);

//------ update output dims
matched_unsqueeze->validate_and_infer_types();
matched_broadcast->validate_and_infer_types();
matched_reshape->validate_and_infer_types();

matched_matmul->set_transpose_b(true);
transpose_matmul_b(ctx,
matched_node_param,
matched_node_concat,
matched_node_transpose,
matched_node_matmul);

return true;
};
Expand Down Expand Up @@ -599,7 +585,6 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
LOG_DEBUG("6.Check and apply opt layout if applicable.");

const bool optimize_v_tensors = m_cfg.get<::intel_npu::NPUW_LLM_OPTIMIZE_V_TENSORS>();
// NB: Try to apply opt transpose only for Llama-2-7b-chat-hf model
if (optimize_v_tensors) {
if (optimize_value_tensors(kvcache_model)) {
// NB: Check if TransposeValueTensors transformation was applied
Expand Down

0 comments on commit 1416302

Please sign in to comment.