turned vtensors-transpose ON by default

openvinotoolkit · Feb 4, 2025 · 1416302 · 1416302
1 parent 757763f
commit 1416302
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 73 deletions.
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
@@ -72,7 +72,7 @@ DEFINE_OPT(NPUW_LLM_BATCH_DIM, uint32_t, 0, npuw::llm::batch_dim, CompileTime);
 DEFINE_OPT(NPUW_LLM_SEQ_LEN_DIM, uint32_t, 2, npuw::llm::seq_len_dim, CompileTime);
 DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, CompileTime);
 DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, CompileTime);
-DEFINE_OPT(NPUW_LLM_OPTIMIZE_V_TENSORS, bool, false, npuw::llm::optimize_v_tensors, CompileTime);
+DEFINE_OPT(NPUW_LLM_OPTIMIZE_V_TENSORS, bool, true, npuw::llm::optimize_v_tensors, CompileTime);
 
 namespace npuw {
 namespace llm {

diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp
@@ -27,6 +27,47 @@ class TransposeValueTensors : public ov::pass::MatcherPass {
         std::vector<std::shared_ptr<ov::opset13::Parameter>> old_params;
         using Ref = std::reference_wrapper<Context>;
     };
+
+protected:
+    // generic part of matchers, to transpose v-tensors, and concat, and update matmul args 
+    void transpose_matmul_b(Context::Ref ctx, 
+            std::shared_ptr<ov::Node> node_param,
+            std::shared_ptr<ov::Node> node_concat,
+            std::shared_ptr<ov::Node> node_transpose,
+            std::shared_ptr<ov::Node> node_matmul) {
+
+        auto matched_param = std::static_pointer_cast<ov::op::v0::Parameter>(node_param);
+        auto matched_concat = std::static_pointer_cast<ov::op::v0::Concat>(node_concat);
+        auto matched_transpose = std::static_pointer_cast<ov::op::v1::Transpose>(node_transpose);
+        auto matched_matmul = std::static_pointer_cast<ov::op::v0::MatMul>(node_matmul);
+
+        auto param_shape = matched_param->get_partial_shape();
+        OPENVINO_ASSERT(param_shape.size() == 4u);
+        // NB: Transpose Parameter that correspond to V-tensor it will
+        // speed-up its multiplication with attention scores
+        std::swap(param_shape[2], param_shape[3]);
+        auto new_param = std::make_shared<ov::opset13::Parameter>(matched_param->get_element_type(), param_shape);
+        new_param->set_friendly_name(matched_param->get_friendly_name());
+        new_param->outputs().begin()->get_tensor().set_names(
+            matched_param->outputs().begin()->get_tensor().get_names());
+        ov::replace_node(matched_param, new_param);
+        // NB: Save in order to add/remove to the model later on
+        ctx.get().new_params.push_back(new_param);
+        ctx.get().old_params.push_back(matched_param);
+
+        auto order_cst = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{4}, {0, 2, 3, 1});
+        auto new_transpose =
+            std::make_shared<ov::opset13::Transpose>(matched_transpose->input_value(0), order_cst->output(0));
+        new_transpose->set_friendly_name(matched_transpose->get_friendly_name());
+        ov::replace_node(matched_transpose, new_transpose);
+
+        auto new_concat =
+            std::make_shared<ov::opset13::Concat>(ov::OutputVector{new_param->output(0), new_transpose->output(0)}, 3u);
+        new_concat->set_friendly_name(matched_concat->get_friendly_name());
+        ov::replace_node(matched_concat, new_concat);
+
+        matched_matmul->set_transpose_b(true);
+    }
 };
 
 // llama2, phi3, etc
@@ -48,42 +89,12 @@ class TransposeValueTensors_llama2 : public TransposeValueTensors {
         auto callback = [=](ov::pass::pattern::Matcher& m) {
             auto& node_to_output = m.get_pattern_value_map();
 
-            auto matched_node_param     = node_to_output.at(param).get_node_shared_ptr();
-            auto matched_node_concat    = node_to_output.at(concat).get_node_shared_ptr();
+            auto matched_node_param = node_to_output.at(param).get_node_shared_ptr();
+            auto matched_node_concat = node_to_output.at(concat).get_node_shared_ptr();
             auto matched_node_transpose = node_to_output.at(transpose).get_node_shared_ptr();
-            auto matched_node_matmul    = node_to_output.at(matmul).get_node_shared_ptr();
-
-            auto matched_param     = std::static_pointer_cast<ov::op::v0::Parameter>(matched_node_param);
-            auto matched_concat    = std::static_pointer_cast<ov::op::v0::Concat>(matched_node_concat);
-            auto matched_transpose = std::static_pointer_cast<ov::op::v1::Transpose>(matched_node_transpose);
-            auto matched_matmul    = std::static_pointer_cast<ov::op::v0::MatMul>(matched_node_matmul);
-
-            auto shape = matched_param->get_partial_shape();
-            OPENVINO_ASSERT(shape.size() == 4u);
-            // NB: Transpose Parameter that correspond to V-tensor it will
-            // speed-up its multiplication with attention scores
-            std::swap(shape[2], shape[3]);
-            auto new_param = std::make_shared<ov::opset13::Parameter>(matched_param->get_element_type(), shape);
-            new_param->set_friendly_name(matched_param->get_friendly_name());
-            new_param->outputs().begin()->get_tensor().set_names(matched_param->outputs().begin()->get_tensor().get_names());
-            ov::replace_node(matched_param, new_param);
-            // NB: Save in order to add/remove to the model later on
-            ctx.get().new_params.push_back(new_param);
-            ctx.get().old_params.push_back(matched_param);
-
-            auto order_cst = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{4}, {0, 2, 3, 1});
-            auto new_transpose = std::make_shared<ov::opset13::Transpose>(matched_transpose->input_value(0),
-                                                                          order_cst->output(0));
-            new_transpose->set_friendly_name(matched_transpose->get_friendly_name());
-            ov::replace_node(matched_transpose, new_transpose);
-
-            auto new_concat = std::make_shared<ov::opset13::Concat>(
-                    ov::OutputVector{new_param->output(0), new_transpose->output(0)}, 3u);
-            new_concat->set_friendly_name(matched_concat->get_friendly_name());
-            ov::replace_node(matched_concat, new_concat);
-
-            matched_matmul->set_transpose_b(true);
+            auto matched_node_matmul = node_to_output.at(matmul).get_node_shared_ptr();
 
+            transpose_matmul_b(ctx, matched_node_param, matched_node_concat, matched_node_transpose, matched_node_matmul);
             return true;
         };
         register_matcher(std::make_shared<opp::Matcher>(matmul, "TransposeValueTensors_llama2"), std::move(callback));
@@ -126,14 +137,14 @@ class TransposeValueTensors_llama3 : public TransposeValueTensors {
             auto matched_node_broadcast = node_to_output.at(broadcast).get_node_shared_ptr();
             auto matched_node_reshape = node_to_output.at(reshape).get_node_shared_ptr();
 
-
-            auto matched_param     = std::static_pointer_cast<ov::op::v0::Parameter>(matched_node_param);
-            auto matched_concat    = std::static_pointer_cast<ov::op::v0::Concat>(matched_node_concat);
+            auto matched_param = std::static_pointer_cast<ov::op::v0::Parameter>(matched_node_param);
+            auto matched_concat = std::static_pointer_cast<ov::op::v0::Concat>(matched_node_concat);
             auto matched_transpose = std::static_pointer_cast<ov::op::v1::Transpose>(matched_node_transpose);
-            auto matched_matmul    = std::static_pointer_cast<ov::op::v0::MatMul>(matched_node_matmul);
-            auto matched_unsqueeze    = std::static_pointer_cast<ov::op::v0::Unsqueeze>(matched_node_unsqueeze);
-            auto matched_broadcast    = std::static_pointer_cast<ov::op::v3::Broadcast>(matched_node_broadcast);
-            auto matched_reshape = std::static_pointer_cast<ov::op::v1::Reshape>(matched_node_reshape);
+            auto matched_matmul = std::static_pointer_cast<ov::op::v0::MatMul>(matched_node_matmul);
+
+            auto matched_unsqueeze  = std::static_pointer_cast<ov::op::v0::Unsqueeze>(matched_node_unsqueeze);
+            auto matched_broadcast  = std::static_pointer_cast<ov::op::v3::Broadcast>(matched_node_broadcast);
+            auto matched_reshape    = std::static_pointer_cast<ov::op::v1::Reshape>(matched_node_reshape);
 
             auto shape_broadcast = matched_broadcast->get_output_shape(0);
             OPENVINO_ASSERT(shape_broadcast.size() == 5u);
@@ -155,36 +166,11 @@ class TransposeValueTensors_llama3 : public TransposeValueTensors {
             reshape_axes_node->set_friendly_name(matched_reshape->get_friendly_name() + "/new_reshape_shape");
             matched_reshape->input(1).replace_source_output(reshape_axes_node);
 
-            auto param_shape = matched_param->get_partial_shape();
-            OPENVINO_ASSERT(param_shape.size() == 4u);
-            // NB: Transpose Parameter that correspond to V-tensor it will
-            // speed-up its multiplication with attention scores
-            std::swap(param_shape[2], param_shape[3]);
-            auto new_param = std::make_shared<ov::opset13::Parameter>(matched_param->get_element_type(), param_shape);
-            new_param->set_friendly_name(matched_param->get_friendly_name());
-            new_param->outputs().begin()->get_tensor().set_names(matched_param->outputs().begin()->get_tensor().get_names());
-            ov::replace_node(matched_param, new_param);
-            // NB: Save in order to add/remove to the model later on
-            ctx.get().new_params.push_back(new_param);
-            ctx.get().old_params.push_back(matched_param);
-
-            auto order_cst = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{4}, {0, 2, 3, 1});
-            auto new_transpose = std::make_shared<ov::opset13::Transpose>(matched_transpose->input_value(0),
-                                                                          order_cst->output(0));
-            new_transpose->set_friendly_name(matched_transpose->get_friendly_name());
-            ov::replace_node(matched_transpose, new_transpose);
-
-            auto new_concat = std::make_shared<ov::opset13::Concat>(
-                    ov::OutputVector{new_param->output(0), new_transpose->output(0)}, 3u);
-            new_concat->set_friendly_name(matched_concat->get_friendly_name());
-            ov::replace_node(matched_concat, new_concat);
-
-            //------ update output dims
-            matched_unsqueeze->validate_and_infer_types();
-            matched_broadcast->validate_and_infer_types();
-            matched_reshape->validate_and_infer_types();
-
-            matched_matmul->set_transpose_b(true);
+            transpose_matmul_b(ctx,
+                               matched_node_param,
+                               matched_node_concat,
+                               matched_node_transpose,
+                               matched_node_matmul);
 
             return true;
         };
@@ -599,7 +585,6 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
     LOG_DEBUG("6.Check and apply opt layout if applicable.");
 
     const bool optimize_v_tensors = m_cfg.get<::intel_npu::NPUW_LLM_OPTIMIZE_V_TENSORS>();
-    // NB: Try to apply opt transpose only for Llama-2-7b-chat-hf model
     if (optimize_v_tensors) {
         if (optimize_value_tensors(kvcache_model)) {
             // NB: Check if TransposeValueTensors transformation was applied