From e0df99b2e3bfad37334dbf914ebe7533ac0313dd Mon Sep 17 00:00:00 2001
From: Bogdan Pereanu <bogdan.pereanu@intel.com>
Date: Thu, 30 Jan 2025 09:32:40 +0200
Subject: [PATCH] Initial commit

Signed-off-by: Bogdan Pereanu <bogdan.pereanu@intel.com>
---
 .../src/backend/include/zero_pipeline.hpp     |  5 +-
 .../src/backend/src/zero_pipeline.cpp         | 56 ++++---------------
 .../include/intel_npu/common/igraph.hpp       | 21 +++++--
 .../intel_npu/src/common/src/igraph.cpp       | 19 +++++--
 .../compiler_adapter/include/driver_graph.hpp |  4 ++
 .../compiler_adapter/include/plugin_graph.hpp |  2 +
 .../src/compiler_adapter/src/driver_graph.cpp | 39 +++++++++++--
 .../intel_npu/utils/zero/zero_wrappers.hpp    |  2 +
 8 files changed, 83 insertions(+), 65 deletions(-)
diff --git a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp
index 29069f0a0cf8cc..b8af2a4ad8b32a 100644
--- a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp
+++ b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp
@@ -27,7 +27,7 @@ struct Pipeline {
 
     Pipeline(const Pipeline&) = delete;
     Pipeline& operator=(const Pipeline&) = delete;
-    ~Pipeline();
+    virtual ~Pipeline() = default;
 
     void push();
     void pull();
@@ -66,10 +66,7 @@ struct Pipeline {
     std::shared_ptr<zeroProfiling::NpuInferProfiling> _npu_profiling;
     Logger _logger;
 
-    uint32_t _group_ordinal;
     std::mutex _mutex;
-    bool _turbo = false;
-    ze_command_queue_priority_t _ze_queue_priority;
     std::optional<ze_command_queue_workload_type_t> _ze_workload_type = std::nullopt;
 };
 
diff --git a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp
index 3cf9b205df2abd..50b04b4a3f0053 100644
--- a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp
+++ b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp
@@ -32,8 +32,7 @@ Pipeline::Pipeline(const Config& config,
       _id(_graph->get_unique_id()),
       _number_of_command_lists(_graph->get_batch_size().has_value() ? *_graph->get_batch_size() : 1),
       _npu_profiling(npu_profiling),
-      _logger("Pipeline", _config.get<LOG_LEVEL>()),
-      _group_ordinal(group_ordinal) {
+      _logger("Pipeline", _config.get<LOG_LEVEL>()) {
     OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "Zero_infer_request::Pipeline::Pipeline");
     _logger.debug("Pipeline - initialize started");
 
@@ -64,21 +63,8 @@ Pipeline::Pipeline(const Config& config,
                                           _init_structs->getMutableCommandListVersion() ? true : false));
     }
 
-    _ze_queue_priority = zeroUtils::toZeQueuePriority(_config.get<MODEL_PRIORITY>());
-
-    if (_config.has<TURBO>()) {
-        _turbo = _config.get<TURBO>();
-    }
-
-    if (config.has<WORKLOAD_TYPE>()) {
-        _ze_workload_type = zeroUtils::toZeQueueWorkloadType(config.get<WORKLOAD_TYPE>());
-    }
-
-    _command_queue = CommandQueueManager::getInstance().getCommandQueue(_init_structs,
-                                                                        _ze_queue_priority,
-                                                                        _graph->get_ze_workload_type(),
-                                                                        _group_ordinal,
-                                                                        _turbo);
+    _ze_workload_type = _graph->get_ze_workload_type();
+    _command_queue = _graph->get_command_queue();
 
     if (_sync_output_with_fences) {
         _fences.resize(_number_of_command_lists);
@@ -91,7 +77,7 @@ Pipeline::Pipeline(const Config& config,
 
     for (size_t i = 0; i < _number_of_command_lists; i++) {
         size_t io_index = 0;
-        for (const auto& desc : graph->get_input_descriptors()) {
+        for (const auto& desc : _graph->get_input_descriptors()) {
             if (input_tensors.at(io_index).size() > 1) {
                 void* data = nullptr;
                 auto remote_tensor = std::dynamic_pointer_cast<ZeroRemoteTensor>(input_tensors.at(io_index).at(i));
@@ -101,7 +87,7 @@ Pipeline::Pipeline(const Config& config,
                     data = remote_tensor->get_original_memory();
                 }
 
-                graph->set_argument_value(desc.idx, data);
+                _graph->set_argument_value(desc.idx, data);
 
                 ++io_index;
                 continue;
@@ -115,7 +101,7 @@ Pipeline::Pipeline(const Config& config,
                 data = remote_tensor->get_original_memory();
             }
 
-            graph->set_argument_value(
+            _graph->set_argument_value(
                 desc.idx,
                 static_cast<unsigned char*>(data) +
                     (i * input_tensors.at(io_index).at(0)->get_byte_size()) / _number_of_command_lists);
@@ -124,7 +110,7 @@ Pipeline::Pipeline(const Config& config,
         }
 
         io_index = 0;
-        for (const auto& desc : graph->get_output_descriptors()) {
+        for (const auto& desc : _graph->get_output_descriptors()) {
             void* data = nullptr;
             auto remote_tensor = std::dynamic_pointer_cast<ZeroRemoteTensor>(output_tensors.at(io_index));
             if (remote_tensor == nullptr) {
@@ -133,7 +119,7 @@ Pipeline::Pipeline(const Config& config,
                 data = remote_tensor->get_original_memory();
             }
 
-            graph->set_argument_value(
+            _graph->set_argument_value(
                 desc.idx,
                 static_cast<unsigned char*>(data) +
                     (i * output_tensors.at(io_index)->get_byte_size()) / _number_of_command_lists);
@@ -152,7 +138,7 @@ Pipeline::Pipeline(const Config& config,
             _command_lists.at(i)->appendNpuTimestamp(reinterpret_cast<uint64_t*>(_npu_profiling->npu_ts_infer_start));
         }
 
-        _command_lists.at(i)->appendGraphExecute(static_cast<ze_graph_handle_t>(graph->get_handle()),
+        _command_lists.at(i)->appendGraphExecute(static_cast<ze_graph_handle_t>(_graph->get_handle()),
                                                  profiling_query.getHandle());
 
         /// append timestamp command if feature was activated
@@ -196,11 +182,7 @@ void Pipeline::getCommandQueue() {
             }
         }
 
-        _command_queue = CommandQueueManager::getInstance().getCommandQueue(_init_structs,
-                                                                            _ze_queue_priority,
-                                                                            _graph->get_ze_workload_type(),
-                                                                            _group_ordinal,
-                                                                            _turbo);
+        _command_queue = _graph->get_command_queue();
 
         if (_sync_output_with_fences) {
             for (size_t i = 0; i < _number_of_command_lists; i++) {
@@ -210,7 +192,7 @@ void Pipeline::getCommandQueue() {
         }
 
         _logger.debug("Pipeline - getCommandQueue() - free previous command queue");
-        CommandQueueManager::getInstance().freeCommandQueue(_ze_queue_priority, _ze_workload_type, _turbo);
+        _graph->destroy_specific_command_queue(_ze_workload_type);
 
         _ze_workload_type = _graph->get_ze_workload_type();
     }
@@ -330,20 +312,4 @@ void Pipeline::closeCommandListIndex(size_t command_list_index) {
     _command_lists.at(command_list_index)->close();
 };
 
-Pipeline::~Pipeline() {
-    if (_command_queue) {
-        if (_sync_output_with_fences) {
-            // fences shall be destroyed before the command queue is destroyed
-            for (size_t i = 0; i < _number_of_command_lists; i++) {
-                if (_fences[i] != nullptr) {
-                    _fences[i].reset();
-                }
-            }
-        }
-
-        _command_queue.reset();
-        CommandQueueManager::getInstance().freeCommandQueue(_ze_queue_priority, _ze_workload_type, _turbo);
-    }
-}
-
 }  // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp
index efb5b6b8978cfc..309cb98ddcb135 100644
--- a/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp
+++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp
@@ -43,6 +43,10 @@ class IGraph : public std::enable_shared_from_this<IGraph> {
     const std::vector<ArgumentDescriptor>& get_input_descriptors() const;
     const std::vector<ArgumentDescriptor>& get_output_descriptors() const;
 
+    const std::shared_ptr<CommandQueue>& get_command_queue() const;
+    const void destroy_specific_command_queue(
+        const std::optional<ze_command_queue_workload_type_t>& zeWorkloadType) const;
+
     void set_workload_type(const ov::WorkloadType workloadType);
     const std::optional<ze_command_queue_workload_type_t> get_ze_workload_type() const;
 
@@ -59,8 +63,8 @@ class IGraph : public std::enable_shared_from_this<IGraph> {
 
 protected:
     /**
-     * @brief Determines if batching can be addressed inside the plugin. In the positive case, the batch size used by
-     * the model will also be deduced and returned.
+     * @brief Determines if batching can be addressed inside the plugin. In the positive case, the batch size used
+     * by the model will also be deduced and returned.
      * @details Batching can be handled by the plugin only if:
      *  - The batch axis is the first axis.
      *  - The batch size received by the compiler takes the default value of 1.
@@ -72,11 +76,13 @@ class IGraph : public std::enable_shared_from_this<IGraph> {
      *
      * @param metadata Metadata containing the shape values as seen by both the compiler and IR model. These will
      * ultimately be used for determining the batch size.
-     * @returns The batch size deduced by the algorithm or the default value of 1 if batching cannot be performed inside
-     * the plugin.
+     * @returns The batch size deduced by the algorithm or the default value of 1 if batching cannot be performed
+     * inside the plugin.
      */
     std::optional<size_t> get_batch_size(const NetworkMetadata& metadata);
 
+    virtual void create_new_command_queue() = 0;
+
     ze_graph_handle_t _handle = nullptr;
     NetworkMetadata _metadata;
 
@@ -85,8 +91,8 @@ class IGraph : public std::enable_shared_from_this<IGraph> {
 
     std::vector<std::shared_ptr<Event>> _last_submitted_event;
 
-    // Used to protect zero pipeline creation in the graph. The pipeline should be created only once per graph when the
-    // first inference starts running
+    // Used to protect zero pipeline creation in the graph. The pipeline should be created only once per graph when
+    // the first inference starts running
     std::mutex _mutex;
 
     std::unique_ptr<BlobContainer> _blobPtr;
@@ -100,7 +106,10 @@ class IGraph : public std::enable_shared_from_this<IGraph> {
      */
     std::optional<std::size_t> _batch_size = std::nullopt;
 
+    std::shared_ptr<CommandQueue> _command_queue;
     std::optional<ze_command_queue_workload_type_t> _ze_workload_type = std::nullopt;
+    bool _turbo = false;
+    ze_command_queue_priority_t _ze_queue_priority;
 
     Logger _logger;
 };
diff --git a/src/plugins/intel_npu/src/common/src/igraph.cpp b/src/plugins/intel_npu/src/common/src/igraph.cpp
index ce54b53ea20432..b3e68baddb0337 100644
--- a/src/plugins/intel_npu/src/common/src/igraph.cpp
+++ b/src/plugins/intel_npu/src/common/src/igraph.cpp
@@ -21,11 +21,7 @@ IGraph::IGraph(ze_graph_handle_t handle,
     : _handle(handle),
       _metadata(std::move(metadata)),
       _blobPtr(std::move(blobPtr)),
-      _logger("IGraph", config.get<LOG_LEVEL>()) {
-    if (config.has<WORKLOAD_TYPE>()) {
-        set_workload_type(config.get<WORKLOAD_TYPE>());
-    }
-}
+      _logger("IGraph", config.get<LOG_LEVEL>()) {}
 
 const NetworkMetadata& IGraph::get_metadata() const {
     return _metadata;
@@ -47,8 +43,21 @@ const std::vector<ArgumentDescriptor>& IGraph::get_output_descriptors() const {
     return _output_descriptors;
 }
 
+const std::shared_ptr<CommandQueue>& IGraph::get_command_queue() const {
+    return _command_queue;
+}
+
+const void IGraph::destroy_specific_command_queue(
+    const std::optional<ze_command_queue_workload_type_t>& zeWorkloadType) const {
+    CommandQueueManager::getInstance().freeCommandQueue(_ze_queue_priority, zeWorkloadType, _turbo);
+}
+
 void IGraph::set_workload_type(const ov::WorkloadType workloadType) {
     _ze_workload_type = zeroUtils::toZeQueueWorkloadType(workloadType);
+
+    if (_command_queue) {
+        create_new_command_queue();
+    }
 }
 
 std::mutex& IGraph::get_mutex() {
diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp
index ac89a790291d2e..ce910d5cf98526 100644
--- a/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp
+++ b/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp
@@ -37,9 +37,13 @@ class DriverGraph final : public IGraph {
 private:
     bool release_blob(const Config& config);
 
+    void create_new_command_queue() override;
+
     std::shared_ptr<ZeGraphExtWrappers> _zeGraphExt;
     std::shared_ptr<ZeroInitStructsHolder> _zeroInitStruct;
 
+    uint32_t _groupOrdinal;
+
     Logger _logger;
 
     // In the case of the import path, the blob is released after graph initialization so it can not be any longer
diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp
index 61d4a6ed866529..3dd5b57fa5e8a4 100644
--- a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp
+++ b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp
@@ -38,6 +38,8 @@ class PluginGraph final : public IGraph {
     ~PluginGraph() override;
 
 private:
+    void create_new_command_queue() override{};
+
     std::shared_ptr<ZeGraphExtWrappers> _zeGraphExt;
     std::shared_ptr<ZeroInitStructsHolder> _zeroInitStruct;
 
diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp
index 97f77ca644dc08..fa9ebf5e2935c9 100644
--- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp
+++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp
@@ -99,11 +99,6 @@ void DriverGraph::initialize(const Config& config) {
     _input_descriptors.shrink_to_fit();
     _output_descriptors.shrink_to_fit();
 
-    ze_device_properties_t deviceProperties = {};
-    deviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
-    THROW_ON_FAIL_FOR_LEVELZERO("zeDeviceGetProperties",
-                                zeDeviceGetProperties(_zeroInitStruct->getDevice(), &deviceProperties));
-
     _zeGraphExt->initializeGraph(_handle);
 
     _logger.debug("Graph initialize finish");
@@ -113,6 +108,29 @@ void DriverGraph::initialize(const Config& config) {
     //  releasing it here to avoid unnecessary memory usage.
     _blobIsReleased = release_blob(config);
 
+    // Find the corresponding command queue group.
+    ze_device_properties_t deviceProperties = {};
+    deviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    THROW_ON_FAIL_FOR_LEVELZERO("zeDeviceGetProperties",
+                                zeDeviceGetProperties(_zeroInitStruct->getDevice(), &deviceProperties));
+    _groupOrdinal = zeroUtils::findGroupOrdinal(_zeroInitStruct->getDevice(), deviceProperties);
+
+    _ze_queue_priority = zeroUtils::toZeQueuePriority(config.get<MODEL_PRIORITY>());
+
+    if (config.has<TURBO>()) {
+        _turbo = config.get<TURBO>();
+    }
+
+    if (config.has<WORKLOAD_TYPE>()) {
+        _ze_workload_type = zeroUtils::toZeQueueWorkloadType(config.get<WORKLOAD_TYPE>());
+    }
+
+    _command_queue = CommandQueueManager::getInstance().getCommandQueue(_zeroInitStruct,
+                                                                        _ze_queue_priority,
+                                                                        _ze_workload_type,
+                                                                        _groupOrdinal,
+                                                                        _turbo);
+
     if (config.get<BATCH_MODE>() != ov::intel_npu::BatchMode::COMPILER) {
         _batch_size = get_batch_size(_metadata);
     }
@@ -124,6 +142,14 @@ void DriverGraph::initialize(const Config& config) {
     }
 }
 
+void DriverGraph::create_new_command_queue() {
+    _command_queue = CommandQueueManager::getInstance().getCommandQueue(_zeroInitStruct,
+                                                                        _ze_queue_priority,
+                                                                        _ze_workload_type,
+                                                                        _groupOrdinal,
+                                                                        _turbo);
+}
+
 bool DriverGraph::release_blob(const Config& config) {
     if (_blobPtr == nullptr || _zeroInitStruct->getGraphDdiTable().version() < ZE_GRAPH_EXT_VERSION_1_8 ||
         config.get<PERF_COUNT>()) {
@@ -155,6 +181,9 @@ DriverGraph::~DriverGraph() {
             _handle = nullptr;
         }
     }
+
+    _command_queue.reset();
+    CommandQueueManager::getInstance().freeCommandQueue(_ze_queue_priority, _ze_workload_type, _turbo);
 }
 
 }  // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp
index d85725c530fb14..5197fd0c29c1a5 100644
--- a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp
+++ b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp
@@ -112,6 +112,8 @@ class Fence {
     }
 
 private:
+    std::shared_ptr<CommandQueue> _command_queue;
+
     ze_fence_handle_t _handle = nullptr;
 
     Logger _log;