From e0df99b2e3bfad37334dbf914ebe7533ac0313dd Mon Sep 17 00:00:00 2001 From: Bogdan Pereanu Date: Thu, 30 Jan 2025 09:32:40 +0200 Subject: [PATCH] Initial commit Signed-off-by: Bogdan Pereanu --- .../src/backend/include/zero_pipeline.hpp | 5 +- .../src/backend/src/zero_pipeline.cpp | 56 ++++--------------- .../include/intel_npu/common/igraph.hpp | 21 +++++-- .../intel_npu/src/common/src/igraph.cpp | 19 +++++-- .../compiler_adapter/include/driver_graph.hpp | 4 ++ .../compiler_adapter/include/plugin_graph.hpp | 2 + .../src/compiler_adapter/src/driver_graph.cpp | 39 +++++++++++-- .../intel_npu/utils/zero/zero_wrappers.hpp | 2 + 8 files changed, 83 insertions(+), 65 deletions(-) diff --git a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp index 29069f0a0cf8cc..b8af2a4ad8b32a 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp @@ -27,7 +27,7 @@ struct Pipeline { Pipeline(const Pipeline&) = delete; Pipeline& operator=(const Pipeline&) = delete; - ~Pipeline(); + virtual ~Pipeline() = default; void push(); void pull(); @@ -66,10 +66,7 @@ struct Pipeline { std::shared_ptr _npu_profiling; Logger _logger; - uint32_t _group_ordinal; std::mutex _mutex; - bool _turbo = false; - ze_command_queue_priority_t _ze_queue_priority; std::optional _ze_workload_type = std::nullopt; }; diff --git a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp index 3cf9b205df2abd..50b04b4a3f0053 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp @@ -32,8 +32,7 @@ Pipeline::Pipeline(const Config& config, _id(_graph->get_unique_id()), _number_of_command_lists(_graph->get_batch_size().has_value() ? *_graph->get_batch_size() : 1), _npu_profiling(npu_profiling), - _logger("Pipeline", _config.get()), - _group_ordinal(group_ordinal) { + _logger("Pipeline", _config.get()) { OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "Zero_infer_request::Pipeline::Pipeline"); _logger.debug("Pipeline - initialize started"); @@ -64,21 +63,8 @@ Pipeline::Pipeline(const Config& config, _init_structs->getMutableCommandListVersion() ? true : false)); } - _ze_queue_priority = zeroUtils::toZeQueuePriority(_config.get()); - - if (_config.has()) { - _turbo = _config.get(); - } - - if (config.has()) { - _ze_workload_type = zeroUtils::toZeQueueWorkloadType(config.get()); - } - - _command_queue = CommandQueueManager::getInstance().getCommandQueue(_init_structs, - _ze_queue_priority, - _graph->get_ze_workload_type(), - _group_ordinal, - _turbo); + _ze_workload_type = _graph->get_ze_workload_type(); + _command_queue = _graph->get_command_queue(); if (_sync_output_with_fences) { _fences.resize(_number_of_command_lists); @@ -91,7 +77,7 @@ Pipeline::Pipeline(const Config& config, for (size_t i = 0; i < _number_of_command_lists; i++) { size_t io_index = 0; - for (const auto& desc : graph->get_input_descriptors()) { + for (const auto& desc : _graph->get_input_descriptors()) { if (input_tensors.at(io_index).size() > 1) { void* data = nullptr; auto remote_tensor = std::dynamic_pointer_cast(input_tensors.at(io_index).at(i)); @@ -101,7 +87,7 @@ Pipeline::Pipeline(const Config& config, data = remote_tensor->get_original_memory(); } - graph->set_argument_value(desc.idx, data); + _graph->set_argument_value(desc.idx, data); ++io_index; continue; @@ -115,7 +101,7 @@ Pipeline::Pipeline(const Config& config, data = remote_tensor->get_original_memory(); } - graph->set_argument_value( + _graph->set_argument_value( desc.idx, static_cast(data) + (i * input_tensors.at(io_index).at(0)->get_byte_size()) / _number_of_command_lists); @@ -124,7 +110,7 @@ Pipeline::Pipeline(const Config& config, } io_index = 0; - for (const auto& desc : graph->get_output_descriptors()) { + for (const auto& desc : _graph->get_output_descriptors()) { void* data = nullptr; auto remote_tensor = std::dynamic_pointer_cast(output_tensors.at(io_index)); if (remote_tensor == nullptr) { @@ -133,7 +119,7 @@ Pipeline::Pipeline(const Config& config, data = remote_tensor->get_original_memory(); } - graph->set_argument_value( + _graph->set_argument_value( desc.idx, static_cast(data) + (i * output_tensors.at(io_index)->get_byte_size()) / _number_of_command_lists); @@ -152,7 +138,7 @@ Pipeline::Pipeline(const Config& config, _command_lists.at(i)->appendNpuTimestamp(reinterpret_cast(_npu_profiling->npu_ts_infer_start)); } - _command_lists.at(i)->appendGraphExecute(static_cast(graph->get_handle()), + _command_lists.at(i)->appendGraphExecute(static_cast(_graph->get_handle()), profiling_query.getHandle()); /// append timestamp command if feature was activated @@ -196,11 +182,7 @@ void Pipeline::getCommandQueue() { } } - _command_queue = CommandQueueManager::getInstance().getCommandQueue(_init_structs, - _ze_queue_priority, - _graph->get_ze_workload_type(), - _group_ordinal, - _turbo); + _command_queue = _graph->get_command_queue(); if (_sync_output_with_fences) { for (size_t i = 0; i < _number_of_command_lists; i++) { @@ -210,7 +192,7 @@ void Pipeline::getCommandQueue() { } _logger.debug("Pipeline - getCommandQueue() - free previous command queue"); - CommandQueueManager::getInstance().freeCommandQueue(_ze_queue_priority, _ze_workload_type, _turbo); + _graph->destroy_specific_command_queue(_ze_workload_type); _ze_workload_type = _graph->get_ze_workload_type(); } @@ -330,20 +312,4 @@ void Pipeline::closeCommandListIndex(size_t command_list_index) { _command_lists.at(command_list_index)->close(); }; -Pipeline::~Pipeline() { - if (_command_queue) { - if (_sync_output_with_fences) { - // fences shall be destroyed before the command queue is destroyed - for (size_t i = 0; i < _number_of_command_lists; i++) { - if (_fences[i] != nullptr) { - _fences[i].reset(); - } - } - } - - _command_queue.reset(); - CommandQueueManager::getInstance().freeCommandQueue(_ze_queue_priority, _ze_workload_type, _turbo); - } -} - } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp index efb5b6b8978cfc..309cb98ddcb135 100644 --- a/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp +++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp @@ -43,6 +43,10 @@ class IGraph : public std::enable_shared_from_this { const std::vector& get_input_descriptors() const; const std::vector& get_output_descriptors() const; + const std::shared_ptr& get_command_queue() const; + const void destroy_specific_command_queue( + const std::optional& zeWorkloadType) const; + void set_workload_type(const ov::WorkloadType workloadType); const std::optional get_ze_workload_type() const; @@ -59,8 +63,8 @@ class IGraph : public std::enable_shared_from_this { protected: /** - * @brief Determines if batching can be addressed inside the plugin. In the positive case, the batch size used by - * the model will also be deduced and returned. + * @brief Determines if batching can be addressed inside the plugin. In the positive case, the batch size used + * by the model will also be deduced and returned. * @details Batching can be handled by the plugin only if: * - The batch axis is the first axis. * - The batch size received by the compiler takes the default value of 1. @@ -72,11 +76,13 @@ class IGraph : public std::enable_shared_from_this { * * @param metadata Metadata containing the shape values as seen by both the compiler and IR model. These will * ultimately be used for determining the batch size. - * @returns The batch size deduced by the algorithm or the default value of 1 if batching cannot be performed inside - * the plugin. + * @returns The batch size deduced by the algorithm or the default value of 1 if batching cannot be performed + * inside the plugin. */ std::optional get_batch_size(const NetworkMetadata& metadata); + virtual void create_new_command_queue() = 0; + ze_graph_handle_t _handle = nullptr; NetworkMetadata _metadata; @@ -85,8 +91,8 @@ class IGraph : public std::enable_shared_from_this { std::vector> _last_submitted_event; - // Used to protect zero pipeline creation in the graph. The pipeline should be created only once per graph when the - // first inference starts running + // Used to protect zero pipeline creation in the graph. The pipeline should be created only once per graph when + // the first inference starts running std::mutex _mutex; std::unique_ptr _blobPtr; @@ -100,7 +106,10 @@ class IGraph : public std::enable_shared_from_this { */ std::optional _batch_size = std::nullopt; + std::shared_ptr _command_queue; std::optional _ze_workload_type = std::nullopt; + bool _turbo = false; + ze_command_queue_priority_t _ze_queue_priority; Logger _logger; }; diff --git a/src/plugins/intel_npu/src/common/src/igraph.cpp b/src/plugins/intel_npu/src/common/src/igraph.cpp index ce54b53ea20432..b3e68baddb0337 100644 --- a/src/plugins/intel_npu/src/common/src/igraph.cpp +++ b/src/plugins/intel_npu/src/common/src/igraph.cpp @@ -21,11 +21,7 @@ IGraph::IGraph(ze_graph_handle_t handle, : _handle(handle), _metadata(std::move(metadata)), _blobPtr(std::move(blobPtr)), - _logger("IGraph", config.get()) { - if (config.has()) { - set_workload_type(config.get()); - } -} + _logger("IGraph", config.get()) {} const NetworkMetadata& IGraph::get_metadata() const { return _metadata; @@ -47,8 +43,21 @@ const std::vector& IGraph::get_output_descriptors() const { return _output_descriptors; } +const std::shared_ptr& IGraph::get_command_queue() const { + return _command_queue; +} + +const void IGraph::destroy_specific_command_queue( + const std::optional& zeWorkloadType) const { + CommandQueueManager::getInstance().freeCommandQueue(_ze_queue_priority, zeWorkloadType, _turbo); +} + void IGraph::set_workload_type(const ov::WorkloadType workloadType) { _ze_workload_type = zeroUtils::toZeQueueWorkloadType(workloadType); + + if (_command_queue) { + create_new_command_queue(); + } } std::mutex& IGraph::get_mutex() { diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp index ac89a790291d2e..ce910d5cf98526 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp @@ -37,9 +37,13 @@ class DriverGraph final : public IGraph { private: bool release_blob(const Config& config); + void create_new_command_queue() override; + std::shared_ptr _zeGraphExt; std::shared_ptr _zeroInitStruct; + uint32_t _groupOrdinal; + Logger _logger; // In the case of the import path, the blob is released after graph initialization so it can not be any longer diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp index 61d4a6ed866529..3dd5b57fa5e8a4 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp @@ -38,6 +38,8 @@ class PluginGraph final : public IGraph { ~PluginGraph() override; private: + void create_new_command_queue() override{}; + std::shared_ptr _zeGraphExt; std::shared_ptr _zeroInitStruct; diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp index 97f77ca644dc08..fa9ebf5e2935c9 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp @@ -99,11 +99,6 @@ void DriverGraph::initialize(const Config& config) { _input_descriptors.shrink_to_fit(); _output_descriptors.shrink_to_fit(); - ze_device_properties_t deviceProperties = {}; - deviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; - THROW_ON_FAIL_FOR_LEVELZERO("zeDeviceGetProperties", - zeDeviceGetProperties(_zeroInitStruct->getDevice(), &deviceProperties)); - _zeGraphExt->initializeGraph(_handle); _logger.debug("Graph initialize finish"); @@ -113,6 +108,29 @@ void DriverGraph::initialize(const Config& config) { // releasing it here to avoid unnecessary memory usage. _blobIsReleased = release_blob(config); + // Find the corresponding command queue group. + ze_device_properties_t deviceProperties = {}; + deviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; + THROW_ON_FAIL_FOR_LEVELZERO("zeDeviceGetProperties", + zeDeviceGetProperties(_zeroInitStruct->getDevice(), &deviceProperties)); + _groupOrdinal = zeroUtils::findGroupOrdinal(_zeroInitStruct->getDevice(), deviceProperties); + + _ze_queue_priority = zeroUtils::toZeQueuePriority(config.get()); + + if (config.has()) { + _turbo = config.get(); + } + + if (config.has()) { + _ze_workload_type = zeroUtils::toZeQueueWorkloadType(config.get()); + } + + _command_queue = CommandQueueManager::getInstance().getCommandQueue(_zeroInitStruct, + _ze_queue_priority, + _ze_workload_type, + _groupOrdinal, + _turbo); + if (config.get() != ov::intel_npu::BatchMode::COMPILER) { _batch_size = get_batch_size(_metadata); } @@ -124,6 +142,14 @@ void DriverGraph::initialize(const Config& config) { } } +void DriverGraph::create_new_command_queue() { + _command_queue = CommandQueueManager::getInstance().getCommandQueue(_zeroInitStruct, + _ze_queue_priority, + _ze_workload_type, + _groupOrdinal, + _turbo); +} + bool DriverGraph::release_blob(const Config& config) { if (_blobPtr == nullptr || _zeroInitStruct->getGraphDdiTable().version() < ZE_GRAPH_EXT_VERSION_1_8 || config.get()) { @@ -155,6 +181,9 @@ DriverGraph::~DriverGraph() { _handle = nullptr; } } + + _command_queue.reset(); + CommandQueueManager::getInstance().freeCommandQueue(_ze_queue_priority, _ze_workload_type, _turbo); } } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp index d85725c530fb14..5197fd0c29c1a5 100644 --- a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp +++ b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp @@ -112,6 +112,8 @@ class Fence { } private: + std::shared_ptr _command_queue; + ze_fence_handle_t _handle = nullptr; Logger _log;