diff --git a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp index 6baabc55b435ce..a06f579c66c744 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp @@ -40,6 +40,9 @@ struct Pipeline { void closeCommandListIndex(size_t command_list_index); protected: + void getCommandQueue(); + + std::shared_ptr _init_structs; std::shared_ptr _graph; const Config _config; const uint32_t _id; @@ -59,9 +62,11 @@ struct Pipeline { std::vector> _fences; std::shared_ptr _event_pool; std::vector> _events; - bool sync_output_with_fences_ = true; + bool _sync_output_with_fences = true; std::shared_ptr _npu_profiling; Logger _logger; + + std::mutex _mutex; }; } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp index 7ada704c9969d8..31daa32da08d02 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp @@ -28,6 +28,17 @@ Type extract_object(const ov::AnyMap& params, const ov::Property& p) { return res.as(); } +template +bool compare_shared_ptr(const std::shared_ptr& a, const std::shared_ptr& b) { + if (a == b) { + return true; + } + if (a && b) { + return a.get() == b.get(); + } + return false; +} + } // namespace namespace intel_npu { @@ -41,14 +52,11 @@ Pipeline::Pipeline(const Config& config, const std::vector>>& input_tensors, const std::vector>& output_tensors, uint32_t group_ordinal) - : _graph(graph), + : _init_structs(init_structs), + _graph(graph), _config(config), _id(_graph->get_unique_id()), _number_of_command_lists(_graph->get_batch_size().has_value() ? *_graph->get_batch_size() : 1), - _event_pool{ - std::make_shared(init_structs->getDevice(), - init_structs->getContext(), - _number_of_command_lists ? static_cast(_number_of_command_lists) : 1)}, _npu_profiling(npu_profiling), _logger("Pipeline", _config.get()) { OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "Zero_infer_request::Pipeline::Pipeline"); @@ -58,22 +66,43 @@ Pipeline::Pipeline(const Config& config, profiling_query.create(profiling_pool._handle); } + OPENVINO_ASSERT(_sync_output_with_fences || !_config.get(), + "In-order execution doesn't work in case synchronization of the inferences is done using events"); + + if (!_sync_output_with_fences || _config.get()) { + _event_pool = + std::make_shared(_init_structs->getDevice(), + _init_structs->getContext(), + _number_of_command_lists ? static_cast(_number_of_command_lists) : 1); + + _events.reserve(_number_of_command_lists); + for (size_t i = 0; i < _number_of_command_lists; i++) { + _events.emplace_back(std::make_shared(_event_pool, static_cast(i))); + } + } + _command_lists.reserve(_number_of_command_lists); - _events.reserve(_number_of_command_lists); - _fences.reserve(_number_of_command_lists); - _logger.debug("Pipeline - emplace_back _event_pool and _command_queue"); for (size_t i = 0; i < _number_of_command_lists; i++) { _command_lists.emplace_back( - std::make_unique(init_structs, + std::make_unique(_init_structs, group_ordinal, - init_structs->getMutableCommandListVersion() ? true : false)); - _events.emplace_back(std::make_shared(_event_pool, static_cast(i))); - _fences.emplace_back(std::make_unique(*_graph->get_command_queue())); + _init_structs->getMutableCommandListVersion() ? true : false)); + } + + _command_queue = _graph->get_command_queue(); + + if (_sync_output_with_fences) { + _fences.resize(_number_of_command_lists); + + for (size_t i = 0; i < _number_of_command_lists; i++) { + _logger.debug("Pipeline - getCommandQueue() - create new fence"); + _fences[i] = std::make_unique(_command_queue); + } } for (size_t i = 0; i < _number_of_command_lists; i++) { size_t io_index = 0; - for (const auto& desc : graph->get_input_descriptors()) { + for (const auto& desc : _graph->get_input_descriptors()) { if (input_tensors.at(io_index).size() > 1) { void* data = nullptr; auto remote_tensor = std::dynamic_pointer_cast(input_tensors.at(io_index).at(i)); @@ -83,7 +112,7 @@ Pipeline::Pipeline(const Config& config, data = extract_object(remote_tensor->get_properties(), ov::intel_npu::mem_handle); } - graph->set_argument_value(desc.idx, data); + _graph->set_argument_value(desc.idx, data); ++io_index; continue; @@ -97,7 +126,7 @@ Pipeline::Pipeline(const Config& config, data = extract_object(remote_tensor->get_properties(), ov::intel_npu::mem_handle); } - graph->set_argument_value( + _graph->set_argument_value( desc.idx, static_cast(data) + (i * input_tensors.at(io_index).at(0)->get_byte_size()) / _number_of_command_lists); @@ -106,7 +135,7 @@ Pipeline::Pipeline(const Config& config, } io_index = 0; - for (const auto& desc : graph->get_output_descriptors()) { + for (const auto& desc : _graph->get_output_descriptors()) { void* data = nullptr; auto remote_tensor = std::dynamic_pointer_cast(output_tensors.at(io_index)); if (remote_tensor == nullptr) { @@ -115,7 +144,7 @@ Pipeline::Pipeline(const Config& config, data = extract_object(remote_tensor->get_properties(), ov::intel_npu::mem_handle); } - graph->set_argument_value( + _graph->set_argument_value( desc.idx, static_cast(data) + (i * output_tensors.at(io_index)->get_byte_size()) / _number_of_command_lists); @@ -134,7 +163,7 @@ Pipeline::Pipeline(const Config& config, _command_lists.at(i)->appendNpuTimestamp(reinterpret_cast(_npu_profiling->npu_ts_infer_start)); } - _command_lists.at(i)->appendGraphExecute(static_cast(graph->get_handle()), + _command_lists.at(i)->appendGraphExecute(static_cast(_graph->get_handle()), profiling_query.getHandle()); /// append timestamp command if feature was activated @@ -153,7 +182,7 @@ Pipeline::Pipeline(const Config& config, } // appendBarrier used in L0 as well - if (!sync_output_with_fences_) { + if (!_sync_output_with_fences) { _command_lists.at(i)->appendBarrier(); _events.at(i)->AppendSignalEvent(*_command_lists.at(i)); } @@ -162,9 +191,30 @@ Pipeline::Pipeline(const Config& config, _logger.debug("Pipeline - initialize completed"); } +void Pipeline::getCommandQueue() { + _logger.debug("Pipeline - getCommandQueue() started"); + + std::lock_guard lock(_mutex); + + if (!compare_shared_ptr(_command_queue, _graph->get_command_queue())) { + _command_queue = _graph->get_command_queue(); + + if (_sync_output_with_fences) { + for (size_t i = 0; i < _number_of_command_lists; i++) { + _logger.debug("Pipeline - getCommandQueue() - create new fence"); + _fences[i] = std::make_unique(_command_queue); + } + } + } + + _logger.debug("Pipeline - getCommandQueue() completed"); +} + void Pipeline::push() { _logger.debug("Pipeline - push() started"); + getCommandQueue(); + if (_config.get()) { if (_id) { auto previousIndex = _graph->get_last_submitted_id(); @@ -179,10 +229,10 @@ void Pipeline::push() { for (size_t i = 0; i < _command_lists.size(); ++i) { OV_ITT_TASK_CHAIN(ZERO_PIPELINE_IP_PUSH, itt::domains::LevelZeroBackend, "Pipeline", "push"); - if (sync_output_with_fences_) { - _graph->get_command_queue()->executeCommandList(*_command_lists.at(i), *_fences.at(i)); + if (_sync_output_with_fences) { + _command_queue->executeCommandList(*_command_lists.at(i), *_fences.at(i)); } else { - _graph->get_command_queue()->executeCommandList(*_command_lists.at(i)); + _command_queue->executeCommandList(*_command_lists.at(i)); } } @@ -194,7 +244,7 @@ void Pipeline::pull() { OV_ITT_TASK_CHAIN(ZERO_PIPELINE_IP_PULL, itt::domains::LevelZeroBackend, "Pipeline", "pull"); for (size_t i = 0; i < _command_lists.size(); ++i) { - if (sync_output_with_fences_) { + if (_sync_output_with_fences) { _fences.at(i)->hostSynchronize(); } else { _events.at(i)->hostSynchronize(); @@ -209,17 +259,17 @@ void Pipeline::pull() { }; void Pipeline::reset() const { - _logger.debug("Pipeline - rest() started"); + _logger.debug("Pipeline - reset() started"); for (size_t i = 0; i < _command_lists.size(); ++i) { - if (sync_output_with_fences_) { + if (_sync_output_with_fences) { _fences.at(i)->reset(); } else { _events.at(i)->reset(); } } - _logger.debug("Pipeline - rest() completed"); + _logger.debug("Pipeline - reset() completed"); }; void Pipeline::updateCommandList(uint32_t arg_index, const void* arg_data, size_t byte_size) { diff --git a/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp b/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp index fc5aec9158151c..b33143e91098e9 100644 --- a/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp +++ b/src/plugins/intel_npu/src/common/include/intel_npu/common/igraph.hpp @@ -41,9 +41,10 @@ class IGraph : public std::enable_shared_from_this { const std::vector& get_input_descriptors() const; const std::vector& get_output_descriptors() const; + const std::shared_ptr& get_command_queue() const; - void set_workload_type(const ov::WorkloadType workloadType) const; + virtual void set_workload_type(const ov::WorkloadType workloadType) = 0; std::mutex& get_mutex(); @@ -58,8 +59,8 @@ class IGraph : public std::enable_shared_from_this { protected: /** - * @brief Determines if batching can be addressed inside the plugin. In the positive case, the batch size used by - * the model will also be deduced and returned. + * @brief Determines if batching can be addressed inside the plugin. In the positive case, the batch size used + * by the model will also be deduced and returned. * @details Batching can be handled by the plugin only if: * - The batch axis is the first axis. * - The batch size received by the compiler takes the default value of 1. @@ -71,22 +72,23 @@ class IGraph : public std::enable_shared_from_this { * * @param metadata Metadata containing the shape values as seen by both the compiler and IR model. These will * ultimately be used for determining the batch size. - * @returns The batch size deduced by the algorithm or the default value of 1 if batching cannot be performed inside - * the plugin. + * @returns The batch size deduced by the algorithm or the default value of 1 if batching cannot be performed + * inside the plugin. */ std::optional get_batch_size(const NetworkMetadata& metadata); + virtual void create_new_command_queue() = 0; + ze_graph_handle_t _handle = nullptr; NetworkMetadata _metadata; std::vector _input_descriptors; std::vector _output_descriptors; - std::shared_ptr _command_queue; std::vector> _last_submitted_event; - // Used to protect zero pipeline creation in the graph. The pipeline should be created only once per graph when the - // first inference starts running + // Used to protect zero pipeline creation in the graph. The pipeline should be created only once per graph when + // the first inference starts running std::mutex _mutex; std::vector _blob; @@ -100,6 +102,12 @@ class IGraph : public std::enable_shared_from_this { */ std::optional _batch_size = std::nullopt; + std::shared_ptr _command_queue; + uint32_t _group_ordinal; + ze_command_queue_workload_type_t _ze_workload_type; + bool _turbo = false; + ze_command_queue_priority_t _ze_queue_priority; + Logger _logger; }; diff --git a/src/plugins/intel_npu/src/common/src/igraph.cpp b/src/plugins/intel_npu/src/common/src/igraph.cpp index 9a53928c9a3d9e..1becc38cf25874 100644 --- a/src/plugins/intel_npu/src/common/src/igraph.cpp +++ b/src/plugins/intel_npu/src/common/src/igraph.cpp @@ -50,26 +50,6 @@ const std::shared_ptr& IGraph::get_command_queue() const { return _command_queue; } -void IGraph::set_workload_type(const ov::WorkloadType workloadType) const { - if (_command_queue == nullptr) { - return; - } - - ze_command_queue_workload_type_t zeWorkloadType; - switch (workloadType) { - case ov::WorkloadType::DEFAULT: - zeWorkloadType = ze_command_queue_workload_type_t::ZE_WORKLOAD_TYPE_DEFAULT; - break; - case ov::WorkloadType::EFFICIENT: - zeWorkloadType = ze_command_queue_workload_type_t::ZE_WORKLOAD_TYPE_BACKGROUND; - break; - default: - OPENVINO_THROW("Unknown value for WorkloadType!"); - } - - _command_queue->setWorkloadType(zeWorkloadType); -} - std::mutex& IGraph::get_mutex() { return _mutex; } diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp index cf3d54c6b363e5..0a1cf275b1a6c8 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/driver_graph.hpp @@ -32,11 +32,15 @@ class DriverGraph final : public IGraph { void initialize(const Config& config) override; + void set_workload_type(const ov::WorkloadType workloadType) override; + ~DriverGraph() override; private: bool release_blob(const Config& config); + void create_new_command_queue() override; + std::shared_ptr _zeGraphExt; std::shared_ptr _zeroInitStruct; diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp index 9c88ace1c29d23..a886065c8d2a2a 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/plugin_graph.hpp @@ -35,9 +35,13 @@ class PluginGraph final : public IGraph { void initialize(const Config& config) override; + void set_workload_type(const ov::WorkloadType workloadType) override; + ~PluginGraph() override; private: + void create_new_command_queue() override; + std::shared_ptr _zeGraphExt; std::shared_ptr _zeroInitStruct; diff --git a/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp b/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp index a80beb8c57305d..37411a13ed0d5b 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp +++ b/src/plugins/intel_npu/src/compiler_adapter/include/ze_graph_ext_wrappers.hpp @@ -48,7 +48,7 @@ class ZeGraphExtWrappers { void setGraphArgumentValue(ze_graph_handle_t graphHandle, uint32_t argi_, const void* argv) const; - void initializeGraph(ze_graph_handle_t graphHandle, const Config& config) const; + void initializeGraph(ze_graph_handle_t graphHandle) const; private: std::unordered_set getQueryResultFromSupportedLayers( @@ -60,7 +60,7 @@ class ZeGraphExtWrappers { std::vector& inputs, std::vector& outputs) const; - void initialize_graph_through_command_list(ze_graph_handle_t graphHandle, const Config& config) const; + void initialize_graph_through_command_list(ze_graph_handle_t graphHandle) const; std::shared_ptr _zeroInitStruct; uint32_t _graphExtVersion; diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp index a29412075c7e39..0bd5e618d44ab7 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/driver_graph.cpp @@ -99,34 +99,37 @@ void DriverGraph::initialize(const Config& config) { _input_descriptors.shrink_to_fit(); _output_descriptors.shrink_to_fit(); + _zeGraphExt->initializeGraph(_handle); + + _logger.debug("Graph initialize finish"); + + // We are allowed to release the original blob because weights were loaded in NPU memory during + // _zeGraphExt->initializeGraph(). The driver will not access the original blob from this moment on, so we are + // releasing it here to avoid unnecessary memory usage. + _blobIsReleased = release_blob(config); + + // Find the corresponding command queue group. ze_device_properties_t deviceProperties = {}; deviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; THROW_ON_FAIL_FOR_LEVELZERO("zeDeviceGetProperties", zeDeviceGetProperties(_zeroInitStruct->getDevice(), &deviceProperties)); - auto groupOrdinal = zeroUtils::findGroupOrdinal(_zeroInitStruct->getDevice(), deviceProperties); + _group_ordinal = zeroUtils::findGroupOrdinal(_zeroInitStruct->getDevice(), deviceProperties); + + _ze_queue_priority = zeroUtils::toZeQueuePriority(config.get()); - bool turbo = false; if (config.has()) { - turbo = config.get(); + _turbo = config.get(); } - _command_queue = std::make_shared(_zeroInitStruct, - zeroUtils::toZeQueuePriority(config.get()), - groupOrdinal, - turbo); - if (config.has()) { - set_workload_type(config.get()); + if (!_zeroInitStruct->getCommandQueueDdiTable().version()) { + OPENVINO_THROW("The WorkloadType property is not supported by the current Driver Version!"); + } } - _zeGraphExt->initializeGraph(_handle, config); + _ze_workload_type = zeroUtils::toZeQueueWorkloadType(config.get()); - _logger.debug("Graph initialize finish"); - - // We are allowed to release the original blob because weights were loaded in NPU memory during - // _zeGraphExt->initializeGraph(). The driver will not access the original blob from this moment on, so we are - // releasing it here to avoid unnecessary memory usage. - _blobIsReleased = release_blob(config); + create_new_command_queue(); if (config.get() != ov::intel_npu::BatchMode::COMPILER) { _batch_size = get_batch_size(_metadata); @@ -139,6 +142,26 @@ void DriverGraph::initialize(const Config& config) { } } +void DriverGraph::set_workload_type(const ov::WorkloadType workloadType) { + if (!_zeroInitStruct->getCommandQueueDdiTable().version()) { + OPENVINO_THROW("The WorkloadType property is not supported by the current Driver Version!"); + } + + _ze_workload_type = zeroUtils::toZeQueueWorkloadType(workloadType); + + if (_command_queue) { + create_new_command_queue(); + } +} + +void DriverGraph::create_new_command_queue() { + _command_queue = CommandQueuePool::getInstance().getCommandQueue(_zeroInitStruct, + _ze_queue_priority, + _ze_workload_type, + _group_ordinal, + _turbo); +} + bool DriverGraph::release_blob(const Config& config) { if (_blob.empty() || _zeroInitStruct->getGraphDdiTable().version() < ZE_GRAPH_EXT_VERSION_1_8 || config.get()) { diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp index d0c24a82e03937..0ddb6308f825cf 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/plugin_graph.cpp @@ -94,27 +94,30 @@ void PluginGraph::initialize(const Config& config) { _input_descriptors.shrink_to_fit(); _output_descriptors.shrink_to_fit(); + _zeGraphExt->initializeGraph(_handle); + + // Find the corresponding command queue group. ze_device_properties_t deviceProperties = {}; deviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; THROW_ON_FAIL_FOR_LEVELZERO("zeDeviceGetProperties", zeDeviceGetProperties(_zeroInitStruct->getDevice(), &deviceProperties)); - auto groupOrdinal = zeroUtils::findGroupOrdinal(_zeroInitStruct->getDevice(), deviceProperties); + _group_ordinal = zeroUtils::findGroupOrdinal(_zeroInitStruct->getDevice(), deviceProperties); + + _ze_queue_priority = zeroUtils::toZeQueuePriority(config.get()); - bool turbo = false; if (config.has()) { - turbo = config.get(); + _turbo = config.get(); } - _command_queue = std::make_shared(_zeroInitStruct, - zeroUtils::toZeQueuePriority(config.get()), - groupOrdinal, - turbo); - if (config.has()) { - set_workload_type(config.get()); + if (!_zeroInitStruct->getCommandQueueDdiTable().version()) { + OPENVINO_THROW("The WorkloadType property is not supported by the current Driver Version!"); + } } - _zeGraphExt->initializeGraph(_handle, config); + _ze_workload_type = zeroUtils::toZeQueueWorkloadType(config.get()); + + create_new_command_queue(); if (config.get() != ov::intel_npu::BatchMode::COMPILER) { _batch_size = get_batch_size(_metadata); @@ -129,6 +132,26 @@ void PluginGraph::initialize(const Config& config) { _logger.debug("Graph initialize finish"); } +void PluginGraph::set_workload_type(const ov::WorkloadType workloadType) { + if (!_zeroInitStruct->getCommandQueueDdiTable().version()) { + OPENVINO_THROW("The WorkloadType property is not supported by the current Driver Version!"); + } + + _ze_workload_type = zeroUtils::toZeQueueWorkloadType(workloadType); + + if (_command_queue) { + create_new_command_queue(); + } +} + +void PluginGraph::create_new_command_queue() { + _command_queue = CommandQueuePool::getInstance().getCommandQueue(_zeroInitStruct, + _ze_queue_priority, + _ze_workload_type, + _group_ordinal, + _turbo); +} + PluginGraph::~PluginGraph() { if (_handle != nullptr) { auto result = _zeGraphExt->destroyGraph(_handle); diff --git a/src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp b/src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp index 2f6eded512ab8e..6b17b457c57f42 100644 --- a/src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp +++ b/src/plugins/intel_npu/src/compiler_adapter/src/ze_graph_ext_wrappers.cpp @@ -158,10 +158,10 @@ void ZeGraphExtWrappers::setGraphArgumentValue(ze_graph_handle_t graphHandle, ui THROW_ON_FAIL_FOR_LEVELZERO_EXT("zeGraphSetArgumentValue", result, _zeroInitStruct->getGraphDdiTable()); } -void ZeGraphExtWrappers::initializeGraph(ze_graph_handle_t graphHandle, const Config& config) const { +void ZeGraphExtWrappers::initializeGraph(ze_graph_handle_t graphHandle) const { if (_zeroInitStruct->getGraphDdiTable().version() < ZE_GRAPH_EXT_VERSION_1_8) { _logger.debug("Use initialize_graph_through_command_list for ext version smaller than 1.8"); - initialize_graph_through_command_list(graphHandle, config); + initialize_graph_through_command_list(graphHandle); } else { _logger.debug("Initialize graph based on graph properties for ext version larger than 1.8"); ze_graph_properties_2_t properties = {}; @@ -175,13 +175,12 @@ void ZeGraphExtWrappers::initializeGraph(ze_graph_handle_t graphHandle, const Co } if (properties.initStageRequired & ZE_GRAPH_STAGE_COMMAND_LIST_INITIALIZE) { - initialize_graph_through_command_list(graphHandle, config); + initialize_graph_through_command_list(graphHandle); } } } -void ZeGraphExtWrappers::initialize_graph_through_command_list(ze_graph_handle_t graphHandle, - const Config& config) const { +void ZeGraphExtWrappers::initialize_graph_through_command_list(ze_graph_handle_t graphHandle) const { ze_device_properties_t deviceProperties = {}; deviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; THROW_ON_FAIL_FOR_LEVELZERO("zeDeviceGetProperties", @@ -191,7 +190,8 @@ void ZeGraphExtWrappers::initialize_graph_through_command_list(ze_graph_handle_t _logger.debug("initialize_graph_through_command_list init start - create graph_command_list"); CommandList graph_command_list(_zeroInitStruct, groupOrdinal); _logger.debug("initialize_graph_through_command_list - create graph_command_queue"); - CommandQueue graph_command_queue(_zeroInitStruct, ZE_COMMAND_QUEUE_PRIORITY_NORMAL, groupOrdinal, false); + CommandQueueDesc desc = {ZE_COMMAND_QUEUE_PRIORITY_NORMAL, ZE_WORKLOAD_TYPE_DEFAULT, false}; + auto graph_command_queue = std::make_shared(_zeroInitStruct, desc, groupOrdinal); _logger.debug("initialize_graph_through_command_list - create fence"); Fence fence(graph_command_queue); @@ -201,7 +201,7 @@ void ZeGraphExtWrappers::initialize_graph_through_command_list(ze_graph_handle_t graph_command_list.close(); _logger.debug("initialize_graph_through_command_list - performing executeCommandList"); - graph_command_queue.executeCommandList(graph_command_list, fence); + graph_command_queue->executeCommandList(graph_command_list, fence); _logger.debug("initialize_graph_through_command_list - performing hostSynchronize"); fence.hostSynchronize(); _logger.debug("initialize_graph_through_command_list - hostSynchronize completed"); diff --git a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_utils.hpp b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_utils.hpp index db9dc1c9f51d34..ec302e597fb369 100644 --- a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_utils.hpp +++ b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_utils.hpp @@ -8,6 +8,8 @@ #include #include +#include + #include "intel_npu/utils/logger/logger.hpp" #include "intel_npu/utils/zero/zero_api.hpp" #include "intel_npu/utils/zero/zero_result.hpp" @@ -50,6 +52,34 @@ namespace zeroUtils { ze_result_to_description(result)); \ } +static inline size_t toPriorityVal(const ze_command_queue_priority_t& val) { + switch (val) { + case ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW: + return 0; + case ZE_COMMAND_QUEUE_PRIORITY_NORMAL: + return 1; + case ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH: + return 2; + default: + OPENVINO_THROW("Incorrect queue priority."); + } +} + +static inline size_t toWorkloadVal(const std::optional& val) { + if (!val.has_value()) { + return 0; + } + + switch (*val) { + case ZE_WORKLOAD_TYPE_DEFAULT: + return 1; + case ZE_WORKLOAD_TYPE_BACKGROUND: + return 2; + default: + OPENVINO_THROW("Incorrect workload type."); + } +} + static inline ze_command_queue_priority_t toZeQueuePriority(const ov::hint::Priority& val) { switch (val) { case ov::hint::Priority::LOW: @@ -63,6 +93,17 @@ static inline ze_command_queue_priority_t toZeQueuePriority(const ov::hint::Prio } } +static inline ze_command_queue_workload_type_t toZeQueueWorkloadType(const ov::WorkloadType& val) { + switch (val) { + case ov::WorkloadType::DEFAULT: + return ZE_WORKLOAD_TYPE_DEFAULT; + case ov::WorkloadType::EFFICIENT: + return ZE_WORKLOAD_TYPE_BACKGROUND; + default: + OPENVINO_THROW("Unknown value for WorkloadType."); + } +} + static inline std::size_t precisionToSize(const ze_graph_argument_precision_t val) { switch (val) { case ZE_GRAPH_ARGUMENT_PRECISION_INT4: diff --git a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp index c2041d678b0c42..76fff5008c314c 100644 --- a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp +++ b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_wrappers.hpp @@ -6,6 +6,8 @@ #include +#include + #include "intel_npu/utils/logger/logger.hpp" #include "intel_npu/utils/zero/zero_init.hpp" #include "intel_npu/utils/zero/zero_types.hpp" @@ -15,6 +17,12 @@ namespace intel_npu { class CommandList; class CommandQueue; +struct CommandQueueDesc { + ze_command_queue_priority_t priority; + ze_command_queue_workload_type_t workload; + bool turbo; +}; + class EventPool { public: EventPool() = delete; @@ -61,7 +69,7 @@ class CommandList { public: friend class CommandQueue; CommandList() = delete; - CommandList(const std::shared_ptr& initStructs, + CommandList(const std::shared_ptr& init_structs, const uint32_t& group_ordinal, bool mtci_is_supported = false); CommandList(const CommandList&) = delete; @@ -85,7 +93,7 @@ class CommandList { } private: - std::shared_ptr _initStructs; + std::shared_ptr _init_structs; Logger _log; @@ -96,7 +104,7 @@ class CommandList { class Fence { public: Fence() = delete; - Fence(const CommandQueue& command_queue); + Fence(const std::shared_ptr& command_queue); Fence(const Fence&) = delete; Fence(Fence&&) = delete; Fence& operator=(const Fence&) = delete; @@ -110,6 +118,8 @@ class Fence { } private: + std::shared_ptr _command_queue; + ze_fence_handle_t _handle = nullptr; Logger _log; @@ -118,10 +128,9 @@ class Fence { class CommandQueue { public: CommandQueue() = delete; - CommandQueue(const std::shared_ptr& initStructs, - const ze_command_queue_priority_t& priority, - const uint32_t& group_ordinal, - bool turbo = false); + CommandQueue(const std::shared_ptr& init_structs, + const CommandQueueDesc desc, + const uint32_t& group_ordinal); CommandQueue(const CommandQueue&) = delete; CommandQueue(CommandQueue&&) = delete; CommandQueue& operator=(const CommandQueue&) = delete; @@ -129,18 +138,44 @@ class CommandQueue { void executeCommandList(CommandList& command_list) const; void executeCommandList(CommandList& command_list, Fence& fence) const; - void setWorkloadType(ze_command_queue_workload_type_t workloadType) const; + void setWorkloadType(ze_command_queue_workload_type_t workload_type) const; ~CommandQueue(); inline ze_command_queue_handle_t handle() const { return _handle; } private: - std::shared_ptr _initStructs; + std::shared_ptr _init_structs; Logger _log; ze_command_queue_handle_t _handle = nullptr; }; +class CommandQueuePool { +public: + CommandQueuePool(); + CommandQueuePool(const CommandQueuePool& other) = delete; + CommandQueuePool(CommandQueuePool&& other) = delete; + void operator=(const CommandQueuePool&) = delete; + void operator=(CommandQueuePool&&) = delete; + + static CommandQueuePool& getInstance(); + + std::shared_ptr getCommandQueue(const std::shared_ptr& init_structs, + const ze_command_queue_priority_t& priority, + const ze_command_queue_workload_type_t& workload_type, + const uint32_t& group_ordinal, + bool turbo); + +private: + int computeHash(CommandQueueDesc desc); + + std::unordered_map> _pool; + + Logger _log; + + std::mutex _mutex; +}; + } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp b/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp index 4868d6326c5fe4..8cb67f50ff8955 100644 --- a/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp +++ b/src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp @@ -14,105 +14,116 @@ EventPool::EventPool(ze_device_handle_t device_handle, const ze_context_handle_t nullptr, ZE_EVENT_POOL_FLAG_HOST_VISIBLE, event_count}; - THROW_ON_FAIL_FOR_LEVELZERO("zeEventPoolCreate", - zeEventPoolCreate(context, &event_pool_desc, 1, &device_handle, &_handle)); + auto result = zeEventPoolCreate(context, &event_pool_desc, 1, &device_handle, &_handle); + THROW_ON_FAIL_FOR_LEVELZERO("zeEventPoolCreate", result); } EventPool::~EventPool() { auto result = zeEventPoolDestroy(_handle); if (ZE_RESULT_SUCCESS != result) { _log.error("zeEventPoolDestroy failed %#X", uint64_t(result)); } + + _handle = nullptr; } Event::Event(const std::shared_ptr& event_pool, uint32_t event_index) : _event_pool(event_pool), _log("Event", Logger::global().level()) { ze_event_desc_t event_desc = {ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr, event_index, 0, 0}; - THROW_ON_FAIL_FOR_LEVELZERO("zeEventCreate", zeEventCreate(_event_pool->handle(), &event_desc, &_handle)); + auto result = zeEventCreate(_event_pool->handle(), &event_desc, &_handle); + THROW_ON_FAIL_FOR_LEVELZERO("zeEventCreate", result); } void Event::AppendSignalEvent(CommandList& command_list) const { - THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListAppendSignalEvent", - zeCommandListAppendSignalEvent(command_list.handle(), _handle)); + auto result = zeCommandListAppendSignalEvent(command_list.handle(), _handle); + THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListAppendSignalEvent", result); } void Event::AppendWaitOnEvent(CommandList& command_list) { - THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListAppendWaitOnEvents", - zeCommandListAppendWaitOnEvents(command_list.handle(), 1, &_handle)); + auto result = zeCommandListAppendWaitOnEvents(command_list.handle(), 1, &_handle); + THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListAppendWaitOnEvents", result); } void Event::AppendEventReset(CommandList& command_list) const { - THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListAppendEventReset", - zeCommandListAppendEventReset(command_list.handle(), _handle)); + auto result = zeCommandListAppendEventReset(command_list.handle(), _handle); + THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListAppendEventReset", result); } void Event::hostSynchronize() const { - THROW_ON_FAIL_FOR_LEVELZERO("zeEventHostSynchronize", zeEventHostSynchronize(_handle, UINT64_MAX)); + auto result = zeEventHostSynchronize(_handle, UINT64_MAX); + THROW_ON_FAIL_FOR_LEVELZERO("zeEventHostSynchronize", result); } void Event::reset() const { - THROW_ON_FAIL_FOR_LEVELZERO("zeEventHostReset", zeEventHostReset(_handle)); + auto result = zeEventHostReset(_handle); + THROW_ON_FAIL_FOR_LEVELZERO("zeEventHostReset", result); } Event::~Event() { auto result = zeEventDestroy(_handle); if (ZE_RESULT_SUCCESS != result) { _log.error("zeEventDestroy failed %#X", uint64_t(result)); } + + _handle = nullptr; } -CommandList::CommandList(const std::shared_ptr& initStructs, +CommandList::CommandList(const std::shared_ptr& init_structs, const uint32_t& group_ordinal, bool mtci_is_supported) - : _initStructs(initStructs), + : _init_structs(init_structs), _log("CommandList", Logger::global().level()) { ze_mutable_command_list_exp_desc_t mutable_desc = {ZE_STRUCTURE_TYPE_MUTABLE_COMMAND_LIST_EXP_DESC, nullptr, 0}; ze_command_list_desc_t desc = {ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, &mutable_desc, group_ordinal, 0}; - THROW_ON_FAIL_FOR_LEVELZERO( - "zeCommandListCreate", - zeCommandListCreate(_initStructs->getContext(), _initStructs->getDevice(), &desc, &_handle)); + auto result = zeCommandListCreate(_init_structs->getContext(), _init_structs->getDevice(), &desc, &_handle); + THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListCreate", result); if (mtci_is_supported) { ze_mutable_command_id_exp_desc_t mutableCmdIdDesc = {ZE_STRUCTURE_TYPE_MUTABLE_COMMAND_ID_EXP_DESC, nullptr, ZE_MUTABLE_COMMAND_EXP_FLAG_GRAPH_ARGUMENT}; - THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListGetNextCommandIdExp", - zeCommandListGetNextCommandIdExp(_handle, &mutableCmdIdDesc, &_command_id)); + result = zeCommandListGetNextCommandIdExp(_handle, &mutableCmdIdDesc, &_command_id); + THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListGetNextCommandIdExp", result); } } void CommandList::reset() const { - THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListReset", zeCommandListReset(_handle)); + auto result = zeCommandListReset(_handle); + THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListReset", result); } void CommandList::appendMemoryCopy(void* dst, const void* src, const std::size_t size) const { - THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListAppendMemoryCopy", - zeCommandListAppendMemoryCopy(_handle, dst, src, size, nullptr, 0, nullptr)); + auto result = zeCommandListAppendMemoryCopy(_handle, dst, src, size, nullptr, 0, nullptr); + THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListAppendMemoryCopy", result); } void CommandList::appendGraphInitialize(const ze_graph_handle_t& graph_handle) const { ze_result_t result = - _initStructs->getGraphDdiTable().pfnAppendGraphInitialize(_handle, graph_handle, nullptr, 0, nullptr); - THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnAppendGraphInitialize", result, _initStructs->getGraphDdiTable()); + _init_structs->getGraphDdiTable().pfnAppendGraphInitialize(_handle, graph_handle, nullptr, 0, nullptr); + THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnAppendGraphInitialize", result, _init_structs->getGraphDdiTable()); } void CommandList::appendGraphExecute(const ze_graph_handle_t& graph_handle, const ze_graph_profiling_query_handle_t& profiling_query_handle) const { - ze_result_t result = _initStructs->getGraphDdiTable() + ze_result_t result = _init_structs->getGraphDdiTable() .pfnAppendGraphExecute(_handle, graph_handle, profiling_query_handle, nullptr, 0, nullptr); - THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnAppendGraphExecute", result, _initStructs->getGraphDdiTable()); + THROW_ON_FAIL_FOR_LEVELZERO_EXT("pfnAppendGraphExecute", result, _init_structs->getGraphDdiTable()); } void CommandList::appendNpuTimestamp(uint64_t* timestamp_buff) const { - THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListAppendWriteGlobalTimestamp", - zeCommandListAppendWriteGlobalTimestamp(_handle, timestamp_buff, nullptr, 0, nullptr)); + auto result = zeCommandListAppendWriteGlobalTimestamp(_handle, timestamp_buff, nullptr, 0, nullptr); + THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListAppendWriteGlobalTimestamp", result); } void CommandList::appendBarrier() const { - THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListAppendBarrier", zeCommandListAppendBarrier(_handle, nullptr, 0, nullptr)); + auto result = zeCommandListAppendBarrier(_handle, nullptr, 0, nullptr); + THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListAppendBarrier", result); } void CommandList::close() const { - THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListClose", zeCommandListClose(_handle)); + auto result = zeCommandListClose(_handle); + THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListClose", result); } CommandList::~CommandList() { auto result = zeCommandListDestroy(_handle); if (ZE_RESULT_SUCCESS != result) { _log.error("zeCommandListDestroy failed %#X", uint64_t(result)); } + + _handle = nullptr; } void CommandList::updateMutableCommandList(uint32_t arg_index, const void* arg_value) const { ze_mutable_graph_argument_exp_desc_t desc = { - (ZE_MAJOR_VERSION(_initStructs->getZeDrvApiVersion()) > 1 || - (ZE_MAJOR_VERSION(_initStructs->getZeDrvApiVersion()) == 1 && - ZE_MINOR_VERSION(_initStructs->getZeDrvApiVersion()) >= 11)) + (ZE_MAJOR_VERSION(_init_structs->getZeDrvApiVersion()) > 1 || + (ZE_MAJOR_VERSION(_init_structs->getZeDrvApiVersion()) == 1 && + ZE_MINOR_VERSION(_init_structs->getZeDrvApiVersion()) >= 11)) ? ZE_STRUCTURE_TYPE_MUTABLE_GRAPH_ARGUMENT_EXP_DESC : static_cast(ZE_STRUCTURE_TYPE_MUTABLE_GRAPH_ARGUMENT_EXP_DESC_DEPRECATED), nullptr, @@ -124,45 +135,55 @@ void CommandList::updateMutableCommandList(uint32_t arg_index, const void* arg_v &desc, 0}; - THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListUpdateMutableCommandsExp", - zeCommandListUpdateMutableCommandsExp(_handle, &mutable_commands_exp_desc_t)); + auto result = zeCommandListUpdateMutableCommandsExp(_handle, &mutable_commands_exp_desc_t); + THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListUpdateMutableCommandsExp", result); } -CommandQueue::CommandQueue(const std::shared_ptr& initStructs, - const ze_command_queue_priority_t& priority, - const uint32_t& group_ordinal, - bool turbo) - : _initStructs(initStructs), +CommandQueue::CommandQueue(const std::shared_ptr& init_structs, + const CommandQueueDesc desc, + const uint32_t& group_ordinal) + : _init_structs(init_structs), _log("CommandQueue", Logger::global().level()) { - ze_command_queue_desc_t queue_desc = - {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, group_ordinal, 0, 0, ZE_COMMAND_QUEUE_MODE_DEFAULT, priority}; + ze_command_queue_desc_t queue_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, + nullptr, + group_ordinal, + 0, + 0, + ZE_COMMAND_QUEUE_MODE_DEFAULT, + desc.priority}; - if (turbo) { - if (_initStructs->getCommandQueueDdiTable().version()) { - ze_command_queue_desc_npu_ext_t turbo_cfg = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC_NPU_EXT, nullptr, turbo}; + if (desc.turbo) { + if (_init_structs->getCommandQueueDdiTable().version()) { + ze_command_queue_desc_npu_ext_t turbo_cfg = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC_NPU_EXT, + nullptr, + desc.turbo}; queue_desc.pNext = &turbo_cfg; } else { OPENVINO_THROW("Turbo is not supported by the current driver"); } } - THROW_ON_FAIL_FOR_LEVELZERO( - "zeCommandQueueCreate", - zeCommandQueueCreate(_initStructs->getContext(), _initStructs->getDevice(), &queue_desc, &_handle)); + auto result = zeCommandQueueCreate(_init_structs->getContext(), _init_structs->getDevice(), &queue_desc, &_handle); + THROW_ON_FAIL_FOR_LEVELZERO("zeCommandQueueCreate", result); + + if (_init_structs->getCommandQueueDdiTable().version()) { + auto result = _init_structs->getCommandQueueDdiTable().pfnSetWorkloadType(_handle, desc.workload); + THROW_ON_FAIL_FOR_LEVELZERO("zeSetWorkloadType", result); + } } void CommandQueue::executeCommandList(CommandList& command_list) const { - THROW_ON_FAIL_FOR_LEVELZERO("zeCommandQueueExecuteCommandLists", - zeCommandQueueExecuteCommandLists(_handle, 1, &command_list._handle, nullptr)); + auto result = zeCommandQueueExecuteCommandLists(_handle, 1, &command_list._handle, nullptr); + THROW_ON_FAIL_FOR_LEVELZERO("zeCommandQueueExecuteCommandLists", result); } void CommandQueue::executeCommandList(CommandList& command_list, Fence& fence) const { - THROW_ON_FAIL_FOR_LEVELZERO("zeCommandQueueExecuteCommandLists", - zeCommandQueueExecuteCommandLists(_handle, 1, &command_list._handle, fence.handle())); + auto result = zeCommandQueueExecuteCommandLists(_handle, 1, &command_list._handle, fence.handle()); + THROW_ON_FAIL_FOR_LEVELZERO("zeCommandQueueExecuteCommandLists", result); } -void CommandQueue::setWorkloadType(ze_command_queue_workload_type_t workloadType) const { - if (_initStructs->getCommandQueueDdiTable().version()) { - THROW_ON_FAIL_FOR_LEVELZERO("zeSetWorkloadType", - _initStructs->getCommandQueueDdiTable().pfnSetWorkloadType(_handle, workloadType)); +void CommandQueue::setWorkloadType(ze_command_queue_workload_type_t workload_type) const { + if (_init_structs->getCommandQueueDdiTable().version()) { + auto result = _init_structs->getCommandQueueDdiTable().pfnSetWorkloadType(_handle, workload_type); + THROW_ON_FAIL_FOR_LEVELZERO("zeSetWorkloadType", result); } else { OPENVINO_THROW("The WorkloadType property is not supported by the current Driver Version!"); } @@ -173,23 +194,71 @@ CommandQueue::~CommandQueue() { if (ZE_RESULT_SUCCESS != result) { _log.error("zeCommandQueueDestroy failed %#X", uint64_t(result)); } + + _handle = nullptr; } -Fence::Fence(const CommandQueue& command_queue) : _log("Fence", Logger::global().level()) { +Fence::Fence(const std::shared_ptr& command_queue) + : _command_queue(command_queue), + _log("Fence", Logger::global().level()) { ze_fence_desc_t fence_desc = {ZE_STRUCTURE_TYPE_FENCE_DESC, nullptr, 0}; - THROW_ON_FAIL_FOR_LEVELZERO("zeFenceCreate", zeFenceCreate(command_queue.handle(), &fence_desc, &_handle)); + auto result = zeFenceCreate(command_queue->handle(), &fence_desc, &_handle); + THROW_ON_FAIL_FOR_LEVELZERO("zeFenceCreate", result); } void Fence::reset() const { - THROW_ON_FAIL_FOR_LEVELZERO("zeFenceReset", zeFenceReset(_handle)); + auto result = zeFenceReset(_handle); + THROW_ON_FAIL_FOR_LEVELZERO("zeFenceReset", result); } void Fence::hostSynchronize() const { - THROW_ON_FAIL_FOR_LEVELZERO("zeFenceHostSynchronize", zeFenceHostSynchronize(_handle, UINT64_MAX)); + auto result = zeFenceHostSynchronize(_handle, UINT64_MAX); + THROW_ON_FAIL_FOR_LEVELZERO("zeFenceHostSynchronize", result); } Fence::~Fence() { auto result = zeFenceDestroy(_handle); if (ZE_RESULT_SUCCESS != result) { _log.error("zeFenceDestroy failed %#X", uint64_t(result)); } + + _handle = nullptr; +} + +CommandQueuePool::CommandQueuePool() : _log("CommandQueue", Logger::global().level()) {} +int CommandQueuePool::computeHash(CommandQueueDesc desc) { + return (static_cast(desc.priority) & 0xFF) | (static_cast(desc.workload) & 0xFF) << 8 | + (desc.turbo << 16); +} +CommandQueuePool& CommandQueuePool::getInstance() { + static CommandQueuePool instance; + return instance; +} +std::shared_ptr CommandQueuePool::getCommandQueue( + const std::shared_ptr& init_structs, + const ze_command_queue_priority_t& priority, + const ze_command_queue_workload_type_t& workload_type, + const uint32_t& group_ordinal, + bool turbo) { + CommandQueueDesc desc = {priority, workload_type, turbo}; + + int hash = computeHash(desc); + + std::lock_guard lock(_mutex); + if (_pool.find(hash) != _pool.end()) { + // found one weak pointer in the pool + // is it valid? + auto obj = _pool.at(hash).lock(); + if (obj) { + _log.debug("Get Command Queue"); + return obj; + } + } // otherwise create a new object + + _log.debug("Create Command Queue"); + auto new_obj = std::make_shared(init_structs, desc, group_ordinal); + + auto pair = std::make_pair(hash, new_obj); + _pool.emplace(pair); + + return new_obj; } } // namespace intel_npu diff --git a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.cpp b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.cpp index f45e30bb109849..50c49d883f1d0b 100644 --- a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.cpp +++ b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.cpp @@ -37,6 +37,12 @@ INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTest, ::testing::ValuesIn(configsInferRequestRunTests)), InferRequestRunTests::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTest, + InferRunTestsOnNewerDrivers, + ::testing::Combine(::testing::Values(ov::test::utils::DEVICE_NPU), + ::testing::ValuesIn(configsInferRequestRunTests)), + InferRequestRunTests::getTestCaseName); + const std::vector batchingConfigs = { {ov::log::level(ov::log::Level::WARNING), ov::intel_npu::batch_mode(ov::intel_npu::BatchMode::PLUGIN)}, {ov::log::level(ov::log::Level::WARNING), ov::intel_npu::batch_mode(ov::intel_npu::BatchMode::COMPILER)}, diff --git a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp index 31b55704757b01..7935f3d39c9157 100644 --- a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp +++ b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp @@ -962,6 +962,37 @@ TEST_P(SetShapeInferRunTests, checkResultsAfterIOBlobReallocation) { } } +using InferRunTestsOnNewerDrivers = InferRequestRunTests; + +TEST_P(InferRunTestsOnNewerDrivers, MultipleCompiledModelsTestsSyncInfers) { + // Skip test according to plugin specific disabledTestPatterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + const int no_of_iterations = 256; + std::array compiled_models; + + for (int i = 0; i < no_of_iterations; ++i) { + OV_ASSERT_NO_THROW(compiled_models[i] = core->compile_model(ov_model, target_device, configuration)); + } + + std::array infer_reqs; + std::array infer_reqs_threads; + for (int i = 0; i < no_of_iterations; ++i) { + OV_ASSERT_NO_THROW(infer_reqs[i] = compiled_models[i].create_infer_request()); + } + + for (int i = 0; i < no_of_iterations; ++i) { + infer_reqs_threads[i] = std::thread([&compiled_models, &infer_reqs, i]() -> void { + OV_ASSERT_NO_THROW(infer_reqs[i].infer()); + infer_reqs[i] = {}; + compiled_models[i] = {}; + }); + } + + for (int i = 0; i < no_of_iterations; ++i) { + infer_reqs_threads[i].join(); + } +} + } // namespace behavior } // namespace test } // namespace ov diff --git a/src/plugins/intel_npu/tests/functional/behavior/ov_infer_request/compile_and_infer.cpp b/src/plugins/intel_npu/tests/functional/behavior/ov_infer_request/compile_and_infer.cpp index 5a77908adabd0c..b0318d9b8f25f7 100644 --- a/src/plugins/intel_npu/tests/functional/behavior/ov_infer_request/compile_and_infer.cpp +++ b/src/plugins/intel_npu/tests/functional/behavior/ov_infer_request/compile_and_infer.cpp @@ -31,4 +31,11 @@ INSTANTIATE_TEST_SUITE_P(compatibility_smoke_BehaviorTests, {ov::intel_npu::defer_weights_load(false)}})), ov::test::utils::appendPlatformTypeTestName); +INSTANTIATE_TEST_SUITE_P(compatibility_smoke_BehaviorTests, + OVCompileAndInferRequesOnNewerDrivers, + ::testing::Combine(::testing::Values(getConstantGraph(ov::element::f32)), + ::testing::Values(ov::test::utils::DEVICE_NPU), + ::testing::ValuesIn(configs)), + ov::test::utils::appendPlatformTypeTestName); + } // namespace diff --git a/src/plugins/intel_npu/tests/functional/internal/overload/compile_and_infer.hpp b/src/plugins/intel_npu/tests/functional/internal/overload/compile_and_infer.hpp index e44329c5de56c8..6066b4d42089bf 100644 --- a/src/plugins/intel_npu/tests/functional/internal/overload/compile_and_infer.hpp +++ b/src/plugins/intel_npu/tests/functional/internal/overload/compile_and_infer.hpp @@ -5,8 +5,11 @@ #include #include +#include #include +#include #include +#include #include "base/ov_behavior_test_utils.hpp" #include "intel_npu/config/common.hpp" @@ -163,7 +166,6 @@ TEST_P(OVCompileAndInferRequest, CompiledModelWorkloadTypeDelayedExecutor) { OV_ASSERT_NO_THROW(execNet = core->compile_model(function, target_device, configuration)); ov::AnyMap modelConfiguration; modelConfiguration[workload_type.name()] = WorkloadType::DEFAULT; - OV_ASSERT_NO_THROW(execNet.set_property(modelConfiguration)); if (isCommandQueueExtSupported()) { ov::InferRequest req; @@ -177,7 +179,7 @@ TEST_P(OVCompileAndInferRequest, CompiledModelWorkloadTypeDelayedExecutor) { OV_ASSERT_NO_THROW(req.wait()); ASSERT_TRUE(is_called); } else { - OV_EXPECT_THROW_HAS_SUBSTRING(execNet.create_infer_request(), + OV_EXPECT_THROW_HAS_SUBSTRING(execNet.set_property(modelConfiguration), ov::Exception, "WorkloadType property is not supported by the current Driver Version!"); } @@ -206,6 +208,60 @@ TEST_P(OVCompileAndInferRequest, CompiledModelWorkloadTypeUpdateAfterCompilation } } +TEST_P(OVCompileAndInferRequest, CompiledModelWorkloadTypeUpdateAfterCompilationWithMultipleInfers) { + if (isCommandQueueExtSupported()) { + OV_ASSERT_NO_THROW(execNet = core->compile_model(function, target_device, configuration)); + + auto secondCompiledModel = core->compile_model(function, target_device, configuration); + + ov::InferRequest req1, req2, req3; + OV_ASSERT_NO_THROW(req1 = execNet.create_infer_request()); + OV_ASSERT_NO_THROW(req3 = secondCompiledModel.create_infer_request()); + bool isCalled = false; + OV_ASSERT_NO_THROW(req1.set_callback([&](std::exception_ptr exception_ptr) { + ASSERT_EQ(exception_ptr, nullptr); + isCalled = true; + })); + OV_ASSERT_NO_THROW(req1.start_async()); + OV_ASSERT_NO_THROW(req1.wait()); + ASSERT_TRUE(isCalled); + + OV_ASSERT_NO_THROW(req3.infer()); + + req1 = {}; + + ov::AnyMap modelConfiguration; + modelConfiguration[workload_type.name()] = WorkloadType::EFFICIENT; + OV_ASSERT_NO_THROW(execNet.set_property(modelConfiguration)); + ASSERT_EQ(execNet.get_property(workload_type.name()).as(), WorkloadType::EFFICIENT); + OV_ASSERT_NO_THROW(req2 = execNet.create_infer_request()) + OV_ASSERT_NO_THROW(req2.infer()); + + modelConfiguration[workload_type.name()] = WorkloadType::DEFAULT; + OV_ASSERT_NO_THROW(execNet.set_property(modelConfiguration)); + ASSERT_EQ(execNet.get_property(workload_type.name()).as(), WorkloadType::DEFAULT); + isCalled = false; + OV_ASSERT_NO_THROW(req2.set_callback([&](std::exception_ptr exception_ptr) { + ASSERT_EQ(exception_ptr, nullptr); + isCalled = true; + })); + OV_ASSERT_NO_THROW(req2.start_async()); + OV_ASSERT_NO_THROW(req2.wait()); + ASSERT_TRUE(isCalled); + + req2 = {}; + req3 = {}; + + OV_ASSERT_NO_THROW(req1 = execNet.create_infer_request()); + OV_ASSERT_NO_THROW(req2 = secondCompiledModel.create_infer_request()); + OV_ASSERT_NO_THROW(req1.infer()); + OV_ASSERT_NO_THROW(req3 = execNet.create_infer_request()); + OV_ASSERT_NO_THROW(req2.infer()); + OV_ASSERT_NO_THROW(req3.infer()); + OV_ASSERT_NO_THROW(req3.infer()); + } +} + using OVCompileAndInferRequestTurbo = OVCompileAndInferRequest; TEST_P(OVCompileAndInferRequestTurbo, CompiledModelTurbo) { @@ -247,6 +303,86 @@ TEST_P(OVCompileAndInferRequestTurbo, CompiledModelTurbo) { } } +using OVCompileAndInferRequesOnNewerDrivers = OVCompileAndInferRequest; + +TEST_P(OVCompileAndInferRequesOnNewerDrivers, MultipleCompiledModelsTestsSyncInfers) { + // Skip test according to plugin specific disabledTestPatterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + auto supportedProperties = core->get_property("NPU", supported_properties.name()).as>(); + bool isTurboSupported = + std::any_of(supportedProperties.begin(), supportedProperties.end(), [](const PropertyName& property) { + return property == intel_npu::turbo.name(); + }); + + if (isCommandQueueExtSupported()) { + ASSERT_TRUE(isTurboSupported); + + const int no_of_iterations = 256; + std::array compiled_models; + + for (int i = 0; i < no_of_iterations; ++i) { + if (i % 4) { + configuration[intel_npu::turbo.name()] = false; + } else { + configuration[intel_npu::turbo.name()] = true; + } + + if (i % 5 == 1) { + configuration[workload_type.name()] = WorkloadType::DEFAULT; + } else if (i % 5 == 2) { + configuration[workload_type.name()] = WorkloadType::EFFICIENT; + } + + if (i % 3 == 0) { + configuration[ov::hint::model_priority.name()] = ov::hint::Priority::LOW; + } else if (i % 3 == 1) { + configuration[ov::hint::model_priority.name()] = ov::hint::Priority::MEDIUM; + } else if (i % 3 == 2) { + configuration[ov::hint::model_priority.name()] = ov::hint::Priority::HIGH; + } + + OV_ASSERT_NO_THROW(compiled_models[i] = core->compile_model(function, target_device, configuration)); + } + + std::array infer_reqs; + std::array infer_reqs_threads; + for (int i = 0; i < no_of_iterations; ++i) { + OV_ASSERT_NO_THROW(infer_reqs[i] = compiled_models[i].create_infer_request()); + } + + for (int i = 0; i < no_of_iterations; ++i) { + infer_reqs_threads[i] = std::thread([&compiled_models, &infer_reqs, i]() -> void { + OV_ASSERT_NO_THROW(infer_reqs[i].infer()); + + ov::AnyMap modelConfiguration; + if (i % 5 == 0) { + modelConfiguration[workload_type.name()] = WorkloadType::DEFAULT; + OV_ASSERT_NO_THROW(compiled_models[i].set_property(modelConfiguration)); + } else if (i % 5 == 1) { + modelConfiguration[workload_type.name()] = WorkloadType::EFFICIENT; + OV_ASSERT_NO_THROW(compiled_models[i].set_property(modelConfiguration)); + } else if (i % 5 == 2) { + modelConfiguration[workload_type.name()] = WorkloadType::DEFAULT; + OV_ASSERT_NO_THROW(compiled_models[i].set_property(modelConfiguration)); + } else if (i % 5 == 3) { + modelConfiguration[workload_type.name()] = WorkloadType::EFFICIENT; + OV_ASSERT_NO_THROW(compiled_models[i].set_property(modelConfiguration)); + } + + OV_ASSERT_NO_THROW(infer_reqs[i].infer()); + + infer_reqs[i] = {}; + compiled_models[i] = {}; + }); + } + + for (int i = 0; i < no_of_iterations; ++i) { + infer_reqs_threads[i].join(); + } + } +} + } // namespace behavior } // namespace test } // namespace ov