Adding support for handling batching on the plugin

pereanub · Apr 8, 2024 · 3facb5a · 3facb5a
1 parent edc30e6
commit 3facb5a
Show file tree

Hide file tree

Showing 18 changed files with 373 additions and 93 deletions.
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/compiler.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/compiler.hpp
@@ -360,4 +360,25 @@ struct COMPILATION_NUM_THREADS final : OptionBase<COMPILATION_NUM_THREADS, int32
     }
 };
 
+//
+// BATCH_MODE
+//
+struct BATCH_MODE final : OptionBase<BATCH_MODE, ov::intel_npu::BatchMode> {
+    static std::string_view key() {
+        return ov::intel_npu::batch_mode.name();
+    }
+
+    static constexpr std::string_view getTypeName() {
+        return "ov::intel_npu::BatchMode";
+    }
+
+    static ov::intel_npu::BatchMode defaultValue() {
+        return ov::intel_npu::BatchMode::AUTO;
+    }
+
+    static ov::intel_npu::BatchMode parse(std::string_view val);
+
+    static std::string toString(const ov::intel_npu::BatchMode& val);
+};
+
 }  // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/al/include/npu_private_properties.hpp b/src/plugins/intel_npu/src/al/include/npu_private_properties.hpp
@@ -151,6 +151,43 @@ inline std::ostream& operator<<(std::ostream& out, const ElfCompilerBackend& fmt
     return out;
 }
 
+/**
+ * @brief [Only for NPU Plugin]
+ * Type: String. Default is "AUTO".
+ * This option is added for enabling batching on plugin.
+ * Possible values: "AUTO", "COMPILER", "PLUGIN".
+ */
+enum class BatchMode {
+    AUTO = 0,
+    COMPILER = 1,
+    PLUGIN = 2,
+};
+
+/**
+ * @brief Prints a string representation of ov::intel_npu::BatchMode to a stream
+ * @param out An output stream to send to
+ * @param fmt A value for batching on plugin to print to a stream
+ * @return A reference to the `out` stream
+ * @note Configuration API v 2.0
+ */
+inline std::ostream& operator<<(std::ostream& out, const BatchMode& fmt) {
+    switch (fmt) {
+    case BatchMode::AUTO: {
+        out << "AUTO";
+    } break;
+    case BatchMode::COMPILER: {
+        out << "COMPILER";
+    } break;
+    case BatchMode::PLUGIN: {
+        out << "PLUGIN";
+    } break;
+    default:
+        out << static_cast<uint32_t>(fmt);
+        break;
+    }
+    return out;
+}
+
 /**
  * @brief [Only for NPU Plugin]
  * Type: string, default is MODEL.
@@ -321,6 +358,14 @@ static constexpr ov::Property<ProfilingType> profiling_type{"NPU_PROFILING_TYPE"
  */
 static constexpr ov::Property<ElfCompilerBackend> use_elf_compiler_backend{"NPU_USE_ELF_COMPILER_BACKEND"};
 
+/**
+ * @brief [Only for NPU Plugin]
+ * Type: String. Default is "AUTO".
+ * This option is added for enabling batching on plugin, otherwise batching will be handled by compiler.
+ * Possible values: "AUTO", "PLUGIN", "COMPILER".
+ */
+static constexpr ov::Property<BatchMode> batch_mode{"NPU_BATCH_MODE"};
+
 /**
  * @brief [Only for NPU Plugin]
  * Type: integer, default is 1

diff --git a/src/plugins/intel_npu/src/al/src/config/compiler.cpp b/src/plugins/intel_npu/src/al/src/config/compiler.cpp
@@ -24,6 +24,7 @@ void intel_npu::registerCompilerOptions(OptionsDesc& desc) {
     desc.add<DMA_ENGINES>();
     desc.add<USE_ELF_COMPILER_BACKEND>();
     desc.add<DYNAMIC_SHAPE_TO_STATIC>();
+    desc.add<BATCH_MODE>();
 }
 
 //
@@ -119,3 +120,27 @@ std::string intel_npu::USE_ELF_COMPILER_BACKEND::toString(const ov::intel_npu::E
 
     return strStream.str();
 }
+
+//
+// BATCH_MODE
+//
+
+ov::intel_npu::BatchMode intel_npu::BATCH_MODE::parse(std::string_view val) {
+    if (val == "AUTO") {
+        return ov::intel_npu::BatchMode::AUTO;
+    } else if (val == "COMPILER") {
+        return ov::intel_npu::BatchMode::COMPILER;
+    } else if (val == "PLUGIN") {
+        return ov::intel_npu::BatchMode::PLUGIN;
+    }
+
+    OPENVINO_THROW("Value '{0}' is not a valid BATCH_TYPE option", val);
+}
+
+std::string intel_npu::BATCH_MODE::toString(const ov::intel_npu::BatchMode& val) {
+    std::stringstream strStream;
+
+    strStream << val;
+
+    return strStream.str();
+}
diff --git a/src/plugins/intel_npu/src/backend/CMakeLists.txt b/src/plugins/intel_npu/src/backend/CMakeLists.txt
@@ -24,7 +24,7 @@ target_include_directories(${TARGET_NAME}
 target_link_libraries(${TARGET_NAME}
     PRIVATE
         openvino::npu_al
-        openvino_npu_zero_result_parser
+        openvino_npu_zero_utils
         ze_loader
 )
 

diff --git a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp
@@ -46,6 +46,10 @@ class ZeroInferRequest final : public SyncInferRequest {
     zeroProfiling::ProfilingQuery _profiling_query;
     std::shared_ptr<zeroProfiling::NpuInferProfiling> _npu_profiling;
     std::unique_ptr<Pipeline> _pipeline;
+
+    // If batching is handled on the compiler side then batching on the plugin shall be set to 1, we don't do any
+    // specific operations on the plugin in this case.
+    size_t _batch_size = 1;
 };
 
 }  //  namespace intel_npu
diff --git a/src/plugins/intel_npu/src/backend/include/zero_memory.hpp b/src/plugins/intel_npu/src/backend/include/zero_memory.hpp
@@ -88,16 +88,16 @@ class HostMemAllocator final {
     static const std::size_t _alignment = STANDARD_PAGE_SIZE;
 };
 
-// For graph arguments (inputs and outputs) memory should be located on a host side. For discrete HW
-// generation the arguments has to be moved to device side to make it accessible.
-// MemoryManagementUnit allow to keeps device allocations in case of discrete HW.
-// Usage: we should append graph arguments with corresponding names with `appendArgument` call
-// to prepare size statistics and lookup table. To commit memory allocation we should call `allocate`
+// Graph arguments (inputs and output) need to be allocated in the host memory.
+// For discrete platforms, graph arguments need to be copied into the device memory.
+// MemoryMangementUnit is used to allocate memory in the device memory.
+// Usage: we should append graph arguments with corresponding names with `appendArgument` call to prepare size
+// statistics and lookup table. To commit memory allocation we should call `allocate`
 struct MemoryManagementUnit {
     MemoryManagementUnit() = default;
 
     void appendArgument(const std::string& name, const ze_graph_argument_properties_t& argument);
-    /* Allocate Device memories */
+
     void allocate(const ze_device_handle_t device_handle, const ze_context_handle_t context);
 
     std::size_t getSize() const;

diff --git a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp
@@ -20,9 +20,9 @@ struct Pipeline {
     Pipeline& operator=(Pipeline&&) = delete;
     virtual ~Pipeline() = default;
 
-    virtual void push() = 0;
-    virtual void pull() = 0;
-    virtual void reset() const = 0;
+    virtual void push(const size_t batch_index) = 0;
+    virtual void pull(const size_t batch_index) = 0;
+    virtual void reset(const size_t batch_index) const = 0;
 
 protected:
     zeroMemory::MemoryManagementUnit _deviceInputs;
@@ -34,5 +34,6 @@ std::unique_ptr<Pipeline> makePipeline(const std::shared_ptr<const IExecutor>& e
                                        zeroProfiling::ProfilingPool& profiling_pool,
                                        zeroProfiling::ProfilingQuery& profiling_query,
                                        std::shared_ptr<zeroProfiling::NpuInferProfiling> npu_profiling,
-                                       std::unordered_map<std::string, std::shared_ptr<ov::ITensor>>& tensors);
+                                       std::unordered_map<std::string, std::shared_ptr<ov::ITensor>>& tensors,
+                                       const size_t batch_size);
 }  // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/backend/include/zero_utils.hpp b/src/plugins/intel_npu/src/backend/include/zero_utils.hpp
@@ -10,7 +10,7 @@
 
 #include "intel_npu/al/config/runtime.hpp"
 #include "intel_npu/utils/logger/logger.hpp"
-#include "intel_npu/utils/zero/zero_result.hpp"
+#include "intel_npu/utils/zero/zero_utils.hpp"
 
 namespace intel_npu {
 

diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
@@ -40,11 +40,6 @@ void check_level_zero_attributes_match(const IONodeDescriptor& nodeDescriptor,
             "Given: " + std::to_string(ovDimensions.size()));
     }
 
-    for (size_t index = 0; index < ovDimensions.size(); ++index) {
-        if (ovDimensions[index] != zeDescriptor.info.dims[index] && !nodeDescriptor.originalShape.is_dynamic()) {
-            OPENVINO_THROW("Shape mismatch for parameter " + name);
-        }
-    }
     for (size_t index = ovDimensions.size(); index < ZE_MAX_GRAPH_ARGUMENT_DIMENSIONS_SIZE; ++index) {
         if (zeDescriptor.info.dims[index] != 0 && zeDescriptor.info.dims[index] != 1) {
             OPENVINO_THROW("Shape mismatch for parameter " + name);
@@ -91,12 +86,34 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
         return std::find(container.begin(), container.end(), value) != container.end();
     };
 
+    auto optionValue = config.get<BATCH_MODE>();
+    if (optionValue != ov::intel_npu::BatchMode::COMPILER) {
+        const std::vector<size_t>& ovDimensions =
+            _metadata.parameters.at(_metadata.inputNames[0]).originalShape.get_shape();
+        const ZeroExecutor::ArgumentDescriptor& zeExecutor = executorInputDescriptors.at(_metadata.inputNames[0]);
+        switch (zeExecutor.info.deviceLayout) {
+        case ZE_GRAPH_ARGUMENT_LAYOUT_NCHW:
+        case ZE_GRAPH_ARGUMENT_LAYOUT_NHWC:
+        case ZE_GRAPH_ARGUMENT_LAYOUT_NCDHW:
+        case ZE_GRAPH_ARGUMENT_LAYOUT_NDHWC:
+        case ZE_GRAPH_ARGUMENT_LAYOUT_NC:
+            if ((ovDimensions[0] == zeExecutor.info.dims[0]) && (ovDimensions[0] != 1)) {
+                _logger.warning("Batching on the plugin is not used, batching is handled by the compiler");
+            } else {
+                _batch_size = ovDimensions[0];
+            }
+            break;
+        default:
+            _logger.warning("Batching on the plugin is working only when batching is found on 0th dimension");
+        }
+    }
+
     for (const std::string& inputName : _metadata.inputNames) {
         if (!executorInputDescriptors.count(inputName)) {
             OPENVINO_THROW("Invalid graph input descriptor key: " + inputName);
         }
 
-        const IONodeDescriptor& parameterDescriptor = _metadata.parameters.at(inputName);
+        IONodeDescriptor parameterDescriptor = _metadata.parameters.at(inputName);
         check_level_zero_attributes_match(parameterDescriptor, executorInputDescriptors.at(inputName), inputName);
 
         ov::Allocator allocator;
@@ -106,6 +123,12 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
             allocator = zeroMemory::HostMemAllocator(backendPtr);
         }
 
+        // When batching is handled by the plugin we need to modify transposed shape with the original batch size since
+        // it will be forced to 1 at the compilation time
+        if (_batch_size > 1) {
+            parameterDescriptor.transposedShape[0] = _batch_size;
+        }
+
         // The I/O buffers already allocated using the Level Zero API are being reused here
         allocate_tensor(inputName, parameterDescriptor, TensorType::InputOrOutput, allocator);
 
@@ -127,11 +150,17 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
             OPENVINO_THROW("Invalid graph output descriptor key: " + outputName);
         }
 
-        const IONodeDescriptor& resultDescriptor = _metadata.results.at(outputName);
+        IONodeDescriptor resultDescriptor = _metadata.results.at(outputName);
         check_level_zero_attributes_match(resultDescriptor, executorOutputDescriptors.at(outputName), outputName);
 
         auto allocator = zeroMemory::HostMemAllocator(backendPtr);
 
+        // When batching is handled by the plugin we need to modify transposed shape with the original batch size since
+        // it will be forced to 1 at the compilation time
+        if (_batch_size > 1) {
+            resultDescriptor.transposedShape[0] = _batch_size;
+        }
+
         allocate_tensor(outputName, resultDescriptor, TensorType::InputOrOutput, allocator);
 
         if (contains(_metadata.shapeNames, outputName)) {
@@ -174,7 +203,13 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
     }
 
     /// Construct pipepline
-    _pipeline = makePipeline(_executorPtr, _config, _profiling_pool, _profiling_query, _npu_profiling, _copyAllTensors);
+    _pipeline = makePipeline(_executorPtr,
+                             _config,
+                             _profiling_pool,
+                             _profiling_query,
+                             _npu_profiling,
+                             _copyAllTensors,
+                             _batch_size);
 }
 
 void ZeroInferRequest::infer() {
@@ -212,13 +247,17 @@ void ZeroInferRequest::infer_async() {
         }
     }
 
-    _pipeline->push();
+    for (size_t i = 0; i < _batch_size; i++) {
+        _pipeline->push(i);
+    }
 }
 
 void ZeroInferRequest::get_result() {
     OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "get_result");
 
-    _pipeline->pull();
+    for (size_t i = 0; i < _batch_size; i++) {
+        _pipeline->pull(i);
+    }
 
     for (const auto& name : _outputAndStateOutputNames) {
         const auto& outputTensor = _allTensors.at(name);
@@ -249,7 +288,9 @@ void ZeroInferRequest::get_result() {
         }
     }
 
-    _pipeline->reset();
+    for (size_t i = 0; i < _batch_size; i++) {
+        _pipeline->reset(i);
+    }
     _logger.debug("InferRequest::get_result finished");
 }