Skip to content

Commit

Permalink
Adding support for handling batching on the plugin
Browse files Browse the repository at this point in the history
  • Loading branch information
pereanub committed Apr 8, 2024
1 parent edc30e6 commit 3facb5a
Show file tree
Hide file tree
Showing 18 changed files with 373 additions and 93 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -360,4 +360,25 @@ struct COMPILATION_NUM_THREADS final : OptionBase<COMPILATION_NUM_THREADS, int32
}
};

//
// BATCH_MODE
//
struct BATCH_MODE final : OptionBase<BATCH_MODE, ov::intel_npu::BatchMode> {
static std::string_view key() {
return ov::intel_npu::batch_mode.name();
}

static constexpr std::string_view getTypeName() {
return "ov::intel_npu::BatchMode";
}

static ov::intel_npu::BatchMode defaultValue() {
return ov::intel_npu::BatchMode::AUTO;
}

static ov::intel_npu::BatchMode parse(std::string_view val);

static std::string toString(const ov::intel_npu::BatchMode& val);
};

} // namespace intel_npu
45 changes: 45 additions & 0 deletions src/plugins/intel_npu/src/al/include/npu_private_properties.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,43 @@ inline std::ostream& operator<<(std::ostream& out, const ElfCompilerBackend& fmt
return out;
}

/**
* @brief [Only for NPU Plugin]
* Type: String. Default is "AUTO".
* This option is added for enabling batching on plugin.
* Possible values: "AUTO", "COMPILER", "PLUGIN".
*/
enum class BatchMode {
AUTO = 0,
COMPILER = 1,
PLUGIN = 2,
};

/**
* @brief Prints a string representation of ov::intel_npu::BatchMode to a stream
* @param out An output stream to send to
* @param fmt A value for batching on plugin to print to a stream
* @return A reference to the `out` stream
* @note Configuration API v 2.0
*/
inline std::ostream& operator<<(std::ostream& out, const BatchMode& fmt) {
switch (fmt) {
case BatchMode::AUTO: {
out << "AUTO";
} break;
case BatchMode::COMPILER: {
out << "COMPILER";
} break;
case BatchMode::PLUGIN: {
out << "PLUGIN";
} break;
default:
out << static_cast<uint32_t>(fmt);
break;
}
return out;
}

/**
* @brief [Only for NPU Plugin]
* Type: string, default is MODEL.
Expand Down Expand Up @@ -321,6 +358,14 @@ static constexpr ov::Property<ProfilingType> profiling_type{"NPU_PROFILING_TYPE"
*/
static constexpr ov::Property<ElfCompilerBackend> use_elf_compiler_backend{"NPU_USE_ELF_COMPILER_BACKEND"};

/**
* @brief [Only for NPU Plugin]
* Type: String. Default is "AUTO".
* This option is added for enabling batching on plugin, otherwise batching will be handled by compiler.
* Possible values: "AUTO", "PLUGIN", "COMPILER".
*/
static constexpr ov::Property<BatchMode> batch_mode{"NPU_BATCH_MODE"};

/**
* @brief [Only for NPU Plugin]
* Type: integer, default is 1
Expand Down
25 changes: 25 additions & 0 deletions src/plugins/intel_npu/src/al/src/config/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ void intel_npu::registerCompilerOptions(OptionsDesc& desc) {
desc.add<DMA_ENGINES>();
desc.add<USE_ELF_COMPILER_BACKEND>();
desc.add<DYNAMIC_SHAPE_TO_STATIC>();
desc.add<BATCH_MODE>();
}

//
Expand Down Expand Up @@ -119,3 +120,27 @@ std::string intel_npu::USE_ELF_COMPILER_BACKEND::toString(const ov::intel_npu::E

return strStream.str();
}

//
// BATCH_MODE
//

ov::intel_npu::BatchMode intel_npu::BATCH_MODE::parse(std::string_view val) {
if (val == "AUTO") {
return ov::intel_npu::BatchMode::AUTO;
} else if (val == "COMPILER") {
return ov::intel_npu::BatchMode::COMPILER;
} else if (val == "PLUGIN") {
return ov::intel_npu::BatchMode::PLUGIN;
}

OPENVINO_THROW("Value '{0}' is not a valid BATCH_TYPE option", val);
}

std::string intel_npu::BATCH_MODE::toString(const ov::intel_npu::BatchMode& val) {
std::stringstream strStream;

strStream << val;

return strStream.str();
}
2 changes: 1 addition & 1 deletion src/plugins/intel_npu/src/backend/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ target_include_directories(${TARGET_NAME}
target_link_libraries(${TARGET_NAME}
PRIVATE
openvino::npu_al
openvino_npu_zero_result_parser
openvino_npu_zero_utils
ze_loader
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ class ZeroInferRequest final : public SyncInferRequest {
zeroProfiling::ProfilingQuery _profiling_query;
std::shared_ptr<zeroProfiling::NpuInferProfiling> _npu_profiling;
std::unique_ptr<Pipeline> _pipeline;

// If batching is handled on the compiler side then batching on the plugin shall be set to 1, we don't do any
// specific operations on the plugin in this case.
size_t _batch_size = 1;
};

} // namespace intel_npu
12 changes: 6 additions & 6 deletions src/plugins/intel_npu/src/backend/include/zero_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,16 +88,16 @@ class HostMemAllocator final {
static const std::size_t _alignment = STANDARD_PAGE_SIZE;
};

// For graph arguments (inputs and outputs) memory should be located on a host side. For discrete HW
// generation the arguments has to be moved to device side to make it accessible.
// MemoryManagementUnit allow to keeps device allocations in case of discrete HW.
// Usage: we should append graph arguments with corresponding names with `appendArgument` call
// to prepare size statistics and lookup table. To commit memory allocation we should call `allocate`
// Graph arguments (inputs and output) need to be allocated in the host memory.
// For discrete platforms, graph arguments need to be copied into the device memory.
// MemoryMangementUnit is used to allocate memory in the device memory.
// Usage: we should append graph arguments with corresponding names with `appendArgument` call to prepare size
// statistics and lookup table. To commit memory allocation we should call `allocate`
struct MemoryManagementUnit {
MemoryManagementUnit() = default;

void appendArgument(const std::string& name, const ze_graph_argument_properties_t& argument);
/* Allocate Device memories */

void allocate(const ze_device_handle_t device_handle, const ze_context_handle_t context);

std::size_t getSize() const;
Expand Down
9 changes: 5 additions & 4 deletions src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ struct Pipeline {
Pipeline& operator=(Pipeline&&) = delete;
virtual ~Pipeline() = default;

virtual void push() = 0;
virtual void pull() = 0;
virtual void reset() const = 0;
virtual void push(const size_t batch_index) = 0;
virtual void pull(const size_t batch_index) = 0;
virtual void reset(const size_t batch_index) const = 0;

protected:
zeroMemory::MemoryManagementUnit _deviceInputs;
Expand All @@ -34,5 +34,6 @@ std::unique_ptr<Pipeline> makePipeline(const std::shared_ptr<const IExecutor>& e
zeroProfiling::ProfilingPool& profiling_pool,
zeroProfiling::ProfilingQuery& profiling_query,
std::shared_ptr<zeroProfiling::NpuInferProfiling> npu_profiling,
std::unordered_map<std::string, std::shared_ptr<ov::ITensor>>& tensors);
std::unordered_map<std::string, std::shared_ptr<ov::ITensor>>& tensors,
const size_t batch_size);
} // namespace intel_npu
2 changes: 1 addition & 1 deletion src/plugins/intel_npu/src/backend/include/zero_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

#include "intel_npu/al/config/runtime.hpp"
#include "intel_npu/utils/logger/logger.hpp"
#include "intel_npu/utils/zero/zero_result.hpp"
#include "intel_npu/utils/zero/zero_utils.hpp"

namespace intel_npu {

Expand Down
63 changes: 52 additions & 11 deletions src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,6 @@ void check_level_zero_attributes_match(const IONodeDescriptor& nodeDescriptor,
"Given: " + std::to_string(ovDimensions.size()));
}

for (size_t index = 0; index < ovDimensions.size(); ++index) {
if (ovDimensions[index] != zeDescriptor.info.dims[index] && !nodeDescriptor.originalShape.is_dynamic()) {
OPENVINO_THROW("Shape mismatch for parameter " + name);
}
}
for (size_t index = ovDimensions.size(); index < ZE_MAX_GRAPH_ARGUMENT_DIMENSIONS_SIZE; ++index) {
if (zeDescriptor.info.dims[index] != 0 && zeDescriptor.info.dims[index] != 1) {
OPENVINO_THROW("Shape mismatch for parameter " + name);
Expand Down Expand Up @@ -91,12 +86,34 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
return std::find(container.begin(), container.end(), value) != container.end();
};

auto optionValue = config.get<BATCH_MODE>();
if (optionValue != ov::intel_npu::BatchMode::COMPILER) {
const std::vector<size_t>& ovDimensions =
_metadata.parameters.at(_metadata.inputNames[0]).originalShape.get_shape();
const ZeroExecutor::ArgumentDescriptor& zeExecutor = executorInputDescriptors.at(_metadata.inputNames[0]);
switch (zeExecutor.info.deviceLayout) {
case ZE_GRAPH_ARGUMENT_LAYOUT_NCHW:
case ZE_GRAPH_ARGUMENT_LAYOUT_NHWC:
case ZE_GRAPH_ARGUMENT_LAYOUT_NCDHW:
case ZE_GRAPH_ARGUMENT_LAYOUT_NDHWC:
case ZE_GRAPH_ARGUMENT_LAYOUT_NC:
if ((ovDimensions[0] == zeExecutor.info.dims[0]) && (ovDimensions[0] != 1)) {
_logger.warning("Batching on the plugin is not used, batching is handled by the compiler");
} else {
_batch_size = ovDimensions[0];
}
break;
default:
_logger.warning("Batching on the plugin is working only when batching is found on 0th dimension");
}
}

for (const std::string& inputName : _metadata.inputNames) {
if (!executorInputDescriptors.count(inputName)) {
OPENVINO_THROW("Invalid graph input descriptor key: " + inputName);
}

const IONodeDescriptor& parameterDescriptor = _metadata.parameters.at(inputName);
IONodeDescriptor parameterDescriptor = _metadata.parameters.at(inputName);
check_level_zero_attributes_match(parameterDescriptor, executorInputDescriptors.at(inputName), inputName);

ov::Allocator allocator;
Expand All @@ -106,6 +123,12 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
allocator = zeroMemory::HostMemAllocator(backendPtr);
}

// When batching is handled by the plugin we need to modify transposed shape with the original batch size since
// it will be forced to 1 at the compilation time
if (_batch_size > 1) {
parameterDescriptor.transposedShape[0] = _batch_size;
}

// The I/O buffers already allocated using the Level Zero API are being reused here
allocate_tensor(inputName, parameterDescriptor, TensorType::InputOrOutput, allocator);

Expand All @@ -127,11 +150,17 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
OPENVINO_THROW("Invalid graph output descriptor key: " + outputName);
}

const IONodeDescriptor& resultDescriptor = _metadata.results.at(outputName);
IONodeDescriptor resultDescriptor = _metadata.results.at(outputName);
check_level_zero_attributes_match(resultDescriptor, executorOutputDescriptors.at(outputName), outputName);

auto allocator = zeroMemory::HostMemAllocator(backendPtr);

// When batching is handled by the plugin we need to modify transposed shape with the original batch size since
// it will be forced to 1 at the compilation time
if (_batch_size > 1) {
resultDescriptor.transposedShape[0] = _batch_size;
}

allocate_tensor(outputName, resultDescriptor, TensorType::InputOrOutput, allocator);

if (contains(_metadata.shapeNames, outputName)) {
Expand Down Expand Up @@ -174,7 +203,13 @@ ZeroInferRequest::ZeroInferRequest(const std::shared_ptr<ZeroInitStructsHolder>&
}

/// Construct pipepline
_pipeline = makePipeline(_executorPtr, _config, _profiling_pool, _profiling_query, _npu_profiling, _copyAllTensors);
_pipeline = makePipeline(_executorPtr,
_config,
_profiling_pool,
_profiling_query,
_npu_profiling,
_copyAllTensors,
_batch_size);
}

void ZeroInferRequest::infer() {
Expand Down Expand Up @@ -212,13 +247,17 @@ void ZeroInferRequest::infer_async() {
}
}

_pipeline->push();
for (size_t i = 0; i < _batch_size; i++) {
_pipeline->push(i);
}
}

void ZeroInferRequest::get_result() {
OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "get_result");

_pipeline->pull();
for (size_t i = 0; i < _batch_size; i++) {
_pipeline->pull(i);
}

for (const auto& name : _outputAndStateOutputNames) {
const auto& outputTensor = _allTensors.at(name);
Expand Down Expand Up @@ -249,7 +288,9 @@ void ZeroInferRequest::get_result() {
}
}

_pipeline->reset();
for (size_t i = 0; i < _batch_size; i++) {
_pipeline->reset(i);
}
_logger.debug("InferRequest::get_result finished");
}

Expand Down
Loading

0 comments on commit 3facb5a

Please sign in to comment.