Skip to content

Commit

Permalink
Merge branch 'master' into mateuszm/op/emb/poc
Browse files Browse the repository at this point in the history
  • Loading branch information
mlukasze authored May 21, 2024
2 parents 3698088 + 415ba28 commit 02cc178
Show file tree
Hide file tree
Showing 5 changed files with 140 additions and 9 deletions.
16 changes: 15 additions & 1 deletion src/plugins/intel_gpu/include/intel_gpu/plugin/common_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,21 @@ inline ov::Shape predict_shape(const std::string& name, const cldnn::layout layo
return layout.get_shape();
}

void convert_and_copy(const ov::ITensor* src, cldnn::memory::ptr dst, cldnn::stream& stream);
/// WA: Force exit. Any opencl api call can be hang after CL_OUT_OF_RESOURCES.
inline void ForceExit() {
std::cerr << "[GPU] force exit.\n"
<< "\tDue to the driver bug any subsequent OpenCL API call will cause application hang, "
<< "so GPU plugin can't finish correctly.\n"
<< "\tPlease try to update the driver or reduce memory consumption "
<< "(use smaller batch size, less streams, lower precision, etc)"
<< "to avoid CL_OUT_OF_RESOURCES exception" << std::endl;
std::_Exit(-1);
}

void convert_and_copy(const ov::ITensor* src,
cldnn::memory::ptr dst,
cldnn::stream& stream,
const cldnn::layout& src_layout = cldnn::layout({}, ov::element::undefined, cldnn::format::bfyx, cldnn::padding()));
void convert_and_copy(const cldnn::memory::ptr src, ov::ITensor const* dst, const cldnn::stream& stream);
void convert_and_copy(const ov::ITensor* src, ov::ITensor const* dst, const cldnn::stream& stream);
void convert_and_copy(const cldnn::memory::ptr src, cldnn::memory::ptr dst, cldnn::stream& stream);
Expand Down
4 changes: 2 additions & 2 deletions src/plugins/intel_gpu/src/plugin/common_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ bool data_types_are_supported(const ov::Node* node) {
return true;
}

void convert_and_copy(const ov::ITensor* src, cldnn::memory::ptr dst, cldnn::stream& stream) {
void convert_and_copy(const ov::ITensor* src, cldnn::memory::ptr dst, cldnn::stream& stream, const cldnn::layout& src_layout) {
const bool blocking = true;
auto src_et = src->get_element_type();
auto dst_et = dst->get_layout().data_type;
Expand All @@ -158,7 +158,7 @@ void convert_and_copy(const ov::ITensor* src, cldnn::memory::ptr dst, cldnn::str

size_t size = ov::shape_size(src->get_shape());
ov::Tensor tmp_tensor(dst_et, src->get_shape());
::convert_and_copy(src->data(), src_et, tmp_tensor.data(), dst_et, size, cldnn::layout({}, ov::element::undefined, cldnn::format::bfyx, cldnn::padding()));
::convert_and_copy(src->data(), src_et, tmp_tensor.data(), dst_et, size, src_layout);
dst->copy_from(stream, tmp_tensor.data(), blocking);
}

Expand Down
42 changes: 37 additions & 5 deletions src/plugins/intel_gpu/src/plugin/variable_state.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
#include "intel_gpu/runtime/memory_caps.hpp"
#include "intel_gpu/runtime/layout.hpp"
#include "intel_gpu/runtime/debug_configuration.hpp"

#include <memory>

namespace ov {
Expand Down Expand Up @@ -57,11 +56,44 @@ void VariableState::set_layout(const cldnn::layout& new_layout) {
}

void VariableState::set_state(const ov::SoPtr<ov::ITensor>& state) {
m_layout.set_partial_shape(state->get_shape());
size_t rank = state->get_shape().size();
m_layout.data_padding = cldnn::padding(std::vector<int32_t>(rank, 0), std::vector<int32_t>(rank, 0), 0, m_layout.data_padding.get_dynamic_pad_dims());
auto src_shape = state->get_shape();
size_t src_rank = src_shape.size();
m_layout.data_padding = cldnn::padding(std::vector<int32_t>(src_rank, 0),
std::vector<int32_t>(src_rank, 0),
0,
m_layout.data_padding.get_dynamic_pad_dims());
auto src_stride = state->get_strides();
for (size_t i = 0; i < src_rank; ++i) {
src_stride[i] = src_stride[i] / (state->get_element_type().bitwidth()/8);
}
m_layout.set_partial_shape(src_shape);
update_device_buffer();
convert_and_copy(state._ptr.get(), m_memory, m_context->get_engine().get_service_stream());

// check whether the src tensor is padded
std::vector<size_t> src_stride_no_pad(src_rank, 1);
std::vector<int32_t> upper_pad(std::max<size_t>(src_rank, 4), 0);
std::vector<int32_t> lower_pad(std::max<size_t>(src_rank, 4), 0);
for (int32_t i = static_cast<int32_t>(src_stride.size()) - 1; i >= 0; --i) {
if (i <= static_cast<int32_t>(src_stride.size()) - 2)
src_stride_no_pad[i] = src_stride_no_pad[i + 1] * src_shape[i + 1];
if (src_stride[i] != src_stride_no_pad[i]) {
OPENVINO_ASSERT(src_stride[i] > src_stride_no_pad[i]);
size_t padded_size = src_stride[i] / src_stride[i + 1];
size_t non_padded_size = src_stride_no_pad[i] / src_stride_no_pad[i + 1];
int32_t pad_dim_legacy = i + 1;
if (pad_dim_legacy >= 2) {
int32_t spatial_axis = pad_dim_legacy - 2;
int32_t spatial_size = std::max<int32_t>(static_cast<int32_t>(src_rank), 4) - 2;
pad_dim_legacy = spatial_size - spatial_axis - 1 + 2;
}
upper_pad[pad_dim_legacy] = static_cast<int32_t>(padded_size) - static_cast<int32_t>(non_padded_size);
}
}
cldnn::padding src_padd = cldnn::padding(lower_pad, upper_pad, 0.f);
auto src_fmt = cldnn::format::get_default_format(src_rank);
auto src_layout = cldnn::layout(ov::PartialShape(src_shape), state->get_element_type(), src_fmt, src_padd);

convert_and_copy(state._ptr.get(), m_memory, m_context->get_engine().get_service_stream(), src_layout);
set();
}

Expand Down
25 changes: 24 additions & 1 deletion src/plugins/intel_gpu/src/runtime/kernels_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,33 @@ void kernels_cache::get_program_source(const kernels_code& kernels_source_code,
current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_header_str));
}

// This is a temporary walk-around to avoid severe performance drop.
// It will be removed after OpenCL compiler is updated.
auto need_separate_batch = [&](std::string& unique_kernel_name) -> bool {
const std::vector<std::string> special_kernels = {"gemm_tiled_opt"};

// check if the current kernel name is in special_kernels
for (auto& special_kernel : special_kernels) {
if (entry_point.find(special_kernel) != std::string::npos)
return true;
}

// check if the current_batch has one of special_kernels
if (current_bucket.back().kernels_counter == 1) {
auto& kernel_in_current_batch = current_bucket.back().entry_point_to_id.begin()->first;
for (auto& special_kernel : special_kernels) {
if (kernel_in_current_batch.find(special_kernel) != std::string::npos)
return true;
}
}
return false;
};

// Create new kernels batch when the limit is reached
// and current kernel's entry_point is duplicated in this kernels batch
if (current_bucket.back().kernels_counter >= get_max_kernels_per_batch()
|| current_bucket.back().entry_point_to_id.find(entry_point) != current_bucket.back().entry_point_to_id.end()) {
|| current_bucket.back().entry_point_to_id.find(entry_point) != current_bucket.back().entry_point_to_id.end()
|| need_separate_batch(entry_point)) {
const auto& batch_id = static_cast<int32_t>(current_bucket.size());
current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_header_str));
}
Expand Down
62 changes: 62 additions & 0 deletions src/plugins/intel_gpu/tests/functional/behavior/infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,68 @@ TEST(VariablesTest, smoke_set_get_state_with_convert) {
ov::test::utils::compare(tensor_to_set, state_tensor, 1e-5f, 1e-5f);
}

TEST(VariablesTest, smoke_padded_tensor_set_get_state_with_convert) {
auto build_model = [](ov::element::Type type, const ov::PartialShape& shape) {
auto param = std::make_shared<ov::op::v0::Parameter>(type, shape);
const ov::op::util::VariableInfo variable_info { shape, type, "v0" };
auto variable = std::make_shared<ov::op::util::Variable>(variable_info);
auto read_value = std::make_shared<ov::op::v6::ReadValue>(param, variable);
auto add = std::make_shared<ov::op::v1::Add>(read_value, param);
auto assign = std::make_shared<ov::op::v6::Assign>(add, variable);
auto res = std::make_shared<ov::op::v0::Result>(add);
return std::make_shared<ov::Model>(ov::ResultVector { res }, ov::SinkVector { assign }, ov::ParameterVector{param}, "StateTestModel");
};

auto ov = ov::Core();
const ov::Shape virable_shape_padded = {1, 3, 4, 4};
const ov::Shape virable_shape = {1, 3, 2, 4};
const ov::Shape input_shape = {1, 3, 2, 4};
const ov::element::Type et = ov::element::f32;
auto model = build_model(et, input_shape);
auto compiled_model = ov.compile_model(model, ov::test::utils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f16));
auto request = compiled_model.create_infer_request();

auto variables = request.query_state();
ASSERT_EQ(variables.size(), 1);
auto variable = variables.front();
ASSERT_EQ(variable.get_name(), "v0");
auto state_tensor = variable.get_state();
ASSERT_EQ(state_tensor.get_shape(), virable_shape);
ASSERT_EQ(state_tensor.get_element_type(), et);

auto tensor_to_set_padded = ov::test::utils::create_and_fill_tensor(et, virable_shape_padded);

// trim original tensor
auto tensor_to_set =
ov::Tensor(tensor_to_set_padded, ov::Coordinate{0, 0, 0, 0}, ov::Coordinate(virable_shape));

variable.set_state(tensor_to_set);
state_tensor = variable.get_state();

auto res_tensor_ptr = static_cast<float*>(state_tensor.data());
auto ref_tensor_ptr = static_cast<float*>(tensor_to_set.data());
auto ref_stride = tensor_to_set.get_strides();
auto res_stride = state_tensor.get_strides();
for (size_t i = 0; i < ref_stride.size(); ++i) {
ref_stride[i] /= (tensor_to_set.get_element_type().bitwidth()/8);
res_stride[i] /= (state_tensor.get_element_type().bitwidth()/8);
}
// ref stride: [48, 16, 4, 1]
// res stride: [24, 8, 4, 1]
// compare actual tensor w/o pad
for (size_t b = 0; b < virable_shape[0]; ++b) {
for (size_t f = 0; f < virable_shape[1]; ++f) {
for (size_t y = 0; y < virable_shape[2]; ++y) {
for (size_t x = 0; x < virable_shape[3]; ++x) {
auto ref_idx = b * ref_stride[0] + f * ref_stride[1] + y * ref_stride[2] + x * ref_stride[3];
auto res_idx = b * res_stride[0] + f * res_stride[1] + y * res_stride[2] + x * res_stride[3];
ASSERT_EQ(res_tensor_ptr[res_idx], ref_tensor_ptr[ref_idx]);
}
}
}
}
}

TEST(TensorTest, smoke_outputTensorShapesForDynamicInput) {
auto core = ov::Core();
using namespace ov::preprocess;
Expand Down

0 comments on commit 02cc178

Please sign in to comment.