diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/common_utils.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/common_utils.hpp index 79806c8298a82e..80fac9d10d9f5b 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/common_utils.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/common_utils.hpp @@ -111,7 +111,21 @@ inline ov::Shape predict_shape(const std::string& name, const cldnn::layout layo return layout.get_shape(); } -void convert_and_copy(const ov::ITensor* src, cldnn::memory::ptr dst, cldnn::stream& stream); +/// WA: Force exit. Any opencl api call can be hang after CL_OUT_OF_RESOURCES. +inline void ForceExit() { + std::cerr << "[GPU] force exit.\n" + << "\tDue to the driver bug any subsequent OpenCL API call will cause application hang, " + << "so GPU plugin can't finish correctly.\n" + << "\tPlease try to update the driver or reduce memory consumption " + << "(use smaller batch size, less streams, lower precision, etc)" + << "to avoid CL_OUT_OF_RESOURCES exception" << std::endl; + std::_Exit(-1); +} + +void convert_and_copy(const ov::ITensor* src, + cldnn::memory::ptr dst, + cldnn::stream& stream, + const cldnn::layout& src_layout = cldnn::layout({}, ov::element::undefined, cldnn::format::bfyx, cldnn::padding())); void convert_and_copy(const cldnn::memory::ptr src, ov::ITensor const* dst, const cldnn::stream& stream); void convert_and_copy(const ov::ITensor* src, ov::ITensor const* dst, const cldnn::stream& stream); void convert_and_copy(const cldnn::memory::ptr src, cldnn::memory::ptr dst, cldnn::stream& stream); diff --git a/src/plugins/intel_gpu/src/plugin/common_utils.cpp b/src/plugins/intel_gpu/src/plugin/common_utils.cpp index 6064b70e6f07d6..9069c8570eadf8 100644 --- a/src/plugins/intel_gpu/src/plugin/common_utils.cpp +++ b/src/plugins/intel_gpu/src/plugin/common_utils.cpp @@ -141,7 +141,7 @@ bool data_types_are_supported(const ov::Node* node) { return true; } -void convert_and_copy(const ov::ITensor* src, cldnn::memory::ptr dst, cldnn::stream& stream) { +void convert_and_copy(const ov::ITensor* src, cldnn::memory::ptr dst, cldnn::stream& stream, const cldnn::layout& src_layout) { const bool blocking = true; auto src_et = src->get_element_type(); auto dst_et = dst->get_layout().data_type; @@ -158,7 +158,7 @@ void convert_and_copy(const ov::ITensor* src, cldnn::memory::ptr dst, cldnn::str size_t size = ov::shape_size(src->get_shape()); ov::Tensor tmp_tensor(dst_et, src->get_shape()); - ::convert_and_copy(src->data(), src_et, tmp_tensor.data(), dst_et, size, cldnn::layout({}, ov::element::undefined, cldnn::format::bfyx, cldnn::padding())); + ::convert_and_copy(src->data(), src_et, tmp_tensor.data(), dst_et, size, src_layout); dst->copy_from(stream, tmp_tensor.data(), blocking); } diff --git a/src/plugins/intel_gpu/src/plugin/variable_state.cpp b/src/plugins/intel_gpu/src/plugin/variable_state.cpp index 81871bbf8fa2fb..4d8cdae95e4a8a 100644 --- a/src/plugins/intel_gpu/src/plugin/variable_state.cpp +++ b/src/plugins/intel_gpu/src/plugin/variable_state.cpp @@ -11,7 +11,6 @@ #include "intel_gpu/runtime/memory_caps.hpp" #include "intel_gpu/runtime/layout.hpp" #include "intel_gpu/runtime/debug_configuration.hpp" - #include namespace ov { @@ -57,11 +56,44 @@ void VariableState::set_layout(const cldnn::layout& new_layout) { } void VariableState::set_state(const ov::SoPtr& state) { - m_layout.set_partial_shape(state->get_shape()); - size_t rank = state->get_shape().size(); - m_layout.data_padding = cldnn::padding(std::vector(rank, 0), std::vector(rank, 0), 0, m_layout.data_padding.get_dynamic_pad_dims()); + auto src_shape = state->get_shape(); + size_t src_rank = src_shape.size(); + m_layout.data_padding = cldnn::padding(std::vector(src_rank, 0), + std::vector(src_rank, 0), + 0, + m_layout.data_padding.get_dynamic_pad_dims()); + auto src_stride = state->get_strides(); + for (size_t i = 0; i < src_rank; ++i) { + src_stride[i] = src_stride[i] / (state->get_element_type().bitwidth()/8); + } + m_layout.set_partial_shape(src_shape); update_device_buffer(); - convert_and_copy(state._ptr.get(), m_memory, m_context->get_engine().get_service_stream()); + + // check whether the src tensor is padded + std::vector src_stride_no_pad(src_rank, 1); + std::vector upper_pad(std::max(src_rank, 4), 0); + std::vector lower_pad(std::max(src_rank, 4), 0); + for (int32_t i = static_cast(src_stride.size()) - 1; i >= 0; --i) { + if (i <= static_cast(src_stride.size()) - 2) + src_stride_no_pad[i] = src_stride_no_pad[i + 1] * src_shape[i + 1]; + if (src_stride[i] != src_stride_no_pad[i]) { + OPENVINO_ASSERT(src_stride[i] > src_stride_no_pad[i]); + size_t padded_size = src_stride[i] / src_stride[i + 1]; + size_t non_padded_size = src_stride_no_pad[i] / src_stride_no_pad[i + 1]; + int32_t pad_dim_legacy = i + 1; + if (pad_dim_legacy >= 2) { + int32_t spatial_axis = pad_dim_legacy - 2; + int32_t spatial_size = std::max(static_cast(src_rank), 4) - 2; + pad_dim_legacy = spatial_size - spatial_axis - 1 + 2; + } + upper_pad[pad_dim_legacy] = static_cast(padded_size) - static_cast(non_padded_size); + } + } + cldnn::padding src_padd = cldnn::padding(lower_pad, upper_pad, 0.f); + auto src_fmt = cldnn::format::get_default_format(src_rank); + auto src_layout = cldnn::layout(ov::PartialShape(src_shape), state->get_element_type(), src_fmt, src_padd); + + convert_and_copy(state._ptr.get(), m_memory, m_context->get_engine().get_service_stream(), src_layout); set(); } diff --git a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp index 6d738b63212f6d..47483c6a1a0192 100644 --- a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp +++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp @@ -131,10 +131,33 @@ void kernels_cache::get_program_source(const kernels_code& kernels_source_code, current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_header_str)); } + // This is a temporary walk-around to avoid severe performance drop. + // It will be removed after OpenCL compiler is updated. + auto need_separate_batch = [&](std::string& unique_kernel_name) -> bool { + const std::vector special_kernels = {"gemm_tiled_opt"}; + + // check if the current kernel name is in special_kernels + for (auto& special_kernel : special_kernels) { + if (entry_point.find(special_kernel) != std::string::npos) + return true; + } + + // check if the current_batch has one of special_kernels + if (current_bucket.back().kernels_counter == 1) { + auto& kernel_in_current_batch = current_bucket.back().entry_point_to_id.begin()->first; + for (auto& special_kernel : special_kernels) { + if (kernel_in_current_batch.find(special_kernel) != std::string::npos) + return true; + } + } + return false; + }; + // Create new kernels batch when the limit is reached // and current kernel's entry_point is duplicated in this kernels batch if (current_bucket.back().kernels_counter >= get_max_kernels_per_batch() - || current_bucket.back().entry_point_to_id.find(entry_point) != current_bucket.back().entry_point_to_id.end()) { + || current_bucket.back().entry_point_to_id.find(entry_point) != current_bucket.back().entry_point_to_id.end() + || need_separate_batch(entry_point)) { const auto& batch_id = static_cast(current_bucket.size()); current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_header_str)); } diff --git a/src/plugins/intel_gpu/tests/functional/behavior/infer_request.cpp b/src/plugins/intel_gpu/tests/functional/behavior/infer_request.cpp index bdd02c1ec822eb..2a29775c6ebd87 100644 --- a/src/plugins/intel_gpu/tests/functional/behavior/infer_request.cpp +++ b/src/plugins/intel_gpu/tests/functional/behavior/infer_request.cpp @@ -274,6 +274,68 @@ TEST(VariablesTest, smoke_set_get_state_with_convert) { ov::test::utils::compare(tensor_to_set, state_tensor, 1e-5f, 1e-5f); } +TEST(VariablesTest, smoke_padded_tensor_set_get_state_with_convert) { + auto build_model = [](ov::element::Type type, const ov::PartialShape& shape) { + auto param = std::make_shared(type, shape); + const ov::op::util::VariableInfo variable_info { shape, type, "v0" }; + auto variable = std::make_shared(variable_info); + auto read_value = std::make_shared(param, variable); + auto add = std::make_shared(read_value, param); + auto assign = std::make_shared(add, variable); + auto res = std::make_shared(add); + return std::make_shared(ov::ResultVector { res }, ov::SinkVector { assign }, ov::ParameterVector{param}, "StateTestModel"); + }; + + auto ov = ov::Core(); + const ov::Shape virable_shape_padded = {1, 3, 4, 4}; + const ov::Shape virable_shape = {1, 3, 2, 4}; + const ov::Shape input_shape = {1, 3, 2, 4}; + const ov::element::Type et = ov::element::f32; + auto model = build_model(et, input_shape); + auto compiled_model = ov.compile_model(model, ov::test::utils::DEVICE_GPU, ov::hint::inference_precision(ov::element::f16)); + auto request = compiled_model.create_infer_request(); + + auto variables = request.query_state(); + ASSERT_EQ(variables.size(), 1); + auto variable = variables.front(); + ASSERT_EQ(variable.get_name(), "v0"); + auto state_tensor = variable.get_state(); + ASSERT_EQ(state_tensor.get_shape(), virable_shape); + ASSERT_EQ(state_tensor.get_element_type(), et); + + auto tensor_to_set_padded = ov::test::utils::create_and_fill_tensor(et, virable_shape_padded); + + // trim original tensor + auto tensor_to_set = + ov::Tensor(tensor_to_set_padded, ov::Coordinate{0, 0, 0, 0}, ov::Coordinate(virable_shape)); + + variable.set_state(tensor_to_set); + state_tensor = variable.get_state(); + + auto res_tensor_ptr = static_cast(state_tensor.data()); + auto ref_tensor_ptr = static_cast(tensor_to_set.data()); + auto ref_stride = tensor_to_set.get_strides(); + auto res_stride = state_tensor.get_strides(); + for (size_t i = 0; i < ref_stride.size(); ++i) { + ref_stride[i] /= (tensor_to_set.get_element_type().bitwidth()/8); + res_stride[i] /= (state_tensor.get_element_type().bitwidth()/8); + } + // ref stride: [48, 16, 4, 1] + // res stride: [24, 8, 4, 1] + // compare actual tensor w/o pad + for (size_t b = 0; b < virable_shape[0]; ++b) { + for (size_t f = 0; f < virable_shape[1]; ++f) { + for (size_t y = 0; y < virable_shape[2]; ++y) { + for (size_t x = 0; x < virable_shape[3]; ++x) { + auto ref_idx = b * ref_stride[0] + f * ref_stride[1] + y * ref_stride[2] + x * ref_stride[3]; + auto res_idx = b * res_stride[0] + f * res_stride[1] + y * res_stride[2] + x * res_stride[3]; + ASSERT_EQ(res_tensor_ptr[res_idx], ref_tensor_ptr[ref_idx]); + } + } + } + } +} + TEST(TensorTest, smoke_outputTensorShapesForDynamicInput) { auto core = ov::Core(); using namespace ov::preprocess;