Skip to content

Commit

Permalink
[GPU] Fix crop primitive execution with dynamic paddings input (#28672)
Browse files Browse the repository at this point in the history
### Details:
- This change fixes the issue with incorrect `runtime_offset`
calculation when the crop can't be optimized out
 - Added related tests
  • Loading branch information
sshlyapn authored Feb 4, 2025
1 parent 7e83770 commit e50d722
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 2 deletions.
25 changes: 24 additions & 1 deletion src/plugins/intel_gpu/src/graph/impls/ocl/crop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#include "eltwise/eltwise_kernel_selector.h"
#include "eltwise/eltwise_kernel_base.h"

#include "openvino/core/validation_util.hpp"

namespace cldnn {
namespace ocl {

Expand Down Expand Up @@ -55,7 +57,28 @@ struct crop_impl : typed_primitive_impl_ocl<crop> {
}

update_shapes(*_kernel_data.params, impl_param);
auto runtime_offset = convert_data_tensor(impl_param.get_input_layout(), impl_param.input_offsets[0]).GetFirstElementOffset();

// The padding sizes are reset to 0 up to crop_axis, as kernel reads data using
// "input[GET_INDEX(INPUT, order) + runtime_offset]", where GET_INDEX handles all paddings before
// specified axis. However, for proper runtime offset calculation, we have to consider paddings
// after the crop_axis, which requires subtracting input_offset from the runtime buffer, since
// padding for the first element is already included in the GET_INDEX() call.
// For example, for input shape like: [1, 32, 128 (pad_before=512, pad_after=0), 8 (pad_before=4, pad_after=4)]
// with crop_axis=2 and split_lengths = {64, 64},
// runtime_offset should be set in terms of [1, 32, 128 (pad_before=0, pad_after=0), 8 (pad_before=4, pad_after=4)] shape.
// So crop.out0's runtime_offset=0 and crop.out1's runtime_offset=1024.

auto input_layout = impl_param.get_input_layout();
auto crop_axis = ov::util::normalize(impl_param.typed_desc<crop>()->axis, static_cast<int64_t>(input_layout.get_partial_shape().size()));

input_layout.data_padding._dynamic_dims_mask = padding::EMPTY_MASK;
for (size_t i = 0; i <= static_cast<size_t>(crop_axis); i++) {
input_layout.data_padding._lower_size[i] = 0;
input_layout.data_padding._upper_size[i] = 0;
}

auto input_offset = convert_data_tensor(input_layout).GetFirstElementOffset();
auto runtime_offset = convert_data_tensor(input_layout, impl_param.input_offsets[0]).GetFirstElementOffset() - input_offset;
kernel_selector::ScalarDescriptor s;
s.t = kernel_selector::ScalarDescriptor::Types::UINT32;
s.v.u32 = static_cast<uint32_t>(runtime_offset);
Expand Down
91 changes: 90 additions & 1 deletion src/plugins/intel_gpu/tests/unit/test_cases/crop_gpu_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,6 @@ TEST(crop_gpu, basic_in2x2x2x3_crop_all) {
auto output = outputs.at("crop").get_memory();
cldnn::mem_lock<float> output_ptr(output, get_test_stream());

printf("Results:\n");
for (int b = 0; b < crop_batch_num; ++b) { //B
for (int f = 0; f < crop_feature_num; ++f) { //F
for (int y = 0; y < crop_y_size; ++y) { //Y
Expand Down Expand Up @@ -1477,6 +1476,96 @@ TEST(crop_gpu, dynamic_in1x4x1x1_varaidic_split) {
ASSERT_EQ(output_ptr_2[i], out2[i]);
}

TEST(crop_gpu, dynamic_input_padding_varaidic_split) {
tests::random_generator rg(GET_SUITE_NAME);
auto& engine = get_test_engine();

auto batch_num = 1;
auto feature_num = 4;
auto y_size = 128;
auto x_size = 4;

auto split_axis = 2;
auto data_y_pad_axis = 2;
auto data_x_pad_axis = 3;
auto input_y_pad_before = 64;
auto input_y_pad_after = 32;
auto input_x_pad_before = 8;
auto input_x_pad_after = 2;

auto input_dyn_layout = layout{ ov::PartialShape{-1, feature_num, y_size, x_size}, data_types::f32, format::bfyx };
input_dyn_layout.data_padding._dynamic_dims_mask[data_y_pad_axis] = 1;
input_dyn_layout.data_padding._lower_size[data_x_pad_axis] = input_x_pad_before;
input_dyn_layout.data_padding._upper_size[data_x_pad_axis] = input_x_pad_after;

auto input_actual_layout = layout{ ov::PartialShape{batch_num, feature_num, y_size, x_size}, data_types::f32, format::bfyx };
input_actual_layout.data_padding._lower_size[data_y_pad_axis] = input_y_pad_before;
input_actual_layout.data_padding._upper_size[data_y_pad_axis] = input_y_pad_after;
input_actual_layout.data_padding._lower_size[data_x_pad_axis] = input_x_pad_before;
input_actual_layout.data_padding._upper_size[data_x_pad_axis] = input_x_pad_after;

auto input_mem = engine.allocate_memory(input_actual_layout);
auto axis_mem = engine.allocate_memory({ {}, data_types::i64, format::bfyx });
auto splits_length_mem = engine.allocate_memory({ {2}, data_types::i64, format::bfyx });

auto elements_count = input_mem->size() / sizeof(float);
auto input_data = rg.generate_random_1d<float>(elements_count, -10, 10);

cldnn::crop_ngraph_op_mode op_mode = cldnn::crop_ngraph_op_mode::variadic_split;
topology topology;
topology.add(input_layout("input", input_dyn_layout));
topology.add(data("split_axis", axis_mem));
topology.add(data("splits_length", splits_length_mem));
topology.add(crop("variadic_split.out0", { input_info("input"), input_info("split_axis"), input_info("splits_length") }, tensor(1), tensor(0), op_mode, 0, split_axis));
topology.add(crop("variadic_split.out1", { input_info("input"), input_info("split_axis"), input_info("splits_length") }, tensor(1), tensor(0), op_mode, 1, split_axis));

std::vector<int64_t> splits_vec = { 64, 64 };

set_values(input_mem, input_data);
set_values(splits_length_mem, splits_vec);
set_values<int64_t>(axis_mem, {split_axis});

ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
config.set_property(ov::intel_gpu::optimize_data(true));
config.set_property(ov::intel_gpu::custom_outputs(topology.get_primitives_ids()));

network network(engine, topology, config);
network.set_input_data("input", input_mem);

auto check_output = [&](size_t output_idx, cldnn::network_output output) {
auto y_start = std::accumulate(splits_vec.begin(), splits_vec.begin() + output_idx, 0);
auto y_size_output = splits_vec[output_idx];

auto output_layout = output.get_layout();
auto output_mem = output.get_memory();
cldnn::mem_lock<float> output_ptr(output_mem, get_test_stream());
for (size_t b = 0; b < static_cast<size_t>(batch_num); b++) {
for (size_t f = 0; f < static_cast<size_t>(feature_num); f++) {
for (size_t y = 0; y < static_cast<size_t>(y_size_output); y++) {
for (size_t x = 0; x < static_cast<size_t>(x_size); x++) {
auto input_offset = input_actual_layout.get_linear_offset(cldnn::tensor(static_cast<int32_t>(b),
static_cast<int32_t>(f),
static_cast<int32_t>(x),
static_cast<int32_t>(y + y_start), 0, 0));
auto output_offset = output_layout.get_linear_offset(cldnn::tensor(static_cast<int32_t>(b),
static_cast<int32_t>(f),
static_cast<int32_t>(x),
static_cast<int32_t>(y), 0, 0));

ASSERT_EQ(input_data[input_offset], output_ptr[output_offset]);
}
}
}
}
};

auto outputs = network.execute();

check_output(0, outputs.at("variadic_split.out0"));
check_output(1, outputs.at("variadic_split.out1"));
}

TEST(crop_gpu, static_split_batch) {
auto& engine = get_test_engine();

Expand Down

0 comments on commit e50d722

Please sign in to comment.