diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_f_y_axes.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_f_y_axes.cpp index 69756b02e2d9ff..deb25d17618347 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_f_y_axes.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_f_y_axes.cpp @@ -214,6 +214,8 @@ bool PermuteKernel_f_y_axes::Validate(const Params& p) const { const auto& params = dynamic_cast(p); const auto& in = params.inputs[0]; const auto in_layout = in.GetLayout(); + const auto& out = params.outputs[0]; + const auto& out_layout = out.GetLayout(); const auto feature_div = GetDivisor(in.Feature().v); const auto y_div = GetDivisor(in.Y().v); @@ -227,6 +229,10 @@ bool PermuteKernel_f_y_axes::Validate(const Params& p) const { return false; } + if (in_layout != out_layout) { + return false; + } + // Accept only supported blocked layouts and SIMD sizes. if (!SimpleLayout(in_layout)) { const auto feature_block_size = GetFeatureBlockSize(params); diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/permute_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/permute_gpu_test.cpp index 9688ebe227071f..db9777a3a613d9 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/permute_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/permute_gpu_test.cpp @@ -2175,6 +2175,76 @@ TEST(permute_gpu_f32_dynamic, bfyx_0_2_3_1) { } } +TEST(permute_f_y_axes_fallback, b_fs_yx_fsv16) { + constexpr size_t array_size = 128; + + auto& engine = get_test_engine(); + if (!engine.get_device_info().supports_immad) + return; + + auto input_layout_static = layout{ov::PartialShape{1, 8, 16, 1}, data_types::f32, format::bfyx}; + auto input = engine.allocate_memory(input_layout_static); + + std::vector input_data; + input_data.reserve(array_size); + for (size_t i = 0; i < array_size; ++i) + input_data.push_back(static_cast(i)); + + auto weights = engine.allocate_memory({ data_types::f32, format::bfyx, { 8, 16, 1, 1 } }); + + std::vector weights_data; + weights_data.reserve(array_size); + for (size_t i = 0; i < array_size; ++i) + weights_data.push_back(static_cast(1.0)); + + set_values(weights, weights_data); + set_values(input, input_data); + + auto impl_desc_onednn = ov::intel_gpu::ImplementationDesc{format::b_fs_yx_fsv16, "", impl_types::onednn}; + auto impl_forcing_map = ov::intel_gpu::ImplForcingMap{{"conv", impl_desc_onednn}}; + + topology topology; + topology.add(input_layout("input", input_layout_static)); + topology.add(permute("permute", input_info("input"), { 0, 2, 1, 3 })); + topology.add(data("weights", weights)); + topology.add(convolution("conv", input_info("permute"), "weights", "", 1, {1,1}, {1,1}, {0,0}, {0,0}, false)); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::force_implementations(impl_forcing_map)); + + network network(engine, topology, config); + network.set_input_data("input", input); + auto outputs = network.execute(); + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "conv"); + + auto output = outputs.begin()->second.get_memory(); + + float answers[] = { + 120.f, 120.f, 120.f, 120.f, 120.f, 120.f, 120.f, 120.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 376.f, 376.f, 376.f, 376.f, 376.f, 376.f, 376.f, 376.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 632.f, 632.f, 632.f, 632.f, 632.f, 632.f, 632.f, 632.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 888.f, 888.f, 888.f, 888.f, 888.f, 888.f, 888.f, 888.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 1144.f, 1144.f, 1144.f, 1144.f, 1144.f, 1144.f, 1144.f, 1144.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 1400.f, 1400.f, 1400.f, 1400.f, 1400.f, 1400.f, 1400.f, 1400.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 1656.f, 1656.f, 1656.f, 1656.f, 1656.f, 1656.f, 1656.f, 1656.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + 1912.f, 1912.f, 1912.f, 1912.f, 1912.f, 1912.f, 1912.f, 1912.f, + 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, + }; + + cldnn::mem_lock output_ptr(output, get_test_stream()); + for (size_t i = 0; i < array_size; i++) { + ASSERT_FLOAT_EQ(answers[i], output_ptr[i]); + } +} + class permute_bfzyx_to_bfyxz: public TiledPermuteTest {}; INSTANTIATE_TEST_SUITE_P(, permute_bfzyx_to_bfyxz,