[GPU] Add check to fallback to permute_ref if different format for on…

…eDNN (openvinotoolkit#27402) ### Details: If the input and output format are not the same in permute before oneDNN convolution, permute_kernel_f_y_axes doesn't support it. Need to fallback to permute_ref ### Tickets: - *CVS-155933*
iefode · Nov 8, 2024 · fb5b5ed · fb5b5ed
1 parent b416fb0
commit fb5b5ed
Show file tree

Hide file tree

Showing 2 changed files with 76 additions and 0 deletions.
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_f_y_axes.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/permute/permute_kernel_f_y_axes.cpp
@@ -214,6 +214,8 @@ bool PermuteKernel_f_y_axes::Validate(const Params& p) const {
     const auto& params = dynamic_cast<const permute_params&>(p);
     const auto& in = params.inputs[0];
     const auto in_layout = in.GetLayout();
+    const auto& out = params.outputs[0];
+    const auto& out_layout = out.GetLayout();
 
     const auto feature_div = GetDivisor(in.Feature().v);
     const auto y_div = GetDivisor(in.Y().v);
@@ -227,6 +229,10 @@ bool PermuteKernel_f_y_axes::Validate(const Params& p) const {
         return false;
     }
 
+    if (in_layout != out_layout) {
+        return false;
+    }
+
     // Accept only supported blocked layouts and SIMD sizes.
     if (!SimpleLayout(in_layout)) {
         const auto feature_block_size = GetFeatureBlockSize(params);

diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/permute_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/permute_gpu_test.cpp
@@ -2175,6 +2175,76 @@ TEST(permute_gpu_f32_dynamic, bfyx_0_2_3_1) {
     }
 }
 
+TEST(permute_f_y_axes_fallback, b_fs_yx_fsv16) {
+    constexpr size_t array_size = 128;
+
+    auto& engine = get_test_engine();
+    if (!engine.get_device_info().supports_immad)
+        return;
+
+    auto input_layout_static = layout{ov::PartialShape{1, 8, 16, 1}, data_types::f32, format::bfyx};
+    auto input = engine.allocate_memory(input_layout_static);
+
+    std::vector<float> input_data;
+    input_data.reserve(array_size);
+    for (size_t i = 0; i < array_size; ++i)
+        input_data.push_back(static_cast<float>(i));
+
+    auto weights = engine.allocate_memory({ data_types::f32, format::bfyx, { 8, 16, 1, 1 } });
+
+    std::vector<float> weights_data;
+    weights_data.reserve(array_size);
+    for (size_t i = 0; i < array_size; ++i)
+      weights_data.push_back(static_cast<float>(1.0));
+
+    set_values(weights, weights_data);
+    set_values(input, input_data);
+
+    auto impl_desc_onednn = ov::intel_gpu::ImplementationDesc{format::b_fs_yx_fsv16, "", impl_types::onednn};
+    auto impl_forcing_map = ov::intel_gpu::ImplForcingMap{{"conv", impl_desc_onednn}};
+
+    topology topology;
+    topology.add(input_layout("input", input_layout_static));
+    topology.add(permute("permute", input_info("input"), { 0, 2, 1, 3 }));
+    topology.add(data("weights", weights));
+    topology.add(convolution("conv", input_info("permute"), "weights", "", 1, {1,1}, {1,1}, {0,0}, {0,0}, false));
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::force_implementations(impl_forcing_map));
+
+    network network(engine, topology, config);
+    network.set_input_data("input", input);
+    auto outputs = network.execute();
+    ASSERT_EQ(outputs.size(), size_t(1));
+    ASSERT_EQ(outputs.begin()->first, "conv");
+
+    auto output = outputs.begin()->second.get_memory();
+
+    float answers[] = {
+        120.f, 120.f, 120.f, 120.f, 120.f, 120.f, 120.f, 120.f,
+        0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+        376.f, 376.f, 376.f, 376.f, 376.f, 376.f, 376.f, 376.f,
+        0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+        632.f, 632.f, 632.f, 632.f, 632.f, 632.f, 632.f, 632.f,
+        0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+        888.f, 888.f, 888.f, 888.f, 888.f, 888.f, 888.f, 888.f,
+        0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+        1144.f, 1144.f, 1144.f, 1144.f, 1144.f, 1144.f, 1144.f, 1144.f,
+        0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+        1400.f, 1400.f, 1400.f, 1400.f, 1400.f, 1400.f, 1400.f, 1400.f,
+        0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+        1656.f, 1656.f, 1656.f, 1656.f, 1656.f, 1656.f, 1656.f, 1656.f,
+        0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+        1912.f, 1912.f, 1912.f, 1912.f, 1912.f, 1912.f, 1912.f, 1912.f,
+        0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
+    };
+
+    cldnn::mem_lock<float> output_ptr(output, get_test_stream());
+    for (size_t i = 0; i < array_size; i++) {
+        ASSERT_FLOAT_EQ(answers[i], output_ptr[i]);
+    }
+}
+
 class permute_bfzyx_to_bfyxz: public TiledPermuteTest {};
 
 INSTANTIATE_TEST_SUITE_P(, permute_bfzyx_to_bfyxz,