From c2f8a3cc72fd2bee4ba48915b06ffa83f1ba28b8 Mon Sep 17 00:00:00 2001
From: Taylor Yeonbok Lee <taylor.lee@intel.com>
Date: Tue, 14 Jan 2025 00:40:26 -0800
Subject: [PATCH] [GPU] Fix conv kernel to select opt kernel even though filter
 size is large (#28421)

### Details:
- From https://github.com/openvinotoolkit/openvino/pull/23621 conv is
selecting ref kernel when filter size is large
- Reverted the change in 23621 and instead fixed opt kernel impl not to
use manual unroll if the filter size is large

### Tickets:
 -  CVS-157998
---
 .../convolution_gpu_bfyx_os_iyx_osv16.cl      | 21 +++++++++++++++++--
 .../convolution_kernel_bfyx_os_iyx_osv16.cpp  | 14 ++++++-------
 .../unit/test_cases/convolution_gpu_test.cpp  |  6 +++---
 3 files changed, 28 insertions(+), 13 deletions(-)
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16.cl
index 1ccaf35cf46fd0..576e05dfa91d0f 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_os_iyx_osv16.cl
@@ -171,13 +171,24 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)(
 
         uint wi = 0;
         uint kr = 0; // kr = Kernel Row
+#ifdef DISABLE_MANUAL_UNROLL
+        unroll_for (; kr < FILTER_SIZE_Y; ++kr)
+#else
         LOOP(FILTER_SIZE_Y, kr,  // LOOP is a macro that unrolls the loop.
+#endif
         {
             uint kc = 0; // kc = Kernel Column
+#ifdef DISABLE_MANUAL_UNROLL
+        unroll_for (; kc < FILTER_SIZE_X; ++kc)
+            {
+                unroll_for (uint br = 0; br < OUTPUT_BLOCK_HEIGHT; br++) {
+                    unroll_for(uint bc = 0; bc < OUTPUT_BLOCK_WIDTH; bc++) {
+#else
             LOOP(FILTER_SIZE_X, kc,
             {
-                for(uint br=0; br<OUTPUT_BLOCK_HEIGHT; br++) {
-                    for(uint bc=0; bc<OUTPUT_BLOCK_WIDTH; bc++) {
+                for (uint br = 0; br < OUTPUT_BLOCK_HEIGHT; br++) {
+                    for(uint bc = 0; bc < OUTPUT_BLOCK_WIDTH; bc++) {
+#endif
 
 #if IN_BLOCK_WIDTH != SUB_GROUP_SIZE
                         //if we fix the programming model, then we could use a nice simple 2d array: val = in[br * STRIDE_SIZE_Y + kr][bc * STRIDE_SIZE_X + kc];
@@ -193,11 +204,17 @@ KERNEL(convolution_gpu_bfyx_os_iyx_osv16)(
                 w[wi % PREFETCH] = weights[weight_addr_safe];
                 weight_addr += OSV_SIZE; // weights must be stored in just the right SIMD swizzled format for this to work, see host code for details.
                 wi++;
+#ifdef DISABLE_MANUAL_UNROLL
+            }
+        }
+#else
             });
         });
+#endif
         // addr went beyond due to prefetch so move it back to correct location.
         weight_addr -= PREFETCH * OSV_SIZE;
     }
+    
 
     uint out_split_offset = g * OUTPUT_FEATURE_PITCH * FILTER_OFM_NUM;
     uint out_addr = OUTPUT_OFFSET;
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.cpp
index 58bbed8210df86..f3e13a263465d1 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.cpp
@@ -208,14 +208,6 @@ bool ConvolutionKernel_bfyx_os_iyx_osv16::Validate(const Params& p) const {
         return false;
     }
 
-    // To prevent big sized filter which causes lots of CL build time.
-    const size_t acceptable_filter_size = 1024;     // This acceptable size was decided by heuristics
-    const auto& params = static_cast<const convolution_params&>(p);
-    auto filter_size = params.filterSize.x * params.filterSize.y;
-    if (filter_size >= acceptable_filter_size) {
-        return false;
-    }
-
     return true;
 }
 
@@ -245,6 +237,12 @@ JitConstants ConvolutionKernel_bfyx_os_iyx_osv16::GetJitConstants(const convolut
     jit.AddConstant(MakeJitConstant("IN_BLOCK_WIDTH", dispatchData.cldnnStyle.inputBlockWidth));
     jit.AddConstant(MakeJitConstant("PREFETCH", dispatchData.cldnnStyle.prefetch));
 
+    const size_t large_filter_size = 1024;     // This acceptable size was decided by heuristics
+    auto filter_size = params.filterSize.x * params.filterSize.y;
+    if (filter_size >= large_filter_size) {
+        jit.AddConstant(MakeJitConstant("DISABLE_MANUAL_UNROLL", 1));
+    }
+
     if (leftovers) {
         jit.AddConstant(MakeJitConstant("LEFTOVERS", leftovers));
     }
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp
index b7cac423cd16b0..bbfe7224b4a328 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp
@@ -1689,7 +1689,7 @@ TEST(convolution_f32_fw_gpu, convolution_big_size_weights) {
     };
 
     const std::vector<std::string> impl_kernel_data = {
-        "convolution_gpu_ref__f32"
+        "convolution_gpu_bfyx_os_iyx_osv16__f32"
     };
 
     for (size_t m = 0 ; m < filter_size_data.size() / 2; m++) {
@@ -1767,8 +1767,8 @@ TEST(convolution_f16_fw_gpu, convolution_big_size_weights) {
     };
 
     const std::vector<std::string> impl_kernel_data = {
-        "convolution_gpu_ref__f16",
-        "convolution_gpu_bfyx_gemm_like__f16",
+        "convolution_gpu_bfyx_os_iyx_osv16__f16",
+        "convolution_gpu_bfyx_os_iyx_osv16__f16",
     };
 
     for (size_t m = 0 ; m < filter_size_data.size() / 2; m++) {