[GPU] support 64 alignment for 8bit weights layout (#28864)

### Details: - *In the int8 model, if the batch size of the FC layer is 1025 (which is aligned to 1040), the FC layer can't use SLM because 1040 is not aligned to 64. The reason for this misalignment is that 64 alignment is only applied for i4 weights in the fake alignment function.* - *Modified to allow 64 alignment for i8/u8 weights to use SLM for 1045 batch size.* ### Tickets: - *161596*
openvinotoolkit · Feb 10, 2025 · 70ec531 · 70ec531
1 parent 56607df
commit 70ec531
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 2 deletions.
diff --git a/src/plugins/intel_gpu/src/graph/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/fully_connected.cpp
@@ -270,8 +270,9 @@ kernel_impl_params fully_connected_inst::get_fake_aligned_params(kernel_impl_par
         if (orig_impl_param.dev_type == cldnn::device_type::integrated_gpu) {
             auto weights_layout_dt = orig_impl_param.weights_layout.value().data_type;
             auto is_4bit = weights_layout_dt == data_types::i4 || weights_layout_dt == data_types::u4;
+            auto is_8bit = weights_layout_dt == data_types::i8 || weights_layout_dt == data_types::u8;
             auto is_extra_alignment_needed = batch_size >= 256;
-            fake_align_base = is_4bit && is_extra_alignment_needed ? 64 : 16;
+            fake_align_base = (is_4bit || is_8bit) && is_extra_alignment_needed ? 64 : 16;
         }
 
         std::fill(input_shape.begin(), input_shape.end() - 1, 1);

diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
@@ -3078,6 +3078,10 @@ class fully_connected_gpu_tests: public ::testing::Test {
         ASSERT_EQ(outputs.begin()->first, "fc_prim");
 
         auto output_mem = outputs.begin()->second.get_memory();
+        const int batch_alignment = 64;
+        if ((batch > batch_alignment) && (batch % batch_alignment != 0)) {
+            ASSERT_EQ(output_mem->get_layout().batch(), align_to(batch, batch_alignment));
+        }
         cldnn::mem_lock<ov::float16> output_ptr (output_mem, get_test_stream());
 
         auto ref_output_mem = get_ref_results();
@@ -4195,7 +4199,11 @@ TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_wzp_sta
     this->test_compressed_int4_scale_dyn_quan_weight_i4(false, 320, 1024, 1024, 32, 32, true);
 }
 
-// Test weight zp for INT8 ASYM 
+// Test weight zp for INT8 ASYM
+TEST_F(fully_connected_gpu_tests, compressed_int8_scale_dynamic_quantize_wzp_128_large_input_1025) {
+    this->test_compressed_int8_scale_dyn_quan_weight_u8(true, 1025, 3584, 4608, 128, 128, true);
+}
+
 TEST_F(fully_connected_gpu_tests, compressed_int8_scale_dynamic_quantize_wzp_128_large) {
     this->test_compressed_int8_scale_dyn_quan_weight_u8(true, 320, 4096, 4096, 128, 128, true);
 }