Skip to content

Commit

Permalink
[GPU] support 64 alignment for 8bit weights layout (#28864)
Browse files Browse the repository at this point in the history
### Details:
- *In the int8 model, if the batch size of the FC layer is 1025 (which
is aligned to 1040), the FC layer can't use SLM because 1040 is not
aligned to 64. The reason for this misalignment is that 64 alignment is
only applied for i4 weights in the fake alignment function.*
- *Modified to allow 64 alignment for i8/u8 weights to use SLM for 1045
batch size.*

### Tickets:
 - *161596*
  • Loading branch information
ahnyoung-paul authored Feb 10, 2025
1 parent 56607df commit 70ec531
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 2 deletions.
3 changes: 2 additions & 1 deletion src/plugins/intel_gpu/src/graph/fully_connected.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,8 +270,9 @@ kernel_impl_params fully_connected_inst::get_fake_aligned_params(kernel_impl_par
if (orig_impl_param.dev_type == cldnn::device_type::integrated_gpu) {
auto weights_layout_dt = orig_impl_param.weights_layout.value().data_type;
auto is_4bit = weights_layout_dt == data_types::i4 || weights_layout_dt == data_types::u4;
auto is_8bit = weights_layout_dt == data_types::i8 || weights_layout_dt == data_types::u8;
auto is_extra_alignment_needed = batch_size >= 256;
fake_align_base = is_4bit && is_extra_alignment_needed ? 64 : 16;
fake_align_base = (is_4bit || is_8bit) && is_extra_alignment_needed ? 64 : 16;
}

std::fill(input_shape.begin(), input_shape.end() - 1, 1);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3078,6 +3078,10 @@ class fully_connected_gpu_tests: public ::testing::Test {
ASSERT_EQ(outputs.begin()->first, "fc_prim");

auto output_mem = outputs.begin()->second.get_memory();
const int batch_alignment = 64;
if ((batch > batch_alignment) && (batch % batch_alignment != 0)) {
ASSERT_EQ(output_mem->get_layout().batch(), align_to(batch, batch_alignment));
}
cldnn::mem_lock<ov::float16> output_ptr (output_mem, get_test_stream());

auto ref_output_mem = get_ref_results();
Expand Down Expand Up @@ -4195,7 +4199,11 @@ TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_wzp_sta
this->test_compressed_int4_scale_dyn_quan_weight_i4(false, 320, 1024, 1024, 32, 32, true);
}

// Test weight zp for INT8 ASYM
// Test weight zp for INT8 ASYM
TEST_F(fully_connected_gpu_tests, compressed_int8_scale_dynamic_quantize_wzp_128_large_input_1025) {
this->test_compressed_int8_scale_dyn_quan_weight_u8(true, 1025, 3584, 4608, 128, 128, true);
}

TEST_F(fully_connected_gpu_tests, compressed_int8_scale_dynamic_quantize_wzp_128_large) {
this->test_compressed_int8_scale_dyn_quan_weight_u8(true, 320, 4096, 4096, 128, 128, true);
}
Expand Down

0 comments on commit 70ec531

Please sign in to comment.