Skip to content

Commit

Permalink
+coalesced
Browse files Browse the repository at this point in the history
  • Loading branch information
koufesser committed Jan 20, 2025
1 parent 7c98a02 commit d4005d1
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 61 deletions.
62 changes: 33 additions & 29 deletions src/cl/matrix_multiplication.cl
Original file line number Diff line number Diff line change
Expand Up @@ -29,54 +29,58 @@ __kernel void matrix_multiplication_local(__global float *a, __global float *b,

float sum = 0.0f;
for (int tileK = 0; tileK * TILE_SIZE < K; tileK++) {
tileA[local_i][local_j] = a[i * K + local_j + tileK * TILE_SIZE];
tileB[local_i][local_j] = b[(local_i + tileK * TILE_SIZE) * N + j];
tileA[local_j][local_i] = a[j * K + local_i + tileK * TILE_SIZE];
tileB[local_j][local_i] = b[(local_j + tileK * TILE_SIZE) * N + i];

barrier(CLK_LOCAL_MEM_FENCE);

for (int i = 0; i < TILE_SIZE; i++) {
sum += tileA[local_i][i] * tileB[i][local_j];
for (int l = 0; l < TILE_SIZE; l++) {
sum += tileA[local_j][l] * tileB[l][local_i];
}

barrier(CLK_LOCAL_MEM_FENCE);
}
c[i * N + j] = sum;
c[j * N + i] = sum;
}
#endif

#if defined(TILE_SIZE) && defined(WORK_PER_THREAD)
__kernel void matrix_multiplication_local_wpt(__global float *a, __global float *b, __global float *c, unsigned int M, unsigned int K, unsigned int N) {
int i = get_global_id(0);
int j = get_global_id(1);
int local_i = get_local_id(0);
int local_j = get_local_id(1);

__local float tileA[TILE_SIZE][TILE_SIZE];
__local float tileB[TILE_SIZE][TILE_SIZE];
int i = get_global_id(0);
int j = get_global_id(1);
int local_i = get_local_id(0);
int local_j = get_local_id(1);

__local float tileA[TILE_SIZE][TILE_SIZE];
__local float tileB[TILE_SIZE][TILE_SIZE];

float sum[WORK_PER_THREAD];
for (int i = 0; i < WORK_PER_THREAD; i++) {
sum[i] = 0;
for (int l = 0; l < WORK_PER_THREAD; l++) {
sum[l] = 0;
}

for (int tileK = 0; tileK * TILE_SIZE < K; tileK++) {
for (int thread = 0; thread < WORK_PER_THREAD; thread++) {
tileA[local_i * WORK_PER_THREAD + thread][local_j] = a[(i * WORK_PER_THREAD + thread) * K + local_j + tileK * TILE_SIZE];
tileB[local_i * WORK_PER_THREAD + thread][local_j] = b[((local_i * WORK_PER_THREAD + thread) + tileK * TILE_SIZE) * N + j];
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int tileK = 0; tileK * TILE_SIZE < K; tileK++) {

for (int thread = 0; thread < WORK_PER_THREAD; thread++) {
tileA[local_j * WORK_PER_THREAD + thread][local_i] = a[(j * WORK_PER_THREAD + thread) * K + local_i + tileK * TILE_SIZE];
tileB[local_j * WORK_PER_THREAD + thread][local_i] = b[(local_j * WORK_PER_THREAD + thread + tileK * TILE_SIZE) * N + i];
}

for (int i = 0; i < TILE_SIZE; i++) {
float tileb = tileB[i][local_j];
barrier(CLK_LOCAL_MEM_FENCE);

for (int l = 0; l < TILE_SIZE; l++) {
for (int thread = 0; thread < WORK_PER_THREAD; thread++) {
sum[thread] += tileA[local_i * WORK_PER_THREAD + thread][i] * tileb;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
sum[thread] += tileA[local_j * WORK_PER_THREAD + thread][l] * tileB[l][local_i];
}
}

barrier(CLK_LOCAL_MEM_FENCE);
}

for (int thread = 0; thread < WORK_PER_THREAD; thread++) {
c[(j * WORK_PER_THREAD + thread) * N + i] = sum[thread];
}

for (int thread = 0; thread < WORK_PER_THREAD; thread++) {
c[(i * WORK_PER_THREAD + thread) * N + j] = sum[thread];
}
}
#endif
51 changes: 29 additions & 22 deletions src/cl/matrix_transpose.cl
Original file line number Diff line number Diff line change
Expand Up @@ -5,53 +5,60 @@
#define TILE_SIZE 16
#line 6

__kernel void matrix_transpose_naive( __global float *a, __global float *at, unsigned int m, unsigned int k)
__kernel void matrix_transpose_naive( __global float *a, __global float *at, unsigned int h, unsigned int w)
{
int i = get_global_id(0);
int j = get_global_id(1);
if (i >= k)
if (i >= w)
return;
if (j >= m)
if (j >= h)
return;
float x = a[i * k + j];
at[j * m + i] = x;
float x = a[j * w + i];
at[i * h + j] = x;
}


__kernel void matrix_transpose_local_bad_banks( __global float *a, __global float *at, unsigned int m, unsigned int k)
__kernel void matrix_transpose_local_bad_banks( __global float *a, __global float *at, unsigned int h, unsigned int w)
{
int i = get_global_id(0);
int j = get_global_id(1);
__local float tile[TILE_SIZE][TILE_SIZE];
int local_i = get_local_id(0);
int local_j = get_local_id(1);

if (i >= k)
return;
if (j >= m)
return;

tile[local_i][local_j] = a[i * k + j];
int start_i = i - local_i;
int start_j = j - local_j;
int transposed_i = start_j + local_i;
int transposed_j = start_i + local_j;

tile[local_j][local_i] = a[j * w + i];
barrier(CLK_LOCAL_MEM_FENCE);
at[j * m + i] = tile[local_i][local_j];
at[transposed_j * w + transposed_i] = tile[local_i][local_j];
}

__kernel void matrix_transpose_local_good_banks( __global float *a, __global float *at, unsigned int m, unsigned int k)
__kernel void matrix_transpose_local_good_banks( __global float *a, __global float *at, unsigned int h, unsigned int w)
{
int i = get_global_id(0);
int j = get_global_id(1);
__local float tile[TILE_SIZE][TILE_SIZE + 1];
int local_i = get_local_id(0);
int local_j = get_local_id(1);
int indx = local_i * TILE_SIZE + local_j;
int biased_i= indx / (TILE_SIZE + 1);
int biased_j = indx % (TILE_SIZE + 1);
if (i >= k)
return;
if (j >= m)
return;

tile[biased_i][biased_j] = a[i * k + j];
int start_i = i - local_i;
int start_j = j - local_j;
int transposed_i = start_j + local_i;
int transposed_j = start_i + local_j;

int indx = local_j * TILE_SIZE + local_i;
int biased_j = indx / (TILE_SIZE + 1);
int biased_i = indx % (TILE_SIZE + 1);

tile[biased_j][biased_i] = a[j*w + i];
barrier(CLK_LOCAL_MEM_FENCE);
at[j * m + i] = tile[biased_i][biased_j];

indx = local_i * TILE_SIZE + local_j;
biased_j = indx / (TILE_SIZE + 1);
biased_i = indx % (TILE_SIZE + 1);
at[transposed_j * w + transposed_i] = tile[biased_j][biased_i];
}
20 changes: 10 additions & 10 deletions src/main_matrix_multiplication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ struct KernelConfig {
KernelConfig makeNaiveConfig(unsigned int tile_size)
{
std::string kernel_name = "matrix_multiplication_naive";
gpu::WorkSize work_size(tile_size, tile_size, M, N);
gpu::WorkSize work_size(tile_size, tile_size, K, N);
std::string defines;
std::string prefix = "[naive, ts=" + std::to_string(tile_size) + "]";
return KernelConfig{kernel_name, work_size, defines, prefix};
Expand All @@ -60,7 +60,7 @@ KernelConfig makeNaiveConfig(unsigned int tile_size)
KernelConfig makeLocalConfig(unsigned int tile_size)
{
std::string kernel_name = "matrix_multiplication_local";
gpu::WorkSize work_size(tile_size, tile_size, M, N);
gpu::WorkSize work_size(tile_size, tile_size, K, N);
std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size);
std::string prefix = "[local, ts=" + std::to_string(tile_size) + "]";
return KernelConfig{kernel_name, work_size, defines, prefix};
Expand All @@ -69,7 +69,7 @@ KernelConfig makeLocalConfig(unsigned int tile_size)
KernelConfig makeLocalWPTConfig(unsigned int tile_size, unsigned int wpt)
{
std::string kernel_name = "matrix_multiplication_local_wpt";
gpu::WorkSize work_size(tile_size / wpt, tile_size, M /wpt, N);
gpu::WorkSize work_size(tile_size, tile_size / wpt, K, N / wpt);
std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size) + " -DWORK_PER_THREAD=" + std::to_string(wpt);
std::string prefix = "[local wpt, ts=" + std::to_string(tile_size) + ", wpt=" + std::to_string(wpt) + "]";
return KernelConfig{kernel_name, work_size, defines, prefix};
Expand Down Expand Up @@ -141,13 +141,13 @@ int main(int argc, char **argv)
const std::vector<float> cs_cpu_reference = computeCPU(as.data(), bs.data());


runTest(makeNaiveConfig(4), as.data(), bs.data(), cs_cpu_reference.data());
runTest(makeNaiveConfig(8), as.data(), bs.data(), cs_cpu_reference.data());
runTest(makeNaiveConfig(16), as.data(), bs.data(), cs_cpu_reference.data());

runTest(makeLocalConfig(4), as.data(), bs.data(), cs_cpu_reference.data());
runTest(makeLocalConfig(8), as.data(), bs.data(), cs_cpu_reference.data());
runTest(makeLocalConfig(16), as.data(), bs.data(), cs_cpu_reference.data());
// runTest(makeNaiveConfig(4), as.data(), bs.data(), cs_cpu_reference.data());
// runTest(makeNaiveConfig(8), as.data(), bs.data(), cs_cpu_reference.data());
// runTest(makeNaiveConfig(16), as.data(), bs.data(), cs_cpu_reference.data());
//
// runTest(makeLocalConfig(4), as.data(), bs.data(), cs_cpu_reference.data());
// runTest(makeLocalConfig(8), as.data(), bs.data(), cs_cpu_reference.data());
// runTest(makeLocalConfig(16), as.data(), bs.data(), cs_cpu_reference.data());

for (unsigned int tile_size : {4, 8, 16})
for (unsigned int wpt : {2, 4, 8, 16})
Expand Down

0 comments on commit d4005d1

Please sign in to comment.