+coalesced

GPGPUCourse · Jan 20, 2025 · d4005d1 · d4005d1
1 parent 7c98a02
commit d4005d1
Show file tree

Hide file tree

Showing 3 changed files with 72 additions and 61 deletions.
diff --git a/src/cl/matrix_multiplication.cl b/src/cl/matrix_multiplication.cl
@@ -29,54 +29,58 @@ __kernel void matrix_multiplication_local(__global float *a, __global float *b,
 
     float sum = 0.0f;
     for (int tileK = 0; tileK * TILE_SIZE < K; tileK++) {
-        tileA[local_i][local_j] = a[i * K + local_j + tileK * TILE_SIZE];
-        tileB[local_i][local_j] = b[(local_i + tileK * TILE_SIZE) * N + j];
+        tileA[local_j][local_i] = a[j * K + local_i + tileK * TILE_SIZE];
+        tileB[local_j][local_i] = b[(local_j + tileK * TILE_SIZE) * N + i];
 
         barrier(CLK_LOCAL_MEM_FENCE);
 
-        for (int i = 0; i < TILE_SIZE; i++) {
-            sum += tileA[local_i][i] * tileB[i][local_j];
+        for (int l = 0; l < TILE_SIZE; l++) {
+            sum += tileA[local_j][l] * tileB[l][local_i];
         }
 
         barrier(CLK_LOCAL_MEM_FENCE);
     }
-    c[i * N + j] = sum;
+    c[j * N + i] = sum;
    }
 #endif
 
 #if defined(TILE_SIZE) && defined(WORK_PER_THREAD)
 __kernel void matrix_multiplication_local_wpt(__global float *a, __global float *b, __global float *c, unsigned int M, unsigned int K, unsigned int N) {
-     int i = get_global_id(0);
-     int j = get_global_id(1);
-     int local_i = get_local_id(0);
-     int local_j = get_local_id(1);
 
-     __local float tileA[TILE_SIZE][TILE_SIZE];
-     __local float tileB[TILE_SIZE][TILE_SIZE];
+    int i = get_global_id(0);
+    int j = get_global_id(1);
+    int local_i = get_local_id(0);
+    int local_j = get_local_id(1);
+
+    __local float tileA[TILE_SIZE][TILE_SIZE];
+    __local float tileB[TILE_SIZE][TILE_SIZE];
 
      float sum[WORK_PER_THREAD];
-     for (int i = 0; i < WORK_PER_THREAD; i++) {
-        sum[i] = 0;
+     for (int l = 0; l < WORK_PER_THREAD; l++) {
+        sum[l] = 0;
      }
 
-     for (int tileK = 0; tileK * TILE_SIZE < K; tileK++) {
-         for (int thread = 0; thread < WORK_PER_THREAD; thread++) {
-             tileA[local_i * WORK_PER_THREAD + thread][local_j] = a[(i * WORK_PER_THREAD + thread) * K + local_j + tileK * TILE_SIZE];
-             tileB[local_i * WORK_PER_THREAD + thread][local_j] = b[((local_i * WORK_PER_THREAD + thread) + tileK * TILE_SIZE) * N + j];
-         }
-         barrier(CLK_LOCAL_MEM_FENCE);
+    for (int tileK = 0; tileK * TILE_SIZE < K; tileK++) {
+
+        for (int thread = 0; thread < WORK_PER_THREAD; thread++) {
+            tileA[local_j * WORK_PER_THREAD + thread][local_i] = a[(j * WORK_PER_THREAD + thread) * K + local_i + tileK * TILE_SIZE];
+            tileB[local_j * WORK_PER_THREAD + thread][local_i] = b[(local_j * WORK_PER_THREAD + thread + tileK * TILE_SIZE) * N + i];
+        }
 
-         for (int i = 0; i < TILE_SIZE; i++) {
-            float tileb = tileB[i][local_j];
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        for (int l = 0; l < TILE_SIZE; l++) {
             for (int thread = 0; thread < WORK_PER_THREAD; thread++) {
-                sum[thread] += tileA[local_i * WORK_PER_THREAD + thread][i] * tileb;
-             }
-         }
-         barrier(CLK_LOCAL_MEM_FENCE);
-     }
+                sum[thread] += tileA[local_j * WORK_PER_THREAD + thread][l] * tileB[l][local_i];
+            }
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    for (int thread = 0; thread < WORK_PER_THREAD; thread++) {
+        c[(j * WORK_PER_THREAD + thread) * N + i] = sum[thread];
+    }
 
-   for (int thread = 0; thread < WORK_PER_THREAD; thread++) {
-      c[(i * WORK_PER_THREAD + thread) * N + j] = sum[thread];
-   }
 }
 #endif
diff --git a/src/cl/matrix_transpose.cl b/src/cl/matrix_transpose.cl
@@ -5,53 +5,60 @@
 #define TILE_SIZE 16
 #line 6
 
-__kernel void matrix_transpose_naive( __global float *a, __global float *at, unsigned int m, unsigned int k)
+__kernel void matrix_transpose_naive( __global float *a, __global float *at, unsigned int h, unsigned int w)
 {
     int i = get_global_id(0);
     int j = get_global_id(1);
-    if (i >= k)
+    if (i >= w)
         return;
-    if (j >= m)
+    if (j >= h)
         return;
-    float x = a[i * k + j];
-    at[j * m + i] = x;
+    float x = a[j * w + i];
+    at[i * h + j] = x;
 }
 
 
-__kernel void matrix_transpose_local_bad_banks( __global float *a, __global float *at, unsigned int m, unsigned int k)
+__kernel void matrix_transpose_local_bad_banks( __global float *a, __global float *at, unsigned int h, unsigned int w)
 {
     int i = get_global_id(0);
     int j = get_global_id(1);
     __local float tile[TILE_SIZE][TILE_SIZE];
     int local_i = get_local_id(0);
     int local_j = get_local_id(1);
 
-    if (i >= k)
-        return;
-    if (j >= m)
-        return;
 
-    tile[local_i][local_j] = a[i * k + j];
+    int start_i = i - local_i;
+    int start_j = j - local_j;
+    int transposed_i = start_j + local_i;
+    int transposed_j = start_i + local_j;
+
+    tile[local_j][local_i] = a[j * w + i];
     barrier(CLK_LOCAL_MEM_FENCE);
-    at[j * m + i] = tile[local_i][local_j];
+    at[transposed_j * w + transposed_i] = tile[local_i][local_j];
 }
 
-__kernel void matrix_transpose_local_good_banks( __global float *a, __global float *at, unsigned int m, unsigned int k)
+__kernel void matrix_transpose_local_good_banks( __global float *a, __global float *at, unsigned int h, unsigned int w)
 {
     int i = get_global_id(0);
     int j = get_global_id(1);
     __local float tile[TILE_SIZE][TILE_SIZE + 1];
     int local_i = get_local_id(0);
     int local_j = get_local_id(1);
-    int indx = local_i * TILE_SIZE + local_j;
-    int biased_i= indx / (TILE_SIZE + 1);
-    int biased_j = indx % (TILE_SIZE + 1);
-    if (i >= k)
-        return;
-    if (j >= m)
-        return;
 
-    tile[biased_i][biased_j] = a[i * k + j];
+    int start_i = i - local_i;
+    int start_j = j - local_j;
+    int transposed_i = start_j + local_i;
+    int transposed_j = start_i + local_j;
+
+    int indx = local_j * TILE_SIZE + local_i;
+    int biased_j = indx / (TILE_SIZE + 1);
+    int biased_i = indx % (TILE_SIZE + 1);
+
+    tile[biased_j][biased_i] = a[j*w + i];
     barrier(CLK_LOCAL_MEM_FENCE);
-    at[j * m + i] = tile[biased_i][biased_j];
+
+    indx = local_i * TILE_SIZE + local_j;
+    biased_j = indx / (TILE_SIZE + 1);
+    biased_i = indx % (TILE_SIZE + 1);
+    at[transposed_j * w + transposed_i] = tile[biased_j][biased_i];
 }
diff --git a/src/main_matrix_multiplication.cpp b/src/main_matrix_multiplication.cpp
@@ -51,7 +51,7 @@ struct KernelConfig {
 KernelConfig makeNaiveConfig(unsigned int tile_size)
 {
     std::string kernel_name = "matrix_multiplication_naive";
-    gpu::WorkSize work_size(tile_size, tile_size, M, N);
+    gpu::WorkSize work_size(tile_size, tile_size, K, N);
     std::string defines;
     std::string prefix = "[naive, ts=" + std::to_string(tile_size) + "]";
     return KernelConfig{kernel_name, work_size, defines, prefix};
@@ -60,7 +60,7 @@ KernelConfig makeNaiveConfig(unsigned int tile_size)
 KernelConfig makeLocalConfig(unsigned int tile_size)
 {
     std::string kernel_name = "matrix_multiplication_local";
-    gpu::WorkSize work_size(tile_size, tile_size, M, N);
+    gpu::WorkSize work_size(tile_size, tile_size, K, N);
     std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size);
     std::string prefix = "[local, ts=" + std::to_string(tile_size) + "]";
     return KernelConfig{kernel_name, work_size, defines, prefix};
@@ -69,7 +69,7 @@ KernelConfig makeLocalConfig(unsigned int tile_size)
 KernelConfig makeLocalWPTConfig(unsigned int tile_size, unsigned int wpt)
 {
     std::string kernel_name = "matrix_multiplication_local_wpt";
-    gpu::WorkSize work_size(tile_size / wpt, tile_size, M /wpt, N);
+    gpu::WorkSize work_size(tile_size, tile_size / wpt, K, N / wpt);
     std::string defines = "-DTILE_SIZE=" + std::to_string(tile_size) + " -DWORK_PER_THREAD=" + std::to_string(wpt);
     std::string prefix = "[local wpt, ts=" + std::to_string(tile_size) + ", wpt=" + std::to_string(wpt) + "]";
     return KernelConfig{kernel_name, work_size, defines, prefix};
@@ -141,13 +141,13 @@ int main(int argc, char **argv)
     const std::vector<float> cs_cpu_reference = computeCPU(as.data(), bs.data());
 
 
-    runTest(makeNaiveConfig(4), as.data(), bs.data(), cs_cpu_reference.data());
-    runTest(makeNaiveConfig(8), as.data(), bs.data(), cs_cpu_reference.data());
-    runTest(makeNaiveConfig(16), as.data(), bs.data(), cs_cpu_reference.data());
-
-    runTest(makeLocalConfig(4), as.data(), bs.data(), cs_cpu_reference.data());
-    runTest(makeLocalConfig(8), as.data(), bs.data(), cs_cpu_reference.data());
-    runTest(makeLocalConfig(16), as.data(), bs.data(), cs_cpu_reference.data());
+    // runTest(makeNaiveConfig(4), as.data(), bs.data(), cs_cpu_reference.data());
+    // runTest(makeNaiveConfig(8), as.data(), bs.data(), cs_cpu_reference.data());
+    // runTest(makeNaiveConfig(16), as.data(), bs.data(), cs_cpu_reference.data());
+    //
+    // runTest(makeLocalConfig(4), as.data(), bs.data(), cs_cpu_reference.data());
+    // runTest(makeLocalConfig(8), as.data(), bs.data(), cs_cpu_reference.data());
+    // runTest(makeLocalConfig(16), as.data(), bs.data(), cs_cpu_reference.data());
 
     for (unsigned int tile_size : {4, 8, 16})
         for (unsigned int wpt : {2, 4, 8, 16})