GPGPUCourse · Smart781 · Nov 10, 2024 · Nov 10, 2024 · Nov 10, 2024 · Nov 10, 2024
diff --git a/src/cl/radix.cl b/src/cl/radix.cl
@@ -1 +1,105 @@
-// TODO
+#ifdef __CLION_IDE__
+
+#include <libgpu/opencl/cl/clion_defines.cl>
+
+#endif
+
+#line 6
+
+#define SIZE 16
+
+__kernel void matrix_transpose(__global unsigned int* a, __global unsigned int* at, const unsigned int m, const unsigned int k)
+{
+    unsigned int i = get_global_id(0);
+    unsigned int j = get_global_id(1);
+
+    unsigned int local_i = get_local_id(0);
+    unsigned int local_j = get_local_id(1);
+
+    __local unsigned int buf[SIZE][SIZE + 1];
+
+    unsigned int i1 = j - local_j + local_i;
+    unsigned int j1 = i - local_i + local_j;
+
+    if (j < k && i < m) {
+        buf[local_j][local_i] = a[j * m + i];
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (j1 < m && i1 < k) {
+        at[j1 * k + i1] = buf[local_i][local_j];
+    }
+}
+
+__kernel void prefix_sum_up(__global unsigned int* s, unsigned int n, unsigned int p)
+{
+	unsigned int index = get_global_id(0);
+	unsigned int id2 = 2 * (index + 1) * p - 1;
+	unsigned int id1 = id2 - p;
+	if (id2 < n) {
+            s[id2] += s[id1];
+        }
+}
+
+
+__kernel void prefix_sum_down(__global unsigned int* s, unsigned int n, unsigned int p)
+{
+	unsigned int index = get_global_id(0);
+	unsigned int id2 = 2 * (index + 1) * p - 1 + p;
+	unsigned int id1 = id2 - p;
+	if (id2 < n) {
+	    s[id2] += s[id1];
+        }
+}
+
+__kernel void count(__global unsigned int *ar, __global unsigned int *counters, unsigned int bit_shift, unsigned int n_bits)
+{
+    unsigned int gid = get_global_id(0);
+    unsigned int grid = get_group_id(0);
+
+    unsigned int t = (ar[gid] >> bit_shift) & ((1 << n_bits) - 1);
+    atomic_inc(&counters[grid * (1 << n_bits) + t]);
+}
+
+__kernel void zero(__global unsigned int *as)
+{
+    unsigned int gid = get_global_id(0);
+    as[gid] = 0;
+}
+
+
+
+__kernel void radix_sort(__global unsigned int *as, __global unsigned int *bs, __global unsigned int *counters, unsigned int bit_shift, unsigned int n_bits, unsigned int n)
+{
+    unsigned int gid = get_global_id(0);
+    unsigned int grid = get_group_id(0);
+    unsigned int lid = get_local_id(0);
+
+    __local unsigned int buf[128];
+
+    buf[lid] = (as[gid] >> bit_shift) & ((1 << n_bits) - 1);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    unsigned int ind = buf[lid] * get_num_groups(0) + grid;
+    unsigned int lidx;
+
+    if (ind > 0 && ind < n) {
+        lidx = counters[ind - 1];
+    } else {
+        lidx = 0;
+    }
+
+    unsigned int sh = 0;
+    for (int i = 0; i < lid; ++i) {
+        if (buf[i] == buf[lid]) {
+            sh += 1;
+        }
+    }
+
+    unsigned int target_index = sh + lidx;
+    if (gid < n && target_index < n) {
+        bs[target_index] = as[gid];
+    }
+}
diff --git a/src/main_radix.cpp b/src/main_radix.cpp
@@ -58,17 +58,69 @@ int main(int argc, char **argv) {
 
     const std::vector<unsigned int> cpu_reference = computeCPU(as);
 
-    // remove me
-    return 0;
-
     {
+
+        ocl::Kernel count(radix_kernel, radix_kernel_length, "count");
+        ocl::Kernel prefix_sum_up(radix_kernel, radix_kernel_length, "prefix_sum_up");
+        ocl::Kernel prefix_sum_down(radix_kernel, radix_kernel_length, "prefix_sum_down");
+        ocl::Kernel matrix_transpose(radix_kernel, radix_kernel_length, "matrix_transpose");
+        ocl::Kernel zero(radix_kernel, radix_kernel_length, "zero");
+        ocl::Kernel radix_sort(radix_kernel, radix_kernel_length, "radix_sort");
+
+        count.compile();
+        prefix_sum_up.compile();
+        prefix_sum_down.compile();
+        matrix_transpose.compile();
+        zero.compile();
+        radix_sort.compile();
+
+        constexpr unsigned int n_bits = 4;
+        constexpr unsigned int work_size = 128;
+        constexpr unsigned int transpose_work_group_size = 16;
+        constexpr unsigned int nd = 1 << n_bits;
+        constexpr unsigned int wg = (n + work_size - 1) / work_size;
+        constexpr unsigned int count_size = wg * nd;
+
+        gpu::gpu_mem_32u as_gpu;
+        as_gpu.resizeN(n);
+        gpu::gpu_mem_32u bs_gpu;
+        bs_gpu.resizeN(n);
+        gpu::gpu_mem_32u counters;
+        counters.resizeN(count_size);
+        gpu::gpu_mem_32u counters_tr;
+        counters_tr.resizeN(count_size);
+
         timer t;
         for (int iter = 0; iter < benchmarkingIters; ++iter) {
             // Запускаем секундомер после прогрузки данных, чтобы замерять время работы кернела, а не трансфер данных
 
-            // TODO
+            as_gpu.writeN(as.data(), n);
+            t.restart();
+
+            for (int i = 0; i < 32; i += n_bits) {
+                zero.exec(gpu::WorkSize(work_size, count_size), counters);
+                count.exec(gpu::WorkSize(work_size, n), as_gpu, counters, i, n_bits);
+                matrix_transpose.exec(gpu::WorkSize(16, 16, nd, wg), counters, counters_tr, nd, wg);
+
+                unsigned int j;
+
+                for (j = 1; j < count_size; j *= 2) {
+                    prefix_sum_up.exec(gpu::WorkSize(work_size, count_size / j / 2), counters_tr,
+                                                    count_size, j);
+                }
+                for (j = count_size / 2; j > 0; j /= 2) {
+                    prefix_sum_down.exec(gpu::WorkSize(work_size, count_size / j / 2), counters_tr,
+                                                    count_size, j);
+                }
+
+                radix_sort.exec(gpu::WorkSize(work_size, n), as_gpu, bs_gpu, counters_tr, i,
+                                             n_bits, n);
+                as_gpu.swap(bs_gpu);
+            }
+            t.nextLap();
         }
         t.stop();
+        as_gpu.readN(as.data(), n);
 
         std::cout << "GPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl;
         std::cout << "GPU: " << (n / 1000.0 / 1000.0) / t.lapAvg() << " millions/s" << std::endl;