Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Task08 Артём Смородский HSE #266

Closed
wants to merge 18 commits into from
106 changes: 105 additions & 1 deletion src/cl/radix.cl
Original file line number Diff line number Diff line change
@@ -1 +1,105 @@
// TODO
#ifdef __CLION_IDE__

#include <libgpu/opencl/cl/clion_defines.cl>

#endif

#line 6

#define SIZE 16

__kernel void matrix_transpose(__global unsigned int* a, __global unsigned int* at, const unsigned int m, const unsigned int k)
{
unsigned int i = get_global_id(0);
unsigned int j = get_global_id(1);

unsigned int local_i = get_local_id(0);
unsigned int local_j = get_local_id(1);

__local unsigned int buf[SIZE][SIZE + 1];

unsigned int i1 = j - local_j + local_i;
unsigned int j1 = i - local_i + local_j;

if (j < k && i < m) {
buf[local_j][local_i] = a[j * m + i];
}

barrier(CLK_LOCAL_MEM_FENCE);

if (j1 < m && i1 < k) {
at[j1 * k + i1] = buf[local_i][local_j];
}
}

__kernel void prefix_sum_up(__global unsigned int* s, unsigned int n, unsigned int p)
{
unsigned int index = get_global_id(0);
unsigned int id2 = 2 * (index + 1) * p - 1;
unsigned int id1 = id2 - p;
if (id2 < n) {
s[id2] += s[id1];
}
}


__kernel void prefix_sum_down(__global unsigned int* s, unsigned int n, unsigned int p)
{
unsigned int index = get_global_id(0);
unsigned int id2 = 2 * (index + 1) * p - 1 + p;
unsigned int id1 = id2 - p;
if (id2 < n) {
s[id2] += s[id1];
}
}

__kernel void count(__global unsigned int *ar, __global unsigned int *counters, unsigned int bit_shift, unsigned int n_bits)
{
unsigned int gid = get_global_id(0);
unsigned int grid = get_group_id(0);

unsigned int t = (ar[gid] >> bit_shift) & ((1 << n_bits) - 1);
atomic_inc(&counters[grid * (1 << n_bits) + t]);
}

__kernel void zero(__global unsigned int *as)
{
unsigned int gid = get_global_id(0);
as[gid] = 0;
}



__kernel void radix_sort(__global unsigned int *as, __global unsigned int *bs, __global unsigned int *counters, unsigned int bit_shift, unsigned int n_bits, unsigned int n)
{
unsigned int gid = get_global_id(0);
unsigned int grid = get_group_id(0);
unsigned int lid = get_local_id(0);

__local unsigned int buf[128];

buf[lid] = (as[gid] >> bit_shift) & ((1 << n_bits) - 1);

barrier(CLK_LOCAL_MEM_FENCE);

unsigned int ind = buf[lid] * get_num_groups(0) + grid;
unsigned int lidx;

if (ind > 0 && ind < n) {
lidx = counters[ind - 1];
} else {
lidx = 0;
}

unsigned int sh = 0;
for (int i = 0; i < lid; ++i) {
if (buf[i] == buf[lid]) {
sh += 1;
}
}

unsigned int target_index = sh + lidx;
if (gid < n && target_index < n) {
bs[target_index] = as[gid];
}
}
60 changes: 56 additions & 4 deletions src/main_radix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,69 @@ int main(int argc, char **argv) {

const std::vector<unsigned int> cpu_reference = computeCPU(as);

// remove me
return 0;

{

ocl::Kernel count(radix_kernel, radix_kernel_length, "count");
ocl::Kernel prefix_sum_up(radix_kernel, radix_kernel_length, "prefix_sum_up");
ocl::Kernel prefix_sum_down(radix_kernel, radix_kernel_length, "prefix_sum_down");
ocl::Kernel matrix_transpose(radix_kernel, radix_kernel_length, "matrix_transpose");
ocl::Kernel zero(radix_kernel, radix_kernel_length, "zero");
ocl::Kernel radix_sort(radix_kernel, radix_kernel_length, "radix_sort");

count.compile();
prefix_sum_up.compile();
prefix_sum_down.compile();
matrix_transpose.compile();
zero.compile();
radix_sort.compile();

constexpr unsigned int n_bits = 4;
constexpr unsigned int work_size = 128;
constexpr unsigned int transpose_work_group_size = 16;
constexpr unsigned int nd = 1 << n_bits;
constexpr unsigned int wg = (n + work_size - 1) / work_size;
constexpr unsigned int count_size = wg * nd;

gpu::gpu_mem_32u as_gpu;
as_gpu.resizeN(n);
gpu::gpu_mem_32u bs_gpu;
bs_gpu.resizeN(n);
gpu::gpu_mem_32u counters;
counters.resizeN(count_size);
gpu::gpu_mem_32u counters_tr;
counters_tr.resizeN(count_size);

timer t;
for (int iter = 0; iter < benchmarkingIters; ++iter) {
// Запускаем секундомер после прогрузки данных, чтобы замерять время работы кернела, а не трансфер данных

// TODO
as_gpu.writeN(as.data(), n);
t.restart();

for (int i = 0; i < 32; i += n_bits) {
zero.exec(gpu::WorkSize(work_size, count_size), counters);
count.exec(gpu::WorkSize(work_size, n), as_gpu, counters, i, n_bits);
matrix_transpose.exec(gpu::WorkSize(16, 16, nd, wg), counters, counters_tr, nd, wg);

unsigned int j;

for (j = 1; j < count_size; j *= 2) {
prefix_sum_up.exec(gpu::WorkSize(work_size, count_size / j / 2), counters_tr,
count_size, j);
}
for (j = count_size / 2; j > 0; j /= 2) {
prefix_sum_down.exec(gpu::WorkSize(work_size, count_size / j / 2), counters_tr,
count_size, j);
}

radix_sort.exec(gpu::WorkSize(work_size, n), as_gpu, bs_gpu, counters_tr, i,
n_bits, n);
as_gpu.swap(bs_gpu);
}
t.nextLap();
}
t.stop();
as_gpu.readN(as.data(), n);

std::cout << "GPU: " << t.lapAvg() << "+-" << t.lapStd() << " s" << std::endl;
std::cout << "GPU: " << (n / 1000.0 / 1000.0) / t.lapAvg() << " millions/s" << std::endl;
Expand Down
Loading