Skip to content

Commit

Permalink
add topk cuda kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
wendy12022 committed May 6, 2024
1 parent 2b698b3 commit cf4e92c
Show file tree
Hide file tree
Showing 12 changed files with 314 additions and 40 deletions.
19 changes: 19 additions & 0 deletions src/04kernel/cuda/include/kernel/cuda/topk.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#ifndef KERNEL_CUDA_TOPK_CUH
#define KERNEL_CUDA_TOPK_CUH

#include "threads_distributer.cuh"

namespace refactor::kernel::cuda {

void launchTopK(
KernelLaunchParameters const &params,
float const *data, float *dstVal, unsigned int *dstIdx,
unsigned int topk,
unsigned int stride_axis,
unsigned int stride_in_pre,
unsigned int stride_out_pre,
unsigned int size_axis);

}// namespace refactor::kernel::cuda

#endif// KERNEL_CUDA_TOPK_CUH
103 changes: 103 additions & 0 deletions src/04kernel/cuda/src/topk.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#include "kernel/cuda/topk.cuh"
#include "macro.cuh"
#include <cstdint>
#include <thrust/device_vector.h>
#include <thrust/sort.h>

namespace refactor::kernel::cuda {

using PairType = thrust::pair<float, uint32_t>;

struct ComparePair {
__host__ __device__
bool operator()(const PairType& a, const PairType& b) const {
return a.first > b.first;
}
};

/*
__device__
void process_element(unsigned int n, float *__restrict__ dstVal,
uint32_t *__restrict__ dstIdx,
PairType *list,
uint32_t stride_axis,
uint32_t init_offset){
for (auto tid = blockIdx.x * blockDim.x + threadIdx.x,
step = blockDim.x * gridDim.x;
tid < n;
tid += step) {
uint32_t offset = init_offset + stride_axis * tid;
dstVal[offset] = list[tid].first;
dstIdx[offset] = list[tid].second;
}
}
*/



__global__ static void TopKKernel(
unsigned long long n,
float const *__restrict__ data,
float *__restrict__ dstVal,
uint32_t *__restrict__ dstIdx,
uint32_t topk,
uint32_t stride_axis,
uint32_t stride_in_pre,
uint32_t stride_out_pre,
unsigned int size) {
for (auto tid = blockIdx.x * blockDim.x + threadIdx.x,
step = blockDim.x * gridDim.x;
tid < n;
tid += step) {
PairType *list = new PairType[size];

for(uint32_t i = 0; i < size; i++){
uint32_t srcIdx = tid /stride_axis * stride_in_pre + tid % stride_axis + i * stride_axis;

list[i] = PairType(data[srcIdx], i);
}
// thrust没有partial_sort算法,可尝试优化:分成size/topk组,每组取一个最大值
thrust::sort(thrust::device, list, list + size, ComparePair());


uint32_t init_offset = tid /stride_axis * stride_out_pre + tid % stride_axis;
for (uint32_t i = 0; i < topk; i++)
{
uint32_t offset = init_offset + stride_axis * i;
dstVal[offset] = list[i].first;
dstIdx[offset] = list[i].second;
}

delete[] list;
}
}



void launchTopK(
KernelLaunchParameters const &params,
float const *data, float *dstVal, uint32_t *dstIdx,
uint32_t topk,
uint32_t stride_axis,
uint32_t stride_in_pre,
uint32_t stride_out_pre,
unsigned int size_axis) {

TopKKernel<<<
params.gridSize,
params.blockSize,
0,
reinterpret_cast<cudaStream_t>(params.stream)>>>(
params.n,
(data),
(dstVal),
(dstIdx),
topk,
stride_axis,
stride_in_pre,
stride_out_pre,
size_axis);

}

}// namespace refactor::kernel::cuda
21 changes: 10 additions & 11 deletions src/04kernel/include/kernel/attributes/topk_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,17 @@
namespace refactor::kernel {

struct TopKInfo {
struct Stride{
dim_t axis, in_pre, out_pre;
};
struct Size{
dim_t axis, except_axis;
};
uint32_t topk;
Stride stride;
Size size;

int64_t topk;
int64_t axis;
size_t in_stride, in_stride_pre_axis, out_stride_pre_axis;
size_t elem_size, axis_elem_size;

TopKInfo(int64_t topk, int64_t axis, Tensor const &input);
size_t getElementSize() const {return elem_size;}
size_t getAxisElementSize()const { return axis_elem_size;}
size_t getInStride()const{return in_stride;}
size_t getInStridePreAxis()const{return in_stride_pre_axis;}
size_t getOutStridePreAxis()const {return out_stride_pre_axis;}
TopKInfo(uint32_t topk, uint32_t axis, Tensor const &input);
};

}// namespace refactor::kernel
Expand Down
4 changes: 2 additions & 2 deletions src/04kernel/include/kernel/collectors/topk.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
namespace refactor::kernel {

struct TopKCollector final : public InfoCollector {
int64_t topk, axis;
uint32_t topk, axis;

constexpr TopKCollector(decltype(_target) target, int64_t topk, int64_t axis_) noexcept
constexpr TopKCollector(decltype(_target) target, uint32_t topk, uint32_t axis_) noexcept
: InfoCollector(target), topk(topk), axis(axis_) {}

std::vector<KernelBox>
Expand Down
16 changes: 9 additions & 7 deletions src/04kernel/src/attributes/topk_info.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@

namespace refactor::kernel {

TopKInfo::TopKInfo(int64_t topk, int64_t axis, Tensor const &input):topk(topk),
axis(axis),
in_stride(input.strides()[axis]),
in_stride_pre_axis(axis == 0 ? 0 : input.strides()[axis - 1]),
out_stride_pre_axis(in_stride_pre_axis/input.shape[axis]*topk),
elem_size(input.elementsSize()),
axis_elem_size(input.shape[axis]){}
TopKInfo::TopKInfo(uint32_t topk, uint32_t axis, Tensor const &input){
this->topk =topk;
auto tmpStride = axis == 0 ? 0 : input.strides()[axis - 1];
this->stride = {input.strides()[axis],\
tmpStride,\
tmpStride/input.shape[axis]*topk};
this->size = {input.shape[axis], \
input.elementsSize()/input.shape[axis]};
}

}
30 changes: 15 additions & 15 deletions src/04kernel/src/kernels/topk/cpu_kernel.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "cpu_kernel.hh"
#include <execution>
#include <list>
#include <vector>

namespace refactor::kernel {
using K = TopKCpu;
Expand Down Expand Up @@ -29,31 +29,31 @@ namespace refactor::kernel {
auto src = reinterpret_cast<float const *>(inputs[0]);

auto dstVal = reinterpret_cast<float*>(outputs[0]);//T
auto dstIndex = reinterpret_cast<int64_t*>(outputs[1]);
auto dstIndex = reinterpret_cast<uint32_t*>(outputs[1]);


size_t M = info.getElementSize() / info.getAxisElementSize();
size_t N = info.getAxisElementSize();
auto inStride1 = info.getInStridePreAxis();
auto inStride2 = info.getInStride();
auto outStride1 = info.getOutStridePreAxis();
auto outStride2 = inStride2;
size_t M = info.size.except_axis;
size_t N = info.size.axis;

for(size_t m = 0; m < M; m ++){
using PairType = std::pair<float, int64_t>;
std::list<PairType> list;
using PairType = std::pair<float, uint32_t>;
std::vector<PairType> list;
for(size_t n = 0; n < N; n++){
auto srcIdx = m /inStride2 * inStride1 + m % inStride2 + n * inStride2;
auto srcIdx = m /info.stride.axis * info.stride.in_pre + m % info.stride.axis + n * info.stride.axis;
list.push_back({src[srcIdx],n});
}
list.sort([](const PairType &a, const PairType &b)->bool{return a.first > b.first;});
//list.sort([](const PairType &a, const PairType &b)->bool{return a.first > b.first;});
std::partial_sort(list.begin(), \
list.begin() + info.topk, \
list.end(), \
[](const PairType &a, const PairType &b)->bool{return a.first > b.first;});

size_t offset = m /inStride2 * outStride1 + m % inStride2;
std::for_each_n(list.begin(), (int64_t)info.topk,
size_t offset = m /info.stride.axis * info.stride.out_pre + m % info.stride.axis;
std::for_each_n(list.begin(), (uint32_t)info.topk,
[&](auto &elem) {
dstVal[offset] = elem.first;
dstIndex[offset] = elem.second;
offset += outStride2;
offset += info.stride.axis;
});
}
};
Expand Down
57 changes: 57 additions & 0 deletions src/04kernel/src/kernels/topk/cuda_kernel.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#include "cuda_kernel.hh"

#ifdef USE_CUDA
#include "kernel/cuda/threads_distributer.cuh"
#include "kernel/cuda/topk.cuh"
#include <cuda_runtime.h>
#include <sstream>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#endif

namespace refactor::kernel {
using K = TopKCuda;

K::TopKCuda(TopKInfo info_) noexcept
: Kernel(), info(std::move(info_)) {}

auto K::build(TopKInfo info) noexcept -> KernelBox {
#ifndef USE_CUDA
return nullptr;
#endif

return std::make_unique<K>(std::move(info));
}
auto K::typeId() noexcept -> size_t {
static uint8_t ID = 1;
return reinterpret_cast<size_t>(&ID);
}

auto K::kernelTypeId() const noexcept -> size_t {
return typeId();
}
auto K::description() const noexcept -> std::string_view {
return "Performing concat operation using CUDA";
}

#ifdef USE_CUDA
auto K::lower(Resources &) const noexcept -> RoutineWorkspace {
//return [info = this->info](Resources &, void *workspace, void const *const *inputs, void *const *outputs){

//}
return [info = this->info, params = cuda::ThreadsDistributer()(info.size.except_axis)]
(Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
cuda::launchTopK(
params,
reinterpret_cast<float const *>(inputs[0]),
reinterpret_cast<float *>(outputs[0]),
reinterpret_cast<uint32_t *>(outputs[1]),
info.topk,
info.stride.axis,
info.stride.in_pre,
info.stride.out_pre,
info.size.axis);
};
}
#endif
}// namespace refactor::kernel
26 changes: 26 additions & 0 deletions src/04kernel/src/kernels/topk/cuda_kernel.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#ifndef KERNEL_TOPK_CUDA_KERNEL_HH
#define KERNEL_TOPK_CUDA_KERNEL_HH

#include "kernel/attributes/topk_info.h"
#include "kernel/kernel.h"

namespace refactor::kernel {

struct TopKCuda final : public Kernel {
TopKInfo info;

explicit TopKCuda(TopKInfo) noexcept;

static KernelBox build(TopKInfo) noexcept;
static size_t typeId() noexcept;

size_t kernelTypeId() const noexcept final;
std::string_view description() const noexcept final;
#ifdef USE_CUDA
RoutineWorkspace lower(Resources &) const noexcept final;
#endif
};

}// namespace refactor::kernel

#endif// KERNEL_TOPK_CUDA_KERNEL_HH
4 changes: 2 additions & 2 deletions src/04kernel/test/kernels/topk/test_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ TEST(kernel, TopKCpu) {
// build routine
auto inputTensor = Tensor::share(DataType::F32, Shape{3, 4});
auto outputTensor0 = Tensor::share(DataType::F32, Shape{3, 3});
auto outputTensor1 = Tensor::share(DataType::I64, Shape{3, 3});
auto outputTensor1 = Tensor::share(DataType::U32, Shape{3, 3});

auto kernel = TopKCpu::build(TopKInfo(3,1, *inputTensor));
ASSERT_TRUE(kernel);
Expand All @@ -28,7 +28,7 @@ TEST(kernel, TopKCpu) {

// check
std::vector<float> expectVal = {3,2,1,7,6,5,11,10,9};
std::vector<int64_t> expectIdx = {3,2,1,3,2,1,3,2,1};
std::vector<uint32_t> expectIdx = {3,2,1,3,2,1,3,2,1};
std::for_each(out0.begin(), out0.end(),[](const float &val){std::cout<<val<<" ";});

for(size_t i=0;i< expectVal.size(); ++i){
Expand Down
Loading

0 comments on commit cf4e92c

Please sign in to comment.