Skip to content

Commit

Permalink
feat(kernel): 添加 Clip cuda kernel 及单元测试
Browse files Browse the repository at this point in the history
Signed-off-by: YdrMaster <ydrml@hotmail.com>
  • Loading branch information
YdrMaster committed Dec 1, 2023
1 parent c7e594f commit 9b52e37
Show file tree
Hide file tree
Showing 6 changed files with 193 additions and 1 deletion.
4 changes: 4 additions & 0 deletions src/04kernel/src/collectors/clip.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "kernel/collectors/clip.h"
#include "../kernels/clip/cpu_kernel.hh"
#include "../kernels/clip/cuda_kernel.hh"

namespace refactor::kernel {

Expand All @@ -19,6 +20,9 @@ namespace refactor::kernel {
}
break;
case decltype(_target)::Nvidia:
if (auto ptr = ClipCuda::build(data, hasMax); ptr) {
ans.emplace_back(std::move(ptr));
}
break;
default:
UNREACHABLEX(void, "Unknown target");
Expand Down
2 changes: 1 addition & 1 deletion src/04kernel/src/kernels/clip/cpu_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ namespace refactor::kernel {
}

template<class T>
auto lowerTyped(size_t size, bool hasMax) noexcept -> RoutineWorkspace {
static auto lowerTyped(size_t size, bool hasMax) noexcept -> RoutineWorkspace {
using namespace runtime;
return [=](Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
auto data = reinterpret_cast<T const *>(inputs[0]);
Expand Down
29 changes: 29 additions & 0 deletions src/04kernel/src/kernels/clip/cuda_kernel.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#include "cuda_kernel.hh"

namespace refactor::kernel {
using K = ClipCuda;

K::ClipCuda(decltype(dataType) dt,
decltype(size) size_,
decltype(hasMax) hasMax_) noexcept
: dataType(dt), size(size_), hasMax(hasMax_) {
}

auto K::build(Tensor const &data, bool hasMax) noexcept -> KernelBox {
return data.dataType.isCpuNumberic()
? std::make_unique<K>(data.dataType, data.elementsSize(), hasMax)
: nullptr;
}
auto K::typeId() noexcept -> size_t {
static uint8_t ID = 1;
return reinterpret_cast<size_t>(&ID);
}

auto K::kernelTypeId() const noexcept -> size_t {
return typeId();
}
auto K::description() const noexcept -> std::string_view {
return "Performing clip operation on Nvidia GPU";
}

}// namespace refactor::kernel
80 changes: 80 additions & 0 deletions src/04kernel/src/kernels/clip/cuda_kernel.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#include "cuda_kernel.hh"
#include <thrust/execution_policy.h>
#include <thrust/transform.h>

namespace refactor::kernel {
using K = ClipCuda;

#define MAX(T, V) \
template<> struct Max<T> { \
constexpr static T value = V; \
}

template<class T> struct Max {};
MAX(float, FLT_MAX);
MAX(uint8_t, UCHAR_MAX);
MAX(int8_t, CHAR_MAX);
MAX(uint16_t, USHRT_MAX);
MAX(int16_t, SHRT_MAX);
MAX(int32_t, INT_MAX);
MAX(int64_t, LLONG_MAX);
// see <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0192r0.pdf>
// how to define a constexpr half?
// MAX(half, static_cast<half>(65504));
MAX(double, DBL_MAX);
MAX(uint32_t, UINT_MAX);
MAX(uint64_t, ULLONG_MAX);
#undef MAX

template<class T>
struct ClipFunctor {
T const *min, *max;

__device__ T operator()(T x) const noexcept {
T min_ = *min, max_ = max ? *max : Max<T>::value;
return x < min_ ? min_ : x > max_ ? max_
: x;
}
};

template<class T>
static auto lowerTyped(size_t size, bool hasMax) noexcept -> RoutineWorkspace {
fmt::println("lowering clip cuda kernel for");
using namespace runtime;
return [=](Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
auto data = reinterpret_cast<T const *>(inputs[0]);
auto min = reinterpret_cast<T const *>(inputs[1]),
max = hasMax
? reinterpret_cast<T const *>(inputs[2])
: nullptr;
auto output = reinterpret_cast<T *>(outputs[0]);

thrust::transform(thrust::device,
data, data + size,
output,
ClipFunctor<T>{min, max});
};
}

auto K::lower(Resources &) const noexcept -> RoutineWorkspace {
#define CASE(DT) \
case DataType::DT: \
return lowerTyped<primitive<DataType::DT>::type>(size, hasMax)

switch (dataType) {
CASE(F32);
CASE(U8);
CASE(I8);
CASE(U16);
CASE(I16);
CASE(I32);
CASE(I64);
CASE(F64);
CASE(U32);
CASE(U64);
default:
UNREACHABLE();
}
}

}// namespace refactor::kernel
28 changes: 28 additions & 0 deletions src/04kernel/src/kernels/clip/cuda_kernel.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#ifndef KERNEL_CLIP_CUDA_KERNEL_HH
#define KERNEL_CLIP_CUDA_KERNEL_HH

#include "kernel/kernel.h"
#include "kernel/tensor.h"

namespace refactor::kernel {

struct ClipCuda final : public Kernel {
DataType dataType;
size_t size;
bool hasMax;

ClipCuda(decltype(dataType), decltype(size), decltype(hasMax)) noexcept;

static KernelBox build(Tensor const &, bool hasMax) noexcept;
static size_t typeId() noexcept;

size_t kernelTypeId() const noexcept final;
std::string_view description() const noexcept final;
#ifdef USE_CUDA
RoutineWorkspace lower(Resources &) const noexcept final;
#endif
};

}// namespace refactor::kernel

#endif// KERNEL_CLIP_CUDA_KERNEL_HH
51 changes: 51 additions & 0 deletions src/04kernel/test/kernels/clip/test_cuda.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#ifdef USE_CUDA

#include "../../../src/kernels/clip/cpu_kernel.hh"
#include "../../../src/kernels/clip/cuda_kernel.hh"
#include "hardware/device_manager.h"
#include <gtest/gtest.h>
#include <numeric>

using namespace refactor;
using namespace kernel;
using namespace hardware;

TEST(kernel, ClipCuda) {
// build routine
auto data = Tensor::share(DataType::F32, Shape{2, 3, 4, 5});
auto kernel = ClipCuda::build(*data, true),
kCpu = ClipCpu::build(*data, true);
ASSERT_TRUE(kernel && kCpu);
auto res = runtime::Resources();
auto routine = kernel->lower(res).routine,
rCpu = kCpu->lower(res).routine;
// malloc
auto &dev = *device::init(Device::Type::Nvidia, 0, "");
auto gpuMem = dev.malloc(data->bytesSize()),
gpuMin = dev.malloc(sizeof(float)),
gpuMax = dev.malloc(sizeof(float));
// put input data
std::vector<float> value(data->elementsSize());
float min = 30, max = 80;
std::iota(value.begin(), value.end(), 0);
gpuMem->copyFromHost(value.data(), data->bytesSize());
gpuMin->copyFromHost(&min, sizeof(float));
gpuMax->copyFromHost(&max, sizeof(float));
// inference
{
void const *inputs[]{*gpuMem, *gpuMin, *gpuMax};
void *outputs[]{*gpuMem};
routine(res, nullptr, inputs, outputs);
}
{
void const *inputs[]{value.data(), &min, &max};
void *outputs[]{value.data()};
rCpu(res, nullptr, inputs, outputs);
}
// check
std::vector<float> result(data->elementsSize());
gpuMem->copyToHost(result.data(), data->bytesSize());
EXPECT_EQ(result, value);
}

#endif

0 comments on commit 9b52e37

Please sign in to comment.