From 9b52e372dec255f85001191987b929989efdda5f Mon Sep 17 00:00:00 2001 From: YdrMaster Date: Fri, 1 Dec 2023 15:39:48 +0800 Subject: [PATCH] =?UTF-8?q?feat(kernel):=20=E6=B7=BB=E5=8A=A0=20Clip=20cud?= =?UTF-8?q?a=20kernel=20=E5=8F=8A=E5=8D=95=E5=85=83=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: YdrMaster --- src/04kernel/src/collectors/clip.cc | 4 + src/04kernel/src/kernels/clip/cpu_kernel.cc | 2 +- src/04kernel/src/kernels/clip/cuda_kernel.cc | 29 +++++++ src/04kernel/src/kernels/clip/cuda_kernel.cu | 80 ++++++++++++++++++++ src/04kernel/src/kernels/clip/cuda_kernel.hh | 28 +++++++ src/04kernel/test/kernels/clip/test_cuda.cpp | 51 +++++++++++++ 6 files changed, 193 insertions(+), 1 deletion(-) create mode 100644 src/04kernel/src/kernels/clip/cuda_kernel.cc create mode 100644 src/04kernel/src/kernels/clip/cuda_kernel.cu create mode 100644 src/04kernel/src/kernels/clip/cuda_kernel.hh create mode 100644 src/04kernel/test/kernels/clip/test_cuda.cpp diff --git a/src/04kernel/src/collectors/clip.cc b/src/04kernel/src/collectors/clip.cc index 57c2b612..06ccd020 100644 --- a/src/04kernel/src/collectors/clip.cc +++ b/src/04kernel/src/collectors/clip.cc @@ -1,5 +1,6 @@ #include "kernel/collectors/clip.h" #include "../kernels/clip/cpu_kernel.hh" +#include "../kernels/clip/cuda_kernel.hh" namespace refactor::kernel { @@ -19,6 +20,9 @@ namespace refactor::kernel { } break; case decltype(_target)::Nvidia: + if (auto ptr = ClipCuda::build(data, hasMax); ptr) { + ans.emplace_back(std::move(ptr)); + } break; default: UNREACHABLEX(void, "Unknown target"); diff --git a/src/04kernel/src/kernels/clip/cpu_kernel.cc b/src/04kernel/src/kernels/clip/cpu_kernel.cc index bb4a0e15..a4e7d36c 100644 --- a/src/04kernel/src/kernels/clip/cpu_kernel.cc +++ b/src/04kernel/src/kernels/clip/cpu_kernel.cc @@ -27,7 +27,7 @@ namespace refactor::kernel { } template - auto lowerTyped(size_t size, bool hasMax) noexcept -> RoutineWorkspace { + static auto lowerTyped(size_t size, bool hasMax) noexcept -> RoutineWorkspace { using namespace runtime; return [=](Resources &, void *workspace, void const *const *inputs, void *const *outputs) { auto data = reinterpret_cast(inputs[0]); diff --git a/src/04kernel/src/kernels/clip/cuda_kernel.cc b/src/04kernel/src/kernels/clip/cuda_kernel.cc new file mode 100644 index 00000000..0a976211 --- /dev/null +++ b/src/04kernel/src/kernels/clip/cuda_kernel.cc @@ -0,0 +1,29 @@ +#include "cuda_kernel.hh" + +namespace refactor::kernel { + using K = ClipCuda; + + K::ClipCuda(decltype(dataType) dt, + decltype(size) size_, + decltype(hasMax) hasMax_) noexcept + : dataType(dt), size(size_), hasMax(hasMax_) { + } + + auto K::build(Tensor const &data, bool hasMax) noexcept -> KernelBox { + return data.dataType.isCpuNumberic() + ? std::make_unique(data.dataType, data.elementsSize(), hasMax) + : nullptr; + } + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { + return typeId(); + } + auto K::description() const noexcept -> std::string_view { + return "Performing clip operation on Nvidia GPU"; + } + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/clip/cuda_kernel.cu b/src/04kernel/src/kernels/clip/cuda_kernel.cu new file mode 100644 index 00000000..13219019 --- /dev/null +++ b/src/04kernel/src/kernels/clip/cuda_kernel.cu @@ -0,0 +1,80 @@ +#include "cuda_kernel.hh" +#include +#include + +namespace refactor::kernel { + using K = ClipCuda; + +#define MAX(T, V) \ + template<> struct Max { \ + constexpr static T value = V; \ + } + + template struct Max {}; + MAX(float, FLT_MAX); + MAX(uint8_t, UCHAR_MAX); + MAX(int8_t, CHAR_MAX); + MAX(uint16_t, USHRT_MAX); + MAX(int16_t, SHRT_MAX); + MAX(int32_t, INT_MAX); + MAX(int64_t, LLONG_MAX); + // see + // how to define a constexpr half? + // MAX(half, static_cast(65504)); + MAX(double, DBL_MAX); + MAX(uint32_t, UINT_MAX); + MAX(uint64_t, ULLONG_MAX); +#undef MAX + + template + struct ClipFunctor { + T const *min, *max; + + __device__ T operator()(T x) const noexcept { + T min_ = *min, max_ = max ? *max : Max::value; + return x < min_ ? min_ : x > max_ ? max_ + : x; + } + }; + + template + static auto lowerTyped(size_t size, bool hasMax) noexcept -> RoutineWorkspace { + fmt::println("lowering clip cuda kernel for"); + using namespace runtime; + return [=](Resources &, void *workspace, void const *const *inputs, void *const *outputs) { + auto data = reinterpret_cast(inputs[0]); + auto min = reinterpret_cast(inputs[1]), + max = hasMax + ? reinterpret_cast(inputs[2]) + : nullptr; + auto output = reinterpret_cast(outputs[0]); + + thrust::transform(thrust::device, + data, data + size, + output, + ClipFunctor{min, max}); + }; + } + + auto K::lower(Resources &) const noexcept -> RoutineWorkspace { +#define CASE(DT) \ + case DataType::DT: \ + return lowerTyped::type>(size, hasMax) + + switch (dataType) { + CASE(F32); + CASE(U8); + CASE(I8); + CASE(U16); + CASE(I16); + CASE(I32); + CASE(I64); + CASE(F64); + CASE(U32); + CASE(U64); + default: + UNREACHABLE(); + } + } + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/clip/cuda_kernel.hh b/src/04kernel/src/kernels/clip/cuda_kernel.hh new file mode 100644 index 00000000..08edf841 --- /dev/null +++ b/src/04kernel/src/kernels/clip/cuda_kernel.hh @@ -0,0 +1,28 @@ +#ifndef KERNEL_CLIP_CUDA_KERNEL_HH +#define KERNEL_CLIP_CUDA_KERNEL_HH + +#include "kernel/kernel.h" +#include "kernel/tensor.h" + +namespace refactor::kernel { + + struct ClipCuda final : public Kernel { + DataType dataType; + size_t size; + bool hasMax; + + ClipCuda(decltype(dataType), decltype(size), decltype(hasMax)) noexcept; + + static KernelBox build(Tensor const &, bool hasMax) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_CUDA + RoutineWorkspace lower(Resources &) const noexcept final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_CLIP_CUDA_KERNEL_HH diff --git a/src/04kernel/test/kernels/clip/test_cuda.cpp b/src/04kernel/test/kernels/clip/test_cuda.cpp new file mode 100644 index 00000000..9200a059 --- /dev/null +++ b/src/04kernel/test/kernels/clip/test_cuda.cpp @@ -0,0 +1,51 @@ +#ifdef USE_CUDA + +#include "../../../src/kernels/clip/cpu_kernel.hh" +#include "../../../src/kernels/clip/cuda_kernel.hh" +#include "hardware/device_manager.h" +#include +#include + +using namespace refactor; +using namespace kernel; +using namespace hardware; + +TEST(kernel, ClipCuda) { + // build routine + auto data = Tensor::share(DataType::F32, Shape{2, 3, 4, 5}); + auto kernel = ClipCuda::build(*data, true), + kCpu = ClipCpu::build(*data, true); + ASSERT_TRUE(kernel && kCpu); + auto res = runtime::Resources(); + auto routine = kernel->lower(res).routine, + rCpu = kCpu->lower(res).routine; + // malloc + auto &dev = *device::init(Device::Type::Nvidia, 0, ""); + auto gpuMem = dev.malloc(data->bytesSize()), + gpuMin = dev.malloc(sizeof(float)), + gpuMax = dev.malloc(sizeof(float)); + // put input data + std::vector value(data->elementsSize()); + float min = 30, max = 80; + std::iota(value.begin(), value.end(), 0); + gpuMem->copyFromHost(value.data(), data->bytesSize()); + gpuMin->copyFromHost(&min, sizeof(float)); + gpuMax->copyFromHost(&max, sizeof(float)); + // inference + { + void const *inputs[]{*gpuMem, *gpuMin, *gpuMax}; + void *outputs[]{*gpuMem}; + routine(res, nullptr, inputs, outputs); + } + { + void const *inputs[]{value.data(), &min, &max}; + void *outputs[]{value.data()}; + rCpu(res, nullptr, inputs, outputs); + } + // check + std::vector result(data->elementsSize()); + gpuMem->copyToHost(result.data(), data->bytesSize()); + EXPECT_EQ(result, value); +} + +#endif