-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(kernel): 添加 Clip cuda kernel 及单元测试
Signed-off-by: YdrMaster <ydrml@hotmail.com>
- Loading branch information
Showing
6 changed files
with
193 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
#include "cuda_kernel.hh" | ||
|
||
namespace refactor::kernel { | ||
using K = ClipCuda; | ||
|
||
K::ClipCuda(decltype(dataType) dt, | ||
decltype(size) size_, | ||
decltype(hasMax) hasMax_) noexcept | ||
: dataType(dt), size(size_), hasMax(hasMax_) { | ||
} | ||
|
||
auto K::build(Tensor const &data, bool hasMax) noexcept -> KernelBox { | ||
return data.dataType.isCpuNumberic() | ||
? std::make_unique<K>(data.dataType, data.elementsSize(), hasMax) | ||
: nullptr; | ||
} | ||
auto K::typeId() noexcept -> size_t { | ||
static uint8_t ID = 1; | ||
return reinterpret_cast<size_t>(&ID); | ||
} | ||
|
||
auto K::kernelTypeId() const noexcept -> size_t { | ||
return typeId(); | ||
} | ||
auto K::description() const noexcept -> std::string_view { | ||
return "Performing clip operation on Nvidia GPU"; | ||
} | ||
|
||
}// namespace refactor::kernel |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
#include "cuda_kernel.hh" | ||
#include <thrust/execution_policy.h> | ||
#include <thrust/transform.h> | ||
|
||
namespace refactor::kernel { | ||
using K = ClipCuda; | ||
|
||
#define MAX(T, V) \ | ||
template<> struct Max<T> { \ | ||
constexpr static T value = V; \ | ||
} | ||
|
||
template<class T> struct Max {}; | ||
MAX(float, FLT_MAX); | ||
MAX(uint8_t, UCHAR_MAX); | ||
MAX(int8_t, CHAR_MAX); | ||
MAX(uint16_t, USHRT_MAX); | ||
MAX(int16_t, SHRT_MAX); | ||
MAX(int32_t, INT_MAX); | ||
MAX(int64_t, LLONG_MAX); | ||
// see <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0192r0.pdf> | ||
// how to define a constexpr half? | ||
// MAX(half, static_cast<half>(65504)); | ||
MAX(double, DBL_MAX); | ||
MAX(uint32_t, UINT_MAX); | ||
MAX(uint64_t, ULLONG_MAX); | ||
#undef MAX | ||
|
||
template<class T> | ||
struct ClipFunctor { | ||
T const *min, *max; | ||
|
||
__device__ T operator()(T x) const noexcept { | ||
T min_ = *min, max_ = max ? *max : Max<T>::value; | ||
return x < min_ ? min_ : x > max_ ? max_ | ||
: x; | ||
} | ||
}; | ||
|
||
template<class T> | ||
static auto lowerTyped(size_t size, bool hasMax) noexcept -> RoutineWorkspace { | ||
fmt::println("lowering clip cuda kernel for"); | ||
using namespace runtime; | ||
return [=](Resources &, void *workspace, void const *const *inputs, void *const *outputs) { | ||
auto data = reinterpret_cast<T const *>(inputs[0]); | ||
auto min = reinterpret_cast<T const *>(inputs[1]), | ||
max = hasMax | ||
? reinterpret_cast<T const *>(inputs[2]) | ||
: nullptr; | ||
auto output = reinterpret_cast<T *>(outputs[0]); | ||
|
||
thrust::transform(thrust::device, | ||
data, data + size, | ||
output, | ||
ClipFunctor<T>{min, max}); | ||
}; | ||
} | ||
|
||
auto K::lower(Resources &) const noexcept -> RoutineWorkspace { | ||
#define CASE(DT) \ | ||
case DataType::DT: \ | ||
return lowerTyped<primitive<DataType::DT>::type>(size, hasMax) | ||
|
||
switch (dataType) { | ||
CASE(F32); | ||
CASE(U8); | ||
CASE(I8); | ||
CASE(U16); | ||
CASE(I16); | ||
CASE(I32); | ||
CASE(I64); | ||
CASE(F64); | ||
CASE(U32); | ||
CASE(U64); | ||
default: | ||
UNREACHABLE(); | ||
} | ||
} | ||
|
||
}// namespace refactor::kernel |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#ifndef KERNEL_CLIP_CUDA_KERNEL_HH | ||
#define KERNEL_CLIP_CUDA_KERNEL_HH | ||
|
||
#include "kernel/kernel.h" | ||
#include "kernel/tensor.h" | ||
|
||
namespace refactor::kernel { | ||
|
||
struct ClipCuda final : public Kernel { | ||
DataType dataType; | ||
size_t size; | ||
bool hasMax; | ||
|
||
ClipCuda(decltype(dataType), decltype(size), decltype(hasMax)) noexcept; | ||
|
||
static KernelBox build(Tensor const &, bool hasMax) noexcept; | ||
static size_t typeId() noexcept; | ||
|
||
size_t kernelTypeId() const noexcept final; | ||
std::string_view description() const noexcept final; | ||
#ifdef USE_CUDA | ||
RoutineWorkspace lower(Resources &) const noexcept final; | ||
#endif | ||
}; | ||
|
||
}// namespace refactor::kernel | ||
|
||
#endif// KERNEL_CLIP_CUDA_KERNEL_HH |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
#ifdef USE_CUDA | ||
|
||
#include "../../../src/kernels/clip/cpu_kernel.hh" | ||
#include "../../../src/kernels/clip/cuda_kernel.hh" | ||
#include "hardware/device_manager.h" | ||
#include <gtest/gtest.h> | ||
#include <numeric> | ||
|
||
using namespace refactor; | ||
using namespace kernel; | ||
using namespace hardware; | ||
|
||
TEST(kernel, ClipCuda) { | ||
// build routine | ||
auto data = Tensor::share(DataType::F32, Shape{2, 3, 4, 5}); | ||
auto kernel = ClipCuda::build(*data, true), | ||
kCpu = ClipCpu::build(*data, true); | ||
ASSERT_TRUE(kernel && kCpu); | ||
auto res = runtime::Resources(); | ||
auto routine = kernel->lower(res).routine, | ||
rCpu = kCpu->lower(res).routine; | ||
// malloc | ||
auto &dev = *device::init(Device::Type::Nvidia, 0, ""); | ||
auto gpuMem = dev.malloc(data->bytesSize()), | ||
gpuMin = dev.malloc(sizeof(float)), | ||
gpuMax = dev.malloc(sizeof(float)); | ||
// put input data | ||
std::vector<float> value(data->elementsSize()); | ||
float min = 30, max = 80; | ||
std::iota(value.begin(), value.end(), 0); | ||
gpuMem->copyFromHost(value.data(), data->bytesSize()); | ||
gpuMin->copyFromHost(&min, sizeof(float)); | ||
gpuMax->copyFromHost(&max, sizeof(float)); | ||
// inference | ||
{ | ||
void const *inputs[]{*gpuMem, *gpuMin, *gpuMax}; | ||
void *outputs[]{*gpuMem}; | ||
routine(res, nullptr, inputs, outputs); | ||
} | ||
{ | ||
void const *inputs[]{value.data(), &min, &max}; | ||
void *outputs[]{value.data()}; | ||
rCpu(res, nullptr, inputs, outputs); | ||
} | ||
// check | ||
std::vector<float> result(data->elementsSize()); | ||
gpuMem->copyToHost(result.data(), data->bytesSize()); | ||
EXPECT_EQ(result, value); | ||
} | ||
|
||
#endif |