Skip to content

Commit

Permalink
feat(kernel): 实现 CastCpu kernel
Browse files Browse the repository at this point in the history
Signed-off-by: YdrMaster <ydrml@hotmail.com>
  • Loading branch information
YdrMaster committed Dec 8, 2023
1 parent 5294b14 commit 14cca60
Show file tree
Hide file tree
Showing 7 changed files with 209 additions and 1 deletion.
11 changes: 11 additions & 0 deletions src/04kernel/src/collectors/cast.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#include "kernel/collectors/cast.h"
#include "../kernels/cast/cpu_kernel.hh"
#include "../kernels/cast/cuda_kernel.hh"

namespace refactor::kernel {

Expand All @@ -7,11 +9,20 @@ namespace refactor::kernel {

std::vector<KernelBox>
CastCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
auto const &from = inputs[0];
auto const &to = outputs[0];

std::vector<KernelBox> ans;
switch (_target) {
case decltype(_target)::Cpu:
if (auto ptr = CastCpu::build(from, to); ptr) {
ans.emplace_back(std::move(ptr));
}
break;
case decltype(_target)::Nvidia:
if (auto ptr = CastCuda::build(from, to); ptr) {
ans.emplace_back(std::move(ptr));
}
break;
default:
UNREACHABLEX(void, "Unknown target");
Expand Down
79 changes: 79 additions & 0 deletions src/04kernel/src/kernels/cast/cpu_kernel.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#include "cpu_kernel.hh"
#include <execution>

namespace refactor::kernel {
using K = CastCpu;

K::CastCpu(decltype(from) from_,
decltype(to) to_,
decltype(size) size_) noexcept
: from(from_), to(to_), size(size_) {}

auto K::build(Tensor const &from, Tensor const &to) noexcept -> KernelBox {
return std::make_unique<K>(from.dataType, to.dataType, from.elementsSize());
}
auto K::typeId() noexcept -> size_t {
static uint8_t ID = 1;
return reinterpret_cast<size_t>(&ID);
}

auto K::kernelTypeId() const noexcept -> size_t {
return typeId();
}
auto K::description() const noexcept -> std::string_view {
return "Performing cast operation on generic cpu";
}

template<class T, class U>
static auto lowerTyped(size_t size) noexcept -> RoutineWorkspace {
using namespace runtime;
return [=](Resources &, void *, void const *const *inputs, void *const *outputs) {
auto x = reinterpret_cast<T const *>(inputs[0]);
auto y = reinterpret_cast<U *>(outputs[0]);
std::transform(std::execution::par_unseq,
x, x + size,
y,
[=](auto a) { return static_cast<U>(a); });
};
}

auto K::lower(Resources &) const noexcept -> RoutineWorkspace {
#define CASE_U(T, U) \
case DataType::U: \
return lowerTyped<T, primitive<DataType::U>::type>(size)

#define CASE_T(T) \
case DataType::T: \
switch (to) { \
CASE_U(primitive<DataType::T>::type, U8); \
CASE_U(primitive<DataType::T>::type, U16); \
CASE_U(primitive<DataType::T>::type, U32); \
CASE_U(primitive<DataType::T>::type, U64); \
CASE_U(primitive<DataType::T>::type, I8); \
CASE_U(primitive<DataType::T>::type, I16); \
CASE_U(primitive<DataType::T>::type, I32); \
CASE_U(primitive<DataType::T>::type, I64); \
CASE_U(primitive<DataType::T>::type, F32); \
CASE_U(primitive<DataType::T>::type, F64); \
CASE_U(primitive<DataType::T>::type, Bool); \
default: \
UNREACHABLE(); \
}
switch (from) {
CASE_T(U8);
CASE_T(U16);
CASE_T(U32);
CASE_T(U64);
CASE_T(I8);
CASE_T(I16);
CASE_T(I32);
CASE_T(I64);
CASE_T(F32);
CASE_T(F64);
CASE_T(Bool);
default:
UNREACHABLE();
}
}// namespace refactor::kernel

}// namespace refactor::kernel
25 changes: 25 additions & 0 deletions src/04kernel/src/kernels/cast/cpu_kernel.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#ifndef KERNEL_CLIP_CPU_KERNEL_HH
#define KERNEL_CLIP_CPU_KERNEL_HH

#include "kernel/kernel.h"
#include "kernel/tensor.h"

namespace refactor::kernel {

struct CastCpu final : public Kernel {
DataType from, to;
size_t size;

CastCpu(decltype(from), decltype(to), decltype(size)) noexcept;

static KernelBox build(Tensor const &, Tensor const &) noexcept;
static size_t typeId() noexcept;

size_t kernelTypeId() const noexcept final;
std::string_view description() const noexcept final;
RoutineWorkspace lower(Resources &) const noexcept final;
};

}// namespace refactor::kernel

#endif// KERNEL_CLIP_CPU_KERNEL_HH
36 changes: 36 additions & 0 deletions src/04kernel/src/kernels/cast/cuda_kernel.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#include "cuda_kernel.hh"

#ifdef USE_CUDA
#include "../../generator/nvrtc_repo.h"
#include "kernel/cuda/threads_distributer.cuh"
#include <cuda_runtime.h>
#endif

namespace refactor::kernel {
using K = CastCuda;

K::CastCuda(decltype(from) from_,
decltype(to) to_,
decltype(size) size_) noexcept
: from(from_), to(to_), size(size_) {}

auto K::build(Tensor const &from, Tensor const &to) noexcept -> KernelBox {
#ifndef USE_CUDA
return nullptr;
#endif

return std::make_unique<K>(from.dataType, to.dataType, from.elementsSize());
}
auto K::typeId() noexcept -> size_t {
static uint8_t ID = 1;
return reinterpret_cast<size_t>(&ID);
}

auto K::kernelTypeId() const noexcept -> size_t {
return typeId();
}
auto K::description() const noexcept -> std::string_view {
return "Performing cast operation on Nvidia gpu";
}

}// namespace refactor::kernel
27 changes: 27 additions & 0 deletions src/04kernel/src/kernels/cast/cuda_kernel.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#ifndef KERNEL_CLIP_CUDA_KERNEL_HH
#define KERNEL_CLIP_CUDA_KERNEL_HH

#include "kernel/kernel.h"
#include "kernel/tensor.h"

namespace refactor::kernel {

struct CastCuda final : public Kernel {
DataType from, to;
size_t size;

CastCuda(decltype(from), decltype(to), decltype(size)) noexcept;

static KernelBox build(Tensor const &, Tensor const &) noexcept;
static size_t typeId() noexcept;

size_t kernelTypeId() const noexcept final;
std::string_view description() const noexcept final;
#ifdef USE_CUDA
RoutineWorkspace lower(Resources &) const final;
#endif
};

}// namespace refactor::kernel

#endif// KERNEL_CLIP_CUDA_KERNEL_HH
2 changes: 1 addition & 1 deletion src/04kernel/src/kernels/clip/cpu_kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ namespace refactor::kernel {
template<class T>
static auto lowerTyped(size_t size, bool hasMax) noexcept -> RoutineWorkspace {
using namespace runtime;
return [=](Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
return [=](Resources &, void *, void const *const *inputs, void *const *outputs) {
auto data = reinterpret_cast<T const *>(inputs[0]);
auto min = *reinterpret_cast<T const *>(inputs[1]),
max = hasMax
Expand Down
30 changes: 30 additions & 0 deletions src/04kernel/test/kernels/cast/test_cpu.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#include "../../../src/kernels/cast/cpu_kernel.hh"
#include <gtest/gtest.h>
#include <numeric>

using namespace refactor;
using namespace kernel;

TEST(kernel, CastCpu) {
// build routine
auto x = Tensor::share(DataType::F64, Shape{2, 3, 4, 5});
auto y = Tensor::share(DataType::I8, Shape{2, 3, 4, 5});
auto kernel = CastCpu::build(*x, *y);
ASSERT_TRUE(kernel);
auto res = runtime::Resources();
auto routine = kernel->lower(res).routine;
// put input data
std::vector<float> x_(x->elementsSize());
std::vector<int8_t> y_(y->elementsSize());
std::iota(x_.begin(), x_.end(), 0);
// inference
{
void const *inputs[]{x_.data()};
void *outputs[]{y_.data()};
routine(res, nullptr, inputs, outputs);
}
// check
for (auto i : range0_(static_cast<int8_t>(y->elementsSize()))) {
EXPECT_EQ(y_[i], i);
}
}

0 comments on commit 14cca60

Please sign in to comment.