diff --git a/src/04kernel/src/collectors/cast.cc b/src/04kernel/src/collectors/cast.cc index 9ccfcd13c..bca9d2629 100644 --- a/src/04kernel/src/collectors/cast.cc +++ b/src/04kernel/src/collectors/cast.cc @@ -1,4 +1,6 @@ #include "kernel/collectors/cast.h" +#include "../kernels/cast/cpu_kernel.hh" +#include "../kernels/cast/cuda_kernel.hh" namespace refactor::kernel { @@ -7,11 +9,20 @@ namespace refactor::kernel { std::vector CastCollector::filter(TensorRefs inputs, TensorRefs outputs) const { + auto const &from = inputs[0]; + auto const &to = outputs[0]; + std::vector ans; switch (_target) { case decltype(_target)::Cpu: + if (auto ptr = CastCpu::build(from, to); ptr) { + ans.emplace_back(std::move(ptr)); + } break; case decltype(_target)::Nvidia: + if (auto ptr = CastCuda::build(from, to); ptr) { + ans.emplace_back(std::move(ptr)); + } break; default: UNREACHABLEX(void, "Unknown target"); diff --git a/src/04kernel/src/kernels/cast/cpu_kernel.cc b/src/04kernel/src/kernels/cast/cpu_kernel.cc new file mode 100644 index 000000000..7e125c861 --- /dev/null +++ b/src/04kernel/src/kernels/cast/cpu_kernel.cc @@ -0,0 +1,79 @@ +#include "cpu_kernel.hh" +#include + +namespace refactor::kernel { + using K = CastCpu; + + K::CastCpu(decltype(from) from_, + decltype(to) to_, + decltype(size) size_) noexcept + : from(from_), to(to_), size(size_) {} + + auto K::build(Tensor const &from, Tensor const &to) noexcept -> KernelBox { + return std::make_unique(from.dataType, to.dataType, from.elementsSize()); + } + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { + return typeId(); + } + auto K::description() const noexcept -> std::string_view { + return "Performing cast operation on generic cpu"; + } + + template + static auto lowerTyped(size_t size) noexcept -> RoutineWorkspace { + using namespace runtime; + return [=](Resources &, void *, void const *const *inputs, void *const *outputs) { + auto x = reinterpret_cast(inputs[0]); + auto y = reinterpret_cast(outputs[0]); + std::transform(std::execution::par_unseq, + x, x + size, + y, + [=](auto a) { return static_cast(a); }); + }; + } + + auto K::lower(Resources &) const noexcept -> RoutineWorkspace { +#define CASE_U(T, U) \ + case DataType::U: \ + return lowerTyped::type>(size) + +#define CASE_T(T) \ + case DataType::T: \ + switch (to) { \ + CASE_U(primitive::type, U8); \ + CASE_U(primitive::type, U16); \ + CASE_U(primitive::type, U32); \ + CASE_U(primitive::type, U64); \ + CASE_U(primitive::type, I8); \ + CASE_U(primitive::type, I16); \ + CASE_U(primitive::type, I32); \ + CASE_U(primitive::type, I64); \ + CASE_U(primitive::type, F32); \ + CASE_U(primitive::type, F64); \ + CASE_U(primitive::type, Bool); \ + default: \ + UNREACHABLE(); \ + } + switch (from) { + CASE_T(U8); + CASE_T(U16); + CASE_T(U32); + CASE_T(U64); + CASE_T(I8); + CASE_T(I16); + CASE_T(I32); + CASE_T(I64); + CASE_T(F32); + CASE_T(F64); + CASE_T(Bool); + default: + UNREACHABLE(); + } + }// namespace refactor::kernel + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/cast/cpu_kernel.hh b/src/04kernel/src/kernels/cast/cpu_kernel.hh new file mode 100644 index 000000000..3f138d572 --- /dev/null +++ b/src/04kernel/src/kernels/cast/cpu_kernel.hh @@ -0,0 +1,25 @@ +#ifndef KERNEL_CLIP_CPU_KERNEL_HH +#define KERNEL_CLIP_CPU_KERNEL_HH + +#include "kernel/kernel.h" +#include "kernel/tensor.h" + +namespace refactor::kernel { + + struct CastCpu final : public Kernel { + DataType from, to; + size_t size; + + CastCpu(decltype(from), decltype(to), decltype(size)) noexcept; + + static KernelBox build(Tensor const &, Tensor const &) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; + RoutineWorkspace lower(Resources &) const noexcept final; + }; + +}// namespace refactor::kernel + +#endif// KERNEL_CLIP_CPU_KERNEL_HH diff --git a/src/04kernel/src/kernels/cast/cuda_kernel.cc b/src/04kernel/src/kernels/cast/cuda_kernel.cc new file mode 100644 index 000000000..3de3eb632 --- /dev/null +++ b/src/04kernel/src/kernels/cast/cuda_kernel.cc @@ -0,0 +1,36 @@ +#include "cuda_kernel.hh" + +#ifdef USE_CUDA +#include "../../generator/nvrtc_repo.h" +#include "kernel/cuda/threads_distributer.cuh" +#include +#endif + +namespace refactor::kernel { + using K = CastCuda; + + K::CastCuda(decltype(from) from_, + decltype(to) to_, + decltype(size) size_) noexcept + : from(from_), to(to_), size(size_) {} + + auto K::build(Tensor const &from, Tensor const &to) noexcept -> KernelBox { +#ifndef USE_CUDA + return nullptr; +#endif + + return std::make_unique(from.dataType, to.dataType, from.elementsSize()); + } + auto K::typeId() noexcept -> size_t { + static uint8_t ID = 1; + return reinterpret_cast(&ID); + } + + auto K::kernelTypeId() const noexcept -> size_t { + return typeId(); + } + auto K::description() const noexcept -> std::string_view { + return "Performing cast operation on Nvidia gpu"; + } + +}// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/cast/cuda_kernel.hh b/src/04kernel/src/kernels/cast/cuda_kernel.hh new file mode 100644 index 000000000..775893e6e --- /dev/null +++ b/src/04kernel/src/kernels/cast/cuda_kernel.hh @@ -0,0 +1,27 @@ +#ifndef KERNEL_CLIP_CUDA_KERNEL_HH +#define KERNEL_CLIP_CUDA_KERNEL_HH + +#include "kernel/kernel.h" +#include "kernel/tensor.h" + +namespace refactor::kernel { + + struct CastCuda final : public Kernel { + DataType from, to; + size_t size; + + CastCuda(decltype(from), decltype(to), decltype(size)) noexcept; + + static KernelBox build(Tensor const &, Tensor const &) noexcept; + static size_t typeId() noexcept; + + size_t kernelTypeId() const noexcept final; + std::string_view description() const noexcept final; +#ifdef USE_CUDA + RoutineWorkspace lower(Resources &) const final; +#endif + }; + +}// namespace refactor::kernel + +#endif// KERNEL_CLIP_CUDA_KERNEL_HH diff --git a/src/04kernel/src/kernels/clip/cpu_kernel.cc b/src/04kernel/src/kernels/clip/cpu_kernel.cc index a4e7d36c4..b48735e21 100644 --- a/src/04kernel/src/kernels/clip/cpu_kernel.cc +++ b/src/04kernel/src/kernels/clip/cpu_kernel.cc @@ -29,7 +29,7 @@ namespace refactor::kernel { template static auto lowerTyped(size_t size, bool hasMax) noexcept -> RoutineWorkspace { using namespace runtime; - return [=](Resources &, void *workspace, void const *const *inputs, void *const *outputs) { + return [=](Resources &, void *, void const *const *inputs, void *const *outputs) { auto data = reinterpret_cast(inputs[0]); auto min = *reinterpret_cast(inputs[1]), max = hasMax diff --git a/src/04kernel/test/kernels/cast/test_cpu.cpp b/src/04kernel/test/kernels/cast/test_cpu.cpp new file mode 100644 index 000000000..6241ac8d7 --- /dev/null +++ b/src/04kernel/test/kernels/cast/test_cpu.cpp @@ -0,0 +1,30 @@ +#include "../../../src/kernels/cast/cpu_kernel.hh" +#include +#include + +using namespace refactor; +using namespace kernel; + +TEST(kernel, CastCpu) { + // build routine + auto x = Tensor::share(DataType::F64, Shape{2, 3, 4, 5}); + auto y = Tensor::share(DataType::I8, Shape{2, 3, 4, 5}); + auto kernel = CastCpu::build(*x, *y); + ASSERT_TRUE(kernel); + auto res = runtime::Resources(); + auto routine = kernel->lower(res).routine; + // put input data + std::vector x_(x->elementsSize()); + std::vector y_(y->elementsSize()); + std::iota(x_.begin(), x_.end(), 0); + // inference + { + void const *inputs[]{x_.data()}; + void *outputs[]{y_.data()}; + routine(res, nullptr, inputs, outputs); + } + // check + for (auto i : range0_(static_cast(y->elementsSize()))) { + EXPECT_EQ(y_[i], i); + } +}