Skip to content

Commit

Permalink
refactor(kernel): kernel/test 中用 Device::Blob 替换 ForeignBlob
Browse files Browse the repository at this point in the history
Signed-off-by: YdrMaster <ydrml@hotmail.com>
  • Loading branch information
YdrMaster committed Nov 27, 2023
1 parent 612cfd6 commit f652e81
Show file tree
Hide file tree
Showing 25 changed files with 292 additions and 258 deletions.
27 changes: 22 additions & 5 deletions src/02hardware/include/hardware/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,34 @@ namespace refactor::hardware {
decltype(_mem));

public:
class Blob {
friend class Device;

Device &_device;
void *_ptr;

Blob(decltype(_device) device, size_t);

public:
~Blob();
void copyFromHost(void const *, size_t) const;
void copyToHost(void *, size_t) const;
void copyFrom(Blob const &, size_t) const;
void copyTo(Blob const &, size_t) const;

constexpr void *get() const noexcept { return _ptr; }
constexpr operator void *() const noexcept { return get(); }
constexpr operator bool() const noexcept { return get(); }
};
friend class Blob;

virtual ~Device() = default;

constexpr int32_t typeId() const noexcept { return _typeId; }
constexpr int32_t cardId() const noexcept { return _cardId; }
constexpr std::string_view deviceTypeName() const noexcept { return _deviceTypeName; }

void *malloc(size_t);
void free(void *);
void *copyHD(void *dst, void const *src, size_t bytes) const;
void *copyDH(void *dst, void const *src, size_t bytes) const;
void *copyDD(void *dst, void const *src, size_t bytes) const;
Arc<Blob> malloc(size_t);

using Builder = Arc<Device> (*)(decltype(_deviceTypeName),
decltype(_typeId),
Expand Down
41 changes: 27 additions & 14 deletions src/02hardware/src/device.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,31 @@

namespace refactor::hardware {

Device::Blob::Blob(decltype(_device) device, size_t size)
: _device(device), _ptr(device._mem->malloc(size)) {}

Device::Blob::~Blob() {
_device._mem->free(std::exchange(_ptr, nullptr));
}
void Device::Blob::copyFromHost(void const *ptr, size_t size) const {
_device._mem->copyHD(_ptr, ptr, size);
}
void Device::Blob::copyToHost(void *ptr, size_t size) const {
_device._mem->copyDH(ptr, _ptr, size);
}
void Device::Blob::copyFrom(Blob const &rhs, size_t size) const {
if (_device._mem == rhs._device._mem) {
_device._mem->copyDD(_ptr, rhs._ptr, size);
} else {
std::vector<uint8_t> tmp(size);
rhs.copyToHost(tmp.data(), size);
copyFromHost(tmp.data(), size);
}
}
void Device::Blob::copyTo(Blob const &rhs, size_t size) const {
rhs.copyFrom(*this, size);
}

Device::Device(decltype(_deviceTypeName) deviceTypeName,
decltype(_typeId) typeId,
decltype(_cardId) cardId,
Expand All @@ -12,20 +37,8 @@ namespace refactor::hardware {
_cardId(cardId),
_mem(std::move(mem)) {}

void *Device::malloc(size_t size) {
return _mem->malloc(size);
}
void Device::free(void *ptr) {
_mem->free(ptr);
}
void *Device::copyHD(void *dst, void const *src, size_t bytes) const {
return _mem->copyHD(dst, src, bytes);
}
void *Device::copyDH(void *dst, void const *src, size_t bytes) const {
return _mem->copyDH(dst, src, bytes);
}
void *Device::copyDD(void *dst, void const *src, size_t bytes) const {
return _mem->copyDD(dst, src, bytes);
auto Device::malloc(size_t size) -> Arc<Blob> {
return Arc<Blob>(new Blob(*this, size));
}

struct DeviceType {
Expand Down
2 changes: 1 addition & 1 deletion src/02hardware/src/foreign_blob.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ namespace refactor::hardware {
}
void ForeignBlob::copyFrom(ForeignBlob const &src, size_t bytes) {
if (_device == src._device) {
_device->copyDD(_ptr, src._ptr, bytes);
_memManager->copyDD(_ptr, src._ptr, bytes);
} else {
std::vector<uint8_t> tmp(bytes);
src.copyOut(tmp.data(), bytes);
Expand Down
1 change: 0 additions & 1 deletion src/04kernel/src/kernels/conv/cudnn_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ namespace refactor::kernel {
// nvcc at c++11 doesn't support real move capture
auto routine = [d_ = std::move(d),
workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
using hardware::ForeignBlob;
auto const &d = *d_;
// fetch cudnn handle from resources
auto handle = res.fetchOrStore<CudnnContext>()->handle;
Expand Down
27 changes: 12 additions & 15 deletions src/04kernel/test/kernels/concat/test_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@

#include "../../../src/kernels/concat/cpu_kernel.hh"
#include "../../../src/kernels/concat/cuda_kernel.hh"
#include "hardware/device.h"
#include "hardware/devices/nvidia.h"
#include "kernel/target.h"
#include <gtest/gtest.h>
#include <numeric>

Expand Down Expand Up @@ -35,16 +33,15 @@ TEST(kernel, ConcatCuda) {
auto routine = kernel->lower(res).routine;
// malloc
Device::register_<Nvidia>("nvidia");
auto device = hardware::Device::init("nvidia", 0, "");
auto memManager = Target(Target::NvidiaGpu).memManager();
Arc<hardware::ForeignBlob>
auto device = Device::init("nvidia", 0, "");
Arc<Device::Blob>
gpuIns[]{
hardware::ForeignBlob::share(memManager, inputTensors[0]->bytesSize()),
hardware::ForeignBlob::share(memManager, inputTensors[1]->bytesSize()),
hardware::ForeignBlob::share(memManager, inputTensors[2]->bytesSize()),
hardware::ForeignBlob::share(memManager, inputTensors[3]->bytesSize()),
device->malloc(inputTensors[0]->bytesSize()),
device->malloc(inputTensors[1]->bytesSize()),
device->malloc(inputTensors[2]->bytesSize()),
device->malloc(inputTensors[3]->bytesSize()),
},
gpuOut = hardware::ForeignBlob::share(memManager, result->bytesSize());
gpuOut = device->malloc(result->bytesSize());
// put input data
std::vector<float>
cpuIns[]{
Expand All @@ -59,10 +56,10 @@ TEST(kernel, ConcatCuda) {
std::iota(cpuIns[1].begin(), cpuIns[1].end(), 0);
std::iota(cpuIns[2].begin(), cpuIns[2].end(), 0);
std::iota(cpuIns[3].begin(), cpuIns[3].end(), 0);
gpuIns[0]->copyIn(cpuIns[0].data(), inputTensors[0]->bytesSize());
gpuIns[1]->copyIn(cpuIns[1].data(), inputTensors[1]->bytesSize());
gpuIns[2]->copyIn(cpuIns[2].data(), inputTensors[2]->bytesSize());
gpuIns[3]->copyIn(cpuIns[3].data(), inputTensors[3]->bytesSize());
gpuIns[0]->copyFromHost(cpuIns[0].data(), inputTensors[0]->bytesSize());
gpuIns[1]->copyFromHost(cpuIns[1].data(), inputTensors[1]->bytesSize());
gpuIns[2]->copyFromHost(cpuIns[2].data(), inputTensors[2]->bytesSize());
gpuIns[3]->copyFromHost(cpuIns[3].data(), inputTensors[3]->bytesSize());
// inference
{
void const *inputs[]{*gpuIns[0], *gpuIns[1], *gpuIns[2], *gpuIns[3]};
Expand All @@ -75,7 +72,7 @@ TEST(kernel, ConcatCuda) {
rCpu(res, nullptr, inputs, outputs);
}
// check
gpuOut->copyOut(out.data(), result->bytesSize());
gpuOut->copyToHost(out.data(), result->bytesSize());
EXPECT_EQ(out, cpuOut);
}

Expand Down
16 changes: 8 additions & 8 deletions src/04kernel/test/kernels/expand/test_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@

#include "../../../src/kernels/expand/cpu_kernel.hh"
#include "../../../src/kernels/expand/cuda_kernel.hh"
#include "kernel/target.h"
#include "runtime/mem_manager.hh"
#include "hardware/devices/nvidia.h"
#include <gtest/gtest.h>
#include <numeric>

using namespace refactor;
using namespace kernel;
using namespace hardware;

TEST(kernel, ExpandCuda) {
// build routine
Expand All @@ -22,17 +22,17 @@ TEST(kernel, ExpandCuda) {
auto routine = kernel->lower(res).routine;
auto rCpu = kCpu->lower(res).routine;
// malloc
auto memManager = Target(Target::NvidiaGpu).memManager();
Arc<hardware::ForeignBlob>
gpuIn = hardware::ForeignBlob::share(memManager, input->bytesSize()),
gpuOut = hardware::ForeignBlob::share(memManager, output->bytesSize());
Device::register_<Nvidia>("nvidia");
auto device = Device::init("nvidia", 0, "");
auto gpuIn = device->malloc(input->bytesSize()),
gpuOut = device->malloc(output->bytesSize());
// put input data
std::vector<float>
data(input->elementsSize()),
ans(output->elementsSize()),
result(ans.size());
std::iota(data.begin(), data.end(), 0);
gpuIn->copyIn(data.data(), input->bytesSize());
gpuIn->copyFromHost(data.data(), input->bytesSize());
// inference
{
void const *inputs[]{*gpuIn};
Expand All @@ -45,7 +45,7 @@ TEST(kernel, ExpandCuda) {
rCpu(res, nullptr, inputs, outputs);
}
// check
gpuOut->copyOut(result.data(), output->bytesSize());
gpuOut->copyToHost(result.data(), output->bytesSize());
EXPECT_EQ(result, ans);
}

Expand Down
29 changes: 16 additions & 13 deletions src/04kernel/test/kernels/gather/test_gather_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@

#include "../src/kernels/gather/cpu_kernel.hh"
#include "../src/kernels/gather/cuda_kernel.hh"
#include "kernel/target.h"
#include "hardware/devices/nvidia.h"
#include <gtest/gtest.h>

using namespace refactor;
using namespace kernel;
using namespace hardware;

TEST(kernel, GatherCuda) {
Device::register_<Nvidia>("nvidia");
auto device = Device::init("nvidia", 0, "");
// Case axis = 0, indexType= int64
{
// Create Tensor and build kernels
Expand All @@ -26,11 +29,11 @@ TEST(kernel, GatherCuda) {
std::vector<float> a{1.0, 1.2, 2.3, 3.4, 4.5, 5.7};
std::vector<int64_t> b{0, 1, 1, 2};
std::vector<float> c(output->elementsSize());
auto aGPU = hardware::ForeignBlob::share(Target(Target::NvidiaGpu).memManager(), data->bytesSize());
auto bGPU = hardware::ForeignBlob::share(Target(Target::NvidiaGpu).memManager(), indices->bytesSize());
auto cGPU = hardware::ForeignBlob::share(Target(Target::NvidiaGpu).memManager(), output->bytesSize());
aGPU->copyIn(a.data(), data->bytesSize());
bGPU->copyIn(b.data(), indices->bytesSize());
auto aGPU = device->malloc(data->bytesSize()),
bGPU = device->malloc(indices->bytesSize()),
cGPU = device->malloc(output->bytesSize());
aGPU->copyFromHost(a.data(), data->bytesSize());
bGPU->copyFromHost(b.data(), indices->bytesSize());
// Compute
{
void const *inputs[]{*aGPU, *bGPU};
Expand All @@ -44,7 +47,7 @@ TEST(kernel, GatherCuda) {
}
// Compare
std::vector<float> result(output->elementsSize());
cGPU->copyOut(result.data(), output->bytesSize());
cGPU->copyToHost(result.data(), output->bytesSize());
for (auto i : range0_(c.size())) {
EXPECT_FLOAT_EQ(c[i], result[i]);
}
Expand All @@ -67,11 +70,11 @@ TEST(kernel, GatherCuda) {
std::vector<float> a{1.0, 1.2, 1.9, 2.3, 3.4, 3.9, 4.5, 5.7, 5.9};
std::vector<int> b{0, 2};
std::vector<float> c(output->elementsSize());
auto aGPU = hardware::ForeignBlob::share(Target(Target::NvidiaGpu).memManager(), data->bytesSize());
auto bGPU = hardware::ForeignBlob::share(Target(Target::NvidiaGpu).memManager(), indices->bytesSize());
auto cGPU = hardware::ForeignBlob::share(Target(Target::NvidiaGpu).memManager(), output->bytesSize());
aGPU->copyIn(a.data(), data->bytesSize());
bGPU->copyIn(b.data(), indices->bytesSize());
auto aGPU = device->malloc(data->bytesSize()),
bGPU = device->malloc(indices->bytesSize()),
cGPU = device->malloc(output->bytesSize());
aGPU->copyFromHost(a.data(), data->bytesSize());
bGPU->copyFromHost(b.data(), indices->bytesSize());
// Compute
{
void const *inputs[]{*aGPU, *bGPU};
Expand All @@ -85,7 +88,7 @@ TEST(kernel, GatherCuda) {
}
// Compare
std::vector<float> result(output->elementsSize());
cGPU->copyOut(result.data(), output->bytesSize());
cGPU->copyToHost(result.data(), output->bytesSize());
for (auto i : range0_(c.size())) {
EXPECT_FLOAT_EQ(c[i], result[i]);
}
Expand Down
Loading

0 comments on commit f652e81

Please sign in to comment.