Skip to content

Commit

Permalink
refactor(kernel): 现在所有 kernel 使用静态分配的工作空间
Browse files Browse the repository at this point in the history
Signed-off-by: YdrMaster <ydrml@hotmail.com>
  • Loading branch information
YdrMaster committed Nov 21, 2023
1 parent b3072d3 commit de46294
Show file tree
Hide file tree
Showing 7 changed files with 63 additions and 59 deletions.
17 changes: 9 additions & 8 deletions src/04kernel/src/kernels/concat/cuda_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,23 @@ namespace refactor::kernel {

auto ConcatCuda::lower(Resources &) const noexcept -> RoutineWorkspace {
auto sub = std::min(info.submultiple(), 16u);
return [segments = thrust::device_vector<dim_t>(info.segments.begin(), info.segments.end()),
params = cuda::ThreadsDistributer()(info.blockCount * info.sum / sub),
sum = info.sum / sub,
sub](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
auto size = segments.size() * sizeof(void *);
auto inputs_ = mem_manager::ForeignBlob::share(res.fetch<MemManager>()->manager, size);
inputs_->copyIn(inputs, size);
auto workspaceSize = info.segments.size() * sizeof(void *);
auto routine = [params = cuda::ThreadsDistributer()(info.blockCount * info.sum / sub),
segments = thrust::device_vector<dim_t>(info.segments.begin(), info.segments.end()),
workspaceSize,
sum = info.sum / sub,
sub](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
cudaMemcpy(workspace, inputs, workspaceSize, cudaMemcpyHostToDevice);
cuda::launchConcat(
params,
reinterpret_cast<void const **>((void *) *inputs_),
reinterpret_cast<void const **>(workspace),
segments.data().get(),
outputs[0],
segments.size(),
sum,
sub);
};
return RoutineWorkspace(std::move(routine), workspaceSize);
}

}// namespace refactor::kernel
16 changes: 8 additions & 8 deletions src/04kernel/src/kernels/conv/cudnn_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,9 @@ namespace refactor::kernel {
cudnnFilterDescriptor_t w;
cudnnConvolutionDescriptor_t conv;
cudnnConvolutionFwdAlgo_t algo;
size_t workspaceSize;
bool f64;

Descriptors() : workspaceSize(0) {
Descriptors(bool f64_) : f64(f64_) {
CUDNN_ASSERT(cudnnCreateTensorDescriptor(&x));
CUDNN_ASSERT(cudnnCreateTensorDescriptor(&y));
CUDNN_ASSERT(cudnnCreateFilterDescriptor(&w));
Expand All @@ -33,8 +32,7 @@ namespace refactor::kernel {
Descriptors(const Descriptors &) = delete;
Descriptors(Descriptors &&) = delete;
};
auto d = std::make_shared<Descriptors>();
d->f64 = info.dt == DataType::F64;
auto d = std::make_shared<Descriptors>(info.dt == DataType::F64);

auto cudnnDataType = cudnnDataTypeConvert(info.dt);
auto xs = info.xShape, ys = info.yShape, ws = info.wShape;
Expand All @@ -59,20 +57,21 @@ namespace refactor::kernel {
// for high accuracy, use this algo only
// d->algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
size_t workspaceSize;
{
CUDNN_ASSERT(cudnnGetConvolutionForwardWorkspaceSize(
handle,
d->x, d->w, d->conv, d->y,
d->algo,
&d->workspaceSize));
&workspaceSize));
}
// nvcc at c++11 doesn't support real move capture
return [d_ = std::move(d)](Resources &res, void *workspace_, void const *const *inputs, void *const *outputs) {
auto routine = [d_ = std::move(d),
workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
using mem_manager::ForeignBlob;
auto const &d = *d_;
// fetch cudnn handle from resources
auto handle = res.fetchOrStore<CudnnContext>()->handle;
auto workspace = ForeignBlob::share(res.fetch<MemManager>()->manager, d.workspaceSize);
// build alpha/beta for double
union {
float f32[2];
Expand All @@ -96,10 +95,11 @@ namespace refactor::kernel {
d.x, inputs[0],
d.w, inputs[1],
d.conv, d.algo,
*workspace, d.workspaceSize,
workspace, workspaceSize,
beta,
d.y, outputs[0]));
};
return {std::move(routine), workspaceSize};
}

}// namespace refactor::kernel
42 changes: 21 additions & 21 deletions src/04kernel/src/kernels/reduce/cudnn_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#include "../../utilities/cuda/cudnn_functions.h"
#include "common.h"
#include "cudnn_kernel.hh"
#include "runtime/mem_manager.hh"
#include "mem_manager/functions.h"

namespace refactor::kernel {
using namespace cudnn;
Expand All @@ -13,12 +13,9 @@ namespace refactor::kernel {
struct Descriptors {
cudnnTensorDescriptor_t inDesc;
cudnnTensorDescriptor_t outDesc;

cudnnReduceTensorDescriptor_t reduceDesc;
size_t workspaceSize;
size_t idxWorkspaceSize;

Descriptors() : workspaceSize(0), idxWorkspaceSize(0) {
Descriptors() {
CUDNN_ASSERT(cudnnCreateTensorDescriptor(&inDesc));
CUDNN_ASSERT(cudnnCreateTensorDescriptor(&outDesc));
CUDNN_ASSERT(cudnnCreateReduceTensorDescriptor(&reduceDesc));
Expand Down Expand Up @@ -122,36 +119,39 @@ namespace refactor::kernel {
CUDNN_NOT_PROPAGATE_NAN, CUDNN_REDUCE_TENSOR_NO_INDICES,
CUDNN_32BIT_INDICES));

// get workspace
CUDNN_ASSERT(
cudnnGetReductionWorkspaceSize(handler, d->reduceDesc,
d->inDesc, d->outDesc, &d->workspaceSize));

size_t idxWorkspaceSize, workspaceSize;
// get index workspace
CUDNN_ASSERT(
cudnnGetReductionIndicesSize(handler, d->reduceDesc,
d->inDesc, d->outDesc, &d->idxWorkspaceSize));

d->inDesc, d->outDesc, &idxWorkspaceSize));
// get workspace
CUDNN_ASSERT(
cudnnGetReductionWorkspaceSize(handler, d->reduceDesc,
d->inDesc, d->outDesc, &workspaceSize));
idxWorkspaceSize = mem_manager::alignBytes(idxWorkspaceSize, 256);

// nvcc at c++11 doesn't support real move capture
return [d_ = std::move(d)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
using mem_manager::ForeignBlob;
auto routine = [d_ = std::move(d),
idxWorkspaceSize,
workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
// fetch cudnn handle from resources
auto handle = res.fetchOrStore<CudnnContext>()->handle;
auto const &d = *d_;
auto wsData = ForeignBlob::share(res.fetch<runtime::MemManager>()->manager, d.workspaceSize);
auto idxWsData = ForeignBlob::share(res.fetch<runtime::MemManager>()->manager, d.idxWorkspaceSize);

// name inputs and outputs
auto inData = inputs[0];
auto outData = outputs[0];
// reduce
float alpha = 1.f, beta = 0.f;
CUDNN_ASSERT(cudnnReduceTensor(handle, d.reduceDesc,
*idxWsData, d.idxWorkspaceSize, *wsData,
d.workspaceSize, &alpha, d.inDesc, inData,
&beta, d.outDesc, outData));
void *idxWorkspace = workspace,
*dataWorkspace = reinterpret_cast<uint8_t *>(workspace) + idxWorkspaceSize;
CUDNN_ASSERT(cudnnReduceTensor(
handle, d.reduceDesc,
idxWorkspace, idxWorkspaceSize,
dataWorkspace, workspaceSize,
&alpha, d.inDesc, inData,
&beta, d.outDesc, outData));
};
return RoutineWorkspace(std::move(routine), idxWorkspaceSize + workspaceSize);
}

}// namespace refactor::kernel
17 changes: 9 additions & 8 deletions src/04kernel/src/kernels/split/cuda_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,23 @@ namespace refactor::kernel {

auto SplitCuda::lower(Resources &) const noexcept -> RoutineWorkspace {
auto sub = std::min(info.submultiple(), 16u);
return [segments = thrust::device_vector<dim_t>(info.segments.begin(), info.segments.end()),
params = cuda::ThreadsDistributer()(info.blockCount * info.sum / sub),
sum = info.sum / sub,
sub](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
auto size = segments.size() * sizeof(void *);
auto outputs_ = mem_manager::ForeignBlob::share(res.fetch<MemManager>()->manager, size);
outputs_->copyIn(outputs, size);
auto workspaceSize = info.segments.size() * sizeof(void *);
auto routine = [params = cuda::ThreadsDistributer()(info.blockCount * info.sum / sub),
segments = thrust::device_vector<dim_t>(info.segments.begin(), info.segments.end()),
workspaceSize,
sum = info.sum / sub,
sub](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) {
cudaMemcpy(workspace, outputs, workspaceSize, cudaMemcpyHostToDevice);
cuda::launchSplit(
params,
inputs[0],
segments.data().get(),
reinterpret_cast<void **>((void *) *outputs_),
reinterpret_cast<void **>(workspace),
segments.size(),
sum,
sub);
};
return RoutineWorkspace(std::move(routine), workspaceSize);
}

}// namespace refactor::kernel
5 changes: 3 additions & 2 deletions src/04kernel/test/kernels/concat/test_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,12 @@ TEST(kernel, ConcatCuda) {
ASSERT_TRUE(kCpu && kernel);
auto res = runtime::Resources();
auto rCpu = kCpu->lower(res).routine;
auto routine = kernel->lower(res).routine;
auto [routine, workspaceSize] = kernel->lower(res);
// malloc
res.fetchOrStore<runtime::MemManager>(Target(Target::NvidiaGpu).memManager());
auto memManager = res.fetch<runtime::MemManager>()->manager;
Arc<mem_manager::ForeignBlob>
workspace = mem_manager::ForeignBlob::share(memManager, workspaceSize),
gpuIns[]{
mem_manager::ForeignBlob::share(memManager, inputTensors[0]->bytesSize()),
mem_manager::ForeignBlob::share(memManager, inputTensors[1]->bytesSize()),
Expand Down Expand Up @@ -64,7 +65,7 @@ TEST(kernel, ConcatCuda) {
{
void const *inputs[]{*gpuIns[0], *gpuIns[1], *gpuIns[2], *gpuIns[3]};
void *outputs[]{*gpuOut};
routine(res, nullptr, inputs, outputs);
routine(res, *workspace, inputs, outputs);
}
{
void const *inputs[]{cpuIns[0].data(), cpuIns[1].data(), cpuIns[2].data(), cpuIns[3].data()};
Expand Down
20 changes: 10 additions & 10 deletions src/04kernel/test/kernels/reduce/test_cudnn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,20 @@ static void testReducemean(const Shape &shape, const std::vector<float> &data,
ASSERT_TRUE(kernel);
auto res = runtime::Resources();
res.fetchOrStore<runtime::MemManager>(Target(Target::NvidiaGpu).memManager());
auto routine = kernel->lower(res).routine;
auto [routine, workspaceSize] = kernel->lower(res);
// cuda malloc
auto gpuMemIn = mem_manager::ForeignBlob::share(
Target(Target::NvidiaGpu).memManager(),
dataTensor->bytesSize());
auto gpuMemOut = mem_manager::ForeignBlob::share(
Target(Target::NvidiaGpu).memManager(),
dataTensor->bytesSize());
auto manager = Target(Target::NvidiaGpu).memManager();
auto workspace = mem_manager::ForeignBlob::share(manager, workspaceSize);
auto gpuMemIn = mem_manager::ForeignBlob::share(manager, dataTensor->bytesSize());
auto gpuMemOut = mem_manager::ForeignBlob::share(manager, dataTensor->bytesSize());
// put input output data
gpuMemIn->copyIn(data.data(), dataTensor->bytesSize());
void const *inputs[]{*gpuMemIn};
void *outputs[]{*gpuMemOut};
// inference
routine(res, nullptr, inputs, outputs);
{
void const *inputs[]{*gpuMemIn};
void *outputs[]{*gpuMemOut};
routine(res, *workspace, inputs, outputs);
}
// take output data
Shape outDimArray;
std::unordered_set axesSet(axes.begin(), axes.end());
Expand Down
5 changes: 3 additions & 2 deletions src/04kernel/test/kernels/split/test_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,12 @@ TEST(kernel, SplitCuda) {
ASSERT_TRUE(kCpu && kernel);
auto res = runtime::Resources();
auto rCpu = kCpu->lower(res).routine;
auto routine = kernel->lower(res).routine;
auto [routine, workspaceSize] = kernel->lower(res);
// malloc
res.fetchOrStore<runtime::MemManager>(Target(Target::NvidiaGpu).memManager());
auto memManager = res.fetch<runtime::MemManager>()->manager;
Arc<mem_manager::ForeignBlob>
workspace = mem_manager::ForeignBlob::share(memManager, workspaceSize),
gpuIn = mem_manager::ForeignBlob::share(memManager, dataTensor->bytesSize()),
gpuOuts[]{
mem_manager::ForeignBlob::share(memManager, outputTensors[0]->bytesSize()),
Expand Down Expand Up @@ -63,7 +64,7 @@ TEST(kernel, SplitCuda) {
{
void const *inputs[]{*gpuIn};
void *outputs[]{*gpuOuts[0], *gpuOuts[1], *gpuOuts[2], *gpuOuts[3]};
routine(res, nullptr, inputs, outputs);
routine(res, *workspace, inputs, outputs);
}
{
void const *inputs[]{data.data()};
Expand Down

0 comments on commit de46294

Please sign in to comment.