From de46294f8e2b2055b55ca225c83e0fba98a2f5ed Mon Sep 17 00:00:00 2001 From: YdrMaster Date: Tue, 21 Nov 2023 15:11:41 +0800 Subject: [PATCH] =?UTF-8?q?refactor(kernel):=20=E7=8E=B0=E5=9C=A8=E6=89=80?= =?UTF-8?q?=E6=9C=89=20kernel=20=E4=BD=BF=E7=94=A8=E9=9D=99=E6=80=81?= =?UTF-8?q?=E5=88=86=E9=85=8D=E7=9A=84=E5=B7=A5=E4=BD=9C=E7=A9=BA=E9=97=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: YdrMaster --- .../src/kernels/concat/cuda_kernel.cu | 17 ++++---- src/04kernel/src/kernels/conv/cudnn_kernel.cu | 16 +++---- .../src/kernels/reduce/cudnn_kernel.cu | 42 +++++++++---------- src/04kernel/src/kernels/split/cuda_kernel.cu | 17 ++++---- .../test/kernels/concat/test_cuda.cpp | 5 ++- .../test/kernels/reduce/test_cudnn.cpp | 20 ++++----- src/04kernel/test/kernels/split/test_cuda.cpp | 5 ++- 7 files changed, 63 insertions(+), 59 deletions(-) diff --git a/src/04kernel/src/kernels/concat/cuda_kernel.cu b/src/04kernel/src/kernels/concat/cuda_kernel.cu index ba472ab9..00c52a5e 100644 --- a/src/04kernel/src/kernels/concat/cuda_kernel.cu +++ b/src/04kernel/src/kernels/concat/cuda_kernel.cu @@ -9,22 +9,23 @@ namespace refactor::kernel { auto ConcatCuda::lower(Resources &) const noexcept -> RoutineWorkspace { auto sub = std::min(info.submultiple(), 16u); - return [segments = thrust::device_vector(info.segments.begin(), info.segments.end()), - params = cuda::ThreadsDistributer()(info.blockCount * info.sum / sub), - sum = info.sum / sub, - sub](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { - auto size = segments.size() * sizeof(void *); - auto inputs_ = mem_manager::ForeignBlob::share(res.fetch()->manager, size); - inputs_->copyIn(inputs, size); + auto workspaceSize = info.segments.size() * sizeof(void *); + auto routine = [params = cuda::ThreadsDistributer()(info.blockCount * info.sum / sub), + segments = thrust::device_vector(info.segments.begin(), info.segments.end()), + workspaceSize, + sum = info.sum / sub, + sub](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { + cudaMemcpy(workspace, inputs, workspaceSize, cudaMemcpyHostToDevice); cuda::launchConcat( params, - reinterpret_cast((void *) *inputs_), + reinterpret_cast(workspace), segments.data().get(), outputs[0], segments.size(), sum, sub); }; + return RoutineWorkspace(std::move(routine), workspaceSize); } }// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/conv/cudnn_kernel.cu b/src/04kernel/src/kernels/conv/cudnn_kernel.cu index 67e934ea..ae7b8edc 100644 --- a/src/04kernel/src/kernels/conv/cudnn_kernel.cu +++ b/src/04kernel/src/kernels/conv/cudnn_kernel.cu @@ -14,10 +14,9 @@ namespace refactor::kernel { cudnnFilterDescriptor_t w; cudnnConvolutionDescriptor_t conv; cudnnConvolutionFwdAlgo_t algo; - size_t workspaceSize; bool f64; - Descriptors() : workspaceSize(0) { + Descriptors(bool f64_) : f64(f64_) { CUDNN_ASSERT(cudnnCreateTensorDescriptor(&x)); CUDNN_ASSERT(cudnnCreateTensorDescriptor(&y)); CUDNN_ASSERT(cudnnCreateFilterDescriptor(&w)); @@ -33,8 +32,7 @@ namespace refactor::kernel { Descriptors(const Descriptors &) = delete; Descriptors(Descriptors &&) = delete; }; - auto d = std::make_shared(); - d->f64 = info.dt == DataType::F64; + auto d = std::make_shared(info.dt == DataType::F64); auto cudnnDataType = cudnnDataTypeConvert(info.dt); auto xs = info.xShape, ys = info.yShape, ws = info.wShape; @@ -59,20 +57,21 @@ namespace refactor::kernel { // for high accuracy, use this algo only // d->algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } + size_t workspaceSize; { CUDNN_ASSERT(cudnnGetConvolutionForwardWorkspaceSize( handle, d->x, d->w, d->conv, d->y, d->algo, - &d->workspaceSize)); + &workspaceSize)); } // nvcc at c++11 doesn't support real move capture - return [d_ = std::move(d)](Resources &res, void *workspace_, void const *const *inputs, void *const *outputs) { + auto routine = [d_ = std::move(d), + workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { using mem_manager::ForeignBlob; auto const &d = *d_; // fetch cudnn handle from resources auto handle = res.fetchOrStore()->handle; - auto workspace = ForeignBlob::share(res.fetch()->manager, d.workspaceSize); // build alpha/beta for double union { float f32[2]; @@ -96,10 +95,11 @@ namespace refactor::kernel { d.x, inputs[0], d.w, inputs[1], d.conv, d.algo, - *workspace, d.workspaceSize, + workspace, workspaceSize, beta, d.y, outputs[0])); }; + return {std::move(routine), workspaceSize}; } }// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/reduce/cudnn_kernel.cu b/src/04kernel/src/kernels/reduce/cudnn_kernel.cu index 7a616eef..9d1e3fcd 100644 --- a/src/04kernel/src/kernels/reduce/cudnn_kernel.cu +++ b/src/04kernel/src/kernels/reduce/cudnn_kernel.cu @@ -2,7 +2,7 @@ #include "../../utilities/cuda/cudnn_functions.h" #include "common.h" #include "cudnn_kernel.hh" -#include "runtime/mem_manager.hh" +#include "mem_manager/functions.h" namespace refactor::kernel { using namespace cudnn; @@ -13,12 +13,9 @@ namespace refactor::kernel { struct Descriptors { cudnnTensorDescriptor_t inDesc; cudnnTensorDescriptor_t outDesc; - cudnnReduceTensorDescriptor_t reduceDesc; - size_t workspaceSize; - size_t idxWorkspaceSize; - Descriptors() : workspaceSize(0), idxWorkspaceSize(0) { + Descriptors() { CUDNN_ASSERT(cudnnCreateTensorDescriptor(&inDesc)); CUDNN_ASSERT(cudnnCreateTensorDescriptor(&outDesc)); CUDNN_ASSERT(cudnnCreateReduceTensorDescriptor(&reduceDesc)); @@ -122,36 +119,39 @@ namespace refactor::kernel { CUDNN_NOT_PROPAGATE_NAN, CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES)); - // get workspace - CUDNN_ASSERT( - cudnnGetReductionWorkspaceSize(handler, d->reduceDesc, - d->inDesc, d->outDesc, &d->workspaceSize)); - + size_t idxWorkspaceSize, workspaceSize; // get index workspace CUDNN_ASSERT( cudnnGetReductionIndicesSize(handler, d->reduceDesc, - d->inDesc, d->outDesc, &d->idxWorkspaceSize)); - + d->inDesc, d->outDesc, &idxWorkspaceSize)); + // get workspace + CUDNN_ASSERT( + cudnnGetReductionWorkspaceSize(handler, d->reduceDesc, + d->inDesc, d->outDesc, &workspaceSize)); + idxWorkspaceSize = mem_manager::alignBytes(idxWorkspaceSize, 256); // nvcc at c++11 doesn't support real move capture - return [d_ = std::move(d)](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { - using mem_manager::ForeignBlob; + auto routine = [d_ = std::move(d), + idxWorkspaceSize, + workspaceSize](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { // fetch cudnn handle from resources auto handle = res.fetchOrStore()->handle; auto const &d = *d_; - auto wsData = ForeignBlob::share(res.fetch()->manager, d.workspaceSize); - auto idxWsData = ForeignBlob::share(res.fetch()->manager, d.idxWorkspaceSize); - // name inputs and outputs auto inData = inputs[0]; auto outData = outputs[0]; // reduce float alpha = 1.f, beta = 0.f; - CUDNN_ASSERT(cudnnReduceTensor(handle, d.reduceDesc, - *idxWsData, d.idxWorkspaceSize, *wsData, - d.workspaceSize, &alpha, d.inDesc, inData, - &beta, d.outDesc, outData)); + void *idxWorkspace = workspace, + *dataWorkspace = reinterpret_cast(workspace) + idxWorkspaceSize; + CUDNN_ASSERT(cudnnReduceTensor( + handle, d.reduceDesc, + idxWorkspace, idxWorkspaceSize, + dataWorkspace, workspaceSize, + &alpha, d.inDesc, inData, + &beta, d.outDesc, outData)); }; + return RoutineWorkspace(std::move(routine), idxWorkspaceSize + workspaceSize); } }// namespace refactor::kernel diff --git a/src/04kernel/src/kernels/split/cuda_kernel.cu b/src/04kernel/src/kernels/split/cuda_kernel.cu index 6723b503..9ba5e92a 100644 --- a/src/04kernel/src/kernels/split/cuda_kernel.cu +++ b/src/04kernel/src/kernels/split/cuda_kernel.cu @@ -9,22 +9,23 @@ namespace refactor::kernel { auto SplitCuda::lower(Resources &) const noexcept -> RoutineWorkspace { auto sub = std::min(info.submultiple(), 16u); - return [segments = thrust::device_vector(info.segments.begin(), info.segments.end()), - params = cuda::ThreadsDistributer()(info.blockCount * info.sum / sub), - sum = info.sum / sub, - sub](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { - auto size = segments.size() * sizeof(void *); - auto outputs_ = mem_manager::ForeignBlob::share(res.fetch()->manager, size); - outputs_->copyIn(outputs, size); + auto workspaceSize = info.segments.size() * sizeof(void *); + auto routine = [params = cuda::ThreadsDistributer()(info.blockCount * info.sum / sub), + segments = thrust::device_vector(info.segments.begin(), info.segments.end()), + workspaceSize, + sum = info.sum / sub, + sub](Resources &res, void *workspace, void const *const *inputs, void *const *outputs) { + cudaMemcpy(workspace, outputs, workspaceSize, cudaMemcpyHostToDevice); cuda::launchSplit( params, inputs[0], segments.data().get(), - reinterpret_cast((void *) *outputs_), + reinterpret_cast(workspace), segments.size(), sum, sub); }; + return RoutineWorkspace(std::move(routine), workspaceSize); } }// namespace refactor::kernel diff --git a/src/04kernel/test/kernels/concat/test_cuda.cpp b/src/04kernel/test/kernels/concat/test_cuda.cpp index 96a05477..f0723324 100644 --- a/src/04kernel/test/kernels/concat/test_cuda.cpp +++ b/src/04kernel/test/kernels/concat/test_cuda.cpp @@ -30,11 +30,12 @@ TEST(kernel, ConcatCuda) { ASSERT_TRUE(kCpu && kernel); auto res = runtime::Resources(); auto rCpu = kCpu->lower(res).routine; - auto routine = kernel->lower(res).routine; + auto [routine, workspaceSize] = kernel->lower(res); // malloc res.fetchOrStore(Target(Target::NvidiaGpu).memManager()); auto memManager = res.fetch()->manager; Arc + workspace = mem_manager::ForeignBlob::share(memManager, workspaceSize), gpuIns[]{ mem_manager::ForeignBlob::share(memManager, inputTensors[0]->bytesSize()), mem_manager::ForeignBlob::share(memManager, inputTensors[1]->bytesSize()), @@ -64,7 +65,7 @@ TEST(kernel, ConcatCuda) { { void const *inputs[]{*gpuIns[0], *gpuIns[1], *gpuIns[2], *gpuIns[3]}; void *outputs[]{*gpuOut}; - routine(res, nullptr, inputs, outputs); + routine(res, *workspace, inputs, outputs); } { void const *inputs[]{cpuIns[0].data(), cpuIns[1].data(), cpuIns[2].data(), cpuIns[3].data()}; diff --git a/src/04kernel/test/kernels/reduce/test_cudnn.cpp b/src/04kernel/test/kernels/reduce/test_cudnn.cpp index d8e3b1cb..d31fd06d 100644 --- a/src/04kernel/test/kernels/reduce/test_cudnn.cpp +++ b/src/04kernel/test/kernels/reduce/test_cudnn.cpp @@ -15,20 +15,20 @@ static void testReducemean(const Shape &shape, const std::vector &data, ASSERT_TRUE(kernel); auto res = runtime::Resources(); res.fetchOrStore(Target(Target::NvidiaGpu).memManager()); - auto routine = kernel->lower(res).routine; + auto [routine, workspaceSize] = kernel->lower(res); // cuda malloc - auto gpuMemIn = mem_manager::ForeignBlob::share( - Target(Target::NvidiaGpu).memManager(), - dataTensor->bytesSize()); - auto gpuMemOut = mem_manager::ForeignBlob::share( - Target(Target::NvidiaGpu).memManager(), - dataTensor->bytesSize()); + auto manager = Target(Target::NvidiaGpu).memManager(); + auto workspace = mem_manager::ForeignBlob::share(manager, workspaceSize); + auto gpuMemIn = mem_manager::ForeignBlob::share(manager, dataTensor->bytesSize()); + auto gpuMemOut = mem_manager::ForeignBlob::share(manager, dataTensor->bytesSize()); // put input output data gpuMemIn->copyIn(data.data(), dataTensor->bytesSize()); - void const *inputs[]{*gpuMemIn}; - void *outputs[]{*gpuMemOut}; // inference - routine(res, nullptr, inputs, outputs); + { + void const *inputs[]{*gpuMemIn}; + void *outputs[]{*gpuMemOut}; + routine(res, *workspace, inputs, outputs); + } // take output data Shape outDimArray; std::unordered_set axesSet(axes.begin(), axes.end()); diff --git a/src/04kernel/test/kernels/split/test_cuda.cpp b/src/04kernel/test/kernels/split/test_cuda.cpp index 544e341b..5e417bce 100644 --- a/src/04kernel/test/kernels/split/test_cuda.cpp +++ b/src/04kernel/test/kernels/split/test_cuda.cpp @@ -30,11 +30,12 @@ TEST(kernel, SplitCuda) { ASSERT_TRUE(kCpu && kernel); auto res = runtime::Resources(); auto rCpu = kCpu->lower(res).routine; - auto routine = kernel->lower(res).routine; + auto [routine, workspaceSize] = kernel->lower(res); // malloc res.fetchOrStore(Target(Target::NvidiaGpu).memManager()); auto memManager = res.fetch()->manager; Arc + workspace = mem_manager::ForeignBlob::share(memManager, workspaceSize), gpuIn = mem_manager::ForeignBlob::share(memManager, dataTensor->bytesSize()), gpuOuts[]{ mem_manager::ForeignBlob::share(memManager, outputTensors[0]->bytesSize()), @@ -63,7 +64,7 @@ TEST(kernel, SplitCuda) { { void const *inputs[]{*gpuIn}; void *outputs[]{*gpuOuts[0], *gpuOuts[1], *gpuOuts[2], *gpuOuts[3]}; - routine(res, nullptr, inputs, outputs); + routine(res, *workspace, inputs, outputs); } { void const *inputs[]{data.data()};