Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/bugfix #3007

Merged
merged 3 commits into from
Aug 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion express/module/StaticModule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,6 @@ ErrorCode StaticModule::_resize(const std::vector<Express::VARP>& inputs) {
if (needResize) {
mSession->setNeedResize();
}
code = mSession->resize();
if (!needResize) {
// Check if output is used by other vars. If used, must realloc output to avoid the content dirty for output vars
// If resized, the output's memory will be all released in Session::resize, don't need clear here
Expand All @@ -489,6 +488,7 @@ ErrorCode StaticModule::_resize(const std::vector<Express::VARP>& inputs) {
}
}
}
code = mSession->resize();
} else {
// Resize
for (int i = 0; i < inputs.size(); ++i) {
Expand Down
100 changes: 53 additions & 47 deletions source/backend/cpu/CPURaster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,32 @@

using Vec4 = MNN::Math::Vec<float, 4>;
namespace MNN {
struct ReduceInfo {
int reduceMask[3] = {0, 0, 0};
int reduceNum = 0;
int reduceIndex[3];
int normalIndex[3];
int normalNum = 0;
bool compute(const Tensor::InsideDescribe::Region& slice) {
normalNum = 0;
reduceNum = 0;
for (int i=0; i<3; ++i) {
if (slice.size[i] > 1 && slice.dst.stride[i] == 0) {
reduceMask[i] = 1;
reduceIndex[reduceNum] = i;
reduceNum ++;
} else {
MNN_ASSERT(normalNum < 3);
normalIndex[normalNum] = i;
normalNum++;
}
}
if (0 == reduceNum) {
return false;
}
return true;
}
};

ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std::vector<Tensor *> &outputs) {
MNN_ASSERT(outputs.size() == 1);
Expand Down Expand Up @@ -138,7 +164,6 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
}
}
auto cache = static_cast<CPUBackend*>(backend())->getCache();
#if 1
auto tempTensor = cache->findCacheTensor(origin, midFormat);
//MNN_ASSERT(CPUBackend::getBytes(backend(), origin) == 4);
if (nullptr == tempTensor) {
Expand All @@ -160,22 +185,6 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
if (--TensorUtils::getDescribe(tempTensor)->useCount == 0) {
forRelease.emplace_back(tempTensor);
}
#else
std::shared_ptr<Tensor> newTensor(new Tensor);
TensorUtils::copyShape(origin, newTensor.get());
TensorUtils::getDescribe(newTensor.get())->dimensionFormat = midFormat;
TensorUtils::getDescribe(newTensor.get())->quantAttr = TensorUtils::getDescribe(origin)->quantAttr;
newTensor->buffer().type = origin->getType();
TensorUtils::setLinearLayout(newTensor.get());
mTempInput.insert(std::make_pair(origin, newTensor.get()));
auto res = backend()->onAcquireBuffer(newTensor.get(), Backend::DYNAMIC);
if (!res) {
return OUT_OF_MEMORY;
}
auto tempTensor = newTensor.get();
backend()->onReleaseBuffer(tempTensor, Backend::DYNAMIC);
cache->pushCacheTensor(newTensor, origin, midFormat);
#endif
mTempInputCopy.emplace_back(std::make_pair(tempTensor, &slice));
}
for (auto t : forRelease) {
Expand All @@ -185,7 +194,15 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
}
auto threadNumber = static_cast<CPUBackend*>(backend())->threadNumber();
if (mTempInputCopy.size() == 1 && threadNumber > 1) {
mHasReduce = false;
ReduceInfo reduceInfo;
for (auto& iter : mTempInputCopy) {
if (reduceInfo.compute(*iter.second)) {
mHasReduce = true;
break;
}
}
if (mTempInputCopy.size() == 1 && threadNumber > 1 && (!mHasReduce)) {
// Split to multi region
auto region = mTempInputCopy[0].second;
const int thredHold = 100;//TODO: Find better way to determine it
Expand Down Expand Up @@ -396,25 +413,11 @@ static void _zero(const Tensor::InsideDescribe::Region& slice, int bytes, uint8_
}
}
static bool _reduceblit(const Tensor::InsideDescribe::Region& slice, int bytes, const uint8_t* srcPtr, uint8_t* dstPtr) {
int reduceMask[3] = {0, 0, 0};
int reduceNum = 0;
int reduceIndex[3];
int normalIndex[3];
int normalNum = 0;
for (int i=0; i<3; ++i) {
if (slice.size[i] > 1 && slice.dst.stride[i] == 0) {
reduceMask[i] = 1;
reduceIndex[reduceNum] = i;
reduceNum ++;
} else {
normalIndex[normalNum] = i;
normalNum++;
}
}
if (0 == reduceNum) {
return false;
}
switch (reduceNum) {
ReduceInfo reduceInfo;
reduceInfo.compute(slice);
auto normalIndex = reduceInfo.normalIndex;
auto reduceIndex = reduceInfo.reduceIndex;
switch (reduceInfo.reduceNum) {
case 3:
{
float summer = 0.0f;
Expand Down Expand Up @@ -490,14 +493,13 @@ static bool _reduceblit(const Tensor::InsideDescribe::Region& slice, int bytes,
return false;
}

static void _blit(const Tensor::InsideDescribe::Region& slice, int bytes, const uint8_t* srcPtr, uint8_t* dstPtr) {
static void _blit(const Tensor::InsideDescribe::Region& slice, int bytes, const uint8_t* srcPtr, uint8_t* dstPtr, bool hasReduce) {
auto proc = _selectUnitProc(bytes, slice.src.stride[2], slice.dst.stride[2]);
#define MNN_BLIT_SUPPORT_REDUCE
#ifdef MNN_BLIT_SUPPORT_REDUCE
if (_reduceblit(slice, bytes, srcPtr, dstPtr)) {
return;
if (hasReduce) {
if (_reduceblit(slice, bytes, srcPtr, dstPtr)) {
return;
}
}
#endif
if (slice.src.stride[1] == slice.size[2] && slice.dst.stride[1] == slice.size[2] && slice.src.stride[2] == 1) {
for (int z=0; z<slice.size[0]; ++z) {
auto srcZ = srcPtr + z * slice.src.stride[0] * bytes;
Expand Down Expand Up @@ -624,13 +626,17 @@ ErrorCode CPURaster::onExecute(const std::vector<Tensor *> &____inputs, const st
tensorConvert(iter.first, iter.second, bytes);
}
threadNum = ALIMIN(threadNum, (int)mTempInputCopy.size());
if (mHasReduce) {
// Don't support reduce with multi thread now
threadNum = 1;
}
MNN_CONCURRENCY_BEGIN(tId, threadNum) {
for (int u=tId; u<mTempInputCopy.size(); u+=threadNum) {
auto& iter = mTempInputCopy[u];
auto& slice = *(iter.second);
auto srcPtr = iter.first->host<uint8_t>() + slice.src.offset * bytes;
auto dstPtr = (uint8_t*)mOutputPtr + slice.dst.offset * bytes;
_blit(slice, bytes, srcPtr, dstPtr);
_blit(slice, bytes, srcPtr, dstPtr, mHasReduce);
}
}
MNN_CONCURRENCY_END();
Expand Down Expand Up @@ -807,7 +813,7 @@ class CPULoop : public Execution {
if (halide_type_float == input->getType().code) {
bytes = cpubackend->functions()->bytes;
}
_blit(reg, bytes, input->host<uint8_t>(), output->host<uint8_t>());
_blit(reg, bytes, input->host<uint8_t>(), output->host<uint8_t>(), false);
}

}
Expand Down Expand Up @@ -855,7 +861,7 @@ class CPULoop : public Execution {
auto dstOffset = dstIter * step0 + dstView->offset();
if (dstOffset >= 0) {
if (srcOffset >= 0 && srcOffset < inputSize) {
_blit(reg, bytes, input->host<uint8_t>() + bytes * srcOffset, output->host<uint8_t>() + bytes * dstOffset);
_blit(reg, bytes, input->host<uint8_t>() + bytes * srcOffset, output->host<uint8_t>() + bytes * dstOffset, false);
} else {
_zero(reg, bytes, output->host<uint8_t>() + bytes * dstOffset);
}
Expand Down Expand Up @@ -921,7 +927,7 @@ class CPULoop : public Execution {
auto step0 = cmd->steps()->data()[0];
auto step1 = cmd->steps()->data()[1];
auto loopNumber = mLoop->loopNumber();
_blit(reg, bytes, (const uint8_t*)src, (uint8_t*)dst);
_blit(reg, bytes, (const uint8_t*)src, (uint8_t*)dst, false);
break;
}
auto proc = static_cast<CPUBackend*>(backend())->functions()->MNNSelectUnaryFunctionForFloat(op->main_as_UnaryOp()->opType(), static_cast<CPUBackend*>(backend())->precisionMode());
Expand Down
1 change: 1 addition & 0 deletions source/backend/cpu/CPURaster.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class CPURaster : public Execution {
OpCommonUtils::TensorConvertParameter mSingleConvert;
std::vector<std::shared_ptr<Tensor::InsideDescribe::Region>> mCacheRegions;
int32_t mZeroPoint = 0;
bool mHasReduce = false;
};
}
#endif
3 changes: 3 additions & 0 deletions source/backend/opencl/core/OpenCLBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,9 @@ void CLRuntime::onMaskOpReady(const std::vector<Tensor*>& inputs, const std::vec
dstInfo->mInfos.emplace_back(std::move(opInfo));
}
}
void CLRuntime::onReset(int numberThread, const BackendConfig* config, bool full) {
mOpenCLRuntime->setGpuMode(numberThread);
}

bool CLRuntime::onSetCache(const void* buffer, size_t size) {
if (nullptr == buffer) {
Expand Down
1 change: 1 addition & 0 deletions source/backend/opencl/core/OpenCLBackend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ class CLRuntime : public Runtime {
virtual ~CLRuntime();

virtual Backend* onCreate(const BackendConfig* config) const override;
virtual void onReset(int numberThread, const BackendConfig* config, bool full) override;
virtual void onGabageCollect(int level) override;
virtual float onGetMemoryInMB() override;
virtual std::pair<const void*, size_t> onGetCache() override;
Expand Down
2 changes: 1 addition & 1 deletion source/backend/opencl/core/runtime/OpenCLRuntime.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,11 +174,11 @@ class OpenCLRuntime {

std::pair<const void*, size_t> makeCache(void* tuneInfo);
bool setCache(std::pair<const void*, size_t> cache);
void setGpuMode(const int cl_mode_num);
private:
bool loadProgram(const std::string &programName, cl::Program *program);
bool buildProgram(const std::string &buildOptionsStr, cl::Program *program);
bool getDeviceSupportsExtension(const cl::Device &device, const char *extensionName);
void setGpuMode(const int cl_mode_num);

private:
std::vector<size_t> mMaxImageSize;
Expand Down
8 changes: 8 additions & 0 deletions source/core/Pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -933,6 +933,14 @@ ErrorCode Pipeline::fixResizeCache() {
break;
}
}
if (mOutputStatic) {
for (auto t : cmd.workOutputs) {
if (TensorUtils::getDescribe(t)->usage != Tensor::InsideDescribe::NORMAL) {
cmd.group = 0;
break;
}
}
}
}
if (1 == cmd.group) {
fixNumber++;
Expand Down
43 changes: 18 additions & 25 deletions source/core/TensorUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -544,15 +544,30 @@ static bool _ClipDst(int* stride, int srcOffset, int dstOffset, const int* srcSi
dstMin[i] = ALIMAX(0, -o[i]);
dstMax[i] = ALIMIN(srcSize[i]-o[i], dstSize[i]);
}
int srcMin = -1;
for (int i=0; i<sizeNum; ++i) {
if (dstMax[i] < srcSize[i]) {
if (srcMin == -1) {
srcMin = stride[i];
} else {
srcMin = ALIMIN(stride[i], srcMin);
}
}
}
if (srcMin < 0) {
// Src is fully used
return true;
}

// Check If dstMax is inside src, it means one region can't describe dst - src
// TODO: Support slice region to support fuse
for (int i=0; i<sizeNum; ++i) {
if (dstMax[i] == dstSize[i]) {
continue;
}
int bias = offsetBias + dstMax[i] * stride[i];
if (bias < srcMax) {
// for [dstMax, dstSize], exist value match formula
if (bias < srcMax && bias >= srcMin) {
// for [dstMax, dstSize], may exist value match formula
return false;
}
}
Expand All @@ -578,7 +593,6 @@ class TensorUtils::FuseRegionStatus {
public:
enum Status {
FUSE_SRC_COPY,
FUSE_DST_COPY,
FUSE_REGION_COMPUTE
};
void apply(const Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg) {
Expand All @@ -587,16 +601,6 @@ class TensorUtils::FuseRegionStatus {
dstReg.origin = srcReg.origin;
dstReg.src.offset += srcReg.src.offset - srcReg.dst.offset;
break;
case FUSE_DST_COPY:
dstReg.origin = srcReg.origin;
dstReg.dst = srcReg.dst;
dstReg.src = srcReg.src;
dstReg.src.offset = mSrcOff;
dstReg.dst.offset = mDstOff;
dstReg.size[0] = srcReg.size[0];
dstReg.size[1] = srcReg.size[1];
dstReg.size[2] = srcReg.size[2];
break;
case FUSE_REGION_COMPUTE:
{
if (dstSize[0] == 0) {
Expand All @@ -612,7 +616,7 @@ class TensorUtils::FuseRegionStatus {
int valid[3] = {0, 0, 0};
int offset = 3 - dstNum;
if (dstNum > sizeNum) {
for (int i = 2; i >= 0; i--) {
for (int i = dstNum - 1; i >= 0; i--) {
if (i < dstNum) {
if (dstSize[i] == 1) {
expandIdx = i;
Expand Down Expand Up @@ -691,17 +695,6 @@ class TensorUtils::FuseRegionStatus {
mStatus = FUSE_SRC_COPY;
return true;
}
// dst copy fuse
if (isCopyRegion(dstReg) && dstTotalSize == srcTotalSize && copyValid) {
mSrcOff = dstReg.src.offset - srcReg.dst.offset;
mDstOff = dstReg.dst.offset;
mSrcOff = offsetCompute(srcReg, dstReg.src.offset, srcReg.dst.offset, true) + srcReg.src.offset;
if (!(srcReg.src.stride[2] > 0 && mSrcOff % srcReg.src.stride[2] != 0)) {
// when transpose + slice, offset is not align can't fuse
mStatus = FUSE_DST_COPY;
return true;
}
}
#define MNN_3_INT_INIT(x, y) { x[0] = y; x[1] = y; x[2] = y; }
MNN_3_INT_INIT(dstStride, -1)
MNN_3_INT_INIT(srcStride, -1)
Expand Down
6 changes: 5 additions & 1 deletion test/core/RegionFuse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class RegionFuseTest : public MNNTestCase {
using Region = Tensor::InsideDescribe::Region;
virtual ~RegionFuseTest() = default;
virtual bool run(int precision) {
constexpr int N = 11;
constexpr int N = 12;
// [src_offset, src_stride_0_1_2, dst_offset, dst_stride_0_1_2, size_0_1_2]
int data[N*3][11] = {
// 2D-transpose + 2D-transpose = memcpy: [1, 4, 16] => [1, 16, 4] => [1, 4, 16]
Expand Down Expand Up @@ -64,6 +64,10 @@ class RegionFuseTest : public MNNTestCase {
{0, 12321, 111, 1, 0, 12544, 112, 1, 32, 111, 111},
{113, 12544, 112, 1, 0, 12321, 111, 1, 32, 111, 111},
{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
// concat + stack
{0, 32, 1, 1, 32, 64, 1, 1, 20, 32, 1},
{0, 0, 1280, 1, 0, 1, 640, 1, 1, 1, 640},
{0, 0, 32, 1, 32, 0, 64, 1, 1, 10, 32},
};
TensorUtils::FuseWrap fuseUtils;
for (int i = 0; i < N; i++) {
Expand Down
Loading
Loading