alibaba · jxt1234 · Aug 28, 2024 · Aug 26, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/express/module/StaticModule.cpp b/express/module/StaticModule.cpp
@@ -466,7 +466,6 @@ ErrorCode StaticModule::_resize(const std::vector<Express::VARP>& inputs) {
         if (needResize) {
             mSession->setNeedResize();
         }
-        code = mSession->resize();
         if (!needResize) {
             // Check if output is used by other vars. If used, must realloc output to avoid the content dirty for output vars
             // If resized, the output's memory will be all released in Session::resize, don't need clear here
@@ -489,6 +488,7 @@ ErrorCode StaticModule::_resize(const std::vector<Express::VARP>& inputs) {
                 }
             }
         }
+        code = mSession->resize();
     } else {
         // Resize
         for (int i = 0; i < inputs.size(); ++i) {

diff --git a/source/backend/cpu/CPURaster.cpp b/source/backend/cpu/CPURaster.cpp
@@ -20,6 +20,32 @@
 
 using Vec4 = MNN::Math::Vec<float, 4>;
 namespace MNN {
+struct ReduceInfo {
+    int reduceMask[3] = {0, 0, 0};
+    int reduceNum = 0;
+    int reduceIndex[3];
+    int normalIndex[3];
+    int normalNum = 0;
+    bool compute(const Tensor::InsideDescribe::Region& slice) {
+        normalNum = 0;
+        reduceNum = 0;
+        for (int i=0; i<3; ++i) {
+            if (slice.size[i] > 1 && slice.dst.stride[i] == 0) {
+                reduceMask[i] = 1;
+                reduceIndex[reduceNum] = i;
+                reduceNum ++;
+            } else {
+                MNN_ASSERT(normalNum < 3);
+                normalIndex[normalNum] = i;
+                normalNum++;
+            }
+        }
+        if (0 == reduceNum) {
+            return false;
+        }
+        return true;
+    }
+};
 
 ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std::vector<Tensor *> &outputs) {
     MNN_ASSERT(outputs.size() == 1);
@@ -138,7 +164,6 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
             }
         }
         auto cache = static_cast<CPUBackend*>(backend())->getCache();
-#if 1
         auto tempTensor = cache->findCacheTensor(origin, midFormat);
         //MNN_ASSERT(CPUBackend::getBytes(backend(), origin) == 4);
         if (nullptr == tempTensor) {
@@ -160,22 +185,6 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
         if (--TensorUtils::getDescribe(tempTensor)->useCount == 0) {
             forRelease.emplace_back(tempTensor);
         }
-#else
-        std::shared_ptr<Tensor> newTensor(new Tensor);
-        TensorUtils::copyShape(origin, newTensor.get());
-        TensorUtils::getDescribe(newTensor.get())->dimensionFormat = midFormat;
-        TensorUtils::getDescribe(newTensor.get())->quantAttr = TensorUtils::getDescribe(origin)->quantAttr;
-        newTensor->buffer().type = origin->getType();
-        TensorUtils::setLinearLayout(newTensor.get());
-        mTempInput.insert(std::make_pair(origin, newTensor.get()));
-        auto res = backend()->onAcquireBuffer(newTensor.get(), Backend::DYNAMIC);
-        if (!res) {
-            return OUT_OF_MEMORY;
-        }
-        auto tempTensor = newTensor.get();
-        backend()->onReleaseBuffer(tempTensor, Backend::DYNAMIC);
-        cache->pushCacheTensor(newTensor, origin, midFormat);
-#endif
         mTempInputCopy.emplace_back(std::make_pair(tempTensor, &slice));
     }
     for (auto t : forRelease) {
@@ -185,7 +194,15 @@ ErrorCode CPURaster::onResize(const std::vector<Tensor *> &____inputs, const std
         backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
     }
     auto threadNumber = static_cast<CPUBackend*>(backend())->threadNumber();
-    if (mTempInputCopy.size() == 1 && threadNumber > 1) {
+    mHasReduce = false;
+    ReduceInfo reduceInfo;
+    for (auto& iter : mTempInputCopy) {
+        if (reduceInfo.compute(*iter.second)) {
+            mHasReduce = true;
+            break;
+        }
+    }
+    if (mTempInputCopy.size() == 1 && threadNumber > 1 && (!mHasReduce)) {
         // Split to multi region
         auto region = mTempInputCopy[0].second;
         const int thredHold = 100;//TODO: Find better way to determine it
@@ -396,25 +413,11 @@ static void _zero(const Tensor::InsideDescribe::Region& slice, int bytes, uint8_
     }
 }
 static bool _reduceblit(const Tensor::InsideDescribe::Region& slice, int bytes, const uint8_t* srcPtr, uint8_t* dstPtr) {
-    int reduceMask[3] = {0, 0, 0};
-    int reduceNum = 0;
-    int reduceIndex[3];
-    int normalIndex[3];
-    int normalNum = 0;
-    for (int i=0; i<3; ++i) {
-        if (slice.size[i] > 1 && slice.dst.stride[i] == 0) {
-            reduceMask[i] = 1;
-            reduceIndex[reduceNum] = i;
-            reduceNum ++;
-        } else {
-            normalIndex[normalNum] = i;
-            normalNum++;
-        }
-    }
-    if (0 == reduceNum) {
-        return false;
-    }
-    switch (reduceNum) {
+    ReduceInfo reduceInfo;
+    reduceInfo.compute(slice);
+    auto normalIndex = reduceInfo.normalIndex;
+    auto reduceIndex = reduceInfo.reduceIndex;
+    switch (reduceInfo.reduceNum) {
         case 3:
         {
             float summer = 0.0f;
@@ -490,14 +493,13 @@ static bool _reduceblit(const Tensor::InsideDescribe::Region& slice, int bytes,
     return false;
 }
 
-static void _blit(const Tensor::InsideDescribe::Region& slice, int bytes, const uint8_t* srcPtr, uint8_t* dstPtr) {
+static void _blit(const Tensor::InsideDescribe::Region& slice, int bytes, const uint8_t* srcPtr, uint8_t* dstPtr, bool hasReduce) {
     auto proc = _selectUnitProc(bytes, slice.src.stride[2], slice.dst.stride[2]);
-#define MNN_BLIT_SUPPORT_REDUCE
-#ifdef MNN_BLIT_SUPPORT_REDUCE
-    if (_reduceblit(slice, bytes, srcPtr, dstPtr)) {
-        return;
+    if (hasReduce) {
+        if (_reduceblit(slice, bytes, srcPtr, dstPtr)) {
+            return;
+        }
     }
-#endif
     if (slice.src.stride[1] == slice.size[2] && slice.dst.stride[1] == slice.size[2] && slice.src.stride[2] == 1) {
         for (int z=0; z<slice.size[0]; ++z) {
             auto srcZ = srcPtr + z * slice.src.stride[0] * bytes;
@@ -624,13 +626,17 @@ ErrorCode CPURaster::onExecute(const std::vector<Tensor *> &____inputs, const st
         tensorConvert(iter.first, iter.second, bytes);
     }
     threadNum = ALIMIN(threadNum, (int)mTempInputCopy.size());
+    if (mHasReduce) {
+        // Don't support reduce with multi thread now
+        threadNum = 1;
+    }
     MNN_CONCURRENCY_BEGIN(tId, threadNum) {
         for (int u=tId; u<mTempInputCopy.size(); u+=threadNum) {
             auto& iter = mTempInputCopy[u];
             auto& slice = *(iter.second);
             auto srcPtr = iter.first->host<uint8_t>() + slice.src.offset * bytes;
             auto dstPtr = (uint8_t*)mOutputPtr + slice.dst.offset * bytes;
-            _blit(slice, bytes, srcPtr, dstPtr);
+            _blit(slice, bytes, srcPtr, dstPtr, mHasReduce);
         }
     }
     MNN_CONCURRENCY_END();
@@ -807,7 +813,7 @@ class CPULoop : public Execution {
                     if (halide_type_float == input->getType().code) {
                         bytes = cpubackend->functions()->bytes;
                     }
-                    _blit(reg, bytes, input->host<uint8_t>(), output->host<uint8_t>());
+                    _blit(reg, bytes, input->host<uint8_t>(), output->host<uint8_t>(), false);
                 }
 
             }
@@ -855,7 +861,7 @@ class CPULoop : public Execution {
                     auto dstOffset = dstIter * step0 + dstView->offset();
                     if (dstOffset >= 0) {
                         if (srcOffset >= 0 && srcOffset < inputSize) {
-                            _blit(reg, bytes, input->host<uint8_t>() + bytes * srcOffset, output->host<uint8_t>() + bytes * dstOffset);
+                            _blit(reg, bytes, input->host<uint8_t>() + bytes * srcOffset, output->host<uint8_t>() + bytes * dstOffset, false);
                         } else {
                             _zero(reg, bytes, output->host<uint8_t>() + bytes * dstOffset);
                         }
@@ -921,7 +927,7 @@ class CPULoop : public Execution {
                             auto step0 = cmd->steps()->data()[0];
                             auto step1 = cmd->steps()->data()[1];
                             auto loopNumber = mLoop->loopNumber();
-                            _blit(reg, bytes, (const uint8_t*)src, (uint8_t*)dst);
+                            _blit(reg, bytes, (const uint8_t*)src, (uint8_t*)dst, false);
                             break;
                         }
                         auto proc = static_cast<CPUBackend*>(backend())->functions()->MNNSelectUnaryFunctionForFloat(op->main_as_UnaryOp()->opType(), static_cast<CPUBackend*>(backend())->precisionMode());

diff --git a/source/backend/cpu/CPURaster.hpp b/source/backend/cpu/CPURaster.hpp
@@ -36,6 +36,7 @@ class CPURaster : public Execution {
     OpCommonUtils::TensorConvertParameter mSingleConvert;
     std::vector<std::shared_ptr<Tensor::InsideDescribe::Region>> mCacheRegions;
     int32_t mZeroPoint = 0;
+    bool mHasReduce = false;
 };
 }
 #endif
diff --git a/source/backend/opencl/core/OpenCLBackend.cpp b/source/backend/opencl/core/OpenCLBackend.cpp
@@ -163,6 +163,9 @@ void CLRuntime::onMaskOpReady(const std::vector<Tensor*>& inputs, const std::vec
         dstInfo->mInfos.emplace_back(std::move(opInfo));
     }
 }
+void CLRuntime::onReset(int numberThread, const BackendConfig* config, bool full) {
+    mOpenCLRuntime->setGpuMode(numberThread);
+}
 
 bool CLRuntime::onSetCache(const void* buffer, size_t size) {
     if (nullptr == buffer) {

diff --git a/source/backend/opencl/core/OpenCLBackend.hpp b/source/backend/opencl/core/OpenCLBackend.hpp
@@ -49,6 +49,7 @@ class CLRuntime : public Runtime {
     virtual ~CLRuntime();
 
     virtual Backend* onCreate(const BackendConfig* config) const override;
+    virtual void onReset(int numberThread, const BackendConfig* config, bool full) override;
     virtual void onGabageCollect(int level) override;
     virtual float onGetMemoryInMB() override;
     virtual std::pair<const void*, size_t> onGetCache() override;

diff --git a/source/backend/opencl/core/runtime/OpenCLRuntime.hpp b/source/backend/opencl/core/runtime/OpenCLRuntime.hpp
@@ -174,11 +174,11 @@ class OpenCLRuntime {
 
     std::pair<const void*, size_t> makeCache(void* tuneInfo);
     bool setCache(std::pair<const void*, size_t> cache);
+    void setGpuMode(const int cl_mode_num);
 private:
     bool loadProgram(const std::string &programName, cl::Program *program);
     bool buildProgram(const std::string &buildOptionsStr, cl::Program *program);
     bool getDeviceSupportsExtension(const cl::Device &device, const char *extensionName);
-    void setGpuMode(const int cl_mode_num);
 
 private:
     std::vector<size_t> mMaxImageSize;

diff --git a/source/core/Pipeline.cpp b/source/core/Pipeline.cpp
@@ -933,6 +933,14 @@ ErrorCode Pipeline::fixResizeCache() {
                         break;
                     }
                 }
+                if (mOutputStatic) {
+                    for (auto t : cmd.workOutputs) {
+                        if (TensorUtils::getDescribe(t)->usage != Tensor::InsideDescribe::NORMAL) {
+                            cmd.group = 0;
+                            break;
+                        }
+                    }
+                }
             }
             if (1 == cmd.group) {
                 fixNumber++;

diff --git a/source/core/TensorUtils.cpp b/source/core/TensorUtils.cpp
@@ -544,15 +544,30 @@ static bool _ClipDst(int* stride, int srcOffset, int dstOffset, const int* srcSi
         dstMin[i] = ALIMAX(0, -o[i]);
         dstMax[i] = ALIMIN(srcSize[i]-o[i], dstSize[i]);
     }
+    int srcMin = -1;
+    for (int i=0; i<sizeNum; ++i) {
+        if (dstMax[i] < srcSize[i]) {
+            if (srcMin == -1) {
+                srcMin = stride[i];
+            } else {
+                srcMin = ALIMIN(stride[i], srcMin);
+            }
+        }
+    }
+    if (srcMin < 0) {
+        // Src is fully used
+        return true;
+    }
+
     // Check If dstMax is inside src, it means one region can't describe dst - src
     // TODO: Support slice region to support fuse
     for (int i=0; i<sizeNum; ++i) {
         if (dstMax[i] == dstSize[i]) {
             continue;
         }
         int bias = offsetBias + dstMax[i] * stride[i];
-        if (bias < srcMax) {
-            // for [dstMax, dstSize], exist value match formula
+        if (bias < srcMax && bias >= srcMin) {
+            // for [dstMax, dstSize], may exist value match formula
             return false;
         }
     }
@@ -578,7 +593,6 @@ class TensorUtils::FuseRegionStatus {
 public:
     enum Status {
         FUSE_SRC_COPY,
-        FUSE_DST_COPY,
         FUSE_REGION_COMPUTE
     };
     void apply(const Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg) {
@@ -587,16 +601,6 @@ class TensorUtils::FuseRegionStatus {
                 dstReg.origin = srcReg.origin;
                 dstReg.src.offset += srcReg.src.offset - srcReg.dst.offset;
                 break;
-            case FUSE_DST_COPY:
-                dstReg.origin = srcReg.origin;
-                dstReg.dst = srcReg.dst;
-                dstReg.src = srcReg.src;
-                dstReg.src.offset = mSrcOff;
-                dstReg.dst.offset = mDstOff;
-                dstReg.size[0] = srcReg.size[0];
-                dstReg.size[1] = srcReg.size[1];
-                dstReg.size[2] = srcReg.size[2];
-                break;
             case FUSE_REGION_COMPUTE:
             {
                 if (dstSize[0] == 0) {
@@ -612,7 +616,7 @@ class TensorUtils::FuseRegionStatus {
                 int valid[3] = {0, 0, 0};
                 int offset = 3 - dstNum;
                 if (dstNum > sizeNum) {
-                    for (int i = 2; i >= 0; i--) {
+                    for (int i = dstNum - 1; i >= 0; i--) {
                         if (i < dstNum) {
                             if (dstSize[i] == 1) {
                                 expandIdx = i;
@@ -691,17 +695,6 @@ class TensorUtils::FuseRegionStatus {
             mStatus = FUSE_SRC_COPY;
             return true;
         }
-        // dst copy fuse
-        if (isCopyRegion(dstReg) && dstTotalSize == srcTotalSize && copyValid) {
-            mSrcOff = dstReg.src.offset - srcReg.dst.offset;
-            mDstOff = dstReg.dst.offset;
-            mSrcOff = offsetCompute(srcReg, dstReg.src.offset, srcReg.dst.offset, true) + srcReg.src.offset;
-            if (!(srcReg.src.stride[2] > 0 && mSrcOff % srcReg.src.stride[2] != 0)) {
-                // when transpose + slice, offset is not align can't fuse
-                mStatus = FUSE_DST_COPY;
-                return true;
-            }
-        }
     #define MNN_3_INT_INIT(x, y) { x[0] = y; x[1] = y; x[2] = y; }
         MNN_3_INT_INIT(dstStride, -1)
         MNN_3_INT_INIT(srcStride, -1)

diff --git a/test/core/RegionFuse.cpp b/test/core/RegionFuse.cpp
@@ -17,7 +17,7 @@ class RegionFuseTest : public MNNTestCase {
     using Region = Tensor::InsideDescribe::Region;
     virtual ~RegionFuseTest() = default;
     virtual bool run(int precision) {
-        constexpr int N = 11;
+        constexpr int N = 12;
         // [src_offset, src_stride_0_1_2, dst_offset, dst_stride_0_1_2, size_0_1_2]
         int data[N*3][11] = {
             // 2D-transpose + 2D-transpose = memcpy: [1, 4, 16] => [1, 16, 4] => [1, 4, 16]
@@ -64,6 +64,10 @@ class RegionFuseTest : public MNNTestCase {
             {0, 12321, 111, 1, 0, 12544, 112, 1, 32, 111, 111},
             {113, 12544, 112, 1, 0, 12321, 111, 1, 32, 111, 111},
             {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+            // concat + stack
+            {0, 32, 1, 1, 32, 64, 1, 1, 20, 32, 1},
+            {0, 0, 1280, 1, 0, 1, 640, 1, 1, 1, 640},
+            {0, 0, 32, 1, 32, 0, 64, 1, 1, 10, 32},
         };
         TensorUtils::FuseWrap fuseUtils;
         for (int i = 0; i < N; i++) {