From 15633730bf7e636bda98a38a87a7daf6ed74093e Mon Sep 17 00:00:00 2001
From: i-evi <evidence_john@outlook.com>
Date: Sun, 4 Oct 2020 17:11:38 +0800
Subject: [PATCH] fix bug

---
 demo/lenet.lua  |  2 +-
 src/cc_basic.c  |  5 +--
 src/cc_dsc2d.c  | 26 ++++---------
 src/cc_normfn.c |  4 +-
 util/lua2cc.lua | 97 ++++++++++++++++++++++---------------------------
 5 files changed, 55 insertions(+), 79 deletions(-)

diff --git a/demo/lenet.lua b/demo/lenet.lua
index 38a44b9..079ed19 100644
--- a/demo/lenet.lua
+++ b/demo/lenet.lua
@@ -12,7 +12,7 @@ network = {
     l4  = conv2d         ({input = "l3", stride = 1, padding = 2}),
     l5  = relu           ({input = "l4"}),
     l6  = maxPool2d      ({input = "l5", stride = 2}),
-    l7  = reshape        ({input = "l6", shape = {-1, 1}}),
+    l7  = reshape        ({input = "l6", shape = {-1, 1, 1}}),
     l8  = fullyConnected ({input = "l7"}),
     l9  = relu           ({input = "l8"}),
     l10 = fullyConnected ({input = "l9"}),
diff --git a/src/cc_basic.c b/src/cc_basic.c
index 88188f3..df319bf 100644
--- a/src/cc_basic.c
+++ b/src/cc_basic.c
@@ -72,10 +72,7 @@ void cc_tensor_shape_fix(cc_int32 *shape, cc_int32 elems)
 		s *= v;
 		sptr++;
 	}
-	if (s != elems) {
-#ifdef ENABLE_CC_ASSERT
-			cc_assert(f);
-#endif
+	if (s != elems || f) {
 		shape[i] = elems / s;
 	}
 }
diff --git a/src/cc_dsc2d.c b/src/cc_dsc2d.c
index 946fb71..101d329 100644
--- a/src/cc_dsc2d.c
+++ b/src/cc_dsc2d.c
@@ -19,16 +19,14 @@
 #include "global_fn_cfg.h"
 extern fn_conv2d       _conv2d;
 extern fn_array_add_ew _array_add_ew;
-extern fn_array_mul_by _array_add_by;
+extern fn_array_mul_by _array_mul_by;
 
 cc_tensor_t *cc_dw_conv2d(cc_tensor_t *inp,
 		const cc_tensor_t *kernel, const cc_tensor_t *bias,
 	cc_int32 s, cc_int32 p, cc_int32 off, const char *name)
 {
-	cc_uint8 *omp_out_buf = NULL;
 	cc_tensor_t *inp_pad, *oup = NULL;
-	cc_int32 o_ch_size, p_ch_mem_size, o_ch_mem_size,
-		k_ch_mem_size, k_mem_size, num_omp_threads, i;
+	cc_int32 o_ch_size, p_ch_mem_size, o_ch_mem_size, k_ch_mem_size, i;
 	cc_int32 shape[CC_CNN2D_SHAPE] = {0};
 	char pad_name[CC_CONV2D_PAD_NAME_LEN];
 #ifdef ENABLE_CC_ASSERT
@@ -36,7 +34,7 @@ cc_tensor_t *cc_dw_conv2d(cc_tensor_t *inp,
 	cc_assert_zero(cc_tensor_dimension(kernel) - CC_CONV2D_KERNEL_DIM);
 	cc_assert_zero(*inp->dtype - *kernel->dtype);
 	cc_assert_zero(inp->shape[CC_CNN2D_SHAPE_C]
-			- kernel->shape[CC_CONV2D_KERNEL_I]);
+			- kernel->shape[CC_CONV2D_KERNEL_O]);
 #endif
 	if (p) {
 		sprintf(pad_name, "%s%s",
@@ -49,7 +47,7 @@ cc_tensor_t *cc_dw_conv2d(cc_tensor_t *inp,
 	oup = cc_tsrmgr_get(name);
 #endif
 	if (!oup) {
-		shape[CC_CNN2D_SHAPE_C] = kernel->shape[CC_CONV2D_KERNEL_I];
+		shape[CC_CNN2D_SHAPE_C] = kernel->shape[CC_CONV2D_KERNEL_O];
 		shape[CC_CNN2D_SHAPE_H] = cc_conv2d_shape_calc(
 				inp->shape[CC_CNN2D_SHAPE_H],
 			kernel->shape[CC_CONV2D_KERNEL_H], s, p);
@@ -67,13 +65,6 @@ cc_tensor_t *cc_dw_conv2d(cc_tensor_t *inp,
 	k_ch_mem_size = kernel->shape[CC_CONV2D_KERNEL_W] *
 			kernel->shape[CC_CONV2D_KERNEL_H] *
 			cc_dtype_size(*kernel->dtype);
-	k_mem_size = k_ch_mem_size * kernel->shape[CC_CONV2D_KERNEL_I];
-	num_omp_threads = 1;
-#ifdef ENABLE_OPENMP
-	num_omp_threads = omp_get_max_threads();
-#endif
-	cc_assert_alloc(omp_out_buf =
-		(cc_uint8*)malloc(o_ch_mem_size * num_omp_threads));
 #ifdef AUTO_TSRMGR
 	memset(oup->data, 0,
 		list_getlen(oup->container, CC_TENSOR_DATA));
@@ -81,18 +72,17 @@ cc_tensor_t *cc_dw_conv2d(cc_tensor_t *inp,
 #ifdef ENABLE_OPENMP
 	#pragma omp parallel for private(i)
 #endif
-	for (i = 0; i < kernel->shape[CC_CONV2D_KERNEL_I]; ++i) {
+	for (i = 0; i < kernel->shape[CC_CONV2D_KERNEL_O]; ++i) {
 		_conv2d((inp_pad->data + i * p_ch_mem_size),
 			oup->data + i * o_ch_mem_size,
 				inp_pad->shape[CC_CNN2D_SHAPE_W],
 				inp_pad->shape[CC_CNN2D_SHAPE_H],
 				oup->shape[CC_CNN2D_SHAPE_W],
 				oup->shape[CC_CNN2D_SHAPE_H], s, s, 
-				kernel->data + (k_mem_size * i),
+				kernel->data + (k_ch_mem_size * i),
 				kernel->shape[CC_CONV2D_KERNEL_W],
 			*kernel->dtype);
 	}
-	free(omp_out_buf);
 	if (!bias){
 #ifndef AUTO_TSRMGR
 		if (p)
@@ -157,7 +147,7 @@ cc_tensor_t *cc_pw_conv2d(cc_tensor_t *inp, const cc_tensor_t *kernel,
 		for (j = 0; j < kernel->shape[CC_CONV2D_KERNEL_I]; ++j)
 		{
 #ifdef ENABLE_OPENMP
-		_array_add_by(
+		_array_mul_by(
 			omp_out_buf + omp_get_thread_num() * o_ch_mem_size,
 			o_ch_size, inp->data + o_ch_mem_size * j,
 			kernel->data + k_mem_size * i + k_ch_mem_size * j,
@@ -167,7 +157,7 @@ cc_tensor_t *cc_pw_conv2d(cc_tensor_t *inp, const cc_tensor_t *kernel,
 			omp_out_buf + omp_get_thread_num() * o_ch_mem_size,
 		 *oup->dtype);
 #else
-		_array_add_by(omp_out_buf, o_ch_size,
+		_array_mul_by(omp_out_buf, o_ch_size,
 			inp->data + o_ch_mem_size * j,
 			kernel->data + k_mem_size * i + k_ch_mem_size * j,
 			*oup->dtype);
diff --git a/src/cc_normfn.c b/src/cc_normfn.c
index fede905..337e219 100644
--- a/src/cc_normfn.c
+++ b/src/cc_normfn.c
@@ -64,8 +64,8 @@ cc_tensor_t *cc_batch_norm2d(cc_tensor_t *inp,
 	ch_size = inp->shape[CC_CNN2D_SHAPE_H] *
 			inp->shape[CC_CNN2D_SHAPE_W];
 	ch_mem_size = ch_size * dt_size;
-	for (i = 0; i < inp->shape[CC_CNN2D_SHAPE_C]; ++i) {
-		_batch_norm(inp->data + ch_mem_size * i, ch_size,
+	for (i = 0; i < oup->shape[CC_CNN2D_SHAPE_C]; ++i) {
+		_batch_norm(oup->data + ch_mem_size * i, ch_size,
 			para->data + CC_NORM_PARAMETERS * dt_size * i,
 		*para->dtype);
 	}
diff --git a/util/lua2cc.lua b/util/lua2cc.lua
index 81b6328..37a22ac 100644
--- a/util/lua2cc.lua
+++ b/util/lua2cc.lua
@@ -1,4 +1,4 @@
-local parameterLv  = 0  -- PKG: Begin at 0
+local parameterLv = 0  -- PKG: Begin at 0
 local shapeCounter = 0
 local layerCounter = 1 -- MUST Begin at 1
 local parameterCnt = 1 -- MUST Begin at 1
@@ -70,8 +70,8 @@ dwConv2d = function(args)
     if type(scope) == "string" then
       name = string.format("%s/%s", scope, name)
     end
-    parals[info.paraId + 0] = string.format("%03d_w", info.paraLv)
-    parals[info.paraId + 1] = string.format("%03d_b", info.paraLv)
+    parals[info.paraId + 0] = string.format("%03d.w", info.paraLv)
+    parals[info.paraId + 1] = string.format("%03d.b", info.paraLv)
     layerOutputs[ret.layerId] = output
     if info.input == nil then
       if info.layerId - 1 < 1 then
@@ -80,7 +80,7 @@ dwConv2d = function(args)
       info.input = string.format("@%d", info.layerId - 1)
     end
     return string.format(
-      "%s = cc_conv2d(%s, __pls[%d], __pls[%d], %d, %d, %d, \"%s\");",
+      "%s = cc_dw_conv2d(%s, __pls[%d], __pls[%d], %d, %d, %d, \"%s\");",
       output, info.input, info.paraId - 1, info.paraId, info.stride,
       info.padding, info.offset, name)
   end
@@ -100,8 +100,8 @@ pwConv2d = function(args)
     if type(scope) == "string" then
       name = string.format("%s/%s", scope, name)
     end
-    parals[info.paraId + 0] = string.format("%03d_w", info.paraLv)
-    parals[info.paraId + 1] = string.format("%03d_b", info.paraLv)
+    parals[info.paraId + 0] = string.format("%03d.w", info.paraLv)
+    parals[info.paraId + 1] = string.format("%03d.b", info.paraLv)
     layerOutputs[ret.layerId] = output
     if info.input == nil then
       if info.layerId - 1 < 1 then
@@ -129,8 +129,8 @@ fullyConnected = function(args)
     if type(scope) == "string" then
       name = string.format("%s/%s", scope, name)
     end
-    parals[info.paraId + 0] = string.format("%03d_w", info.paraLv)
-    parals[info.paraId + 1] = string.format("%03d_b", info.paraLv)
+    parals[info.paraId + 0] = string.format("%03d.w", info.paraLv)
+    parals[info.paraId + 1] = string.format("%03d.b", info.paraLv)
     layerOutputs[ret.layerId] = output
     if info.input == nil then
       if info.layerId - 1 < 1 then
@@ -243,7 +243,7 @@ batchNorm2d = function(args)
     if type(scope) == "string" then
       name = string.format("%s/%s", scope, name)
     end
-    parals[info.paraId] = string.format("%03d_n", info.paraLv)
+    parals[info.paraId] = string.format("%03d.n", info.paraLv)
     layerOutputs[ret.layerId] = output
     if info.input == nil then
       if info.layerId - 1 < 1 then
@@ -252,7 +252,7 @@ batchNorm2d = function(args)
       info.input = string.format("@%d", info.layerId - 1)
     end
     return string.format(
-      "%s = cc_batch_norm(%s, __pls[%d], \"%s\");",
+      "%s = cc_batch_norm2d(%s, __pls[%d], \"%s\");",
       output, info.input, info.paraId - 1, name)
   end
   return ret
@@ -288,63 +288,50 @@ reshape = function(args)
   return ret
 end
 
-local fprint = function(fp, ...)
+local fputs = function(fp, ...)
   local args = { ... }
-  local flag = false
   for k, v in pairs(args) do
-    if not flag then
-      flag = true
-    else
-      fp:write('\t')
-    end
     fp:write(v)
   end
-  fp:write('\n')
 end
 
 local printLine = function(line, indent)
   if indent == nil then indent = 0 end
+  local lineLimit = 80
   local indentOff = indent * 8
-  local llen = #line + indentOff
   local indentStr = string.rep("\t", indent)
-  line = indentStr..line
-  if llen <= 80 then
-    fprint(_ctrlfp, line)
-    return
-  end
-  local prev = 0
-  local curr = 0
+  local csr = 1
+  local brk = 0
+  local pos = 0
+  local nextword = ""
+
   repeat
-    curr, _ = string.find(line, ',', prev + 1)
-    if curr == nil then
-      break
+    if pos == 0 then
+      pos = indentOff
+      fputs(_ctrlfp, indentStr)
     end
-    if (curr + indentOff) >= 80 then
-      local buf = string.sub(line, 1, prev)
-      fprint(_ctrlfp, buf)
-      line = string.sub(line, prev + 1)
-      if string.byte(line, 1) == 32 then
-        line = string.sub(line, 2)
-      end
+    brk, _ = string.find(line, ',', csr)
+    if brk ~= nil then
+      nextword = string.sub(line, csr, brk)
+    else
+      nextword = string.sub(line, csr)
+    end
+    csr = csr + #nextword
+    if pos + #nextword >= lineLimit then
+      fputs(_ctrlfp, '\n');
+      pos = indentOff
+      fputs(_ctrlfp, indentStr)
     end
-    prev = curr
-  until (#line + indentOff) < 80
-  if #line > 80 then
-    local looking = string.byte(',', 1)
-    for i = #line, 60, -1 do
-      if string.byte(line, i) == looking then
-        fprint(_ctrlfp, string.sub(line, 1, i))
-        line = string.sub(line, i + 1)
-        break
+    if pos == indentOff then
+      local off, _ = string.find(nextword, ' ')
+      if off == 1 then
+        nextword = string.sub(nextword, 2)
       end
     end
-  end
-  curr, _ = string.find(line, ' ')
-  if curr == 1 then
-    line = string.sub(line, 2)
-  end
-  line = indentStr..line
-  fprint(_ctrlfp, line)
+    fputs(_ctrlfp, string.format("%s", nextword))
+    pos = pos + #nextword
+  until csr >= #line
+  fputs(_ctrlfp, '\n')
 end
 
 local runningFlag = true
@@ -418,7 +405,8 @@ ccCodeTranslator = function(net, cfg)
     "static cc_tensor_t *__pls[%d];", #paraxList), indentOff + 0)
 
   local layerDef = "cc_tensor_t "
-  for k, v in pairs(createTsr) do
+  for k = 1, #createTsr do
+    v = createTsr[k]
     layerDef = layerDef..string.format("*%s, ", v)
   end
   layerDef = string.sub(layerDef, 1, #layerDef - 2)..";"
@@ -431,7 +419,8 @@ ccCodeTranslator = function(net, cfg)
     "__pls[i] = cc_tsrmgr_get(p_namels[i]);"), indentOff + 1)
   printLine("}", indentOff + 0)
 
-  for k, v in pairs(codeLines) do
+  for k = 1, #codeLines do
+    v = codeLines[k]
     v = string.gsub(v, "@%d*,",
       function(s)
         return string.format("%s,",