From 292c0fbe1e8c7e471480688faecc8b9c0eefe5df Mon Sep 17 00:00:00 2001
From: i-evi <evidence_john@outlook.com>
Date: Mon, 12 Oct 2020 01:59:01 +0800
Subject: [PATCH] ...

---
 demo/lua2vgg16.lua  |  50 ++++++
 makefile            |   4 +-
 src/cc_array.c      | 380 ++++++-----------------------------------
 src/cc_array.h      |   2 +-
 src/cc_basic.c      |  68 +++-----
 src/cc_conv2d.c     |  15 +-
 src/cc_cpufn.c      | 406 ++++++++++++++++++++++++++++++++++++++++++--
 src/cc_cpufn.h      |  45 +++++
 src/cc_fmap2d.c     |   4 +-
 src/cc_pad2d.c      |   3 +
 src/global_fn_cfg.c |  25 +--
 src/global_fn_cfg.h |  85 +++++-----
 util/lua2cc.lua     |   4 +-
 13 files changed, 639 insertions(+), 452 deletions(-)
 create mode 100644 demo/lua2vgg16.lua

diff --git a/demo/lua2vgg16.lua b/demo/lua2vgg16.lua
new file mode 100644
index 0000000..4ed7e53
--- /dev/null
+++ b/demo/lua2vgg16.lua
@@ -0,0 +1,50 @@
+loadfile("./util/lua2cc.lua")() 
+
+network = {
+    networkName  = "vgg16",
+    createScope  = "vgg16",
+    parameterLv  = 0,
+    inputLayers  = {"in"},
+    outputLayers = {"out"},
+    l1   = conv2d          ({input = "in",
+                              stride = 1, padding = 1}),
+    l2   = relu             ({}),
+    l3   = conv2d           ({stride = 1, padding = 1}),
+    l4   = relu             ({}),
+    l5   = maxPool2d        ({stride = 2}),
+    l6   = conv2d           ({stride = 1, padding = 1}),
+    l7   = relu             ({}),
+    l8   = conv2d           ({stride = 1, padding = 1}),
+    l9   = relu             ({}),
+    l10  = maxPool2d        ({stride = 2}),
+    l11  = conv2d           ({stride = 1, padding = 1}),
+    l12  = relu             ({}),
+    l13  = conv2d           ({stride = 1, padding = 1}),
+    l14  = relu             ({}),
+    l15  = conv2d           ({stride = 1, padding = 1}),
+    l16  = relu             ({}),
+    l17  = maxPool2d        ({stride = 2}),
+    l18  = conv2d           ({stride = 1, padding = 1}),
+    l19  = relu             ({}),
+    l20  = conv2d           ({stride = 1, padding = 1}),
+    l21  = relu             ({}),
+    l22  = conv2d           ({stride = 1, padding = 1}),
+    l23  = relu             ({}),
+    l24  = maxPool2d        ({stride = 2}),
+    l25  = conv2d           ({stride = 1, padding = 1}),
+    l26  = relu             ({}),
+    l27  = conv2d           ({stride = 1, padding = 1}),
+    l28  = relu             ({}),
+    l29  = conv2d           ({stride = 1, padding = 1}),
+    l30  = relu             ({}),
+    l31  = maxPool2d        ({stride = 2}),
+    l32  = reshape          ({shape = {-1, 1, 1}}),
+    l33  = fullyConnected   ({}),
+    l34  = relu             ({}),
+    l35  = fullyConnected   ({}),
+    l36  = relu             ({}),
+    l37  = fullyConnected   ({}),
+    out  = softmax          ({input = "l37"})
+}
+
+ccCodeTranslator(network, {file = "vgg16.c"})
diff --git a/makefile b/makefile
index 7176bfe..25486e4 100644
--- a/makefile
+++ b/makefile
@@ -15,7 +15,7 @@ DFLAG += # -g -fsanitize=address -fno-omit-frame-pointer
 CFLAG += # -std=c89
 CFLAG += -Wall # -Wpedantic
 
-OFLAG += -O3
+OFLAG += -O3 -march=native
 
 # Enable OpenMP
 OFLAG += -DENABLE_OPENMP -fopenmp
@@ -127,7 +127,7 @@ $(OBJS_PATH)/build:
 all: $(APPS)
 
 $(CATCOON_A): $(ALL_O)
-	cd $(OBJS_PATH) && $(AR) $@ $^ && $(MV) $@ ..
+	cd $(OBJS_PATH) && $(AR) $@ $(ALL_O) && $(MV) $@ ..
 
 %.o: ./src/%.c
 	$(CC) -c -o $(OBJS_PATH)/$@ $< $(CFLAG) $(INC)
diff --git a/src/cc_array.c b/src/cc_array.c
index 1b11222..e159e6a 100644
--- a/src/cc_array.c
+++ b/src/cc_array.c
@@ -6,120 +6,42 @@
 #include "cc_dtype.h"
 #include "cc_array.h"
 
-#define ARRAY_SC_OPS(op, oup, arr, elem, arrlen, dtype) \
-	for (i = 0; i < arrlen; ++i) {                                    \
-		*((dtype*)oup + i) = *((dtype*)arr + i) op *(dtype*)elem; \
-	}
-
-#define ARRAY_ELEM_SET(arr, elem, arrlen, dtype) \
-	for (i = 0; i < arrlen; ++i) {              \
-		*((dtype*)arr + i) = *(dtype*)elem; \
-	}
-
-#define ARRAY_ELEM_CLIP(arr, min, max, arrlen, dtype) \
-	for (i = 0; i < arrlen; ++i) {                              \
-		if (min) {                                          \
-			*((dtype*)arr + i) =                        \
-				*((dtype*)arr + i) < *(dtype*)min ? \
-			*(dtype*)min : *((dtype*)arr + i);          \
-		}                                                   \
-		if (max) {                                          \
-			*((dtype*)arr + i) =                        \
-				*((dtype*)arr + i) > *(dtype*)max ? \
-			*(dtype*)max : *((dtype*)arr + i);          \
-		}                                                   \
-	}
-
-#define ARRAY_EW_OPS(op, oup, a, b, arrlen, dtype) \
-	for (i = 0; i < arrlen; ++i) {                   \
-		*((dtype*)oup + i) = *((dtype*)a + i) op \
-		*((dtype*)b + i);                        \
-	}
-
-#define ARRAY_SUM(arr, arrlen, dtype, sum) \
-	*(dtype*)sum = 0;                           \
-	for (i = 0; i < arrlen; ++i) {              \
-		*(dtype*)sum += *((dtype*)arr + i); \
-	}
-
-#define ARRAY_CAST_CASE(_DT, _srcdt, _dstdt) \
-case _DT:                                                          \
-	for (i = 0; i < arrlen; ++i)                               \
-		*((_dstdt*)dst + i) = (_dstdt)*((_srcdt*)src + i); \
-	break;
+#include "global_fn_cfg.h"
+#define EXT_ARRAY_CAST_DEFINITION(dtype) \
+extern fn_array_cast_ ## dtype _array_cast_ ## dtype;
+
+EXT_ARRAY_CAST_DEFINITION  (uint8)
+EXT_ARRAY_CAST_DEFINITION  (uint16)
+EXT_ARRAY_CAST_DEFINITION  (uint32)
+EXT_ARRAY_CAST_DEFINITION  (uint64)
+EXT_ARRAY_CAST_DEFINITION  (int8)
+EXT_ARRAY_CAST_DEFINITION  (int16)
+EXT_ARRAY_CAST_DEFINITION  (int32)
+EXT_ARRAY_CAST_DEFINITION  (int64)
+EXT_ARRAY_CAST_DEFINITION  (float32)
+EXT_ARRAY_CAST_DEFINITION  (float64)
+
+extern fn_array_set    _array_set;
+extern fn_array_clip_by_value _array_clip_by_value;
+
+extern fn_array_add_by _array_add_by;
+extern fn_array_sub_by _array_sub_by;
+extern fn_array_mul_by _array_mul_by;
+extern fn_array_div_by _array_div_by;
+
+extern fn_array_add_ew _array_add_ew;
+extern fn_array_sub_ew _array_sub_ew;
+extern fn_array_mul_ew _array_mul_ew;
+extern fn_array_div_ew _array_div_ew;
+
+extern fn_array_sum  _array_sum;
+extern fn_array_mean _array_mean;
 
 #define CC_ARRAY_CAST_IMPLEMENTATION(dtype) \
-void cc_array_cast_ ## dtype(                                    \
-	void *dst, const void *src, int arrlen, int dt)          \
-{                                                                \
-	cc_int32 i;                                              \
-	switch (dt) {                                            \
-	ARRAY_CAST_CASE(CC_UINT8, cc_uint8, cc_ ## dtype);       \
-	ARRAY_CAST_CASE(CC_UINT16, cc_uint16, cc_ ## dtype);     \
-	ARRAY_CAST_CASE(CC_UINT32, cc_uint32, cc_ ## dtype);     \
-	ARRAY_CAST_CASE(CC_UINT64, cc_uint64, cc_ ## dtype);     \
-	ARRAY_CAST_CASE(CC_INT8, cc_int8, cc_ ## dtype);         \
-	ARRAY_CAST_CASE(CC_INT16, cc_int16, cc_ ## dtype);       \
-	ARRAY_CAST_CASE(CC_INT32, cc_int32, cc_ ## dtype);       \
-	ARRAY_CAST_CASE(CC_INT64, cc_int64, cc_ ## dtype);       \
-	ARRAY_CAST_CASE(CC_FLOAT32, cc_float32, cc_ ## dtype);   \
-	ARRAY_CAST_CASE(CC_FLOAT64, cc_float64, cc_ ## dtype);   \
-	default:                                                 \
-		utlog_format(UTLOG_ERR,                          \
-			"cc_array: unsupported dtype %x\n", dt); \
-		break;                                           \
-	}                                                        \
-}
-
-#define ARRAY_SET_CASE(_DT, _dt) \
-case _DT:                                   \
-	ARRAY_ELEM_SET(arr, x, arrlen, _dt) \
-	break;
-void cc_array_set(void *arr, int arrlen, const void *x, int dt)
-{
-	cc_int32 i;
-	switch (dt) {
-	ARRAY_SET_CASE(CC_UINT8, cc_uint8);
-	ARRAY_SET_CASE(CC_UINT16, cc_uint16);
-	ARRAY_SET_CASE(CC_UINT32, cc_uint32);
-	ARRAY_SET_CASE(CC_UINT64, cc_uint64);
-	ARRAY_SET_CASE(CC_INT8, cc_int8);
-	ARRAY_SET_CASE(CC_INT16, cc_int16);
-	ARRAY_SET_CASE(CC_INT32, cc_int32);
-	ARRAY_SET_CASE(CC_INT64, cc_int64);
-	ARRAY_SET_CASE(CC_FLOAT32, cc_float32);
-	ARRAY_SET_CASE(CC_FLOAT64, cc_float64);
-	default:
-		utlog_format(UTLOG_ERR,
-			"cc_array: unsupported dtype %x\n", dt);
-		break;
-	}
-}
-
-#define ARRAY_CLIP_CASE(_DT, _dt) \
-case _DT:                                     \
-	ARRAY_ELEM_CLIP(arr, min, max, arrlen, _dt); \
-	break;
-void cc_array_clip_by_value(
-	void *arr, int arrlen, const void *min, const void *max, int dt)
-{
-	cc_int32 i;
-	switch (dt) {
-	ARRAY_CLIP_CASE(CC_UINT8, cc_uint8);
-	ARRAY_CLIP_CASE(CC_UINT16, cc_uint16);
-	ARRAY_CLIP_CASE(CC_UINT32, cc_uint32);
-	ARRAY_CLIP_CASE(CC_UINT64, cc_uint64);
-	ARRAY_CLIP_CASE(CC_INT8, cc_int8);
-	ARRAY_CLIP_CASE(CC_INT16, cc_int16);
-	ARRAY_CLIP_CASE(CC_INT32, cc_int32);
-	ARRAY_CLIP_CASE(CC_INT64, cc_int64);
-	ARRAY_CLIP_CASE(CC_FLOAT32, cc_float32);
-	ARRAY_CLIP_CASE(CC_FLOAT64, cc_float64);
-	default:
-		utlog_format(UTLOG_ERR,
-			"cc_array: unsupported dtype %x\n", dt);
-		break;
-	}
+void cc_array_cast_ ## dtype(                           \
+	void *dst, const void *src, int arrlen, int dt) \
+{                                                       \
+	_array_cast_ ## dtype(dst, src, arrlen, dt);    \
 }
 
 CC_ARRAY_CAST_IMPLEMENTATION  (uint8)
@@ -133,263 +55,73 @@ CC_ARRAY_CAST_IMPLEMENTATION  (int64)
 CC_ARRAY_CAST_IMPLEMENTATION  (float32)
 CC_ARRAY_CAST_IMPLEMENTATION  (float64)
 
-#define ARRAY_ADD_BY_CASE(_DT, _dt) \
-case _DT:                                        \
-	ARRAY_SC_OPS(+, oup, a, x, arrlen, _dt); \
-	break;
+void cc_array_set(void *arr, int arrlen, const void *x, int dt)
+{
+	_array_set(arr, arrlen, x, dt);
+}
+
+void cc_array_clip_by_value(void *arr,
+	int arrlen, const void *min, const void *max, int dt)
+{
+	_array_clip_by_value(arr, arrlen, min, max, dt);
+}
+
 void cc_array_add_by(void *oup,
 	int arrlen, const void *a, const void *x, int dt)
 {
-	cc_int32 i;
-	switch (dt) {
-	ARRAY_ADD_BY_CASE(CC_UINT8, cc_uint8);
-	ARRAY_ADD_BY_CASE(CC_UINT16, cc_uint16);
-	ARRAY_ADD_BY_CASE(CC_UINT32, cc_uint32);
-	ARRAY_ADD_BY_CASE(CC_UINT64, cc_uint64);
-	ARRAY_ADD_BY_CASE(CC_INT8, cc_int8);
-	ARRAY_ADD_BY_CASE(CC_INT16, cc_int16);
-	ARRAY_ADD_BY_CASE(CC_INT32, cc_int32);
-	ARRAY_ADD_BY_CASE(CC_INT64, cc_int64);
-	ARRAY_ADD_BY_CASE(CC_FLOAT32, cc_float32);
-	ARRAY_ADD_BY_CASE(CC_FLOAT64, cc_float64);
-	default:
-		utlog_format(UTLOG_ERR,
-			"cc_array: unsupported dtype %x\n", dt);
-		break;
-	}
+	_array_add_by(oup, arrlen, a, x, dt);
 }
 
-#define ARRAY_SUB_BY_CASE(_DT, _dt) \
-case _DT:                                        \
-	ARRAY_SC_OPS(-, oup, a, x, arrlen, _dt); \
-	break;
 void cc_array_sub_by(void *oup,
 	int arrlen, const void *a, const void *x, int dt)
 {
-	cc_int32 i;
-	switch (dt) {
-	ARRAY_SUB_BY_CASE(CC_UINT8, cc_uint8);
-	ARRAY_SUB_BY_CASE(CC_UINT16, cc_uint16);
-	ARRAY_SUB_BY_CASE(CC_UINT32, cc_uint32);
-	ARRAY_SUB_BY_CASE(CC_UINT64, cc_uint64);
-	ARRAY_SUB_BY_CASE(CC_INT8, cc_int8);
-	ARRAY_SUB_BY_CASE(CC_INT16, cc_int16);
-	ARRAY_SUB_BY_CASE(CC_INT32, cc_int32);
-	ARRAY_SUB_BY_CASE(CC_INT64, cc_int64);
-	ARRAY_SUB_BY_CASE(CC_FLOAT32, cc_float32);
-	ARRAY_SUB_BY_CASE(CC_FLOAT64, cc_float64);
-	default:
-		utlog_format(UTLOG_ERR,
-			"cc_array: unsupported dtype %x\n", dt);
-		break;
-	}
+	_array_sub_by(oup, arrlen, a, x, dt);
 }
 
-#define ARRAY_MUL_BY_CASE(_DT, _dt) \
-case _DT:                                        \
-	ARRAY_SC_OPS(*, oup, a, x, arrlen, _dt); \
-	break;
 void cc_array_mul_by(void *oup,
 	int arrlen, const void *a, const void *x, int dt)
 {
-	cc_int32 i;
-	switch (dt) {
-	ARRAY_MUL_BY_CASE(CC_UINT8, cc_uint8);
-	ARRAY_MUL_BY_CASE(CC_UINT16, cc_uint16);
-	ARRAY_MUL_BY_CASE(CC_UINT32, cc_uint32);
-	ARRAY_MUL_BY_CASE(CC_UINT64, cc_uint64);
-	ARRAY_MUL_BY_CASE(CC_INT8, cc_int8);
-	ARRAY_MUL_BY_CASE(CC_INT16, cc_int16);
-	ARRAY_MUL_BY_CASE(CC_INT32, cc_int32);
-	ARRAY_MUL_BY_CASE(CC_INT64, cc_int64);
-	ARRAY_MUL_BY_CASE(CC_FLOAT32, cc_float32);
-	ARRAY_MUL_BY_CASE(CC_FLOAT64, cc_float64);
-	default:
-		utlog_format(UTLOG_ERR,
-			"cc_array: unsupported dtype %x\n", dt);
-		break;
-	}
+	_array_mul_by(oup, arrlen, a, x, dt);
 }
 
-#define ARRAY_DIV_BY_CASE(_DT, _dt) \
-case _DT:                                        \
-	ARRAY_SC_OPS(/, oup, a, x, arrlen, _dt); \
-	break;
 void cc_array_div_by(void *oup,
 	int arrlen, const void *a, const void *x, int dt)
 {
-	cc_int32 i;
-	switch (dt) {
-	ARRAY_DIV_BY_CASE(CC_UINT8, cc_uint8);
-	ARRAY_DIV_BY_CASE(CC_UINT16, cc_uint16);
-	ARRAY_DIV_BY_CASE(CC_UINT32, cc_uint32);
-	ARRAY_DIV_BY_CASE(CC_UINT64, cc_uint64);
-	ARRAY_DIV_BY_CASE(CC_INT8, cc_int8);
-	ARRAY_DIV_BY_CASE(CC_INT16, cc_int16);
-	ARRAY_DIV_BY_CASE(CC_INT32, cc_int32);
-	ARRAY_DIV_BY_CASE(CC_INT64, cc_int64);
-	ARRAY_DIV_BY_CASE(CC_FLOAT32, cc_float32);
-	ARRAY_DIV_BY_CASE(CC_FLOAT64, cc_float64);
-	default:
-		utlog_format(UTLOG_ERR,
-			"cc_array: unsupported dtype %x\n", dt);
-		break;
-	}
+	_array_div_by(oup, arrlen, a, x, dt);
 }
 
-#define ARRAY_ADD_EW_CASE(_DT, _dt) \
-case _DT:                                        \
-	ARRAY_EW_OPS(+, oup, a, b, arrlen, _dt); \
-	break;
 void cc_array_add_ew(void *oup,
 	int arrlen, const void *a, const void *b, int dt)
 {
-	cc_int32 i;
-	switch (dt) {
-	ARRAY_ADD_EW_CASE(CC_UINT8, cc_uint8);
-	ARRAY_ADD_EW_CASE(CC_UINT16, cc_uint16);
-	ARRAY_ADD_EW_CASE(CC_UINT32, cc_uint32);
-	ARRAY_ADD_EW_CASE(CC_UINT64, cc_uint64);
-	ARRAY_ADD_EW_CASE(CC_INT8, cc_int8);
-	ARRAY_ADD_EW_CASE(CC_INT16, cc_int16);
-	ARRAY_ADD_EW_CASE(CC_INT32, cc_int32);
-	ARRAY_ADD_EW_CASE(CC_INT64, cc_int64);
-	ARRAY_ADD_EW_CASE(CC_FLOAT32, cc_float32);
-	ARRAY_ADD_EW_CASE(CC_FLOAT64, cc_float64);
-	default:
-		utlog_format(UTLOG_ERR,
-			"cc_array: unsupported dtype %x\n", dt);
-		break;
-	}
+	_array_add_ew(oup, arrlen, a, b, dt);
 }
 
-#define ARRAY_SUB_EW_CASE(_DT, _dt) \
-case _DT:                                        \
-	ARRAY_EW_OPS(-, oup, a, b, arrlen, _dt); \
-	break;
 void cc_array_sub_ew(void *oup,
 	int arrlen, const void *a, const void *b, int dt)
 {
-	cc_int32 i;
-	switch (dt) {
-	ARRAY_SUB_EW_CASE(CC_UINT8, cc_uint8);
-	ARRAY_SUB_EW_CASE(CC_UINT16, cc_uint16);
-	ARRAY_SUB_EW_CASE(CC_UINT32, cc_uint32);
-	ARRAY_SUB_EW_CASE(CC_UINT64, cc_uint64);
-	ARRAY_SUB_EW_CASE(CC_INT8, cc_int8);
-	ARRAY_SUB_EW_CASE(CC_INT16, cc_int16);
-	ARRAY_SUB_EW_CASE(CC_INT32, cc_int32);
-	ARRAY_SUB_EW_CASE(CC_INT64, cc_int64);
-	ARRAY_SUB_EW_CASE(CC_FLOAT32, cc_float32);
-	ARRAY_SUB_EW_CASE(CC_FLOAT64, cc_float64);
-	default:
-		utlog_format(UTLOG_ERR,
-			"cc_array: unsupported dtype %x\n", dt);
-		break;
-	}
+	_array_sub_ew(oup, arrlen, a, b, dt);
 }
 
-#define ARRAY_MUL_EW_CASE(_DT, _dt) \
-case _DT:                                        \
-	ARRAY_EW_OPS(*, oup, a, b, arrlen, _dt); \
-	break;
 void cc_array_mul_ew(void *oup,
 	int arrlen, const void *a, const void *b, int dt)
 {
-	cc_int32 i;
-	switch (dt) {
-	ARRAY_MUL_EW_CASE(CC_UINT8, cc_uint8);
-	ARRAY_MUL_EW_CASE(CC_UINT16, cc_uint16);
-	ARRAY_MUL_EW_CASE(CC_UINT32, cc_uint32);
-	ARRAY_MUL_EW_CASE(CC_UINT64, cc_uint64);
-	ARRAY_MUL_EW_CASE(CC_INT8, cc_int8);
-	ARRAY_MUL_EW_CASE(CC_INT16, cc_int16);
-	ARRAY_MUL_EW_CASE(CC_INT32, cc_int32);
-	ARRAY_MUL_EW_CASE(CC_INT64, cc_int64);
-	ARRAY_MUL_EW_CASE(CC_FLOAT32, cc_float32);
-	ARRAY_MUL_EW_CASE(CC_FLOAT64, cc_float64);
-	default:
-		utlog_format(UTLOG_ERR,
-			"cc_array: unsupported dtype %x\n", dt);
-		break;
-	}
+	_array_mul_ew(oup, arrlen, a, b, dt);
 }
 
-#define ARRAY_DIV_EW_CASE(_DT, _dt) \
-case _DT:                                        \
-	ARRAY_EW_OPS(/, oup, a, b, arrlen, _dt); \
-	break;
 void cc_array_div_ew(void *oup,
 	int arrlen, const void *a, const void *b, int dt)
 {
-	cc_int32 i;
-	switch (dt) {
-	ARRAY_DIV_EW_CASE(CC_UINT8, cc_uint8);
-	ARRAY_DIV_EW_CASE(CC_UINT16, cc_uint16);
-	ARRAY_DIV_EW_CASE(CC_UINT32, cc_uint32);
-	ARRAY_DIV_EW_CASE(CC_UINT64, cc_uint64);
-	ARRAY_DIV_EW_CASE(CC_INT8, cc_int8);
-	ARRAY_DIV_EW_CASE(CC_INT16, cc_int16);
-	ARRAY_DIV_EW_CASE(CC_INT32, cc_int32);
-	ARRAY_DIV_EW_CASE(CC_INT64, cc_int64);
-	ARRAY_DIV_EW_CASE(CC_FLOAT32, cc_float32);
-	ARRAY_DIV_EW_CASE(CC_FLOAT64, cc_float64);
-	default:
-		utlog_format(UTLOG_ERR,
-			"cc_array: unsupported dtype %x\n", dt);
-		break;
-	}
+	_array_div_ew(oup, arrlen, a, b, dt);
 }
 
-#define ARRAY_SUM_CASE(_DT, _dt) \
-case _DT:                               \
-	ARRAY_SUM(arr, arrlen, _dt, x); \
-	break;
-void cc_array_sum(const void *arr, int arrlen, void *x, int dt)
+void cc_array_sum (const void *arr, int arrlen, void *x, int dt)
 {
-	cc_int32 i;
-	switch (dt) {
-	ARRAY_SUM_CASE(CC_UINT8, cc_uint8);
-	ARRAY_SUM_CASE(CC_UINT16, cc_uint16);
-	ARRAY_SUM_CASE(CC_UINT32, cc_uint32);
-	ARRAY_SUM_CASE(CC_UINT64, cc_uint64);
-	ARRAY_SUM_CASE(CC_INT8, cc_int8);
-	ARRAY_SUM_CASE(CC_INT16, cc_int16);
-	ARRAY_SUM_CASE(CC_INT32, cc_int32);
-	ARRAY_SUM_CASE(CC_INT64, cc_int64);
-	ARRAY_SUM_CASE(CC_FLOAT32, cc_float32);
-	ARRAY_SUM_CASE(CC_FLOAT64, cc_float64);
-	default:
-		utlog_format(UTLOG_ERR,
-			"cc_array: unsupported dtype %x\n", dt);
-		break;
-	}
+	_array_sum(arr, arrlen, x, dt);
 }
 
-#define ARRAY_MEAN_CASE(_DT, _dt) \
-case _DT:                               \
-	ARRAY_SUM(arr, arrlen, _dt, x); \
-	*(_dt*)x /= arrlen;             \
-	break;
 void cc_array_mean(const void *arr, int arrlen, void *x, int dt)
 {
-	cc_int32 i;
-	switch (dt) {
-	ARRAY_MEAN_CASE(CC_UINT8, cc_uint8);
-	ARRAY_MEAN_CASE(CC_UINT16, cc_uint16);
-	ARRAY_MEAN_CASE(CC_UINT32, cc_uint32);
-	ARRAY_MEAN_CASE(CC_UINT64, cc_uint64);
-	ARRAY_MEAN_CASE(CC_INT8, cc_int8);
-	ARRAY_MEAN_CASE(CC_INT16, cc_int16);
-	ARRAY_MEAN_CASE(CC_INT32, cc_int32);
-	ARRAY_MEAN_CASE(CC_INT64, cc_int64);
-	ARRAY_MEAN_CASE(CC_FLOAT32, cc_float32);
-	ARRAY_MEAN_CASE(CC_FLOAT64, cc_float64);
-	default:
-		utlog_format(UTLOG_ERR,
-			"cc_array: unsupported dtype %x\n", dt);
-		break;
-	}
+	_array_mean(arr, arrlen, x, dt);
 }
 
 #define PRINT_ARRAY_CASE(_DT, _dt) \
diff --git a/src/cc_array.h b/src/cc_array.h
index 11e4119..288e134 100644
--- a/src/cc_array.h
+++ b/src/cc_array.h
@@ -10,7 +10,7 @@
  */
 
 #define CC_ARRAY_CAST_DEFINITION(dtype) \
-void cc_array_cast_ ## dtype(                            \
+void cc_array_cast_ ## dtype(                           \
 	void *dst, const void *src, int arrlen, int dt);
 
 CC_ARRAY_CAST_DEFINITION  (uint8)
diff --git a/src/cc_basic.c b/src/cc_basic.c
index ff687ec..dc6bf07 100644
--- a/src/cc_basic.c
+++ b/src/cc_basic.c
@@ -11,34 +11,6 @@
 #include "util_log.h"
 #include "cc_basic.h"
 
-#include "global_fn_cfg.h"
-#define EXT_ARRAY_CAST_DEFINITION(dtype) \
-extern fn_array_cast_ ## dtype _array_cast_ ## dtype;
-
-EXT_ARRAY_CAST_DEFINITION  (uint8)
-EXT_ARRAY_CAST_DEFINITION  (uint16)
-EXT_ARRAY_CAST_DEFINITION  (uint32)
-EXT_ARRAY_CAST_DEFINITION  (uint64)
-EXT_ARRAY_CAST_DEFINITION  (int8)
-EXT_ARRAY_CAST_DEFINITION  (int16)
-EXT_ARRAY_CAST_DEFINITION  (int32)
-EXT_ARRAY_CAST_DEFINITION  (int64)
-EXT_ARRAY_CAST_DEFINITION  (float32)
-EXT_ARRAY_CAST_DEFINITION  (float64)
-
-extern fn_array_set _array_set;
-extern fn_array_clip_by_value _array_clip_by_value;
-
-extern fn_array_add_by _array_add_by;
-extern fn_array_sub_by _array_sub_by;
-extern fn_array_mul_by _array_mul_by;
-extern fn_array_div_by _array_div_by;
-
-extern fn_array_add_ew _array_add_ew;
-extern fn_array_sub_ew _array_sub_ew;
-extern fn_array_mul_ew _array_mul_ew;
-extern fn_array_div_ew _array_div_ew;
-
 static cc_int32 _calc_elems(const cc_int32 *shape)
 {
 	cc_int32 elems;
@@ -300,7 +272,7 @@ void cc_print(const cc_tensor_t *tensor)
 
 void cc_set_value(cc_tensor_t *tensor, void *v)
 {
-	_array_set(tensor->data, 
+	cc_array_set(tensor->data, 
 		cc_elements(tensor), v, *tensor->dtype);
 }
 
@@ -316,7 +288,7 @@ cc_tensor_t *cc_clip_by_value(cc_tensor_t *tensor,
 		yield = tensor;
 	else
 		yield = cc_copy(tensor, name);
-	_array_clip_by_value(tensor->data,
+	cc_array_clip_by_value(tensor->data,
 		cc_elements(tensor), min, max, *tensor->dtype);
 	return yield;
 }
@@ -335,43 +307,43 @@ cc_tensor_t *cc_cast(cc_tensor_t *tensor,
 	cc_assert_ptr(cast = cc_create(tensor->shape, dtype, NULL));
 	switch (dtype) {
 	case CC_INT8:
-		_array_cast_int8(cast->data,
+		cc_array_cast_int8(cast->data,
 			tensor->data, elems, *tensor->dtype);
 		break;
 	case CC_UINT8:
-		_array_cast_uint8(cast->data,
+		cc_array_cast_uint8(cast->data,
 			tensor->data, elems, *tensor->dtype);
 		break;
 	case CC_INT16:
-		_array_cast_int16(cast->data,
+		cc_array_cast_int16(cast->data,
 			tensor->data, elems, *tensor->dtype);
 		break;
 	case CC_UINT16:
-		_array_cast_uint16(cast->data,
+		cc_array_cast_uint16(cast->data,
 			tensor->data, elems, *tensor->dtype);
 		break;
 	case CC_INT32:
-		_array_cast_int32(cast->data,
+		cc_array_cast_int32(cast->data,
 			tensor->data, elems, *tensor->dtype);
 		break;
 	case CC_UINT32:
-		_array_cast_uint32(cast->data,
+		cc_array_cast_uint32(cast->data,
 			tensor->data, elems, *tensor->dtype);
 		break;
 	case CC_INT64:
-		_array_cast_int64(cast->data,
+		cc_array_cast_int64(cast->data,
 			tensor->data, elems, *tensor->dtype);
 		break;
 	case CC_UINT64:
-		_array_cast_uint64(cast->data,
+		cc_array_cast_uint64(cast->data,
 			tensor->data, elems, *tensor->dtype);
 		break;
 	case CC_FLOAT32:
-		_array_cast_float32(cast->data,
+		cc_array_cast_float32(cast->data,
 			tensor->data, elems, *tensor->dtype);
 		break;
 	case CC_FLOAT64:
-		_array_cast_float64(cast->data,
+		cc_array_cast_float64(cast->data,
 			tensor->data, elems, *tensor->dtype);
 		break;
 	default:
@@ -411,19 +383,19 @@ cc_tensor_t *cc_scalar(cc_tensor_t *tensor,
 		yield = cc_copy(tensor, name);
 	switch (op) {
 	case '+':
-		_array_add_by(yield->data, elems,
+		cc_array_add_by(yield->data, elems,
 			yield->data, data, *tensor->dtype);
 		break;
 	case '-':
-		_array_sub_by(yield->data, elems,
+		cc_array_sub_by(yield->data, elems,
 			yield->data, data, *tensor->dtype);
 		break;
 	case '*':
-		_array_mul_by(yield->data, elems,
+		cc_array_mul_by(yield->data, elems,
 			yield->data, data, *tensor->dtype);
 		break;
 	case '/':
-		_array_div_by(yield->data, elems,
+		cc_array_div_by(yield->data, elems,
 			yield->data, data, *tensor->dtype);
 		break;
 	default:
@@ -458,19 +430,19 @@ cc_tensor_t *cc_elemwise(cc_tensor_t *a,
 		yield = cc_copy(a, name);
 	switch (op) {
 	case '+':
-		_array_add_ew(yield->data, elems,
+		cc_array_add_ew(yield->data, elems,
 			yield->data, b->data, *yield->dtype);
 		break;
 	case '-':
-		_array_sub_ew(yield->data, elems,
+		cc_array_sub_ew(yield->data, elems,
 			yield->data, b->data, *yield->dtype);
 		break;
 	case '*':
-		_array_mul_ew(yield->data, elems,
+		cc_array_mul_ew(yield->data, elems,
 			yield->data, b->data, *yield->dtype);
 		break;
 	case '/':
-		_array_div_ew(yield->data, elems,
+		cc_array_div_ew(yield->data, elems,
 			yield->data, b->data, *yield->dtype);
 		break;
 	default:
diff --git a/src/cc_conv2d.c b/src/cc_conv2d.c
index ba1ba11..ce027c6 100644
--- a/src/cc_conv2d.c
+++ b/src/cc_conv2d.c
@@ -7,6 +7,7 @@
 #endif
 
 #include "cc_assert.h"
+#include "cc_array.h"
 #include "cc_basic.h"
 #include "cc_fmap2d.h"
 #include "cc_pad2d.h"
@@ -17,8 +18,6 @@
 
 #include "global_fn_cfg.h"
 extern fn_conv2d       _conv2d;
-extern fn_array_add_ew _array_add_ew;
-extern fn_array_mul_by _array_mul_by;
 
 cc_int32 cc_conv2d_shape_calc(
 	cc_int32 i, cc_int32 k, cc_int32 s, cc_int32 p)
@@ -101,7 +100,7 @@ cc_tensor_t *cc_conv2d(const cc_tensor_t *inp,
 				k_ch_mem_size * j),
 				kernel->shape[CC_CONV2D_KERNEL_W],
 			*kernel->dtype);
-		_array_add_ew(oup->data + o_ch_mem_size * i,
+		cc_array_add_ew(oup->data + o_ch_mem_size * i,
 			o_ch_size, oup->data + o_ch_mem_size * i,
 			omp_out_buf + omp_get_thread_num() * o_ch_mem_size,
 		 *oup->dtype);
@@ -115,7 +114,7 @@ cc_tensor_t *cc_conv2d(const cc_tensor_t *inp,
 				k_ch_mem_size * j),
 				kernel->shape[CC_CONV2D_KERNEL_W],
 			*kernel->dtype);
-		_array_add_ew(oup->data + o_ch_mem_size * i, o_ch_size,
+		cc_array_add_ew(oup->data + o_ch_mem_size * i, o_ch_size,
 				oup->data + o_ch_mem_size * i, omp_out_buf,
 			*oup->dtype);
 #endif
@@ -264,21 +263,21 @@ cc_tensor_t *cc_pw_conv2d(cc_tensor_t *inp, const cc_tensor_t *kernel,
 		for (j = 0; j < kernel->shape[CC_CONV2D_KERNEL_I]; ++j)
 		{
 #ifdef ENABLE_OPENMP
-		_array_mul_by(
+		cc_array_mul_by(
 			omp_out_buf + omp_get_thread_num() * o_ch_mem_size,
 			o_ch_size, inp->data + o_ch_mem_size * j,
 			kernel->data + k_mem_size * i + k_ch_mem_size * j,
 			*oup->dtype);
-		_array_add_ew(oup->data + o_ch_mem_size * i,
+		cc_array_add_ew(oup->data + o_ch_mem_size * i,
 			o_ch_size, oup->data + o_ch_mem_size * i,
 			omp_out_buf + omp_get_thread_num() * o_ch_mem_size,
 		 *oup->dtype);
 #else
-		_array_mul_by(omp_out_buf, o_ch_size,
+		cc_array_mul_by(omp_out_buf, o_ch_size,
 			inp->data + o_ch_mem_size * j,
 			kernel->data + k_mem_size * i + k_ch_mem_size * j,
 			*oup->dtype);
-		_array_add_ew(oup->data + o_ch_mem_size * i, o_ch_size,
+		cc_array_add_ew(oup->data + o_ch_mem_size * i, o_ch_size,
 				oup->data + o_ch_mem_size * i, omp_out_buf,
 			*oup->dtype);
 #endif
diff --git a/src/cc_cpufn.c b/src/cc_cpufn.c
index 55ab0da..b9d5ac4 100644
--- a/src/cc_cpufn.c
+++ b/src/cc_cpufn.c
@@ -321,21 +321,19 @@ static void cc_cpu_conv2d_ ## dt (cc_ ## dt *inp, cc_ ## dt *oup, \
 	cc_int32 x, cc_int32 y, cc_int32 oup_x, cc_int32 oup_y,   \
 	cc_int32 sx, cc_int32 sy, cc_ ## dt *filter, cc_int32 fw) \
 {                                                                 \
-  cc_int32 i, j, k, l, oup_i, oup_j;                              \
+  cc_int32 i, j, k, l;                                            \
   cc_int32 half_fl = fw >> 1;                                     \
   cc_ ## dt sum;                                                  \
-    for (i = half_fl; i < y - half_fl; i += sy) {                 \
-      for (j = half_fl; j < x - half_fl; j += sx) {               \
-        sum = 0;                                                  \
-        for (k = -half_fl; k <= half_fl; ++k) {                   \
-          for (l = -half_fl; l <= half_fl; ++l) {                 \
-            sum += *(inp + (i + k) * x + (j + l)) *               \
-                *(filter + (k + half_fl) * fw + (l + half_fl));   \
+  for (i = half_fl; i < y - half_fl; i += sy) {                   \
+    for (j = half_fl; j < x - half_fl; j += sx) {                 \
+      sum = 0;                                                    \
+      for (k = -half_fl; k <= half_fl; ++k) {                     \
+        for (l = -half_fl; l <= half_fl; ++l) {                   \
+          sum += *(inp + (i + k) * x + (j + l)) *                 \
+              *(filter + (k + half_fl) * fw + (l + half_fl));     \
         }                                                         \
       }                                                           \
-      oup_i = ((i - half_fl) / sy);                               \
-      oup_j = ((j - half_fl) / sx);                               \
-      *(oup + oup_i * oup_x + oup_j) = sum;                       \
+      *oup++ = sum;                                               \
     }                                                             \
   }                                                               \
 }
@@ -555,3 +553,389 @@ void cc_cpu_batch_norm(void *inp,
 			"cc_cpufn: unsupported dtype %x\n", dt);
 	}
 }
+
+#define ARRAY_SC_OPS(op, oup, arr, elem, arrlen, dtype) \
+	for (i = 0; i < arrlen; ++i) {                                    \
+		*((dtype*)oup + i) = *((dtype*)arr + i) op *(dtype*)elem; \
+	}
+
+#define ARRAY_ELEM_SET(arr, elem, arrlen, dtype) \
+	for (i = 0; i < arrlen; ++i) {              \
+		*((dtype*)arr + i) = *(dtype*)elem; \
+	}
+
+#define ARRAY_ELEM_CLIP(arr, min, max, arrlen, dtype) \
+	for (i = 0; i < arrlen; ++i) {                              \
+		if (min) {                                          \
+			*((dtype*)arr + i) =                        \
+				*((dtype*)arr + i) < *(dtype*)min ? \
+			*(dtype*)min : *((dtype*)arr + i);          \
+		}                                                   \
+		if (max) {                                          \
+			*((dtype*)arr + i) =                        \
+				*((dtype*)arr + i) > *(dtype*)max ? \
+			*(dtype*)max : *((dtype*)arr + i);          \
+		}                                                   \
+	}
+
+#define ARRAY_EW_OPS(op, oup, a, b, arrlen, dtype) \
+	for (i = 0; i < arrlen; ++i) {                   \
+		*((dtype*)oup + i) = *((dtype*)a + i) op \
+		*((dtype*)b + i);                        \
+	}
+
+#define ARRAY_SUM(arr, arrlen, dtype, sum) \
+	*(dtype*)sum = 0;                           \
+	for (i = 0; i < arrlen; ++i) {              \
+		*(dtype*)sum += *((dtype*)arr + i); \
+	}
+
+#define ARRAY_CAST_CASE(_DT, _srcdt, _dstdt) \
+case _DT:                                                          \
+	for (i = 0; i < arrlen; ++i)                               \
+		*((_dstdt*)dst + i) = (_dstdt)*((_srcdt*)src + i); \
+	break;
+
+#define CC_CPU_ARRAY_CAST_IMPLEMENTATION(dtype) \
+void cc_cpu_array_cast_ ## dtype(                                \
+	void *dst, const void *src, int arrlen, int dt)          \
+{                                                                \
+	cc_int32 i;                                              \
+	switch (dt) {                                            \
+	ARRAY_CAST_CASE(CC_UINT8, cc_uint8, cc_ ## dtype);       \
+	ARRAY_CAST_CASE(CC_UINT16, cc_uint16, cc_ ## dtype);     \
+	ARRAY_CAST_CASE(CC_UINT32, cc_uint32, cc_ ## dtype);     \
+	ARRAY_CAST_CASE(CC_UINT64, cc_uint64, cc_ ## dtype);     \
+	ARRAY_CAST_CASE(CC_INT8, cc_int8, cc_ ## dtype);         \
+	ARRAY_CAST_CASE(CC_INT16, cc_int16, cc_ ## dtype);       \
+	ARRAY_CAST_CASE(CC_INT32, cc_int32, cc_ ## dtype);       \
+	ARRAY_CAST_CASE(CC_INT64, cc_int64, cc_ ## dtype);       \
+	ARRAY_CAST_CASE(CC_FLOAT32, cc_float32, cc_ ## dtype);   \
+	ARRAY_CAST_CASE(CC_FLOAT64, cc_float64, cc_ ## dtype);   \
+	default:                                                 \
+		utlog_format(UTLOG_ERR,                          \
+			"cc_array: unsupported dtype %x\n", dt); \
+		break;                                           \
+	}                                                        \
+}
+
+#define ARRAY_SET_CASE(_DT, _dt) \
+case _DT:                                   \
+	ARRAY_ELEM_SET(arr, x, arrlen, _dt) \
+	break;
+void cc_cpu_array_set(void *arr, int arrlen, const void *x, int dt)
+{
+	cc_int32 i;
+	switch (dt) {
+	ARRAY_SET_CASE(CC_UINT8, cc_uint8);
+	ARRAY_SET_CASE(CC_UINT16, cc_uint16);
+	ARRAY_SET_CASE(CC_UINT32, cc_uint32);
+	ARRAY_SET_CASE(CC_UINT64, cc_uint64);
+	ARRAY_SET_CASE(CC_INT8, cc_int8);
+	ARRAY_SET_CASE(CC_INT16, cc_int16);
+	ARRAY_SET_CASE(CC_INT32, cc_int32);
+	ARRAY_SET_CASE(CC_INT64, cc_int64);
+	ARRAY_SET_CASE(CC_FLOAT32, cc_float32);
+	ARRAY_SET_CASE(CC_FLOAT64, cc_float64);
+	default:
+		utlog_format(UTLOG_ERR,
+			"cc_array: unsupported dtype %x\n", dt);
+		break;
+	}
+}
+
+#define ARRAY_CLIP_CASE(_DT, _dt) \
+case _DT:                                     \
+	ARRAY_ELEM_CLIP(arr, min, max, arrlen, _dt); \
+	break;
+void cc_cpu_array_clip_by_value(
+	void *arr, int arrlen, const void *min, const void *max, int dt)
+{
+	cc_int32 i;
+	switch (dt) {
+	ARRAY_CLIP_CASE(CC_UINT8, cc_uint8);
+	ARRAY_CLIP_CASE(CC_UINT16, cc_uint16);
+	ARRAY_CLIP_CASE(CC_UINT32, cc_uint32);
+	ARRAY_CLIP_CASE(CC_UINT64, cc_uint64);
+	ARRAY_CLIP_CASE(CC_INT8, cc_int8);
+	ARRAY_CLIP_CASE(CC_INT16, cc_int16);
+	ARRAY_CLIP_CASE(CC_INT32, cc_int32);
+	ARRAY_CLIP_CASE(CC_INT64, cc_int64);
+	ARRAY_CLIP_CASE(CC_FLOAT32, cc_float32);
+	ARRAY_CLIP_CASE(CC_FLOAT64, cc_float64);
+	default:
+		utlog_format(UTLOG_ERR,
+			"cc_array: unsupported dtype %x\n", dt);
+		break;
+	}
+}
+
+CC_CPU_ARRAY_CAST_IMPLEMENTATION  (uint8)
+CC_CPU_ARRAY_CAST_IMPLEMENTATION  (uint16)
+CC_CPU_ARRAY_CAST_IMPLEMENTATION  (uint32)
+CC_CPU_ARRAY_CAST_IMPLEMENTATION  (uint64)
+CC_CPU_ARRAY_CAST_IMPLEMENTATION  (int8)
+CC_CPU_ARRAY_CAST_IMPLEMENTATION  (int16)
+CC_CPU_ARRAY_CAST_IMPLEMENTATION  (int32)
+CC_CPU_ARRAY_CAST_IMPLEMENTATION  (int64)
+CC_CPU_ARRAY_CAST_IMPLEMENTATION  (float32)
+CC_CPU_ARRAY_CAST_IMPLEMENTATION  (float64)
+
+#define ARRAY_ADD_BY_CASE(_DT, _dt) \
+case _DT:                                        \
+	ARRAY_SC_OPS(+, oup, a, x, arrlen, _dt); \
+	break;
+void cc_cpu_array_add_by(void *oup,
+	int arrlen, const void *a, const void *x, int dt)
+{
+	cc_int32 i;
+	switch (dt) {
+	ARRAY_ADD_BY_CASE(CC_UINT8, cc_uint8);
+	ARRAY_ADD_BY_CASE(CC_UINT16, cc_uint16);
+	ARRAY_ADD_BY_CASE(CC_UINT32, cc_uint32);
+	ARRAY_ADD_BY_CASE(CC_UINT64, cc_uint64);
+	ARRAY_ADD_BY_CASE(CC_INT8, cc_int8);
+	ARRAY_ADD_BY_CASE(CC_INT16, cc_int16);
+	ARRAY_ADD_BY_CASE(CC_INT32, cc_int32);
+	ARRAY_ADD_BY_CASE(CC_INT64, cc_int64);
+	ARRAY_ADD_BY_CASE(CC_FLOAT32, cc_float32);
+	ARRAY_ADD_BY_CASE(CC_FLOAT64, cc_float64);
+	default:
+		utlog_format(UTLOG_ERR,
+			"cc_array: unsupported dtype %x\n", dt);
+		break;
+	}
+}
+
+#define ARRAY_SUB_BY_CASE(_DT, _dt) \
+case _DT:                                        \
+	ARRAY_SC_OPS(-, oup, a, x, arrlen, _dt); \
+	break;
+void cc_cpu_array_sub_by(void *oup,
+	int arrlen, const void *a, const void *x, int dt)
+{
+	cc_int32 i;
+	switch (dt) {
+	ARRAY_SUB_BY_CASE(CC_UINT8, cc_uint8);
+	ARRAY_SUB_BY_CASE(CC_UINT16, cc_uint16);
+	ARRAY_SUB_BY_CASE(CC_UINT32, cc_uint32);
+	ARRAY_SUB_BY_CASE(CC_UINT64, cc_uint64);
+	ARRAY_SUB_BY_CASE(CC_INT8, cc_int8);
+	ARRAY_SUB_BY_CASE(CC_INT16, cc_int16);
+	ARRAY_SUB_BY_CASE(CC_INT32, cc_int32);
+	ARRAY_SUB_BY_CASE(CC_INT64, cc_int64);
+	ARRAY_SUB_BY_CASE(CC_FLOAT32, cc_float32);
+	ARRAY_SUB_BY_CASE(CC_FLOAT64, cc_float64);
+	default:
+		utlog_format(UTLOG_ERR,
+			"cc_array: unsupported dtype %x\n", dt);
+		break;
+	}
+}
+
+#define ARRAY_MUL_BY_CASE(_DT, _dt) \
+case _DT:                                        \
+	ARRAY_SC_OPS(*, oup, a, x, arrlen, _dt); \
+	break;
+void cc_cpu_array_mul_by(void *oup,
+	int arrlen, const void *a, const void *x, int dt)
+{
+	cc_int32 i;
+	switch (dt) {
+	ARRAY_MUL_BY_CASE(CC_UINT8, cc_uint8);
+	ARRAY_MUL_BY_CASE(CC_UINT16, cc_uint16);
+	ARRAY_MUL_BY_CASE(CC_UINT32, cc_uint32);
+	ARRAY_MUL_BY_CASE(CC_UINT64, cc_uint64);
+	ARRAY_MUL_BY_CASE(CC_INT8, cc_int8);
+	ARRAY_MUL_BY_CASE(CC_INT16, cc_int16);
+	ARRAY_MUL_BY_CASE(CC_INT32, cc_int32);
+	ARRAY_MUL_BY_CASE(CC_INT64, cc_int64);
+	ARRAY_MUL_BY_CASE(CC_FLOAT32, cc_float32);
+	ARRAY_MUL_BY_CASE(CC_FLOAT64, cc_float64);
+	default:
+		utlog_format(UTLOG_ERR,
+			"cc_array: unsupported dtype %x\n", dt);
+		break;
+	}
+}
+
+#define ARRAY_DIV_BY_CASE(_DT, _dt) \
+case _DT:                                        \
+	ARRAY_SC_OPS(/, oup, a, x, arrlen, _dt); \
+	break;
+void cc_cpu_array_div_by(void *oup,
+	int arrlen, const void *a, const void *x, int dt)
+{
+	cc_int32 i;
+	switch (dt) {
+	ARRAY_DIV_BY_CASE(CC_UINT8, cc_uint8);
+	ARRAY_DIV_BY_CASE(CC_UINT16, cc_uint16);
+	ARRAY_DIV_BY_CASE(CC_UINT32, cc_uint32);
+	ARRAY_DIV_BY_CASE(CC_UINT64, cc_uint64);
+	ARRAY_DIV_BY_CASE(CC_INT8, cc_int8);
+	ARRAY_DIV_BY_CASE(CC_INT16, cc_int16);
+	ARRAY_DIV_BY_CASE(CC_INT32, cc_int32);
+	ARRAY_DIV_BY_CASE(CC_INT64, cc_int64);
+	ARRAY_DIV_BY_CASE(CC_FLOAT32, cc_float32);
+	ARRAY_DIV_BY_CASE(CC_FLOAT64, cc_float64);
+	default:
+		utlog_format(UTLOG_ERR,
+			"cc_array: unsupported dtype %x\n", dt);
+		break;
+	}
+}
+
+#define ARRAY_ADD_EW_CASE(_DT, _dt) \
+case _DT:                                        \
+	ARRAY_EW_OPS(+, oup, a, b, arrlen, _dt); \
+	break;
+void cc_cpu_array_add_ew(void *oup,
+	int arrlen, const void *a, const void *b, int dt)
+{
+	cc_int32 i;
+	switch (dt) {
+	ARRAY_ADD_EW_CASE(CC_UINT8, cc_uint8);
+	ARRAY_ADD_EW_CASE(CC_UINT16, cc_uint16);
+	ARRAY_ADD_EW_CASE(CC_UINT32, cc_uint32);
+	ARRAY_ADD_EW_CASE(CC_UINT64, cc_uint64);
+	ARRAY_ADD_EW_CASE(CC_INT8, cc_int8);
+	ARRAY_ADD_EW_CASE(CC_INT16, cc_int16);
+	ARRAY_ADD_EW_CASE(CC_INT32, cc_int32);
+	ARRAY_ADD_EW_CASE(CC_INT64, cc_int64);
+	ARRAY_ADD_EW_CASE(CC_FLOAT32, cc_float32);
+	ARRAY_ADD_EW_CASE(CC_FLOAT64, cc_float64);
+	default:
+		utlog_format(UTLOG_ERR,
+			"cc_array: unsupported dtype %x\n", dt);
+		break;
+	}
+}
+
+#define ARRAY_SUB_EW_CASE(_DT, _dt) \
+case _DT:                                        \
+	ARRAY_EW_OPS(-, oup, a, b, arrlen, _dt); \
+	break;
+void cc_cpu_array_sub_ew(void *oup,
+	int arrlen, const void *a, const void *b, int dt)
+{
+	cc_int32 i;
+	switch (dt) {
+	ARRAY_SUB_EW_CASE(CC_UINT8, cc_uint8);
+	ARRAY_SUB_EW_CASE(CC_UINT16, cc_uint16);
+	ARRAY_SUB_EW_CASE(CC_UINT32, cc_uint32);
+	ARRAY_SUB_EW_CASE(CC_UINT64, cc_uint64);
+	ARRAY_SUB_EW_CASE(CC_INT8, cc_int8);
+	ARRAY_SUB_EW_CASE(CC_INT16, cc_int16);
+	ARRAY_SUB_EW_CASE(CC_INT32, cc_int32);
+	ARRAY_SUB_EW_CASE(CC_INT64, cc_int64);
+	ARRAY_SUB_EW_CASE(CC_FLOAT32, cc_float32);
+	ARRAY_SUB_EW_CASE(CC_FLOAT64, cc_float64);
+	default:
+		utlog_format(UTLOG_ERR,
+			"cc_array: unsupported dtype %x\n", dt);
+		break;
+	}
+}
+
+#define ARRAY_MUL_EW_CASE(_DT, _dt) \
+case _DT:                                        \
+	ARRAY_EW_OPS(*, oup, a, b, arrlen, _dt); \
+	break;
+void cc_cpu_array_mul_ew(void *oup,
+	int arrlen, const void *a, const void *b, int dt)
+{
+	cc_int32 i;
+	switch (dt) {
+	ARRAY_MUL_EW_CASE(CC_UINT8, cc_uint8);
+	ARRAY_MUL_EW_CASE(CC_UINT16, cc_uint16);
+	ARRAY_MUL_EW_CASE(CC_UINT32, cc_uint32);
+	ARRAY_MUL_EW_CASE(CC_UINT64, cc_uint64);
+	ARRAY_MUL_EW_CASE(CC_INT8, cc_int8);
+	ARRAY_MUL_EW_CASE(CC_INT16, cc_int16);
+	ARRAY_MUL_EW_CASE(CC_INT32, cc_int32);
+	ARRAY_MUL_EW_CASE(CC_INT64, cc_int64);
+	ARRAY_MUL_EW_CASE(CC_FLOAT32, cc_float32);
+	ARRAY_MUL_EW_CASE(CC_FLOAT64, cc_float64);
+	default:
+		utlog_format(UTLOG_ERR,
+			"cc_array: unsupported dtype %x\n", dt);
+		break;
+	}
+}
+
+#define ARRAY_DIV_EW_CASE(_DT, _dt) \
+case _DT:                                        \
+	ARRAY_EW_OPS(/, oup, a, b, arrlen, _dt); \
+	break;
+void cc_cpu_array_div_ew(void *oup,
+	int arrlen, const void *a, const void *b, int dt)
+{
+	cc_int32 i;
+	switch (dt) {
+	ARRAY_DIV_EW_CASE(CC_UINT8, cc_uint8);
+	ARRAY_DIV_EW_CASE(CC_UINT16, cc_uint16);
+	ARRAY_DIV_EW_CASE(CC_UINT32, cc_uint32);
+	ARRAY_DIV_EW_CASE(CC_UINT64, cc_uint64);
+	ARRAY_DIV_EW_CASE(CC_INT8, cc_int8);
+	ARRAY_DIV_EW_CASE(CC_INT16, cc_int16);
+	ARRAY_DIV_EW_CASE(CC_INT32, cc_int32);
+	ARRAY_DIV_EW_CASE(CC_INT64, cc_int64);
+	ARRAY_DIV_EW_CASE(CC_FLOAT32, cc_float32);
+	ARRAY_DIV_EW_CASE(CC_FLOAT64, cc_float64);
+	default:
+		utlog_format(UTLOG_ERR,
+			"cc_array: unsupported dtype %x\n", dt);
+		break;
+	}
+}
+
+#define ARRAY_SUM_CASE(_DT, _dt) \
+case _DT:                               \
+	ARRAY_SUM(arr, arrlen, _dt, x); \
+	break;
+void cc_cpu_array_sum(const void *arr, int arrlen, void *x, int dt)
+{
+	cc_int32 i;
+	switch (dt) {
+	ARRAY_SUM_CASE(CC_UINT8, cc_uint8);
+	ARRAY_SUM_CASE(CC_UINT16, cc_uint16);
+	ARRAY_SUM_CASE(CC_UINT32, cc_uint32);
+	ARRAY_SUM_CASE(CC_UINT64, cc_uint64);
+	ARRAY_SUM_CASE(CC_INT8, cc_int8);
+	ARRAY_SUM_CASE(CC_INT16, cc_int16);
+	ARRAY_SUM_CASE(CC_INT32, cc_int32);
+	ARRAY_SUM_CASE(CC_INT64, cc_int64);
+	ARRAY_SUM_CASE(CC_FLOAT32, cc_float32);
+	ARRAY_SUM_CASE(CC_FLOAT64, cc_float64);
+	default:
+		utlog_format(UTLOG_ERR,
+			"cc_array: unsupported dtype %x\n", dt);
+		break;
+	}
+}
+
+#define ARRAY_MEAN_CASE(_DT, _dt) \
+case _DT:                               \
+	ARRAY_SUM(arr, arrlen, _dt, x); \
+	*(_dt*)x /= arrlen;             \
+	break;
+void cc_cpu_array_mean(const void *arr, int arrlen, void *x, int dt)
+{
+	cc_int32 i;
+	switch (dt) {
+	ARRAY_MEAN_CASE(CC_UINT8, cc_uint8);
+	ARRAY_MEAN_CASE(CC_UINT16, cc_uint16);
+	ARRAY_MEAN_CASE(CC_UINT32, cc_uint32);
+	ARRAY_MEAN_CASE(CC_UINT64, cc_uint64);
+	ARRAY_MEAN_CASE(CC_INT8, cc_int8);
+	ARRAY_MEAN_CASE(CC_INT16, cc_int16);
+	ARRAY_MEAN_CASE(CC_INT32, cc_int32);
+	ARRAY_MEAN_CASE(CC_INT64, cc_int64);
+	ARRAY_MEAN_CASE(CC_FLOAT32, cc_float32);
+	ARRAY_MEAN_CASE(CC_FLOAT64, cc_float64);
+	default:
+		utlog_format(UTLOG_ERR,
+			"cc_array: unsupported dtype %x\n", dt);
+		break;
+	}
+}
diff --git a/src/cc_cpufn.h b/src/cc_cpufn.h
index 981afa8..fbf9360 100644
--- a/src/cc_cpufn.h
+++ b/src/cc_cpufn.h
@@ -30,6 +30,51 @@ void cc_cpu_fully_connected(const void *inp,
 void cc_cpu_batch_norm(void *inp,
 	cc_int32 len, const void *bnpara, cc_dtype dt);
 
+/*
+ * int <---> cc_int32
+ */
+
+#define CC_CPU_ARRAY_CAST_DEFINITION(dtype) \
+void cc_cpu_array_cast_ ## dtype(                        \
+	void *dst, const void *src, int arrlen, int dt);
+
+CC_CPU_ARRAY_CAST_DEFINITION  (uint8)
+CC_CPU_ARRAY_CAST_DEFINITION  (uint16)
+CC_CPU_ARRAY_CAST_DEFINITION  (uint32)
+CC_CPU_ARRAY_CAST_DEFINITION  (uint64)
+CC_CPU_ARRAY_CAST_DEFINITION  (int8)
+CC_CPU_ARRAY_CAST_DEFINITION  (int16)
+CC_CPU_ARRAY_CAST_DEFINITION  (int32)
+CC_CPU_ARRAY_CAST_DEFINITION  (int64)
+CC_CPU_ARRAY_CAST_DEFINITION  (float32)
+CC_CPU_ARRAY_CAST_DEFINITION  (float64)
+
+void cc_cpu_array_set(void *arr, int arrlen, const void *x, int dt);
+
+void cc_cpu_array_clip_by_value(void *arr,
+	int arrlen, const void *min, const void *max, int dt);
+
+void cc_cpu_array_add_by(void *oup,
+	int arrlen, const void *a, const void *x, int dt);
+void cc_cpu_array_sub_by(void *oup,
+	int arrlen, const void *a, const void *x, int dt);
+void cc_cpu_array_mul_by(void *oup,
+	int arrlen, const void *a, const void *x, int dt);
+void cc_cpu_array_div_by(void *oup,
+	int arrlen, const void *a, const void *x, int dt);
+
+void cc_cpu_array_add_ew(void *oup,
+	int arrlen, const void *a, const void *b, int dt);
+void cc_cpu_array_sub_ew(void *oup,
+	int arrlen, const void *a, const void *b, int dt);
+void cc_cpu_array_mul_ew(void *oup,
+	int arrlen, const void *a, const void *b, int dt);
+void cc_cpu_array_div_ew(void *oup,
+	int arrlen, const void *a, const void *b, int dt);
+
+void cc_cpu_array_sum (const void *arr, int arrlen, void *x, int dt);
+void cc_cpu_array_mean(const void *arr, int arrlen, void *x, int dt);
+
 #ifdef __cplusplus
 	}
 #endif
diff --git a/src/cc_fmap2d.c b/src/cc_fmap2d.c
index d0f616f..992df32 100644
--- a/src/cc_fmap2d.c
+++ b/src/cc_fmap2d.c
@@ -6,12 +6,12 @@
 #endif
 
 #include "cc_assert.h"
+#include "cc_array.h"
 #include "cc_basic.h"
 #include "cc_fmap2d.h"
 #include "cc_tsrmgr.h"
 
 #include "global_fn_cfg.h"
-extern fn_array_add_by _array_add_by;
 
 cc_tensor_t *cc_fmap2d_bias(cc_tensor_t *inp,
 	const cc_tensor_t *bias, const char *name)
@@ -37,7 +37,7 @@ cc_tensor_t *cc_fmap2d_bias(cc_tensor_t *inp,
 	#pragma omp parallel for private(i)
 #endif
 	for (i = 0; i < bias->shape[CC_CNN2D_SHAPE_C]; ++i) {
-		_array_add_by(fmap->data + ch_mem_size * i,
+		cc_array_add_by(fmap->data + ch_mem_size * i,
 			ch_size, fmap->data + ch_mem_size * i,
 			bias->data + dt_size * i, *fmap->dtype);
 	}
diff --git a/src/cc_pad2d.c b/src/cc_pad2d.c
index e1df139..b8df403 100644
--- a/src/cc_pad2d.c
+++ b/src/cc_pad2d.c
@@ -41,6 +41,9 @@ cc_tensor_t *cc_pad2d(const cc_tensor_t *inp,
 			pad->shape[CC_CNN2D_SHAPE_H];
 	p_ch_mem_size  = p_ch_size * dtsize;
 	p_row_mem_size = pad->shape[CC_CNN2D_SHAPE_W] * dtsize;
+#ifdef ENABLE_OPENMP
+#pragma omp parallel for private(c, i, j)
+#endif
 	for (c = 0; c < inp->shape[CC_CNN2D_SHAPE_C]; ++c) {
 		for (i = 0; i < inp->shape[CC_CNN2D_SHAPE_H]; ++i) {
 			for (j = 0; j < inp->shape[CC_CNN2D_SHAPE_W]; ++j)
diff --git a/src/global_fn_cfg.c b/src/global_fn_cfg.c
index 92cc762..e72b6af 100644
--- a/src/global_fn_cfg.c
+++ b/src/global_fn_cfg.c
@@ -1,26 +1,29 @@
+#include "cc_cpufn.h"
 #include "global_fn_cfg.h"
 
 void __gfn_check__(void) {return;}
 
-fn_array_set    _array_set    = cc_array_set;
+fn_array_set    _array_set    = cc_cpu_array_set;
 
 fn_array_clip_by_value
-	_array_clip_by_value  = cc_array_clip_by_value;
+	_array_clip_by_value  = cc_cpu_array_clip_by_value;
 
-fn_array_add_by _array_add_by = cc_array_add_by;
-fn_array_sub_by _array_sub_by = cc_array_sub_by;
-fn_array_mul_by _array_mul_by = cc_array_mul_by;
-fn_array_div_by _array_div_by = cc_array_div_by;
+fn_array_add_by _array_add_by = cc_cpu_array_add_by;
+fn_array_sub_by _array_sub_by = cc_cpu_array_sub_by;
+fn_array_mul_by _array_mul_by = cc_cpu_array_mul_by;
+fn_array_div_by _array_div_by = cc_cpu_array_div_by;
 
-fn_array_add_ew _array_add_ew = cc_array_add_ew;
-fn_array_sub_ew _array_sub_ew = cc_array_sub_ew;
-fn_array_mul_ew _array_mul_ew = cc_array_mul_ew;
-fn_array_div_ew _array_div_ew = cc_array_div_ew;
+fn_array_add_ew _array_add_ew = cc_cpu_array_add_ew;
+fn_array_sub_ew _array_sub_ew = cc_cpu_array_sub_ew;
+fn_array_mul_ew _array_mul_ew = cc_cpu_array_mul_ew;
+fn_array_div_ew _array_div_ew = cc_cpu_array_div_ew;
 
+fn_array_sum    _array_sum    = cc_cpu_array_sum;
+fn_array_mean   _array_mean   = cc_cpu_array_mean;
 
 #define GLOBAL_FN_SET_ARRAY_CAST(dtype) \
 fn_array_cast_ ## dtype _array_cast_ ## dtype = \
-	cc_array_cast_ ## dtype;
+	cc_cpu_array_cast_ ## dtype;
 
 GLOBAL_FN_SET_ARRAY_CAST  (uint8)
 GLOBAL_FN_SET_ARRAY_CAST  (uint16)
diff --git a/src/global_fn_cfg.h b/src/global_fn_cfg.h
index 5a65ee9..40eabd0 100644
--- a/src/global_fn_cfg.h
+++ b/src/global_fn_cfg.h
@@ -10,38 +10,6 @@
 #endif
 
 #include "cc_dtype.h"
-#include "cc_cpufn.h"
-
-typedef void (*fn_activation_relu)(
-	void *inp, cc_int32 elems, cc_dtype dt);
-typedef void (*fn_activation_relu6)(
-	void *inp, cc_int32 elems, cc_dtype dt);
-
-typedef void (*fn_activation_softmax)(
-	void *inp, cc_int32 elems, cc_dtype dt);
-
-typedef void (*fn_max_pool2d)(const void *inp, void *oup,
-	cc_int32 x, cc_int32 y, cc_int32 s, cc_dtype dt);
-
-typedef void (*fn_avg_pool2d)(const void *inp, void *oup,
-	cc_int32 x, cc_int32 y, cc_int32 s, cc_dtype dt);
-
-typedef void (*fn_conv2d)(const void *inp, void *oup,
-	cc_int32 x,cc_int32 y, cc_int32 oup_x, cc_int32 oup_y,
-	cc_int32 sx, cc_int32 sy, const void *filter,
-	cc_int32 fw, cc_dtype dt);
-
-typedef void (*fn_fully_connected)(const void *inp,
-		void *oup, const void *w, const void *b,
-	cc_int32 iw, cc_int32 ow, cc_dtype dt);
-
-typedef void (*fn_batch_norm)(void *inp,
-	cc_int32 len, const void *bnpara, cc_dtype dt);
-
-/*
- * cc_array functions' cfg, we do not use a standard BLAS directly
- */
-#include "cc_array.h"
 
 typedef void (*fn_array_set)(
 	void *arr, int arrlen, const void *x, int dt);
@@ -67,20 +35,51 @@ typedef void (*fn_array_mul_ew)(void *oup,
 typedef void (*fn_array_div_ew)(void *oup,
 	int arrlen, const void *a, const void *b, int dt);
 
-#define GLOBAL_FN_DEF_ARRAY_CAST(dtype) \
+typedef void (*fn_array_sum )(
+	const void *arr, int arrlen, void *x, int dt);
+typedef void (*fn_array_mean)(
+	const void *arr, int arrlen, void *x, int dt);
+
+#define TYPEDEF_FN_ARRAY_CAST(dtype) \
 typedef void (*fn_array_cast_ ## dtype)(                \
 	void *dst, const void *src, int arrlen, int dt);
 
-GLOBAL_FN_DEF_ARRAY_CAST  (uint8)
-GLOBAL_FN_DEF_ARRAY_CAST  (uint16)
-GLOBAL_FN_DEF_ARRAY_CAST  (uint32)
-GLOBAL_FN_DEF_ARRAY_CAST  (uint64)
-GLOBAL_FN_DEF_ARRAY_CAST  (int8)
-GLOBAL_FN_DEF_ARRAY_CAST  (int16)
-GLOBAL_FN_DEF_ARRAY_CAST  (int32)
-GLOBAL_FN_DEF_ARRAY_CAST  (int64)
-GLOBAL_FN_DEF_ARRAY_CAST  (float32)
-GLOBAL_FN_DEF_ARRAY_CAST  (float64)
+TYPEDEF_FN_ARRAY_CAST  (uint8)
+TYPEDEF_FN_ARRAY_CAST  (uint16)
+TYPEDEF_FN_ARRAY_CAST  (uint32)
+TYPEDEF_FN_ARRAY_CAST  (uint64)
+TYPEDEF_FN_ARRAY_CAST  (int8)
+TYPEDEF_FN_ARRAY_CAST  (int16)
+TYPEDEF_FN_ARRAY_CAST  (int32)
+TYPEDEF_FN_ARRAY_CAST  (int64)
+TYPEDEF_FN_ARRAY_CAST  (float32)
+TYPEDEF_FN_ARRAY_CAST  (float64)
+
+typedef void (*fn_activation_relu)(
+	void *inp, cc_int32 elems, cc_dtype dt);
+typedef void (*fn_activation_relu6)(
+	void *inp, cc_int32 elems, cc_dtype dt);
+
+typedef void (*fn_activation_softmax)(
+	void *inp, cc_int32 elems, cc_dtype dt);
+
+typedef void (*fn_max_pool2d)(const void *inp, void *oup,
+	cc_int32 x, cc_int32 y, cc_int32 s, cc_dtype dt);
+
+typedef void (*fn_avg_pool2d)(const void *inp, void *oup,
+	cc_int32 x, cc_int32 y, cc_int32 s, cc_dtype dt);
+
+typedef void (*fn_conv2d)(const void *inp, void *oup,
+	cc_int32 x,cc_int32 y, cc_int32 oup_x, cc_int32 oup_y,
+	cc_int32 sx, cc_int32 sy, const void *filter,
+	cc_int32 fw, cc_dtype dt);
+
+typedef void (*fn_fully_connected)(const void *inp,
+		void *oup, const void *w, const void *b,
+	cc_int32 iw, cc_int32 ow, cc_dtype dt);
+
+typedef void (*fn_batch_norm)(void *inp,
+	cc_int32 len, const void *bnpara, cc_dtype dt);
 
 #ifdef __cplusplus
 	}
diff --git a/util/lua2cc.lua b/util/lua2cc.lua
index 37a22ac..90dd14d 100644
--- a/util/lua2cc.lua
+++ b/util/lua2cc.lua
@@ -277,10 +277,10 @@ reshape = function(args)
       if info.layerId - 1 < 1 then
         assert(nil, "must specify an input for the 1st layer")
       end
-      info.input = layerOutputs[info.layerId - 1]
+      info.input = string.format("@%d", info.layerId - 1)
     end
     local code = string.format(
-      "%s = cc_tensor_reshape(%s, __shape%d);",
+      "%s = cc_reshape(%s, __shape%d);",
       output, info.input, info.shapeId)
     layerOutputs[ret.layerId] = output
     return code