diff --git a/demo/lua2vgg16.lua b/demo/lua2vgg16.lua new file mode 100644 index 0000000..4ed7e53 --- /dev/null +++ b/demo/lua2vgg16.lua @@ -0,0 +1,50 @@ +loadfile("./util/lua2cc.lua")() + +network = { + networkName = "vgg16", + createScope = "vgg16", + parameterLv = 0, + inputLayers = {"in"}, + outputLayers = {"out"}, + l1 = conv2d ({input = "in", + stride = 1, padding = 1}), + l2 = relu ({}), + l3 = conv2d ({stride = 1, padding = 1}), + l4 = relu ({}), + l5 = maxPool2d ({stride = 2}), + l6 = conv2d ({stride = 1, padding = 1}), + l7 = relu ({}), + l8 = conv2d ({stride = 1, padding = 1}), + l9 = relu ({}), + l10 = maxPool2d ({stride = 2}), + l11 = conv2d ({stride = 1, padding = 1}), + l12 = relu ({}), + l13 = conv2d ({stride = 1, padding = 1}), + l14 = relu ({}), + l15 = conv2d ({stride = 1, padding = 1}), + l16 = relu ({}), + l17 = maxPool2d ({stride = 2}), + l18 = conv2d ({stride = 1, padding = 1}), + l19 = relu ({}), + l20 = conv2d ({stride = 1, padding = 1}), + l21 = relu ({}), + l22 = conv2d ({stride = 1, padding = 1}), + l23 = relu ({}), + l24 = maxPool2d ({stride = 2}), + l25 = conv2d ({stride = 1, padding = 1}), + l26 = relu ({}), + l27 = conv2d ({stride = 1, padding = 1}), + l28 = relu ({}), + l29 = conv2d ({stride = 1, padding = 1}), + l30 = relu ({}), + l31 = maxPool2d ({stride = 2}), + l32 = reshape ({shape = {-1, 1, 1}}), + l33 = fullyConnected ({}), + l34 = relu ({}), + l35 = fullyConnected ({}), + l36 = relu ({}), + l37 = fullyConnected ({}), + out = softmax ({input = "l37"}) +} + +ccCodeTranslator(network, {file = "vgg16.c"}) diff --git a/makefile b/makefile index 7176bfe..25486e4 100644 --- a/makefile +++ b/makefile @@ -15,7 +15,7 @@ DFLAG += # -g -fsanitize=address -fno-omit-frame-pointer CFLAG += # -std=c89 CFLAG += -Wall # -Wpedantic -OFLAG += -O3 +OFLAG += -O3 -march=native # Enable OpenMP OFLAG += -DENABLE_OPENMP -fopenmp @@ -127,7 +127,7 @@ $(OBJS_PATH)/build: all: $(APPS) $(CATCOON_A): $(ALL_O) - cd $(OBJS_PATH) && $(AR) $@ $^ && $(MV) $@ .. + cd $(OBJS_PATH) && $(AR) $@ $(ALL_O) && $(MV) $@ .. %.o: ./src/%.c $(CC) -c -o $(OBJS_PATH)/$@ $< $(CFLAG) $(INC) diff --git a/src/cc_array.c b/src/cc_array.c index 1b11222..e159e6a 100644 --- a/src/cc_array.c +++ b/src/cc_array.c @@ -6,120 +6,42 @@ #include "cc_dtype.h" #include "cc_array.h" -#define ARRAY_SC_OPS(op, oup, arr, elem, arrlen, dtype) \ - for (i = 0; i < arrlen; ++i) { \ - *((dtype*)oup + i) = *((dtype*)arr + i) op *(dtype*)elem; \ - } - -#define ARRAY_ELEM_SET(arr, elem, arrlen, dtype) \ - for (i = 0; i < arrlen; ++i) { \ - *((dtype*)arr + i) = *(dtype*)elem; \ - } - -#define ARRAY_ELEM_CLIP(arr, min, max, arrlen, dtype) \ - for (i = 0; i < arrlen; ++i) { \ - if (min) { \ - *((dtype*)arr + i) = \ - *((dtype*)arr + i) < *(dtype*)min ? \ - *(dtype*)min : *((dtype*)arr + i); \ - } \ - if (max) { \ - *((dtype*)arr + i) = \ - *((dtype*)arr + i) > *(dtype*)max ? \ - *(dtype*)max : *((dtype*)arr + i); \ - } \ - } - -#define ARRAY_EW_OPS(op, oup, a, b, arrlen, dtype) \ - for (i = 0; i < arrlen; ++i) { \ - *((dtype*)oup + i) = *((dtype*)a + i) op \ - *((dtype*)b + i); \ - } - -#define ARRAY_SUM(arr, arrlen, dtype, sum) \ - *(dtype*)sum = 0; \ - for (i = 0; i < arrlen; ++i) { \ - *(dtype*)sum += *((dtype*)arr + i); \ - } - -#define ARRAY_CAST_CASE(_DT, _srcdt, _dstdt) \ -case _DT: \ - for (i = 0; i < arrlen; ++i) \ - *((_dstdt*)dst + i) = (_dstdt)*((_srcdt*)src + i); \ - break; +#include "global_fn_cfg.h" +#define EXT_ARRAY_CAST_DEFINITION(dtype) \ +extern fn_array_cast_ ## dtype _array_cast_ ## dtype; + +EXT_ARRAY_CAST_DEFINITION (uint8) +EXT_ARRAY_CAST_DEFINITION (uint16) +EXT_ARRAY_CAST_DEFINITION (uint32) +EXT_ARRAY_CAST_DEFINITION (uint64) +EXT_ARRAY_CAST_DEFINITION (int8) +EXT_ARRAY_CAST_DEFINITION (int16) +EXT_ARRAY_CAST_DEFINITION (int32) +EXT_ARRAY_CAST_DEFINITION (int64) +EXT_ARRAY_CAST_DEFINITION (float32) +EXT_ARRAY_CAST_DEFINITION (float64) + +extern fn_array_set _array_set; +extern fn_array_clip_by_value _array_clip_by_value; + +extern fn_array_add_by _array_add_by; +extern fn_array_sub_by _array_sub_by; +extern fn_array_mul_by _array_mul_by; +extern fn_array_div_by _array_div_by; + +extern fn_array_add_ew _array_add_ew; +extern fn_array_sub_ew _array_sub_ew; +extern fn_array_mul_ew _array_mul_ew; +extern fn_array_div_ew _array_div_ew; + +extern fn_array_sum _array_sum; +extern fn_array_mean _array_mean; #define CC_ARRAY_CAST_IMPLEMENTATION(dtype) \ -void cc_array_cast_ ## dtype( \ - void *dst, const void *src, int arrlen, int dt) \ -{ \ - cc_int32 i; \ - switch (dt) { \ - ARRAY_CAST_CASE(CC_UINT8, cc_uint8, cc_ ## dtype); \ - ARRAY_CAST_CASE(CC_UINT16, cc_uint16, cc_ ## dtype); \ - ARRAY_CAST_CASE(CC_UINT32, cc_uint32, cc_ ## dtype); \ - ARRAY_CAST_CASE(CC_UINT64, cc_uint64, cc_ ## dtype); \ - ARRAY_CAST_CASE(CC_INT8, cc_int8, cc_ ## dtype); \ - ARRAY_CAST_CASE(CC_INT16, cc_int16, cc_ ## dtype); \ - ARRAY_CAST_CASE(CC_INT32, cc_int32, cc_ ## dtype); \ - ARRAY_CAST_CASE(CC_INT64, cc_int64, cc_ ## dtype); \ - ARRAY_CAST_CASE(CC_FLOAT32, cc_float32, cc_ ## dtype); \ - ARRAY_CAST_CASE(CC_FLOAT64, cc_float64, cc_ ## dtype); \ - default: \ - utlog_format(UTLOG_ERR, \ - "cc_array: unsupported dtype %x\n", dt); \ - break; \ - } \ -} - -#define ARRAY_SET_CASE(_DT, _dt) \ -case _DT: \ - ARRAY_ELEM_SET(arr, x, arrlen, _dt) \ - break; -void cc_array_set(void *arr, int arrlen, const void *x, int dt) -{ - cc_int32 i; - switch (dt) { - ARRAY_SET_CASE(CC_UINT8, cc_uint8); - ARRAY_SET_CASE(CC_UINT16, cc_uint16); - ARRAY_SET_CASE(CC_UINT32, cc_uint32); - ARRAY_SET_CASE(CC_UINT64, cc_uint64); - ARRAY_SET_CASE(CC_INT8, cc_int8); - ARRAY_SET_CASE(CC_INT16, cc_int16); - ARRAY_SET_CASE(CC_INT32, cc_int32); - ARRAY_SET_CASE(CC_INT64, cc_int64); - ARRAY_SET_CASE(CC_FLOAT32, cc_float32); - ARRAY_SET_CASE(CC_FLOAT64, cc_float64); - default: - utlog_format(UTLOG_ERR, - "cc_array: unsupported dtype %x\n", dt); - break; - } -} - -#define ARRAY_CLIP_CASE(_DT, _dt) \ -case _DT: \ - ARRAY_ELEM_CLIP(arr, min, max, arrlen, _dt); \ - break; -void cc_array_clip_by_value( - void *arr, int arrlen, const void *min, const void *max, int dt) -{ - cc_int32 i; - switch (dt) { - ARRAY_CLIP_CASE(CC_UINT8, cc_uint8); - ARRAY_CLIP_CASE(CC_UINT16, cc_uint16); - ARRAY_CLIP_CASE(CC_UINT32, cc_uint32); - ARRAY_CLIP_CASE(CC_UINT64, cc_uint64); - ARRAY_CLIP_CASE(CC_INT8, cc_int8); - ARRAY_CLIP_CASE(CC_INT16, cc_int16); - ARRAY_CLIP_CASE(CC_INT32, cc_int32); - ARRAY_CLIP_CASE(CC_INT64, cc_int64); - ARRAY_CLIP_CASE(CC_FLOAT32, cc_float32); - ARRAY_CLIP_CASE(CC_FLOAT64, cc_float64); - default: - utlog_format(UTLOG_ERR, - "cc_array: unsupported dtype %x\n", dt); - break; - } +void cc_array_cast_ ## dtype( \ + void *dst, const void *src, int arrlen, int dt) \ +{ \ + _array_cast_ ## dtype(dst, src, arrlen, dt); \ } CC_ARRAY_CAST_IMPLEMENTATION (uint8) @@ -133,263 +55,73 @@ CC_ARRAY_CAST_IMPLEMENTATION (int64) CC_ARRAY_CAST_IMPLEMENTATION (float32) CC_ARRAY_CAST_IMPLEMENTATION (float64) -#define ARRAY_ADD_BY_CASE(_DT, _dt) \ -case _DT: \ - ARRAY_SC_OPS(+, oup, a, x, arrlen, _dt); \ - break; +void cc_array_set(void *arr, int arrlen, const void *x, int dt) +{ + _array_set(arr, arrlen, x, dt); +} + +void cc_array_clip_by_value(void *arr, + int arrlen, const void *min, const void *max, int dt) +{ + _array_clip_by_value(arr, arrlen, min, max, dt); +} + void cc_array_add_by(void *oup, int arrlen, const void *a, const void *x, int dt) { - cc_int32 i; - switch (dt) { - ARRAY_ADD_BY_CASE(CC_UINT8, cc_uint8); - ARRAY_ADD_BY_CASE(CC_UINT16, cc_uint16); - ARRAY_ADD_BY_CASE(CC_UINT32, cc_uint32); - ARRAY_ADD_BY_CASE(CC_UINT64, cc_uint64); - ARRAY_ADD_BY_CASE(CC_INT8, cc_int8); - ARRAY_ADD_BY_CASE(CC_INT16, cc_int16); - ARRAY_ADD_BY_CASE(CC_INT32, cc_int32); - ARRAY_ADD_BY_CASE(CC_INT64, cc_int64); - ARRAY_ADD_BY_CASE(CC_FLOAT32, cc_float32); - ARRAY_ADD_BY_CASE(CC_FLOAT64, cc_float64); - default: - utlog_format(UTLOG_ERR, - "cc_array: unsupported dtype %x\n", dt); - break; - } + _array_add_by(oup, arrlen, a, x, dt); } -#define ARRAY_SUB_BY_CASE(_DT, _dt) \ -case _DT: \ - ARRAY_SC_OPS(-, oup, a, x, arrlen, _dt); \ - break; void cc_array_sub_by(void *oup, int arrlen, const void *a, const void *x, int dt) { - cc_int32 i; - switch (dt) { - ARRAY_SUB_BY_CASE(CC_UINT8, cc_uint8); - ARRAY_SUB_BY_CASE(CC_UINT16, cc_uint16); - ARRAY_SUB_BY_CASE(CC_UINT32, cc_uint32); - ARRAY_SUB_BY_CASE(CC_UINT64, cc_uint64); - ARRAY_SUB_BY_CASE(CC_INT8, cc_int8); - ARRAY_SUB_BY_CASE(CC_INT16, cc_int16); - ARRAY_SUB_BY_CASE(CC_INT32, cc_int32); - ARRAY_SUB_BY_CASE(CC_INT64, cc_int64); - ARRAY_SUB_BY_CASE(CC_FLOAT32, cc_float32); - ARRAY_SUB_BY_CASE(CC_FLOAT64, cc_float64); - default: - utlog_format(UTLOG_ERR, - "cc_array: unsupported dtype %x\n", dt); - break; - } + _array_sub_by(oup, arrlen, a, x, dt); } -#define ARRAY_MUL_BY_CASE(_DT, _dt) \ -case _DT: \ - ARRAY_SC_OPS(*, oup, a, x, arrlen, _dt); \ - break; void cc_array_mul_by(void *oup, int arrlen, const void *a, const void *x, int dt) { - cc_int32 i; - switch (dt) { - ARRAY_MUL_BY_CASE(CC_UINT8, cc_uint8); - ARRAY_MUL_BY_CASE(CC_UINT16, cc_uint16); - ARRAY_MUL_BY_CASE(CC_UINT32, cc_uint32); - ARRAY_MUL_BY_CASE(CC_UINT64, cc_uint64); - ARRAY_MUL_BY_CASE(CC_INT8, cc_int8); - ARRAY_MUL_BY_CASE(CC_INT16, cc_int16); - ARRAY_MUL_BY_CASE(CC_INT32, cc_int32); - ARRAY_MUL_BY_CASE(CC_INT64, cc_int64); - ARRAY_MUL_BY_CASE(CC_FLOAT32, cc_float32); - ARRAY_MUL_BY_CASE(CC_FLOAT64, cc_float64); - default: - utlog_format(UTLOG_ERR, - "cc_array: unsupported dtype %x\n", dt); - break; - } + _array_mul_by(oup, arrlen, a, x, dt); } -#define ARRAY_DIV_BY_CASE(_DT, _dt) \ -case _DT: \ - ARRAY_SC_OPS(/, oup, a, x, arrlen, _dt); \ - break; void cc_array_div_by(void *oup, int arrlen, const void *a, const void *x, int dt) { - cc_int32 i; - switch (dt) { - ARRAY_DIV_BY_CASE(CC_UINT8, cc_uint8); - ARRAY_DIV_BY_CASE(CC_UINT16, cc_uint16); - ARRAY_DIV_BY_CASE(CC_UINT32, cc_uint32); - ARRAY_DIV_BY_CASE(CC_UINT64, cc_uint64); - ARRAY_DIV_BY_CASE(CC_INT8, cc_int8); - ARRAY_DIV_BY_CASE(CC_INT16, cc_int16); - ARRAY_DIV_BY_CASE(CC_INT32, cc_int32); - ARRAY_DIV_BY_CASE(CC_INT64, cc_int64); - ARRAY_DIV_BY_CASE(CC_FLOAT32, cc_float32); - ARRAY_DIV_BY_CASE(CC_FLOAT64, cc_float64); - default: - utlog_format(UTLOG_ERR, - "cc_array: unsupported dtype %x\n", dt); - break; - } + _array_div_by(oup, arrlen, a, x, dt); } -#define ARRAY_ADD_EW_CASE(_DT, _dt) \ -case _DT: \ - ARRAY_EW_OPS(+, oup, a, b, arrlen, _dt); \ - break; void cc_array_add_ew(void *oup, int arrlen, const void *a, const void *b, int dt) { - cc_int32 i; - switch (dt) { - ARRAY_ADD_EW_CASE(CC_UINT8, cc_uint8); - ARRAY_ADD_EW_CASE(CC_UINT16, cc_uint16); - ARRAY_ADD_EW_CASE(CC_UINT32, cc_uint32); - ARRAY_ADD_EW_CASE(CC_UINT64, cc_uint64); - ARRAY_ADD_EW_CASE(CC_INT8, cc_int8); - ARRAY_ADD_EW_CASE(CC_INT16, cc_int16); - ARRAY_ADD_EW_CASE(CC_INT32, cc_int32); - ARRAY_ADD_EW_CASE(CC_INT64, cc_int64); - ARRAY_ADD_EW_CASE(CC_FLOAT32, cc_float32); - ARRAY_ADD_EW_CASE(CC_FLOAT64, cc_float64); - default: - utlog_format(UTLOG_ERR, - "cc_array: unsupported dtype %x\n", dt); - break; - } + _array_add_ew(oup, arrlen, a, b, dt); } -#define ARRAY_SUB_EW_CASE(_DT, _dt) \ -case _DT: \ - ARRAY_EW_OPS(-, oup, a, b, arrlen, _dt); \ - break; void cc_array_sub_ew(void *oup, int arrlen, const void *a, const void *b, int dt) { - cc_int32 i; - switch (dt) { - ARRAY_SUB_EW_CASE(CC_UINT8, cc_uint8); - ARRAY_SUB_EW_CASE(CC_UINT16, cc_uint16); - ARRAY_SUB_EW_CASE(CC_UINT32, cc_uint32); - ARRAY_SUB_EW_CASE(CC_UINT64, cc_uint64); - ARRAY_SUB_EW_CASE(CC_INT8, cc_int8); - ARRAY_SUB_EW_CASE(CC_INT16, cc_int16); - ARRAY_SUB_EW_CASE(CC_INT32, cc_int32); - ARRAY_SUB_EW_CASE(CC_INT64, cc_int64); - ARRAY_SUB_EW_CASE(CC_FLOAT32, cc_float32); - ARRAY_SUB_EW_CASE(CC_FLOAT64, cc_float64); - default: - utlog_format(UTLOG_ERR, - "cc_array: unsupported dtype %x\n", dt); - break; - } + _array_sub_ew(oup, arrlen, a, b, dt); } -#define ARRAY_MUL_EW_CASE(_DT, _dt) \ -case _DT: \ - ARRAY_EW_OPS(*, oup, a, b, arrlen, _dt); \ - break; void cc_array_mul_ew(void *oup, int arrlen, const void *a, const void *b, int dt) { - cc_int32 i; - switch (dt) { - ARRAY_MUL_EW_CASE(CC_UINT8, cc_uint8); - ARRAY_MUL_EW_CASE(CC_UINT16, cc_uint16); - ARRAY_MUL_EW_CASE(CC_UINT32, cc_uint32); - ARRAY_MUL_EW_CASE(CC_UINT64, cc_uint64); - ARRAY_MUL_EW_CASE(CC_INT8, cc_int8); - ARRAY_MUL_EW_CASE(CC_INT16, cc_int16); - ARRAY_MUL_EW_CASE(CC_INT32, cc_int32); - ARRAY_MUL_EW_CASE(CC_INT64, cc_int64); - ARRAY_MUL_EW_CASE(CC_FLOAT32, cc_float32); - ARRAY_MUL_EW_CASE(CC_FLOAT64, cc_float64); - default: - utlog_format(UTLOG_ERR, - "cc_array: unsupported dtype %x\n", dt); - break; - } + _array_mul_ew(oup, arrlen, a, b, dt); } -#define ARRAY_DIV_EW_CASE(_DT, _dt) \ -case _DT: \ - ARRAY_EW_OPS(/, oup, a, b, arrlen, _dt); \ - break; void cc_array_div_ew(void *oup, int arrlen, const void *a, const void *b, int dt) { - cc_int32 i; - switch (dt) { - ARRAY_DIV_EW_CASE(CC_UINT8, cc_uint8); - ARRAY_DIV_EW_CASE(CC_UINT16, cc_uint16); - ARRAY_DIV_EW_CASE(CC_UINT32, cc_uint32); - ARRAY_DIV_EW_CASE(CC_UINT64, cc_uint64); - ARRAY_DIV_EW_CASE(CC_INT8, cc_int8); - ARRAY_DIV_EW_CASE(CC_INT16, cc_int16); - ARRAY_DIV_EW_CASE(CC_INT32, cc_int32); - ARRAY_DIV_EW_CASE(CC_INT64, cc_int64); - ARRAY_DIV_EW_CASE(CC_FLOAT32, cc_float32); - ARRAY_DIV_EW_CASE(CC_FLOAT64, cc_float64); - default: - utlog_format(UTLOG_ERR, - "cc_array: unsupported dtype %x\n", dt); - break; - } + _array_div_ew(oup, arrlen, a, b, dt); } -#define ARRAY_SUM_CASE(_DT, _dt) \ -case _DT: \ - ARRAY_SUM(arr, arrlen, _dt, x); \ - break; -void cc_array_sum(const void *arr, int arrlen, void *x, int dt) +void cc_array_sum (const void *arr, int arrlen, void *x, int dt) { - cc_int32 i; - switch (dt) { - ARRAY_SUM_CASE(CC_UINT8, cc_uint8); - ARRAY_SUM_CASE(CC_UINT16, cc_uint16); - ARRAY_SUM_CASE(CC_UINT32, cc_uint32); - ARRAY_SUM_CASE(CC_UINT64, cc_uint64); - ARRAY_SUM_CASE(CC_INT8, cc_int8); - ARRAY_SUM_CASE(CC_INT16, cc_int16); - ARRAY_SUM_CASE(CC_INT32, cc_int32); - ARRAY_SUM_CASE(CC_INT64, cc_int64); - ARRAY_SUM_CASE(CC_FLOAT32, cc_float32); - ARRAY_SUM_CASE(CC_FLOAT64, cc_float64); - default: - utlog_format(UTLOG_ERR, - "cc_array: unsupported dtype %x\n", dt); - break; - } + _array_sum(arr, arrlen, x, dt); } -#define ARRAY_MEAN_CASE(_DT, _dt) \ -case _DT: \ - ARRAY_SUM(arr, arrlen, _dt, x); \ - *(_dt*)x /= arrlen; \ - break; void cc_array_mean(const void *arr, int arrlen, void *x, int dt) { - cc_int32 i; - switch (dt) { - ARRAY_MEAN_CASE(CC_UINT8, cc_uint8); - ARRAY_MEAN_CASE(CC_UINT16, cc_uint16); - ARRAY_MEAN_CASE(CC_UINT32, cc_uint32); - ARRAY_MEAN_CASE(CC_UINT64, cc_uint64); - ARRAY_MEAN_CASE(CC_INT8, cc_int8); - ARRAY_MEAN_CASE(CC_INT16, cc_int16); - ARRAY_MEAN_CASE(CC_INT32, cc_int32); - ARRAY_MEAN_CASE(CC_INT64, cc_int64); - ARRAY_MEAN_CASE(CC_FLOAT32, cc_float32); - ARRAY_MEAN_CASE(CC_FLOAT64, cc_float64); - default: - utlog_format(UTLOG_ERR, - "cc_array: unsupported dtype %x\n", dt); - break; - } + _array_mean(arr, arrlen, x, dt); } #define PRINT_ARRAY_CASE(_DT, _dt) \ diff --git a/src/cc_array.h b/src/cc_array.h index 11e4119..288e134 100644 --- a/src/cc_array.h +++ b/src/cc_array.h @@ -10,7 +10,7 @@ */ #define CC_ARRAY_CAST_DEFINITION(dtype) \ -void cc_array_cast_ ## dtype( \ +void cc_array_cast_ ## dtype( \ void *dst, const void *src, int arrlen, int dt); CC_ARRAY_CAST_DEFINITION (uint8) diff --git a/src/cc_basic.c b/src/cc_basic.c index ff687ec..dc6bf07 100644 --- a/src/cc_basic.c +++ b/src/cc_basic.c @@ -11,34 +11,6 @@ #include "util_log.h" #include "cc_basic.h" -#include "global_fn_cfg.h" -#define EXT_ARRAY_CAST_DEFINITION(dtype) \ -extern fn_array_cast_ ## dtype _array_cast_ ## dtype; - -EXT_ARRAY_CAST_DEFINITION (uint8) -EXT_ARRAY_CAST_DEFINITION (uint16) -EXT_ARRAY_CAST_DEFINITION (uint32) -EXT_ARRAY_CAST_DEFINITION (uint64) -EXT_ARRAY_CAST_DEFINITION (int8) -EXT_ARRAY_CAST_DEFINITION (int16) -EXT_ARRAY_CAST_DEFINITION (int32) -EXT_ARRAY_CAST_DEFINITION (int64) -EXT_ARRAY_CAST_DEFINITION (float32) -EXT_ARRAY_CAST_DEFINITION (float64) - -extern fn_array_set _array_set; -extern fn_array_clip_by_value _array_clip_by_value; - -extern fn_array_add_by _array_add_by; -extern fn_array_sub_by _array_sub_by; -extern fn_array_mul_by _array_mul_by; -extern fn_array_div_by _array_div_by; - -extern fn_array_add_ew _array_add_ew; -extern fn_array_sub_ew _array_sub_ew; -extern fn_array_mul_ew _array_mul_ew; -extern fn_array_div_ew _array_div_ew; - static cc_int32 _calc_elems(const cc_int32 *shape) { cc_int32 elems; @@ -300,7 +272,7 @@ void cc_print(const cc_tensor_t *tensor) void cc_set_value(cc_tensor_t *tensor, void *v) { - _array_set(tensor->data, + cc_array_set(tensor->data, cc_elements(tensor), v, *tensor->dtype); } @@ -316,7 +288,7 @@ cc_tensor_t *cc_clip_by_value(cc_tensor_t *tensor, yield = tensor; else yield = cc_copy(tensor, name); - _array_clip_by_value(tensor->data, + cc_array_clip_by_value(tensor->data, cc_elements(tensor), min, max, *tensor->dtype); return yield; } @@ -335,43 +307,43 @@ cc_tensor_t *cc_cast(cc_tensor_t *tensor, cc_assert_ptr(cast = cc_create(tensor->shape, dtype, NULL)); switch (dtype) { case CC_INT8: - _array_cast_int8(cast->data, + cc_array_cast_int8(cast->data, tensor->data, elems, *tensor->dtype); break; case CC_UINT8: - _array_cast_uint8(cast->data, + cc_array_cast_uint8(cast->data, tensor->data, elems, *tensor->dtype); break; case CC_INT16: - _array_cast_int16(cast->data, + cc_array_cast_int16(cast->data, tensor->data, elems, *tensor->dtype); break; case CC_UINT16: - _array_cast_uint16(cast->data, + cc_array_cast_uint16(cast->data, tensor->data, elems, *tensor->dtype); break; case CC_INT32: - _array_cast_int32(cast->data, + cc_array_cast_int32(cast->data, tensor->data, elems, *tensor->dtype); break; case CC_UINT32: - _array_cast_uint32(cast->data, + cc_array_cast_uint32(cast->data, tensor->data, elems, *tensor->dtype); break; case CC_INT64: - _array_cast_int64(cast->data, + cc_array_cast_int64(cast->data, tensor->data, elems, *tensor->dtype); break; case CC_UINT64: - _array_cast_uint64(cast->data, + cc_array_cast_uint64(cast->data, tensor->data, elems, *tensor->dtype); break; case CC_FLOAT32: - _array_cast_float32(cast->data, + cc_array_cast_float32(cast->data, tensor->data, elems, *tensor->dtype); break; case CC_FLOAT64: - _array_cast_float64(cast->data, + cc_array_cast_float64(cast->data, tensor->data, elems, *tensor->dtype); break; default: @@ -411,19 +383,19 @@ cc_tensor_t *cc_scalar(cc_tensor_t *tensor, yield = cc_copy(tensor, name); switch (op) { case '+': - _array_add_by(yield->data, elems, + cc_array_add_by(yield->data, elems, yield->data, data, *tensor->dtype); break; case '-': - _array_sub_by(yield->data, elems, + cc_array_sub_by(yield->data, elems, yield->data, data, *tensor->dtype); break; case '*': - _array_mul_by(yield->data, elems, + cc_array_mul_by(yield->data, elems, yield->data, data, *tensor->dtype); break; case '/': - _array_div_by(yield->data, elems, + cc_array_div_by(yield->data, elems, yield->data, data, *tensor->dtype); break; default: @@ -458,19 +430,19 @@ cc_tensor_t *cc_elemwise(cc_tensor_t *a, yield = cc_copy(a, name); switch (op) { case '+': - _array_add_ew(yield->data, elems, + cc_array_add_ew(yield->data, elems, yield->data, b->data, *yield->dtype); break; case '-': - _array_sub_ew(yield->data, elems, + cc_array_sub_ew(yield->data, elems, yield->data, b->data, *yield->dtype); break; case '*': - _array_mul_ew(yield->data, elems, + cc_array_mul_ew(yield->data, elems, yield->data, b->data, *yield->dtype); break; case '/': - _array_div_ew(yield->data, elems, + cc_array_div_ew(yield->data, elems, yield->data, b->data, *yield->dtype); break; default: diff --git a/src/cc_conv2d.c b/src/cc_conv2d.c index ba1ba11..ce027c6 100644 --- a/src/cc_conv2d.c +++ b/src/cc_conv2d.c @@ -7,6 +7,7 @@ #endif #include "cc_assert.h" +#include "cc_array.h" #include "cc_basic.h" #include "cc_fmap2d.h" #include "cc_pad2d.h" @@ -17,8 +18,6 @@ #include "global_fn_cfg.h" extern fn_conv2d _conv2d; -extern fn_array_add_ew _array_add_ew; -extern fn_array_mul_by _array_mul_by; cc_int32 cc_conv2d_shape_calc( cc_int32 i, cc_int32 k, cc_int32 s, cc_int32 p) @@ -101,7 +100,7 @@ cc_tensor_t *cc_conv2d(const cc_tensor_t *inp, k_ch_mem_size * j), kernel->shape[CC_CONV2D_KERNEL_W], *kernel->dtype); - _array_add_ew(oup->data + o_ch_mem_size * i, + cc_array_add_ew(oup->data + o_ch_mem_size * i, o_ch_size, oup->data + o_ch_mem_size * i, omp_out_buf + omp_get_thread_num() * o_ch_mem_size, *oup->dtype); @@ -115,7 +114,7 @@ cc_tensor_t *cc_conv2d(const cc_tensor_t *inp, k_ch_mem_size * j), kernel->shape[CC_CONV2D_KERNEL_W], *kernel->dtype); - _array_add_ew(oup->data + o_ch_mem_size * i, o_ch_size, + cc_array_add_ew(oup->data + o_ch_mem_size * i, o_ch_size, oup->data + o_ch_mem_size * i, omp_out_buf, *oup->dtype); #endif @@ -264,21 +263,21 @@ cc_tensor_t *cc_pw_conv2d(cc_tensor_t *inp, const cc_tensor_t *kernel, for (j = 0; j < kernel->shape[CC_CONV2D_KERNEL_I]; ++j) { #ifdef ENABLE_OPENMP - _array_mul_by( + cc_array_mul_by( omp_out_buf + omp_get_thread_num() * o_ch_mem_size, o_ch_size, inp->data + o_ch_mem_size * j, kernel->data + k_mem_size * i + k_ch_mem_size * j, *oup->dtype); - _array_add_ew(oup->data + o_ch_mem_size * i, + cc_array_add_ew(oup->data + o_ch_mem_size * i, o_ch_size, oup->data + o_ch_mem_size * i, omp_out_buf + omp_get_thread_num() * o_ch_mem_size, *oup->dtype); #else - _array_mul_by(omp_out_buf, o_ch_size, + cc_array_mul_by(omp_out_buf, o_ch_size, inp->data + o_ch_mem_size * j, kernel->data + k_mem_size * i + k_ch_mem_size * j, *oup->dtype); - _array_add_ew(oup->data + o_ch_mem_size * i, o_ch_size, + cc_array_add_ew(oup->data + o_ch_mem_size * i, o_ch_size, oup->data + o_ch_mem_size * i, omp_out_buf, *oup->dtype); #endif diff --git a/src/cc_cpufn.c b/src/cc_cpufn.c index 55ab0da..b9d5ac4 100644 --- a/src/cc_cpufn.c +++ b/src/cc_cpufn.c @@ -321,21 +321,19 @@ static void cc_cpu_conv2d_ ## dt (cc_ ## dt *inp, cc_ ## dt *oup, \ cc_int32 x, cc_int32 y, cc_int32 oup_x, cc_int32 oup_y, \ cc_int32 sx, cc_int32 sy, cc_ ## dt *filter, cc_int32 fw) \ { \ - cc_int32 i, j, k, l, oup_i, oup_j; \ + cc_int32 i, j, k, l; \ cc_int32 half_fl = fw >> 1; \ cc_ ## dt sum; \ - for (i = half_fl; i < y - half_fl; i += sy) { \ - for (j = half_fl; j < x - half_fl; j += sx) { \ - sum = 0; \ - for (k = -half_fl; k <= half_fl; ++k) { \ - for (l = -half_fl; l <= half_fl; ++l) { \ - sum += *(inp + (i + k) * x + (j + l)) * \ - *(filter + (k + half_fl) * fw + (l + half_fl)); \ + for (i = half_fl; i < y - half_fl; i += sy) { \ + for (j = half_fl; j < x - half_fl; j += sx) { \ + sum = 0; \ + for (k = -half_fl; k <= half_fl; ++k) { \ + for (l = -half_fl; l <= half_fl; ++l) { \ + sum += *(inp + (i + k) * x + (j + l)) * \ + *(filter + (k + half_fl) * fw + (l + half_fl)); \ } \ } \ - oup_i = ((i - half_fl) / sy); \ - oup_j = ((j - half_fl) / sx); \ - *(oup + oup_i * oup_x + oup_j) = sum; \ + *oup++ = sum; \ } \ } \ } @@ -555,3 +553,389 @@ void cc_cpu_batch_norm(void *inp, "cc_cpufn: unsupported dtype %x\n", dt); } } + +#define ARRAY_SC_OPS(op, oup, arr, elem, arrlen, dtype) \ + for (i = 0; i < arrlen; ++i) { \ + *((dtype*)oup + i) = *((dtype*)arr + i) op *(dtype*)elem; \ + } + +#define ARRAY_ELEM_SET(arr, elem, arrlen, dtype) \ + for (i = 0; i < arrlen; ++i) { \ + *((dtype*)arr + i) = *(dtype*)elem; \ + } + +#define ARRAY_ELEM_CLIP(arr, min, max, arrlen, dtype) \ + for (i = 0; i < arrlen; ++i) { \ + if (min) { \ + *((dtype*)arr + i) = \ + *((dtype*)arr + i) < *(dtype*)min ? \ + *(dtype*)min : *((dtype*)arr + i); \ + } \ + if (max) { \ + *((dtype*)arr + i) = \ + *((dtype*)arr + i) > *(dtype*)max ? \ + *(dtype*)max : *((dtype*)arr + i); \ + } \ + } + +#define ARRAY_EW_OPS(op, oup, a, b, arrlen, dtype) \ + for (i = 0; i < arrlen; ++i) { \ + *((dtype*)oup + i) = *((dtype*)a + i) op \ + *((dtype*)b + i); \ + } + +#define ARRAY_SUM(arr, arrlen, dtype, sum) \ + *(dtype*)sum = 0; \ + for (i = 0; i < arrlen; ++i) { \ + *(dtype*)sum += *((dtype*)arr + i); \ + } + +#define ARRAY_CAST_CASE(_DT, _srcdt, _dstdt) \ +case _DT: \ + for (i = 0; i < arrlen; ++i) \ + *((_dstdt*)dst + i) = (_dstdt)*((_srcdt*)src + i); \ + break; + +#define CC_CPU_ARRAY_CAST_IMPLEMENTATION(dtype) \ +void cc_cpu_array_cast_ ## dtype( \ + void *dst, const void *src, int arrlen, int dt) \ +{ \ + cc_int32 i; \ + switch (dt) { \ + ARRAY_CAST_CASE(CC_UINT8, cc_uint8, cc_ ## dtype); \ + ARRAY_CAST_CASE(CC_UINT16, cc_uint16, cc_ ## dtype); \ + ARRAY_CAST_CASE(CC_UINT32, cc_uint32, cc_ ## dtype); \ + ARRAY_CAST_CASE(CC_UINT64, cc_uint64, cc_ ## dtype); \ + ARRAY_CAST_CASE(CC_INT8, cc_int8, cc_ ## dtype); \ + ARRAY_CAST_CASE(CC_INT16, cc_int16, cc_ ## dtype); \ + ARRAY_CAST_CASE(CC_INT32, cc_int32, cc_ ## dtype); \ + ARRAY_CAST_CASE(CC_INT64, cc_int64, cc_ ## dtype); \ + ARRAY_CAST_CASE(CC_FLOAT32, cc_float32, cc_ ## dtype); \ + ARRAY_CAST_CASE(CC_FLOAT64, cc_float64, cc_ ## dtype); \ + default: \ + utlog_format(UTLOG_ERR, \ + "cc_array: unsupported dtype %x\n", dt); \ + break; \ + } \ +} + +#define ARRAY_SET_CASE(_DT, _dt) \ +case _DT: \ + ARRAY_ELEM_SET(arr, x, arrlen, _dt) \ + break; +void cc_cpu_array_set(void *arr, int arrlen, const void *x, int dt) +{ + cc_int32 i; + switch (dt) { + ARRAY_SET_CASE(CC_UINT8, cc_uint8); + ARRAY_SET_CASE(CC_UINT16, cc_uint16); + ARRAY_SET_CASE(CC_UINT32, cc_uint32); + ARRAY_SET_CASE(CC_UINT64, cc_uint64); + ARRAY_SET_CASE(CC_INT8, cc_int8); + ARRAY_SET_CASE(CC_INT16, cc_int16); + ARRAY_SET_CASE(CC_INT32, cc_int32); + ARRAY_SET_CASE(CC_INT64, cc_int64); + ARRAY_SET_CASE(CC_FLOAT32, cc_float32); + ARRAY_SET_CASE(CC_FLOAT64, cc_float64); + default: + utlog_format(UTLOG_ERR, + "cc_array: unsupported dtype %x\n", dt); + break; + } +} + +#define ARRAY_CLIP_CASE(_DT, _dt) \ +case _DT: \ + ARRAY_ELEM_CLIP(arr, min, max, arrlen, _dt); \ + break; +void cc_cpu_array_clip_by_value( + void *arr, int arrlen, const void *min, const void *max, int dt) +{ + cc_int32 i; + switch (dt) { + ARRAY_CLIP_CASE(CC_UINT8, cc_uint8); + ARRAY_CLIP_CASE(CC_UINT16, cc_uint16); + ARRAY_CLIP_CASE(CC_UINT32, cc_uint32); + ARRAY_CLIP_CASE(CC_UINT64, cc_uint64); + ARRAY_CLIP_CASE(CC_INT8, cc_int8); + ARRAY_CLIP_CASE(CC_INT16, cc_int16); + ARRAY_CLIP_CASE(CC_INT32, cc_int32); + ARRAY_CLIP_CASE(CC_INT64, cc_int64); + ARRAY_CLIP_CASE(CC_FLOAT32, cc_float32); + ARRAY_CLIP_CASE(CC_FLOAT64, cc_float64); + default: + utlog_format(UTLOG_ERR, + "cc_array: unsupported dtype %x\n", dt); + break; + } +} + +CC_CPU_ARRAY_CAST_IMPLEMENTATION (uint8) +CC_CPU_ARRAY_CAST_IMPLEMENTATION (uint16) +CC_CPU_ARRAY_CAST_IMPLEMENTATION (uint32) +CC_CPU_ARRAY_CAST_IMPLEMENTATION (uint64) +CC_CPU_ARRAY_CAST_IMPLEMENTATION (int8) +CC_CPU_ARRAY_CAST_IMPLEMENTATION (int16) +CC_CPU_ARRAY_CAST_IMPLEMENTATION (int32) +CC_CPU_ARRAY_CAST_IMPLEMENTATION (int64) +CC_CPU_ARRAY_CAST_IMPLEMENTATION (float32) +CC_CPU_ARRAY_CAST_IMPLEMENTATION (float64) + +#define ARRAY_ADD_BY_CASE(_DT, _dt) \ +case _DT: \ + ARRAY_SC_OPS(+, oup, a, x, arrlen, _dt); \ + break; +void cc_cpu_array_add_by(void *oup, + int arrlen, const void *a, const void *x, int dt) +{ + cc_int32 i; + switch (dt) { + ARRAY_ADD_BY_CASE(CC_UINT8, cc_uint8); + ARRAY_ADD_BY_CASE(CC_UINT16, cc_uint16); + ARRAY_ADD_BY_CASE(CC_UINT32, cc_uint32); + ARRAY_ADD_BY_CASE(CC_UINT64, cc_uint64); + ARRAY_ADD_BY_CASE(CC_INT8, cc_int8); + ARRAY_ADD_BY_CASE(CC_INT16, cc_int16); + ARRAY_ADD_BY_CASE(CC_INT32, cc_int32); + ARRAY_ADD_BY_CASE(CC_INT64, cc_int64); + ARRAY_ADD_BY_CASE(CC_FLOAT32, cc_float32); + ARRAY_ADD_BY_CASE(CC_FLOAT64, cc_float64); + default: + utlog_format(UTLOG_ERR, + "cc_array: unsupported dtype %x\n", dt); + break; + } +} + +#define ARRAY_SUB_BY_CASE(_DT, _dt) \ +case _DT: \ + ARRAY_SC_OPS(-, oup, a, x, arrlen, _dt); \ + break; +void cc_cpu_array_sub_by(void *oup, + int arrlen, const void *a, const void *x, int dt) +{ + cc_int32 i; + switch (dt) { + ARRAY_SUB_BY_CASE(CC_UINT8, cc_uint8); + ARRAY_SUB_BY_CASE(CC_UINT16, cc_uint16); + ARRAY_SUB_BY_CASE(CC_UINT32, cc_uint32); + ARRAY_SUB_BY_CASE(CC_UINT64, cc_uint64); + ARRAY_SUB_BY_CASE(CC_INT8, cc_int8); + ARRAY_SUB_BY_CASE(CC_INT16, cc_int16); + ARRAY_SUB_BY_CASE(CC_INT32, cc_int32); + ARRAY_SUB_BY_CASE(CC_INT64, cc_int64); + ARRAY_SUB_BY_CASE(CC_FLOAT32, cc_float32); + ARRAY_SUB_BY_CASE(CC_FLOAT64, cc_float64); + default: + utlog_format(UTLOG_ERR, + "cc_array: unsupported dtype %x\n", dt); + break; + } +} + +#define ARRAY_MUL_BY_CASE(_DT, _dt) \ +case _DT: \ + ARRAY_SC_OPS(*, oup, a, x, arrlen, _dt); \ + break; +void cc_cpu_array_mul_by(void *oup, + int arrlen, const void *a, const void *x, int dt) +{ + cc_int32 i; + switch (dt) { + ARRAY_MUL_BY_CASE(CC_UINT8, cc_uint8); + ARRAY_MUL_BY_CASE(CC_UINT16, cc_uint16); + ARRAY_MUL_BY_CASE(CC_UINT32, cc_uint32); + ARRAY_MUL_BY_CASE(CC_UINT64, cc_uint64); + ARRAY_MUL_BY_CASE(CC_INT8, cc_int8); + ARRAY_MUL_BY_CASE(CC_INT16, cc_int16); + ARRAY_MUL_BY_CASE(CC_INT32, cc_int32); + ARRAY_MUL_BY_CASE(CC_INT64, cc_int64); + ARRAY_MUL_BY_CASE(CC_FLOAT32, cc_float32); + ARRAY_MUL_BY_CASE(CC_FLOAT64, cc_float64); + default: + utlog_format(UTLOG_ERR, + "cc_array: unsupported dtype %x\n", dt); + break; + } +} + +#define ARRAY_DIV_BY_CASE(_DT, _dt) \ +case _DT: \ + ARRAY_SC_OPS(/, oup, a, x, arrlen, _dt); \ + break; +void cc_cpu_array_div_by(void *oup, + int arrlen, const void *a, const void *x, int dt) +{ + cc_int32 i; + switch (dt) { + ARRAY_DIV_BY_CASE(CC_UINT8, cc_uint8); + ARRAY_DIV_BY_CASE(CC_UINT16, cc_uint16); + ARRAY_DIV_BY_CASE(CC_UINT32, cc_uint32); + ARRAY_DIV_BY_CASE(CC_UINT64, cc_uint64); + ARRAY_DIV_BY_CASE(CC_INT8, cc_int8); + ARRAY_DIV_BY_CASE(CC_INT16, cc_int16); + ARRAY_DIV_BY_CASE(CC_INT32, cc_int32); + ARRAY_DIV_BY_CASE(CC_INT64, cc_int64); + ARRAY_DIV_BY_CASE(CC_FLOAT32, cc_float32); + ARRAY_DIV_BY_CASE(CC_FLOAT64, cc_float64); + default: + utlog_format(UTLOG_ERR, + "cc_array: unsupported dtype %x\n", dt); + break; + } +} + +#define ARRAY_ADD_EW_CASE(_DT, _dt) \ +case _DT: \ + ARRAY_EW_OPS(+, oup, a, b, arrlen, _dt); \ + break; +void cc_cpu_array_add_ew(void *oup, + int arrlen, const void *a, const void *b, int dt) +{ + cc_int32 i; + switch (dt) { + ARRAY_ADD_EW_CASE(CC_UINT8, cc_uint8); + ARRAY_ADD_EW_CASE(CC_UINT16, cc_uint16); + ARRAY_ADD_EW_CASE(CC_UINT32, cc_uint32); + ARRAY_ADD_EW_CASE(CC_UINT64, cc_uint64); + ARRAY_ADD_EW_CASE(CC_INT8, cc_int8); + ARRAY_ADD_EW_CASE(CC_INT16, cc_int16); + ARRAY_ADD_EW_CASE(CC_INT32, cc_int32); + ARRAY_ADD_EW_CASE(CC_INT64, cc_int64); + ARRAY_ADD_EW_CASE(CC_FLOAT32, cc_float32); + ARRAY_ADD_EW_CASE(CC_FLOAT64, cc_float64); + default: + utlog_format(UTLOG_ERR, + "cc_array: unsupported dtype %x\n", dt); + break; + } +} + +#define ARRAY_SUB_EW_CASE(_DT, _dt) \ +case _DT: \ + ARRAY_EW_OPS(-, oup, a, b, arrlen, _dt); \ + break; +void cc_cpu_array_sub_ew(void *oup, + int arrlen, const void *a, const void *b, int dt) +{ + cc_int32 i; + switch (dt) { + ARRAY_SUB_EW_CASE(CC_UINT8, cc_uint8); + ARRAY_SUB_EW_CASE(CC_UINT16, cc_uint16); + ARRAY_SUB_EW_CASE(CC_UINT32, cc_uint32); + ARRAY_SUB_EW_CASE(CC_UINT64, cc_uint64); + ARRAY_SUB_EW_CASE(CC_INT8, cc_int8); + ARRAY_SUB_EW_CASE(CC_INT16, cc_int16); + ARRAY_SUB_EW_CASE(CC_INT32, cc_int32); + ARRAY_SUB_EW_CASE(CC_INT64, cc_int64); + ARRAY_SUB_EW_CASE(CC_FLOAT32, cc_float32); + ARRAY_SUB_EW_CASE(CC_FLOAT64, cc_float64); + default: + utlog_format(UTLOG_ERR, + "cc_array: unsupported dtype %x\n", dt); + break; + } +} + +#define ARRAY_MUL_EW_CASE(_DT, _dt) \ +case _DT: \ + ARRAY_EW_OPS(*, oup, a, b, arrlen, _dt); \ + break; +void cc_cpu_array_mul_ew(void *oup, + int arrlen, const void *a, const void *b, int dt) +{ + cc_int32 i; + switch (dt) { + ARRAY_MUL_EW_CASE(CC_UINT8, cc_uint8); + ARRAY_MUL_EW_CASE(CC_UINT16, cc_uint16); + ARRAY_MUL_EW_CASE(CC_UINT32, cc_uint32); + ARRAY_MUL_EW_CASE(CC_UINT64, cc_uint64); + ARRAY_MUL_EW_CASE(CC_INT8, cc_int8); + ARRAY_MUL_EW_CASE(CC_INT16, cc_int16); + ARRAY_MUL_EW_CASE(CC_INT32, cc_int32); + ARRAY_MUL_EW_CASE(CC_INT64, cc_int64); + ARRAY_MUL_EW_CASE(CC_FLOAT32, cc_float32); + ARRAY_MUL_EW_CASE(CC_FLOAT64, cc_float64); + default: + utlog_format(UTLOG_ERR, + "cc_array: unsupported dtype %x\n", dt); + break; + } +} + +#define ARRAY_DIV_EW_CASE(_DT, _dt) \ +case _DT: \ + ARRAY_EW_OPS(/, oup, a, b, arrlen, _dt); \ + break; +void cc_cpu_array_div_ew(void *oup, + int arrlen, const void *a, const void *b, int dt) +{ + cc_int32 i; + switch (dt) { + ARRAY_DIV_EW_CASE(CC_UINT8, cc_uint8); + ARRAY_DIV_EW_CASE(CC_UINT16, cc_uint16); + ARRAY_DIV_EW_CASE(CC_UINT32, cc_uint32); + ARRAY_DIV_EW_CASE(CC_UINT64, cc_uint64); + ARRAY_DIV_EW_CASE(CC_INT8, cc_int8); + ARRAY_DIV_EW_CASE(CC_INT16, cc_int16); + ARRAY_DIV_EW_CASE(CC_INT32, cc_int32); + ARRAY_DIV_EW_CASE(CC_INT64, cc_int64); + ARRAY_DIV_EW_CASE(CC_FLOAT32, cc_float32); + ARRAY_DIV_EW_CASE(CC_FLOAT64, cc_float64); + default: + utlog_format(UTLOG_ERR, + "cc_array: unsupported dtype %x\n", dt); + break; + } +} + +#define ARRAY_SUM_CASE(_DT, _dt) \ +case _DT: \ + ARRAY_SUM(arr, arrlen, _dt, x); \ + break; +void cc_cpu_array_sum(const void *arr, int arrlen, void *x, int dt) +{ + cc_int32 i; + switch (dt) { + ARRAY_SUM_CASE(CC_UINT8, cc_uint8); + ARRAY_SUM_CASE(CC_UINT16, cc_uint16); + ARRAY_SUM_CASE(CC_UINT32, cc_uint32); + ARRAY_SUM_CASE(CC_UINT64, cc_uint64); + ARRAY_SUM_CASE(CC_INT8, cc_int8); + ARRAY_SUM_CASE(CC_INT16, cc_int16); + ARRAY_SUM_CASE(CC_INT32, cc_int32); + ARRAY_SUM_CASE(CC_INT64, cc_int64); + ARRAY_SUM_CASE(CC_FLOAT32, cc_float32); + ARRAY_SUM_CASE(CC_FLOAT64, cc_float64); + default: + utlog_format(UTLOG_ERR, + "cc_array: unsupported dtype %x\n", dt); + break; + } +} + +#define ARRAY_MEAN_CASE(_DT, _dt) \ +case _DT: \ + ARRAY_SUM(arr, arrlen, _dt, x); \ + *(_dt*)x /= arrlen; \ + break; +void cc_cpu_array_mean(const void *arr, int arrlen, void *x, int dt) +{ + cc_int32 i; + switch (dt) { + ARRAY_MEAN_CASE(CC_UINT8, cc_uint8); + ARRAY_MEAN_CASE(CC_UINT16, cc_uint16); + ARRAY_MEAN_CASE(CC_UINT32, cc_uint32); + ARRAY_MEAN_CASE(CC_UINT64, cc_uint64); + ARRAY_MEAN_CASE(CC_INT8, cc_int8); + ARRAY_MEAN_CASE(CC_INT16, cc_int16); + ARRAY_MEAN_CASE(CC_INT32, cc_int32); + ARRAY_MEAN_CASE(CC_INT64, cc_int64); + ARRAY_MEAN_CASE(CC_FLOAT32, cc_float32); + ARRAY_MEAN_CASE(CC_FLOAT64, cc_float64); + default: + utlog_format(UTLOG_ERR, + "cc_array: unsupported dtype %x\n", dt); + break; + } +} diff --git a/src/cc_cpufn.h b/src/cc_cpufn.h index 981afa8..fbf9360 100644 --- a/src/cc_cpufn.h +++ b/src/cc_cpufn.h @@ -30,6 +30,51 @@ void cc_cpu_fully_connected(const void *inp, void cc_cpu_batch_norm(void *inp, cc_int32 len, const void *bnpara, cc_dtype dt); +/* + * int <---> cc_int32 + */ + +#define CC_CPU_ARRAY_CAST_DEFINITION(dtype) \ +void cc_cpu_array_cast_ ## dtype( \ + void *dst, const void *src, int arrlen, int dt); + +CC_CPU_ARRAY_CAST_DEFINITION (uint8) +CC_CPU_ARRAY_CAST_DEFINITION (uint16) +CC_CPU_ARRAY_CAST_DEFINITION (uint32) +CC_CPU_ARRAY_CAST_DEFINITION (uint64) +CC_CPU_ARRAY_CAST_DEFINITION (int8) +CC_CPU_ARRAY_CAST_DEFINITION (int16) +CC_CPU_ARRAY_CAST_DEFINITION (int32) +CC_CPU_ARRAY_CAST_DEFINITION (int64) +CC_CPU_ARRAY_CAST_DEFINITION (float32) +CC_CPU_ARRAY_CAST_DEFINITION (float64) + +void cc_cpu_array_set(void *arr, int arrlen, const void *x, int dt); + +void cc_cpu_array_clip_by_value(void *arr, + int arrlen, const void *min, const void *max, int dt); + +void cc_cpu_array_add_by(void *oup, + int arrlen, const void *a, const void *x, int dt); +void cc_cpu_array_sub_by(void *oup, + int arrlen, const void *a, const void *x, int dt); +void cc_cpu_array_mul_by(void *oup, + int arrlen, const void *a, const void *x, int dt); +void cc_cpu_array_div_by(void *oup, + int arrlen, const void *a, const void *x, int dt); + +void cc_cpu_array_add_ew(void *oup, + int arrlen, const void *a, const void *b, int dt); +void cc_cpu_array_sub_ew(void *oup, + int arrlen, const void *a, const void *b, int dt); +void cc_cpu_array_mul_ew(void *oup, + int arrlen, const void *a, const void *b, int dt); +void cc_cpu_array_div_ew(void *oup, + int arrlen, const void *a, const void *b, int dt); + +void cc_cpu_array_sum (const void *arr, int arrlen, void *x, int dt); +void cc_cpu_array_mean(const void *arr, int arrlen, void *x, int dt); + #ifdef __cplusplus } #endif diff --git a/src/cc_fmap2d.c b/src/cc_fmap2d.c index d0f616f..992df32 100644 --- a/src/cc_fmap2d.c +++ b/src/cc_fmap2d.c @@ -6,12 +6,12 @@ #endif #include "cc_assert.h" +#include "cc_array.h" #include "cc_basic.h" #include "cc_fmap2d.h" #include "cc_tsrmgr.h" #include "global_fn_cfg.h" -extern fn_array_add_by _array_add_by; cc_tensor_t *cc_fmap2d_bias(cc_tensor_t *inp, const cc_tensor_t *bias, const char *name) @@ -37,7 +37,7 @@ cc_tensor_t *cc_fmap2d_bias(cc_tensor_t *inp, #pragma omp parallel for private(i) #endif for (i = 0; i < bias->shape[CC_CNN2D_SHAPE_C]; ++i) { - _array_add_by(fmap->data + ch_mem_size * i, + cc_array_add_by(fmap->data + ch_mem_size * i, ch_size, fmap->data + ch_mem_size * i, bias->data + dt_size * i, *fmap->dtype); } diff --git a/src/cc_pad2d.c b/src/cc_pad2d.c index e1df139..b8df403 100644 --- a/src/cc_pad2d.c +++ b/src/cc_pad2d.c @@ -41,6 +41,9 @@ cc_tensor_t *cc_pad2d(const cc_tensor_t *inp, pad->shape[CC_CNN2D_SHAPE_H]; p_ch_mem_size = p_ch_size * dtsize; p_row_mem_size = pad->shape[CC_CNN2D_SHAPE_W] * dtsize; +#ifdef ENABLE_OPENMP +#pragma omp parallel for private(c, i, j) +#endif for (c = 0; c < inp->shape[CC_CNN2D_SHAPE_C]; ++c) { for (i = 0; i < inp->shape[CC_CNN2D_SHAPE_H]; ++i) { for (j = 0; j < inp->shape[CC_CNN2D_SHAPE_W]; ++j) diff --git a/src/global_fn_cfg.c b/src/global_fn_cfg.c index 92cc762..e72b6af 100644 --- a/src/global_fn_cfg.c +++ b/src/global_fn_cfg.c @@ -1,26 +1,29 @@ +#include "cc_cpufn.h" #include "global_fn_cfg.h" void __gfn_check__(void) {return;} -fn_array_set _array_set = cc_array_set; +fn_array_set _array_set = cc_cpu_array_set; fn_array_clip_by_value - _array_clip_by_value = cc_array_clip_by_value; + _array_clip_by_value = cc_cpu_array_clip_by_value; -fn_array_add_by _array_add_by = cc_array_add_by; -fn_array_sub_by _array_sub_by = cc_array_sub_by; -fn_array_mul_by _array_mul_by = cc_array_mul_by; -fn_array_div_by _array_div_by = cc_array_div_by; +fn_array_add_by _array_add_by = cc_cpu_array_add_by; +fn_array_sub_by _array_sub_by = cc_cpu_array_sub_by; +fn_array_mul_by _array_mul_by = cc_cpu_array_mul_by; +fn_array_div_by _array_div_by = cc_cpu_array_div_by; -fn_array_add_ew _array_add_ew = cc_array_add_ew; -fn_array_sub_ew _array_sub_ew = cc_array_sub_ew; -fn_array_mul_ew _array_mul_ew = cc_array_mul_ew; -fn_array_div_ew _array_div_ew = cc_array_div_ew; +fn_array_add_ew _array_add_ew = cc_cpu_array_add_ew; +fn_array_sub_ew _array_sub_ew = cc_cpu_array_sub_ew; +fn_array_mul_ew _array_mul_ew = cc_cpu_array_mul_ew; +fn_array_div_ew _array_div_ew = cc_cpu_array_div_ew; +fn_array_sum _array_sum = cc_cpu_array_sum; +fn_array_mean _array_mean = cc_cpu_array_mean; #define GLOBAL_FN_SET_ARRAY_CAST(dtype) \ fn_array_cast_ ## dtype _array_cast_ ## dtype = \ - cc_array_cast_ ## dtype; + cc_cpu_array_cast_ ## dtype; GLOBAL_FN_SET_ARRAY_CAST (uint8) GLOBAL_FN_SET_ARRAY_CAST (uint16) diff --git a/src/global_fn_cfg.h b/src/global_fn_cfg.h index 5a65ee9..40eabd0 100644 --- a/src/global_fn_cfg.h +++ b/src/global_fn_cfg.h @@ -10,38 +10,6 @@ #endif #include "cc_dtype.h" -#include "cc_cpufn.h" - -typedef void (*fn_activation_relu)( - void *inp, cc_int32 elems, cc_dtype dt); -typedef void (*fn_activation_relu6)( - void *inp, cc_int32 elems, cc_dtype dt); - -typedef void (*fn_activation_softmax)( - void *inp, cc_int32 elems, cc_dtype dt); - -typedef void (*fn_max_pool2d)(const void *inp, void *oup, - cc_int32 x, cc_int32 y, cc_int32 s, cc_dtype dt); - -typedef void (*fn_avg_pool2d)(const void *inp, void *oup, - cc_int32 x, cc_int32 y, cc_int32 s, cc_dtype dt); - -typedef void (*fn_conv2d)(const void *inp, void *oup, - cc_int32 x,cc_int32 y, cc_int32 oup_x, cc_int32 oup_y, - cc_int32 sx, cc_int32 sy, const void *filter, - cc_int32 fw, cc_dtype dt); - -typedef void (*fn_fully_connected)(const void *inp, - void *oup, const void *w, const void *b, - cc_int32 iw, cc_int32 ow, cc_dtype dt); - -typedef void (*fn_batch_norm)(void *inp, - cc_int32 len, const void *bnpara, cc_dtype dt); - -/* - * cc_array functions' cfg, we do not use a standard BLAS directly - */ -#include "cc_array.h" typedef void (*fn_array_set)( void *arr, int arrlen, const void *x, int dt); @@ -67,20 +35,51 @@ typedef void (*fn_array_mul_ew)(void *oup, typedef void (*fn_array_div_ew)(void *oup, int arrlen, const void *a, const void *b, int dt); -#define GLOBAL_FN_DEF_ARRAY_CAST(dtype) \ +typedef void (*fn_array_sum )( + const void *arr, int arrlen, void *x, int dt); +typedef void (*fn_array_mean)( + const void *arr, int arrlen, void *x, int dt); + +#define TYPEDEF_FN_ARRAY_CAST(dtype) \ typedef void (*fn_array_cast_ ## dtype)( \ void *dst, const void *src, int arrlen, int dt); -GLOBAL_FN_DEF_ARRAY_CAST (uint8) -GLOBAL_FN_DEF_ARRAY_CAST (uint16) -GLOBAL_FN_DEF_ARRAY_CAST (uint32) -GLOBAL_FN_DEF_ARRAY_CAST (uint64) -GLOBAL_FN_DEF_ARRAY_CAST (int8) -GLOBAL_FN_DEF_ARRAY_CAST (int16) -GLOBAL_FN_DEF_ARRAY_CAST (int32) -GLOBAL_FN_DEF_ARRAY_CAST (int64) -GLOBAL_FN_DEF_ARRAY_CAST (float32) -GLOBAL_FN_DEF_ARRAY_CAST (float64) +TYPEDEF_FN_ARRAY_CAST (uint8) +TYPEDEF_FN_ARRAY_CAST (uint16) +TYPEDEF_FN_ARRAY_CAST (uint32) +TYPEDEF_FN_ARRAY_CAST (uint64) +TYPEDEF_FN_ARRAY_CAST (int8) +TYPEDEF_FN_ARRAY_CAST (int16) +TYPEDEF_FN_ARRAY_CAST (int32) +TYPEDEF_FN_ARRAY_CAST (int64) +TYPEDEF_FN_ARRAY_CAST (float32) +TYPEDEF_FN_ARRAY_CAST (float64) + +typedef void (*fn_activation_relu)( + void *inp, cc_int32 elems, cc_dtype dt); +typedef void (*fn_activation_relu6)( + void *inp, cc_int32 elems, cc_dtype dt); + +typedef void (*fn_activation_softmax)( + void *inp, cc_int32 elems, cc_dtype dt); + +typedef void (*fn_max_pool2d)(const void *inp, void *oup, + cc_int32 x, cc_int32 y, cc_int32 s, cc_dtype dt); + +typedef void (*fn_avg_pool2d)(const void *inp, void *oup, + cc_int32 x, cc_int32 y, cc_int32 s, cc_dtype dt); + +typedef void (*fn_conv2d)(const void *inp, void *oup, + cc_int32 x,cc_int32 y, cc_int32 oup_x, cc_int32 oup_y, + cc_int32 sx, cc_int32 sy, const void *filter, + cc_int32 fw, cc_dtype dt); + +typedef void (*fn_fully_connected)(const void *inp, + void *oup, const void *w, const void *b, + cc_int32 iw, cc_int32 ow, cc_dtype dt); + +typedef void (*fn_batch_norm)(void *inp, + cc_int32 len, const void *bnpara, cc_dtype dt); #ifdef __cplusplus } diff --git a/util/lua2cc.lua b/util/lua2cc.lua index 37a22ac..90dd14d 100644 --- a/util/lua2cc.lua +++ b/util/lua2cc.lua @@ -277,10 +277,10 @@ reshape = function(args) if info.layerId - 1 < 1 then assert(nil, "must specify an input for the 1st layer") end - info.input = layerOutputs[info.layerId - 1] + info.input = string.format("@%d", info.layerId - 1) end local code = string.format( - "%s = cc_tensor_reshape(%s, __shape%d);", + "%s = cc_reshape(%s, __shape%d);", output, info.input, info.shapeId) layerOutputs[ret.layerId] = output return code