-
Notifications
You must be signed in to change notification settings - Fork 1
/
riscv-asm.c
72 lines (63 loc) · 2.21 KB
/
riscv-asm.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include <assert.h>
#include <sys/time.h>
#define INT8_MACS 32
#define INT32_LANES 4
/*
* This kernel dynamically shift between RVV modes (vsetvli).
*
* On TH1520 vsetvli internal reconfiguration is VERY EXPENSIVE
* and this kernel drags down to ~1/5 of it's peak performance.
*
*/
int32_t __attribute__((noinline))
dot_vec_dynamic(int32_t* output,
const uint8_t* data,
const int8_t* kernel) {
// LMUL=2, instructions with odd register are illegal instructions
// LMUL=4, vector registers are incremented by 4, else illegal instructions
// LMUL=8, only v0, v8, v16, and v24 are valid vector registers
// inline compute multipliers
// MUL = 2**((INT8_MACS-1) / (1024 / 8 / sew))
const uint8_t e8m = ceil(INT8_MACS / (1024.f / 8 / 8));
const uint8_t e16m = ceil(INT8_MACS / (1024.f / 8 / 16));
const uint8_t e32m = ceil(INT32_LANES / (1024.f / 8 / 32));
asm volatile (
" li a4, %[n_elem] \n"
" li a5, %[n_lane] \n"
// init
" li a7, 0 \n"
" vmv.s.x v0, zero \n"
// load data
" vsetvli t4, a4, e8, m%[e8m], d1 \n"
" vlbu.v v8, (%[data]) \n"
// multiply-accumulate
".MACC: \n"
" vsetvli t5, a4, e8, m%[e8m], d1 \n"
" vlb.v v0, (%[kern]) \n"
" vwmulsu.vv v16, v0, v8 \n"
" vsetvli t4, a4, e16, m%[e16m], d1 \n"
" vmv.s.x v0, zero \n"
" vwredsum.vs v0, v16, v0 \n"
" add %[kern], %[kern], a4 \n"
" vsetvli t5, a5, e32, m%[e32m], d1 \n"
" vslideup.vx v24, v0, a7 \n"
" addi a7, a7, 1 \n"
" bne a7, a5, .MACC \n"
// store
" vsw.v v24, (%[outb]) \n"
::
[n_elem] "I" (INT8_MACS),
[n_lane] "I" (INT32_LANES),
[data] "r" (data),
[kern] "r" (kernel),
[outb] "r" (output),
[e8m] "I" (e8m),
[e16m] "I" (e16m),
[e32m] "I" (e32m)
);
return 0;
}