-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCPU jobs; AVX512.cpp
154 lines (148 loc) · 13.5 KB
/
CPU jobs; AVX512.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
/************************************************************
* File: CPU jobs; AVX512.cpp Created: 2025/01/23 *
* Last mod.: 2025/02/17 *
* *
* Desc: *
* *
* MIT license Copyright (c) David William Bull *
************************************************************/
#include <typedefs.h>
#ifndef UNLOOPx4
#define UNLOOPx4(code) code code code code
#endif
// SIMD AVX512 operations only
void JobAVX512(fl64x8 &x) {
for(ui8 i = 0; i < 16; ++i) {
UNLOOPx4(
x = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(1.12)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x, _mm512_set1_pd(2.01)))))), _mm512_set1_pd(0.0001)));
x = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(0.91)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x, _mm512_set1_pd(2.011)))))), _mm512_set1_pd(0.001)));
x = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(1.15)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x, _mm512_set1_pd(2.01)))))), _mm512_set1_pd(0.01)));
x = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(0.85)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x, _mm512_set1_pd(2.009)))))), _mm512_set1_pd(0.1)));
)
x = _mm512_mul_pd(x, _mm512_add_pd(_mm512_mul_pd(x, _mm512_set1_pd(1.01010101010101)), _mm512_set1_pd(0.00021)));
}
}
// SIMD AVX512 operations only
void JobALU_AVX512(fl64x8 &x, si64& y) {
for(ui8 i = 0; i < 16; ++i) {
UNLOOPx4(
x = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(1.12)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x, _mm512_set1_pd(2.01)))))), _mm512_set1_pd(0.0001)));
y *= 789ull / 13 + 501; y = ((i < 32 ? y << 1 : y >> 1) ^ -1) / 7 - 294939;
x = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(0.91)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x, _mm512_set1_pd(2.011)))))), _mm512_set1_pd(0.001)));
y *= 791ull / 14 + 502; y = ((i < 32 ? y << 1 : y >> 1) ^ -1) / 9 - 294941;
x = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(1.15)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x, _mm512_set1_pd(2.01)))))), _mm512_set1_pd(0.01)));
y *= 789ull / 13 + 501; y = ((i < 32 ? y << 1 : y >> 1) ^ -1) / 7 - 294939;
x = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(0.85)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x, _mm512_set1_pd(2.009)))))), _mm512_set1_pd(0.1)));
y *= 787ull / 11 + 500; y = ((i < 32 ? y << 1 : y >> 1) ^ -1) / 5 - 294937;
)
x = _mm512_mul_pd(x, _mm512_add_pd(_mm512_mul_pd(x, _mm512_set1_pd(1.01010101010101)), _mm512_set1_pd(0.00021)));
}
}
void JobMemAVX512(fl64x8ptrc x) {
for(ui8 i = 0; i < 16; ++i) {
UNLOOPx4(
x[0] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(1.12)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[0], _mm512_set1_pd(2.01)))))), _mm512_set1_pd(0.0001)));
x[2] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(1.12)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[2], _mm512_set1_pd(2.01)))))), _mm512_set1_pd(0.0001)));
x[1] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(1.12)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[1], _mm512_set1_pd(2.01)))))), _mm512_set1_pd(0.0001)));
x[3] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(1.12)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[3], _mm512_set1_pd(2.01)))))), _mm512_set1_pd(0.0001)));
x[0] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(0.91)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[0], _mm512_set1_pd(2.011)))))), _mm512_set1_pd(0.001)));
x[2] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(0.91)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[2], _mm512_set1_pd(2.011)))))), _mm512_set1_pd(0.001)));
x[1] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(0.91)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[1], _mm512_set1_pd(2.011)))))), _mm512_set1_pd(0.001)));
x[3] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(0.91)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[3], _mm512_set1_pd(2.011)))))), _mm512_set1_pd(0.001)));
x[0] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(1.15)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[0], _mm512_set1_pd(2.01)))))), _mm512_set1_pd(0.01)));
x[2] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(1.15)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[2], _mm512_set1_pd(2.01)))))), _mm512_set1_pd(0.01)));
x[1] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(1.15)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[1], _mm512_set1_pd(2.01)))))), _mm512_set1_pd(0.01)));
x[3] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(1.15)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[3], _mm512_set1_pd(2.01)))))), _mm512_set1_pd(0.01)));
x[0] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(0.85)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[0], _mm512_set1_pd(2.009)))))), _mm512_set1_pd(0.1)));
x[2] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(0.85)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[2], _mm512_set1_pd(2.009)))))), _mm512_set1_pd(0.1)));
x[1] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(0.85)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[1], _mm512_set1_pd(2.009)))))), _mm512_set1_pd(0.1)));
x[3] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(0.85)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[3], _mm512_set1_pd(2.009)))))), _mm512_set1_pd(0.1)));
)
x[0] = _mm512_mul_pd(x[0], _mm512_add_pd(_mm512_mul_pd(x[0], _mm512_set1_pd(1.01010101010101)), _mm512_set1_pd(0.00021)));
x[2] = _mm512_mul_pd(x[2], _mm512_add_pd(_mm512_mul_pd(x[2], _mm512_set1_pd(1.01010101010101)), _mm512_set1_pd(0.00021)));
x[1] = _mm512_mul_pd(x[1], _mm512_add_pd(_mm512_mul_pd(x[1], _mm512_set1_pd(1.01010101010101)), _mm512_set1_pd(0.00021)));
x[3] = _mm512_mul_pd(x[3], _mm512_add_pd(_mm512_mul_pd(x[3], _mm512_set1_pd(1.01010101010101)), _mm512_set1_pd(0.00021)));
}
}
void JobMemALU_AVX512(fl64x8ptrc x, si64ptrc y) {
for(ui8 i = 0; i < 16; ++i) {
UNLOOPx4(
x[0] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(1.12)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[0], _mm512_set1_pd(2.01)))))), _mm512_set1_pd(0.0001)));
y[0] *= 789ull / 13 + 501; y[0] = ((i < 32 ? y[0] << 1 : y[0] >> 1) ^ -1) / 7 - 294939;
x[2] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(1.12)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[2], _mm512_set1_pd(2.01)))))), _mm512_set1_pd(0.0001)));
y[2] *= 789ull / 13 + 501; y[2] = ((i < 32 ? y[2] << 1 : y[2] >> 1) ^ -1) / 7 - 294939;
x[1] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(1.12)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[1], _mm512_set1_pd(2.01)))))), _mm512_set1_pd(0.0001)));
y[1] *= 789ull / 13 + 501; y[1] = ((i < 32 ? y[1] << 1 : y[1] >> 1) ^ -1) / 7 - 294939;
x[3] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(1.12)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[3], _mm512_set1_pd(2.01)))))), _mm512_set1_pd(0.0001)));
y[3] *= 789ull / 13 + 501; y[3] = ((i < 32 ? y[3] << 1 : y[3] >> 1) ^ -1) / 7 - 294939;
x[0] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(0.91)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[0], _mm512_set1_pd(2.011)))))), _mm512_set1_pd(0.001)));
y[0] *= 791ull / 14 + 502; y[0] = ((i < 32 ? y[0] << 1 : y[0] >> 1) ^ -1) / 9 - 294941;
x[2] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(0.91)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[2], _mm512_set1_pd(2.011)))))), _mm512_set1_pd(0.001)));
y[2] *= 791ull / 14 + 502; y[2] = ((i < 32 ? y[2] << 1 : y[2] >> 1) ^ -1) / 9 - 294941;
x[1] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(0.91)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[1], _mm512_set1_pd(2.011)))))), _mm512_set1_pd(0.001)));
y[1] *= 791ull / 14 + 502; y[1] = ((i < 32 ? y[1] << 1 : y[1] >> 1) ^ -1) / 9 - 294941;
x[3] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(0.91)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[3], _mm512_set1_pd(2.011)))))), _mm512_set1_pd(0.001)));
y[3] *= 791ull / 14 + 502; y[3] = ((i < 32 ? y[3] << 1 : y[3] >> 1) ^ -1) / 9 - 294941;
x[0] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(1.15)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[0], _mm512_set1_pd(2.01)))))), _mm512_set1_pd(0.01)));
y[0] *= 789ull / 13 + 501; y[0] = ((i < 32 ? y[0] << 1 : y[0] >> 1) ^ -1) / 7 - 294939;
x[2] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(1.15)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[2], _mm512_set1_pd(2.01)))))), _mm512_set1_pd(0.01)));
y[2] *= 789ull / 13 + 501; y[2] = ((i < 32 ? y[2] << 1 : y[2] >> 1) ^ -1) / 7 - 294939;
x[1] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(1.15)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[1], _mm512_set1_pd(2.01)))))), _mm512_set1_pd(0.01)));
y[1] *= 789ull / 13 + 501; y[1] = ((i < 32 ? y[1] << 1 : y[1] >> 1) ^ -1) / 7 - 294939;
x[3] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(1.15)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[3], _mm512_set1_pd(2.01)))))), _mm512_set1_pd(0.01)));
y[3] *= 789ull / 13 + 501; y[3] = ((i < 32 ? y[3] << 1 : y[3] >> 1) ^ -1) / 7 - 294939;
x[0] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(0.85)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[0], _mm512_set1_pd(2.009)))))), _mm512_set1_pd(0.1)));
y[0] *= 787ull / 11 + 500; y[0] = ((i < 32 ? y[0] << 1 : y[0] >> 1) ^ -1) / 5 - 294937;
x[2] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(0.85)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[2], _mm512_set1_pd(2.009)))))), _mm512_set1_pd(0.1)));
y[2] *= 787ull / 11 + 500; y[2] = ((i < 32 ? y[2] << 1 : y[2] >> 1) ^ -1) / 5 - 294937;
x[1] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(0.85)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[1], _mm512_set1_pd(2.009)))))), _mm512_set1_pd(0.1)));
y[1] *= 787ull / 11 + 500; y[1] = ((i < 32 ? y[1] << 1 : y[1] >> 1) ^ -1) / 5 - 294937;
x[3] = _mm512_div_pd(_mm512_sqrt_pd(_mm512_set1_pd(0.85)), _mm512_add_pd(_mm512_abs_pd(_mm512_sub_pd(_mm512_set1_pd(1.0),
_mm512_sqrt_pd(_mm512_sqrt_pd(_mm512_div_pd(x[3], _mm512_set1_pd(2.009)))))), _mm512_set1_pd(0.1)));
y[3] *= 787ull / 11 + 500; y[3] = ((i < 32 ? y[3] << 1 : y[3] >> 1) ^ -1) / 5 - 294937;
)
x[0] = _mm512_mul_pd(x[0], _mm512_add_pd(_mm512_mul_pd(x[0], _mm512_set1_pd(1.01010101010101)), _mm512_set1_pd(0.00021)));
x[2] = _mm512_mul_pd(x[2], _mm512_add_pd(_mm512_mul_pd(x[2], _mm512_set1_pd(1.01010101010101)), _mm512_set1_pd(0.00021)));
x[1] = _mm512_mul_pd(x[1], _mm512_add_pd(_mm512_mul_pd(x[1], _mm512_set1_pd(1.01010101010101)), _mm512_set1_pd(0.00021)));
x[3] = _mm512_mul_pd(x[3], _mm512_add_pd(_mm512_mul_pd(x[3], _mm512_set1_pd(1.01010101010101)), _mm512_set1_pd(0.00021)));
}
}