-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCPU jobs; AVX.cpp
158 lines (151 loc) · 13.7 KB
/
CPU jobs; AVX.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
/************************************************************
* File: CPU jobs; AVX.cpp Created: 2025/01/23 *
* Last mod.: 2025/02/17 *
* *
* Desc: *
* *
* MIT license Copyright (c) David William Bull *
************************************************************/
#include <typedefs.h>
#ifndef UNLOOPx4
#define UNLOOPx4(code) code code code code
#endif
#ifndef _mm256_abs_pd
#define _mm256_abs_pd(input) _mm256_and_pd((fl64x4&)_mm256_set1_epi64x(0x07FFFFFFFFFFFFFFF), (input))
#endif
// SIMD AVX operations only
void JobAVX2(fl64x4& x) {
for(ui8 i = 0; i < 16; ++i) {
UNLOOPx4(
x = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(1.12)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x, _mm256_set1_pd(2.01)))))), _mm256_set1_pd(0.0001)));
x = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(0.91)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x, _mm256_set1_pd(2.011)))))), _mm256_set1_pd(0.001)));
x = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(1.15)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x, _mm256_set1_pd(2.01)))))), _mm256_set1_pd(0.01)));
x = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(0.85)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x, _mm256_set1_pd(2.009)))))), _mm256_set1_pd(0.1)));
)
x = _mm256_mul_pd(x, _mm256_add_pd(_mm256_mul_pd(x, _mm256_set1_pd(1.01010101010101)), _mm256_set1_pd(0.00021)));
}
}
// ALU + SIMD AVX2 operations only
void JobALU_AVX2(fl64x4& x, si64& y) {
for(ui8 i = 0; i < 16; ++i) {
UNLOOPx4(
x = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(1.12)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x, _mm256_set1_pd(2.01)))))), _mm256_set1_pd(0.0001)));
y *= 789ull / 13 + 501; y = ((i < 32 ? y << 1 : y >> 1) ^ -1) / 7 - 294939;
x = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(0.91)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x, _mm256_set1_pd(2.011)))))), _mm256_set1_pd(0.001)));
y *= 791ull / 14 + 502; y = ((i < 32 ? y << 1 : y >> 1) ^ -1) / 9 - 294941;
x = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(1.15)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x, _mm256_set1_pd(2.01)))))), _mm256_set1_pd(0.01)));
y *= 789ull / 13 + 501; y = ((i < 32 ? y << 1 : y >> 1) ^ -1) / 7 - 294939;
x = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(0.85)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x, _mm256_set1_pd(2.009)))))), _mm256_set1_pd(0.1)));
y *= 787ull / 11 + 500; y = ((i < 32 ? y << 1 : y >> 1) ^ -1) / 5 - 294937;
)
x = _mm256_mul_pd(x, _mm256_add_pd(_mm256_mul_pd(x, _mm256_set1_pd(1.01010101010101)), _mm256_set1_pd(0.00021)));
}
}
void JobMemAVX2(fl64x4ptrc x) {
for(ui8 i = 0; i < 16; ++i) {
UNLOOPx4(
x[0] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(1.12)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[0], _mm256_set1_pd(2.01)))))), _mm256_set1_pd(0.0001)));
x[2] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(1.12)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[2], _mm256_set1_pd(2.01)))))), _mm256_set1_pd(0.0001)));
x[1] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(1.12)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[1], _mm256_set1_pd(2.01)))))), _mm256_set1_pd(0.0001)));
x[3] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(1.12)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[3], _mm256_set1_pd(2.01)))))), _mm256_set1_pd(0.0001)));
x[0] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(0.91)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[0], _mm256_set1_pd(2.011)))))), _mm256_set1_pd(0.001)));
x[2] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(0.91)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[2], _mm256_set1_pd(2.011)))))), _mm256_set1_pd(0.001)));
x[1] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(0.91)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[1], _mm256_set1_pd(2.011)))))), _mm256_set1_pd(0.001)));
x[3] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(0.91)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[3], _mm256_set1_pd(2.011)))))), _mm256_set1_pd(0.001)));
x[0] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(1.15)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[0], _mm256_set1_pd(2.01)))))), _mm256_set1_pd(0.01)));
x[2] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(1.15)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[2], _mm256_set1_pd(2.01)))))), _mm256_set1_pd(0.01)));
x[1] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(1.15)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[1], _mm256_set1_pd(2.01)))))), _mm256_set1_pd(0.01)));
x[3] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(1.15)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[3], _mm256_set1_pd(2.01)))))), _mm256_set1_pd(0.01)));
x[0] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(0.85)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[0], _mm256_set1_pd(2.009)))))), _mm256_set1_pd(0.1)));
x[2] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(0.85)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[2], _mm256_set1_pd(2.009)))))), _mm256_set1_pd(0.1)));
x[1] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(0.85)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[1], _mm256_set1_pd(2.009)))))), _mm256_set1_pd(0.1)));
x[3] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(0.85)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[3], _mm256_set1_pd(2.009)))))), _mm256_set1_pd(0.1)));
)
x[0] = _mm256_mul_pd(x[0], _mm256_add_pd(_mm256_mul_pd(x[0], _mm256_set1_pd(1.01010101010101)), _mm256_set1_pd(0.00021)));
x[2] = _mm256_mul_pd(x[2], _mm256_add_pd(_mm256_mul_pd(x[2], _mm256_set1_pd(1.01010101010101)), _mm256_set1_pd(0.00021)));
x[1] = _mm256_mul_pd(x[1], _mm256_add_pd(_mm256_mul_pd(x[1], _mm256_set1_pd(1.01010101010101)), _mm256_set1_pd(0.00021)));
x[3] = _mm256_mul_pd(x[3], _mm256_add_pd(_mm256_mul_pd(x[3], _mm256_set1_pd(1.01010101010101)), _mm256_set1_pd(0.00021)));
}
}
void JobMemALU_AVX2(fl64x4ptrc x, si64ptrc y) {
for(ui8 i = 0; i < 16; ++i) {
UNLOOPx4(
x[0] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(1.12)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[0], _mm256_set1_pd(2.01)))))), _mm256_set1_pd(0.0001)));
y[0] *= 789ull / 13 + 501; y[0] = ((i < 32 ? y[0] << 1 : y[0] >> 1) ^ -1) / 7 - 294939;
x[2] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(1.12)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[2], _mm256_set1_pd(2.01)))))), _mm256_set1_pd(0.0001)));
y[2] *= 789ull / 13 + 501; y[2] = ((i < 32 ? y[2] << 1 : y[2] >> 1) ^ -1) / 7 - 294939;
x[1] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(1.12)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[1], _mm256_set1_pd(2.01)))))), _mm256_set1_pd(0.0001)));
y[1] *= 789ull / 13 + 501; y[1] = ((i < 32 ? y[1] << 1 : y[1] >> 1) ^ -1) / 7 - 294939;
x[3] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(1.12)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[3], _mm256_set1_pd(2.01)))))), _mm256_set1_pd(0.0001)));
y[3] *= 789ull / 13 + 501; y[3] = ((i < 32 ? y[3] << 1 : y[3] >> 1) ^ -1) / 7 - 294939;
x[0] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(0.91)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[0], _mm256_set1_pd(2.011)))))), _mm256_set1_pd(0.001)));
y[0] *= 791ull / 14 + 502; y[0] = ((i < 32 ? y[0] << 1 : y[0] >> 1) ^ -1) / 9 - 294941;
x[2] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(0.91)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[2], _mm256_set1_pd(2.011)))))), _mm256_set1_pd(0.001)));
y[2] *= 791ull / 14 + 502; y[2] = ((i < 32 ? y[2] << 1 : y[2] >> 1) ^ -1) / 9 - 294941;
x[1] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(0.91)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[1], _mm256_set1_pd(2.011)))))), _mm256_set1_pd(0.001)));
y[1] *= 791ull / 14 + 502; y[1] = ((i < 32 ? y[1] << 1 : y[1] >> 1) ^ -1) / 9 - 294941;
x[3] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(0.91)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[3], _mm256_set1_pd(2.011)))))), _mm256_set1_pd(0.001)));
y[3] *= 791ull / 14 + 502; y[3] = ((i < 32 ? y[3] << 1 : y[3] >> 1) ^ -1) / 9 - 294941;
x[0] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(1.15)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[0], _mm256_set1_pd(2.01)))))), _mm256_set1_pd(0.01)));
y[0] *= 789ull / 13 + 501; y[0] = ((i < 32 ? y[0] << 1 : y[0] >> 1) ^ -1) / 7 - 294939;
x[2] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(1.15)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[2], _mm256_set1_pd(2.01)))))), _mm256_set1_pd(0.01)));
y[2] *= 789ull / 13 + 501; y[2] = ((i < 32 ? y[2] << 1 : y[2] >> 1) ^ -1) / 7 - 294939;
x[1] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(1.15)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[1], _mm256_set1_pd(2.01)))))), _mm256_set1_pd(0.01)));
y[1] *= 789ull / 13 + 501; y[1] = ((i < 32 ? y[1] << 1 : y[1] >> 1) ^ -1) / 7 - 294939;
x[3] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(1.15)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[3], _mm256_set1_pd(2.01)))))), _mm256_set1_pd(0.01)));
y[3] *= 789ull / 13 + 501; y[3] = ((i < 32 ? y[3] << 1 : y[3] >> 1) ^ -1) / 7 - 294939;
x[0] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(0.85)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[0], _mm256_set1_pd(2.009)))))), _mm256_set1_pd(0.1)));
y[0] *= 787ull / 11 + 500; y[0] = ((i < 32 ? y[0] << 1 : y[0] >> 1) ^ -1) / 5 - 294937;
x[2] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(0.85)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[2], _mm256_set1_pd(2.009)))))), _mm256_set1_pd(0.1)));
y[2] *= 787ull / 11 + 500; y[2] = ((i < 32 ? y[2] << 1 : y[2] >> 1) ^ -1) / 5 - 294937;
x[1] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(0.85)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[1], _mm256_set1_pd(2.009)))))), _mm256_set1_pd(0.1)));
y[1] *= 787ull / 11 + 500; y[1] = ((i < 32 ? y[1] << 1 : y[1] >> 1) ^ -1) / 5 - 294937;
x[3] = _mm256_div_pd(_mm256_sqrt_pd(_mm256_set1_pd(0.85)), _mm256_add_pd(_mm256_abs_pd(_mm256_sub_pd(_mm256_set1_pd(1.0),
_mm256_sqrt_pd(_mm256_sqrt_pd(_mm256_div_pd(x[3], _mm256_set1_pd(2.009)))))), _mm256_set1_pd(0.1)));
y[3] *= 787ull / 11 + 500; y[3] = ((i < 32 ? y[3] << 1 : y[3] >> 1) ^ -1) / 5 - 294937;
)
x[0] = _mm256_mul_pd(x[0], _mm256_add_pd(_mm256_mul_pd(x[0], _mm256_set1_pd(1.01010101010101)), _mm256_set1_pd(0.00021)));
x[2] = _mm256_mul_pd(x[2], _mm256_add_pd(_mm256_mul_pd(x[2], _mm256_set1_pd(1.01010101010101)), _mm256_set1_pd(0.00021)));
x[1] = _mm256_mul_pd(x[1], _mm256_add_pd(_mm256_mul_pd(x[1], _mm256_set1_pd(1.01010101010101)), _mm256_set1_pd(0.00021)));
x[3] = _mm256_mul_pd(x[3], _mm256_add_pd(_mm256_mul_pd(x[3], _mm256_set1_pd(1.01010101010101)), _mm256_set1_pd(0.00021)));
}
}