Skip to content

Commit

Permalink
rewrite EF and arithmetic to avoid uint32_t
Browse files Browse the repository at this point in the history
Co-authored-by: Ward Beullens <ward@beullens.com>
Co-authored-by: Fabio Campos <campos@sopmac.de>
Co-authored-by: Sofía Celi <sceli@brave.com>
Co-authored-by: Basil Hess <bhe@zurich.ibm.com>
Co-authored-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
  • Loading branch information
5 people committed Oct 24, 2023
1 parent d81b689 commit 442f21e
Show file tree
Hide file tree
Showing 19 changed files with 521 additions and 503 deletions.
5 changes: 4 additions & 1 deletion .cmake/flags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@ if(MSVC)
else()
set(STRICT_OPTIONS_CXX "${STRICT_OPTIONS_CXX} -std=c++14 -O2")
set(STRICT_OPTIONS_CPP "${STRICT_OPTIONS_CPP} -Wall -Wuninitialized -Wno-deprecated-declarations -Wno-missing-field-initializers")
set(STRICT_OPTIONS_C "${STRICT_OPTIONS_C} -O3 -Wstrict-prototypes -Wno-error=strict-prototypes -fvisibility=hidden -funroll-loops -Wno-error=implicit-function-declaration -Wno-error=attributes")
if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
set(STRICT_OPTIONS_C "${STRICT_OPTIONS_C} -O3")
endif()
set(STRICT_OPTIONS_C "${STRICT_OPTIONS_C} -Wstrict-prototypes -Wno-error=strict-prototypes -fvisibility=hidden -funroll-loops -Wno-error=implicit-function-declaration -Wno-error=attributes")
if(CMAKE_C_COMPILER_ID MATCHES "Clang")
set(STRICT_OPTIONS_CPP "${STRICT_OPTIONS_CPP} -Wno-error=unknown-warning-option -Qunused-arguments -Wno-tautological-compare")
set(STRICT_OPTIONS_CPP "${STRICT_OPTIONS_CPP} -Wno-unused-function -Wno-pass-failed")
Expand Down
34 changes: 24 additions & 10 deletions .github/workflows/cmake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,30 +92,44 @@ jobs:
rm -rf build
cmake -Bbuild -DENABLE_CT_TESTING=ON -DCMAKE_BUILD_TYPE=Debug -DMAYO_BUILD_TYPE=${{ matrix.mayo_build_type }} -DMAYO_MARCH="-march=haswell -maes"
cmake --build build
valgrind --track-origins=yes build/test/mayo_test_scheme_MAYO_1
valgrind --track-origins=yes build/test/mayo_test_scheme_MAYO_2
valgrind --track-origins=yes build/test/mayo_test_scheme_MAYO_3
valgrind --track-origins=yes build/test/mayo_test_scheme_MAYO_5
valgrind --error-exitcode=1 --track-origins=yes build/test/mayo_test_scheme_MAYO_1
valgrind --error-exitcode=1 --track-origins=yes build/test/mayo_test_scheme_MAYO_2
valgrind --error-exitcode=1 --track-origins=yes build/test/mayo_test_scheme_MAYO_3
valgrind --error-exitcode=1 --track-origins=yes build/test/mayo_test_scheme_MAYO_5
if: matrix.mayo_build_type == 'opt' || matrix.mayo_build_type == 'avx2'

- name: CT-Tests
run: |
rm -rf build
cmake -Bbuild -DENABLE_CT_TESTING=ON -DCMAKE_BUILD_TYPE=Debug -DMAYO_BUILD_TYPE=${{ matrix.mayo_build_type }} -DMAYO_MARCH="-march=haswell -maes"
cmake --build build
valgrind --track-origins=yes build/test/mayo_test_scheme MAYO_1
valgrind --track-origins=yes build/test/mayo_test_scheme MAYO_2
valgrind --track-origins=yes build/test/mayo_test_scheme MAYO_3
valgrind --track-origins=yes build/test/mayo_test_scheme MAYO_5
valgrind --error-exitcode=1 --track-origins=yes build/test/mayo_test_scheme MAYO_1
valgrind --error-exitcode=1 --track-origins=yes build/test/mayo_test_scheme MAYO_2
valgrind --error-exitcode=1 --track-origins=yes build/test/mayo_test_scheme MAYO_3
valgrind --error-exitcode=1 --track-origins=yes build/test/mayo_test_scheme MAYO_5
if: matrix.mayo_build_type == 'ref'

- name: Memcheck
run: |
rm -rf build
cmake -Bbuild -DCMAKE_BUILD_TYPE=Debug -DMAYO_BUILD_TYPE=${{ matrix.mayo_build_type }} -DMAYO_MARCH="-march=haswell -maes"
cmake --build build
ctest -T memcheck --test-dir build
# valgrind-ci unit_tests_valgrind.xml --summary
valgrind --error-exitcode=1 --track-origins=yes build/test/mayo_test_scheme MAYO_1
valgrind --error-exitcode=1 --track-origins=yes build/test/mayo_test_scheme MAYO_2
valgrind --error-exitcode=1 --track-origins=yes build/test/mayo_test_scheme MAYO_3
valgrind --error-exitcode=1 --track-origins=yes build/test/mayo_test_scheme MAYO_5
if: matrix.mayo_build_type == 'ref'

- name: Memcheck
run: |
rm -rf build
cmake -Bbuild -DCMAKE_BUILD_TYPE=Debug -DMAYO_BUILD_TYPE=${{ matrix.mayo_build_type }} -DMAYO_MARCH="-march=haswell -maes"
cmake --build build
valgrind --error-exitcode=1 --track-origins=yes build/test/mayo_test_scheme_MAYO_1
valgrind --error-exitcode=1 --track-origins=yes build/test/mayo_test_scheme_MAYO_2
valgrind --error-exitcode=1 --track-origins=yes build/test/mayo_test_scheme_MAYO_3
valgrind --error-exitcode=1 --track-origins=yes build/test/mayo_test_scheme_MAYO_5
if: matrix.mayo_build_type == 'opt' || matrix.mayo_build_type == 'avx2'

- name: Address Sanitizer ASAN
run: |
Expand Down
2 changes: 1 addition & 1 deletion include/mayo.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ typedef struct {
} mayo_params_t;

typedef struct sk_t {
uint32_t p[P1_BYTES_MAX/4 + P2_BYTES_MAX/4];
uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8];
uint8_t o[O_BYTES_MAX];
} sk_t;

Expand Down
12 changes: 12 additions & 0 deletions include/mem.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,23 @@ static inline uint32_t ct_is_greater_than(int a, int b) {
return (uint32_t) (diff >> (8*sizeof(uint32_t)-1));
}

// a > b -> b - a is negative
// returns 0xFFFFFFFF if true, 0x00000000 if false
static inline uint64_t ct_64_is_greater_than(int a, int b) {
int64_t diff = ((int64_t) b) - ((int64_t) a);
return (uint64_t) (diff >> (8*sizeof(uint64_t)-1));
}

// if a == b -> 0x00000000, else 0xFFFFFFFF
static inline uint32_t ct_compare_32(int a, int b) {
return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
}

// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF
static inline uint64_t ct_compare_64(int a, int b) {
return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1));
}

// if a == b -> 0x00, else 0xFF
static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) {
return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
Expand Down
5 changes: 1 addition & 4 deletions src/AVX2/arithmetic_128.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,7 @@ inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc)
}

static
inline void multiply_bins_128(uint32_t *bins_32, uint32_t *out_32) {

uint64_t *bins = (uint64_t *) bins_32;
uint64_t *out = (uint64_t *) out_32;
inline void multiply_bins_128(uint64_t *bins, uint64_t *out) {

m_vec_mul_add_x_inv_128(bins + 5 * 8, bins + 10 * 8);
m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8);
Expand Down
5 changes: 1 addition & 4 deletions src/AVX2/arithmetic_64.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,7 @@ static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) {
}

static
inline void multiply_bins_64(uint32_t *bins_32, uint32_t *out_32) {

uint64_t *bins = (uint64_t *) bins_32;
uint64_t *out = (uint64_t *) out_32;
inline void multiply_bins_64(uint64_t *bins, uint64_t *out) {

m_vec_mul_add_x_inv_64(bins + 5 * 4, bins + 10 * 4);
m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4);
Expand Down
5 changes: 1 addition & 4 deletions src/AVX2/arithmetic_96.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,7 @@ inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) {
}

static
inline void multiply_bins_96(uint32_t *bins_32, uint32_t *out_32) {

uint64_t *bins = (uint64_t *) bins_32;
uint64_t *out = (uint64_t *) out_32;
inline void multiply_bins_96(uint64_t *bins, uint64_t *out) {

m_vec_mul_add_x_inv_96(bins + 5 * 6, bins + 10 * 6);
m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6);
Expand Down
20 changes: 10 additions & 10 deletions src/AVX2/shuffle_arithmetic_128.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

// P1*0 -> P1: v x v, O: v x o
static
inline void mayo_5_P1_times_O_avx2(const uint32_t *_P1, __m256i *O_multabs, uint32_t *_acc){
inline void mayo_5_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){

const __m256i *P1 = (__m256i *) _P1;
__m256i *acc = (__m256i *) _acc;
Expand Down Expand Up @@ -57,7 +57,7 @@ inline void mayo_5_P1_times_O_avx2(const uint32_t *_P1, __m256i *O_multabs, uint
}

static
inline void mayo_5_Ot_times_P1O_P2_avx2(const uint32_t *_P1O_P2, __m256i *O_multabs, uint32_t *_acc){
inline void mayo_5_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){
const __m256i *P1O_P2 = (__m256i *) _P1O_P2;
__m256i *acc = (__m256i *) _acc;
const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
Expand Down Expand Up @@ -99,7 +99,7 @@ inline void mayo_5_Ot_times_P1O_P2_avx2(const uint32_t *_P1O_P2, __m256i *O_mult


static
inline void mayo_5_P1P1t_times_O(const uint32_t *_P1, const unsigned char *O, uint32_t *_acc){
inline void mayo_5_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){

const __m256i *P1 = (__m256i *) _P1;
__m256i *acc = (__m256i *) _acc;
Expand Down Expand Up @@ -175,7 +175,7 @@ inline void mayo_5_P1P1t_times_O(const uint32_t *_P1, const unsigned char *O, ui


static
inline void mayo_5_Vt_times_L_avx2(const uint32_t *_L, const __m256i *V_multabs, uint32_t *_acc){
inline void mayo_5_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){

const __m256i *L = (__m256i *) _L;
__m256i *acc = (__m256i *) _acc;
Expand Down Expand Up @@ -227,7 +227,7 @@ inline void mayo_5_Vt_times_L_avx2(const uint32_t *_L, const __m256i *V_multabs,


static
inline void mayo_5_P1_times_Vt_avx2(const uint32_t *_P1, __m256i *V_multabs, uint32_t *_acc){
inline void mayo_5_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){
size_t k,c;
const __m256i *P1 = (__m256i *) _P1;
__m256i *acc = (__m256i *) _acc;
Expand Down Expand Up @@ -280,7 +280,7 @@ inline void mayo_5_P1_times_Vt_avx2(const uint32_t *_P1, __m256i *V_multabs, uin
}

static
inline void mayo_5_Vt_times_Pv_avx2(const uint32_t *_Pv, const __m256i *V_multabs, uint32_t *_acc){
inline void mayo_5_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){

const __m256i *Pv = (__m256i *) _Pv;
__m256i *acc = (__m256i *) _acc;
Expand Down Expand Up @@ -333,7 +333,7 @@ inline void mayo_5_Vt_times_Pv_avx2(const uint32_t *_Pv, const __m256i *V_multab

// P2*S2 -> P2: v x o, S2: o x k
static
inline void mayo_5_P1_times_S1_plus_P2_times_S2_avx2(const uint32_t *_P1, const uint32_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint32_t *_acc){
inline void mayo_5_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){
size_t k,c;
const __m256i *P1 = (__m256i *) _P1;
const __m256i *P2 = (__m256i *) _P2;
Expand Down Expand Up @@ -410,7 +410,7 @@ inline void mayo_5_P1_times_S1_plus_P2_times_S2_avx2(const uint32_t *_P1, const

// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
static
inline void mayo_5_P3_times_S2_avx2(const uint32_t *_P3, __m256i *S2_multabs, uint32_t *_acc){
inline void mayo_5_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){
size_t k,c;
const __m256i *P3 = (__m256i *) _P3;
__m256i *acc = (__m256i *) _acc;
Expand Down Expand Up @@ -464,12 +464,12 @@ inline void mayo_5_P3_times_S2_avx2(const uint32_t *_P3, __m256i *S2_multabs, ui


static
inline void mayo_5_S1t_times_PS1_avx2(const uint32_t *_PS1, __m256i *S1_multabs, uint32_t *_acc){
inline void mayo_5_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){
mayo_5_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc);
}

static
inline void mayo_5_S2t_times_PS2_avx2(const uint32_t *_PS2, __m256i *S2_multabs, uint32_t *_acc){
inline void mayo_5_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){
const __m256i *PS2 = (__m256i *) _PS2;
__m256i *acc = (__m256i *) _acc;
const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
Expand Down
36 changes: 18 additions & 18 deletions src/AVX2/shuffle_arithmetic_64.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

// P1*0 -> P1: v x v, O: v x o
static
inline void mayo_12_P1_times_O_avx2(const uint32_t *_P1, __m256i *O_multabs, uint32_t *_acc){
inline void mayo_12_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){

const __m256i *P1 = (__m256i *) _P1;
__m256i *acc = (__m256i *) _acc;
Expand Down Expand Up @@ -47,7 +47,7 @@ inline void mayo_12_P1_times_O_avx2(const uint32_t *_P1, __m256i *O_multabs, uin


static
inline void mayo_12_Ot_times_P1O_P2_avx2(const uint32_t *_P1O_P2, __m256i *O_multabs, uint32_t *_acc){
inline void mayo_12_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){

const __m256i *P1O_P2 = (__m256i *) _P1O_P2;
__m256i *acc = (__m256i *) _acc;
Expand Down Expand Up @@ -81,7 +81,7 @@ inline void mayo_12_Ot_times_P1O_P2_avx2(const uint32_t *_P1O_P2, __m256i *O_mul
}

static
inline void mayo_12_P1P1t_times_O(const uint32_t *_P1, const unsigned char *O, uint32_t *_acc){
inline void mayo_12_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){

const __m256i *P1 = (__m256i *) _P1;
__m256i *acc = (__m256i *) _acc;
Expand Down Expand Up @@ -140,7 +140,7 @@ inline void mayo_12_P1P1t_times_O(const uint32_t *_P1, const unsigned char *O, u


static
inline void mayo_12_Vt_times_L_avx2(const uint32_t *_L, const __m256i *V_multabs, uint32_t *_acc){
inline void mayo_12_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){

const __m256i *L = (__m256i *) _L;
__m256i *acc = (__m256i *) _acc;
Expand Down Expand Up @@ -180,7 +180,7 @@ inline void mayo_12_Vt_times_L_avx2(const uint32_t *_L, const __m256i *V_multabs


static
inline void mayo_12_Vt_times_Pv_avx2(const uint32_t *_Pv, const __m256i *V_multabs, uint32_t *_acc){
inline void mayo_12_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){

const __m256i *Pv = (__m256i *) _Pv;
__m256i *acc = (__m256i *) _acc;
Expand Down Expand Up @@ -219,7 +219,7 @@ inline void mayo_12_Vt_times_Pv_avx2(const uint32_t *_Pv, const __m256i *V_multa
}

static
inline void mayo_12_P1_times_Vt_avx2(const uint32_t *_P1, __m256i *V_multabs, uint32_t *_acc){
inline void mayo_12_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){
size_t k,c;
const __m256i *P1 = (__m256i *) _P1;
__m256i *acc = (__m256i *) _acc;
Expand Down Expand Up @@ -262,17 +262,17 @@ inline void mayo_12_P1_times_Vt_avx2(const uint32_t *_P1, __m256i *V_multabs, ui
// P1*S1 -> P1: v x v, S1: v x k // P1 upper triangular
// same as mayo_12_P1_times_Vt_avx2
static
inline void mayo_12_P1_times_S1_avx2(const uint32_t *_P1, __m256i *S1_multabs, uint32_t *_acc){
inline void mayo_12_P1_times_S1_avx2(const uint64_t *_P1, __m256i *S1_multabs, uint64_t *_acc){
mayo_12_P1_times_Vt_avx2(_P1, S1_multabs, _acc);
}

static
inline void mayo_12_S1t_times_PS1_avx2(const uint32_t *_PS1, __m256i *S1_multabs, uint32_t *_acc){
inline void mayo_12_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){
mayo_12_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc);
}

static
inline void mayo_12_S2t_times_PS2_avx2(const uint32_t *_PS2, __m256i *S2_multabs, uint32_t *_acc){
inline void mayo_12_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){
const __m256i *PS2 = (__m256i *) _PS2;
__m256i *acc = (__m256i *) _acc;
const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
Expand Down Expand Up @@ -312,7 +312,7 @@ inline void mayo_12_S2t_times_PS2_avx2(const uint32_t *_PS2, __m256i *S2_multabs

// P2*S2 -> P2: v x o, S2: o x k
static
inline void mayo_12_P2_times_S2_avx2(const uint32_t *_P2, __m256i *S2_multabs, uint32_t *_acc){
inline void mayo_12_P2_times_S2_avx2(const uint64_t *_P2, __m256i *S2_multabs, uint64_t *_acc){
size_t k,c;
const __m256i *P2 = (__m256i *) _P2;
__m256i *acc = (__m256i *) _acc;
Expand Down Expand Up @@ -355,7 +355,7 @@ inline void mayo_12_P2_times_S2_avx2(const uint32_t *_P2, __m256i *S2_multabs, u

// P2*S2 -> P2: v x o, S2: o x k
static
inline void mayo_12_P1_times_S1_plus_P2_times_S2_avx2(const uint32_t *_P1, const uint32_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint32_t *_acc){
inline void mayo_12_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){
size_t k,c;
const __m256i *P1 = (__m256i *) _P1;
const __m256i *P2 = (__m256i *) _P2;
Expand Down Expand Up @@ -414,7 +414,7 @@ inline void mayo_12_P1_times_S1_plus_P2_times_S2_avx2(const uint32_t *_P1, const

// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
static
inline void mayo_12_P3_times_S2_avx2(const uint32_t *_P3, __m256i *S2_multabs, uint32_t *_acc){
inline void mayo_12_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){
size_t k,c;
const __m256i *P3 = (__m256i *) _P3;
__m256i *acc = (__m256i *) _acc;
Expand Down Expand Up @@ -456,19 +456,19 @@ inline void mayo_12_P3_times_S2_avx2(const uint32_t *_P3, __m256i *S2_multabs, u


static inline
void mayo12_m_upper(int m_legs, const uint32_t *in, uint32_t *out, int size) {
void mayo12_m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) {
(void) size;
int m_vecs_stored = 0;

for (int r = 0; r < O_MAX; ++r) {
const __m256i* _in = (const __m256i*) (in + m_legs * 4 * (r * size + r));
__m256i* _out = (__m256i*) (out + m_legs * 4 * m_vecs_stored);
const __m256i* _in = (const __m256i*) (in + m_legs * 2 * (r * size + r));
__m256i* _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored);
_out[0] = _in[0];
m_vecs_stored++;
for (int c = r + 1; c < O_MAX; ++c) {
const __m256i* _in2 = (const __m256i*) (in + m_legs * 4 * (r * size + c));
const __m256i* _in3 = (const __m256i*) (in + m_legs * 4 * (c * size + r));
_out = (__m256i*) (out + m_legs * 4 * m_vecs_stored);
const __m256i* _in2 = (const __m256i*) (in + m_legs * 2 * (r * size + c));
const __m256i* _in3 = (const __m256i*) (in + m_legs * 2 * (c * size + r));
_out = (__m256i*) (out + m_legs * 2 * m_vecs_stored);
_out[0] = _in2[0] ^ _in3[0];
m_vecs_stored++;
}
Expand Down
Loading

0 comments on commit 442f21e

Please sign in to comment.