Skip to content

Commit

Permalink
crc64_with_avx512
Browse files Browse the repository at this point in the history
  • Loading branch information
lihuiba committed Jan 31, 2025
1 parent 1ed0b2e commit 4687f15
Show file tree
Hide file tree
Showing 3 changed files with 149 additions and 45 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
endif()

if (${ARCH} STREQUAL x86_64)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mpclmul")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mpclmul -march=native")
elseif (${ARCH} STREQUAL aarch64)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=native -fsigned-char -fno-stack-protector -fomit-frame-pointer")
endif ()
Expand Down
126 changes: 109 additions & 17 deletions common/checksum/crc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -477,29 +477,117 @@ inline void* get_shf_table(size_t i) {
return (char*)pshufb_shf_table + i;
}


inline __attribute__((always_inline))
__m128i crc64ecma_hw_big_sse(const uint8_t*& data, size_t& nbytes, uint64_t crc) {
using SIMD = SSE;
using v128 = typename SIMD::v128;
v128 xmm[8];
auto& ptr = (const v128*&)data;
static_loop<0, 7, 1>(BODY(i){ xmm[i] = SIMD::loadu(ptr+i); });
xmm[0] ^= v128{(long)~crc, 0}; ptr += 8; nbytes -= 128;
do {
static_loop<0, 7, 1>(BODY(i) {
xmm[i] = SIMD::op(xmm[i], RK(3)) ^ SIMD::loadu(ptr+i);
});
ptr += 8; nbytes -= 128;
} while (nbytes >= 128);
static_loop<0, 6, 1>(BODY(i) {
auto I = (i == 6) ? 1 : (9 + i * 2);
xmm[7] ^= SIMD::op(xmm[i], RK(I));
});
return xmm[7];
}

#ifdef __AVX512F__
static const uint64_t rk512[] = {
0xf31fd9271e228b79, // rk_1
0x8260adf2381ad81c, // rk_2
0xdabe95afc7875f40, // rk1
0xe05dd497ca393ae4, // rk2
0xd7d86b2af73de740, // rk3
0x8757d71d4fcc1000, // rk4
0xdabe95afc7875f40,
0x0000000000000000,
0x9c3e466c172963d5,
0x92d8af2baf0e1e84,
0x947874de595052cb,
0x9e735cb59b4724da,
0xe4ce2cd55fea0037,
0x2fe3fd2920ce82ec,
0x0e31d519421a63a5,
0x2e30203212cac325,
0x081f6054a7842df4,
0x6ae3efbb9dd441f3,
0x69a35d91c3730254,
0xb5ea1af9c013aca4,
0x3be653a30fe1af51,
0x60095b008a9efa44, // rk20
0xdabe95afc7875f40, // rk_1b
0xe05dd497ca393ae4, // rk_2b
0x0000000000000000,
0x0000000000000000,
};
#define _RK(i) &rk512[(i)+1]

inline __attribute__((always_inline))
uint64_t crc64ecma_hw_portable(const uint8_t *data, size_t nbytes, uint64_t crc) {
__m128i crc64ecma_hw_big_avx512(const uint8_t*& data, size_t& nbytes, uint64_t crc) {
assert(nbytes >= 256);
using v512 = __m512i_u;
__attribute__((aligned(16)))
v512 crc0 = {(long)~crc};
auto& ptr = (const v512*&)data;
auto zmm0 = _mm512_loadu_si512(ptr++); zmm0 ^= crc0;
auto zmm4 = _mm512_loadu_si512(ptr++);
auto OP = [](v512 a, v512 b, v512 c) -> v512 {
auto x = _mm512_clmulepi64_epi128((a), (b), 0x01);
auto y = _mm512_clmulepi64_epi128((a), (b), 0x10);
return _mm512_ternarylogic_epi64(x, y, (c), 0x96);
};
#define OP_PTR(a, b, ptr) OP(a, b, _mm512_loadu_si512(ptr))
nbytes -= 128;
if (nbytes < 384) {
auto rk3 = _mm512_broadcast_i32x4(*(__m128i*)_RK(3));
do { // fold 128 bytes each iteration
zmm0 = OP_PTR(zmm0, rk3, ptr++);
zmm4 = OP_PTR(zmm4, rk3, ptr++);
nbytes -= 128;
} while (nbytes >= 128);
} else { // nbytes >= 384
auto rk_1_2 = _mm512_broadcast_i32x4(*(__m128i*)&rk512[0]);
auto zmm7 = _mm512_loadu_si512(ptr++);
auto zmm8 = _mm512_loadu_si512(ptr++);
nbytes -= 128;
do { // fold 256 bytes each iteration
zmm0 = OP_PTR(zmm0, rk_1_2, ptr++);
zmm4 = OP_PTR(zmm4, rk_1_2, ptr++);
zmm7 = OP_PTR(zmm7, rk_1_2, ptr++);
zmm8 = OP_PTR(zmm8, rk_1_2, ptr++);
nbytes -= 256;
} while (nbytes >= 256);
auto rk3 = _mm512_broadcast_i32x4(*(__m128i*)_RK(3));
zmm0 = OP(zmm0, rk3, zmm7);
zmm4 = OP(zmm4, rk3, zmm8);
}
auto t = _mm512_extracti64x2_epi64(zmm4, 0x03);
auto zmm7 = v512{t[0], t[1]};
auto zmm1 = OP(zmm0, *(v512*)_RK(9), zmm7);
zmm1 = OP(zmm4, *(v512*)_RK(17), zmm1);
auto zmm8 = _mm512_shuffle_i64x2(zmm1, zmm1, 0x4e);
auto ymm8 = ((__m256i&)zmm8) ^ ((__m256i&)zmm1);
return _mm256_extracti64x2_epi64(ymm8, 0) ^
_mm256_extracti64x2_epi64(ymm8, 1) ;
}
#endif
template<typename F> inline __attribute__((always_inline))
uint64_t crc64ecma_hw_portable(const uint8_t *data, size_t nbytes, uint64_t crc, F hw_big) {
if (unlikely(!nbytes || !data)) return crc;
using SIMD = SSE;
using v128 = typename SIMD::v128;
v128 xmm7 = {(long)~crc};
auto& ptr = (const v128*&)data;
if (nbytes >= 256) {
v128 xmm[8];
assert(nbytes >= 256);
static_loop<0, 7, 1>(BODY(i){ xmm[i] = SIMD::loadu(ptr+i); });
xmm[0] ^= xmm7; ptr += 8; nbytes -= 128;
do {
static_loop<0, 7, 1>(BODY(i) {
xmm[i] = SIMD::op(xmm[i], RK(3)) ^ SIMD::loadu(ptr+i);
});
ptr += 8; nbytes -= 128;
} while (nbytes >= 128);
static_loop<0, 6, 1>(BODY(i) {
auto I = (i == 6) ? 1 : (9 + i * 2);
xmm[7] ^= SIMD::op(xmm[i], RK(I));
});
xmm7 = xmm[7];
xmm7 = hw_big(data, nbytes, crc);
} else if (nbytes >= 16) {
xmm7 ^= SIMD::loadu(ptr++);
nbytes -= 16;
Expand Down Expand Up @@ -543,7 +631,11 @@ uint64_t crc64ecma_hw_portable(const uint8_t *data, size_t nbytes, uint64_t crc)
}

uint64_t crc64ecma_hw(const uint8_t *buf, size_t len, uint64_t crc) {
return crc64ecma_hw_portable(buf, len, crc);
#ifdef __AVX512F__
return crc64ecma_hw_portable(buf, len, crc, crc64ecma_hw_big_avx512);
#else
return crc64ecma_hw_portable(buf, len, crc, crc64ecma_hw_big_sse);
#endif
}

template<typename T>
Expand Down
66 changes: 39 additions & 27 deletions common/checksum/test/test_checksum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,9 @@ void do_test_crc(const char* name, CRC32C calcurlator) {
}

void do_test_crc_small(CRC64ECMA calc_sw, CRC64ECMA calc_hw, uint16_t begin, uint16_t end) {
alignas(16) unsigned char buf[64 * 1024 + 16];
unsigned char buf[64 * 1024 + 16];
for (uint16_t i = 0; i < begin; ++i) buf[i] = 'a' + i % 26;
for (uint16_t i = begin; i < end; ++i) {
buf[i] = 0;
auto crc_sw = calc_sw(buf, i, 0);
auto crc_hw = calc_hw(buf, i, 0);
if (crc_sw != crc_hw) printf("i=%d\n", i);
Expand Down Expand Up @@ -121,39 +121,49 @@ TEST(TestChecksum, crc32c_hw_small) {
do_test_crc_small(crc32c_sw, crc32c_hw_portable, 0, 4000);
}

void do_test_crc_big(const char* name, CRC32C crc32c) {
static unsigned char buf[512 * 1024 * 1024];
memset(buf, 0, sizeof(buf));
void do_perf_crc(const char* name, CRC32C crc32c, unsigned long size) {
const unsigned long SIZE = 1 * 1024 * 1024 * 1024;
__attribute__((aligned(16)))
static unsigned char buf[SIZE+1];
if (size > SIZE) size = SIZE;
memset(buf+1, 0, size);
auto start = std::chrono::system_clock::now();
const uint64_t MAX = 100;
for (auto i = MAX; i; --i) {
crc32c(buf, sizeof(buf), 0);
unsigned long rounds = SIZE / size * 10;
for (auto i = rounds; i; --i) {
crc32c(buf+1, size, 0); // test for memory un-alignment
}
int time_cost = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now() - start).count();
printf("%s time spent: %d us (%0.2f GB/s)\n", name, time_cost,
sizeof(buf) * MAX / 1024 / 1024 / 1024 / (double(time_cost) / 1000 / 1000));
unsigned long time_cost = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now() - start).count();
auto perf = size * rounds / (double(time_cost) / 1000 / 1000) / 1024 / 1024 / 1024;
printf("%s (%lu bytes * %lu rounds = %lu GB), time spent: %lu us (%0.2f GB/s)\n",
name, size, rounds, rounds * size / SIZE, time_cost, perf);
}

inline void do_test_crc_big(const char* name, CRC64ECMA crc64ecma) {
return do_test_crc_big(name, (CRC32C&)crc64ecma);
inline void do_perf_crc(const char* name, CRC64ECMA crc64ecma, unsigned long size) {
return do_perf_crc(name, (CRC32C&)crc64ecma, size);
}

TEST(TestChecksumBig, crc32c_hw_simple) {
do_test_crc_big("crc32c_hw_simple", crc32c_hw_simple);
const size_t _128KB = 128 * 1024;
const size_t _1GB = 1024 * 1024 * 1024;

TEST(Perf, crc32c_hw_simple) {
do_perf_crc("crc32c_hw_simple", crc32c_hw_simple, _128KB);
do_perf_crc("crc32c_hw_simple", crc32c_hw_simple, _1GB);
}

TEST(TestChecksumBig, crc32c_hw_portable) {
do_test_crc_big("crc32c_hw_portable", crc32c_hw_portable);
TEST(Perf, crc32c_hw_portable) {
do_perf_crc("crc32c_hw_portable", crc32c_hw_portable, _128KB);
do_perf_crc("crc32c_hw_portable", crc32c_hw_portable, _1GB);
}

TEST(TestChecksumBig, crc32c_hw_asm) {
do_test_crc_big(crc32c_hw_asm_name, crc32c_hw_asm);
TEST(Perf, crc32c_hw_asm) {
do_perf_crc(crc32c_hw_asm_name, crc32c_hw_asm, _128KB);
do_perf_crc(crc32c_hw_asm_name, crc32c_hw_asm, _1GB);
}
/*
TEST(TestChecksumBig, crc32c_sw) {
do_test_crc_big("crc32c_sw", crc32c_sw);

TEST(Perf, crc32c_sw) {
do_perf_crc("crc32c_sw", crc32c_sw, _128KB);
}
*/

void do_test64(const char* name, CRC64ECMA crc64ecma) {
auto start = std::chrono::system_clock::now();
for (int i = 0; i < 100; ++i)
Expand Down Expand Up @@ -188,12 +198,14 @@ TEST(TestChecksum, crc64ecma_hw) {
do_test64("crc64ecma_hw_portable", crc64ecma_hw);
}

TEST(TestChecksumBig, crc64ecma_hw) {
do_test_crc_big("crc64ecma_hw_portable", crc64ecma_hw);
TEST(Perf, crc64ecma_hw) {
do_perf_crc("crc64ecma_hw_portable", crc64ecma_hw, _128KB);
do_perf_crc("crc64ecma_hw_portable", crc64ecma_hw, _1GB);
}

TEST(TestChecksumBig, crc64ecma_hw_asm) {
do_test_crc_big(crc64ecma_hw_asm_name, crc64ecma_hw_asm);
TEST(Perf, crc64ecma_hw_asm) {
do_perf_crc(crc64ecma_hw_asm_name, crc64ecma_hw_asm, _128KB);
do_perf_crc(crc64ecma_hw_asm_name, crc64ecma_hw_asm, _1GB);
}

TEST(TestChecksum, crc64ecma_small) {
Expand Down

0 comments on commit 4687f15

Please sign in to comment.