diff --git a/CMakeLists.txt b/CMakeLists.txt index 59435225..b28c3101 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -86,7 +86,7 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") endif() if (${ARCH} STREQUAL x86_64) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mpclmul") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mpclmul -march=native") elseif (${ARCH} STREQUAL aarch64) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=native -fsigned-char -fno-stack-protector -fomit-frame-pointer") endif () diff --git a/common/checksum/crc.cpp b/common/checksum/crc.cpp index 9f8a237e..d7c9c6a9 100644 --- a/common/checksum/crc.cpp +++ b/common/checksum/crc.cpp @@ -477,29 +477,117 @@ inline void* get_shf_table(size_t i) { return (char*)pshufb_shf_table + i; } + +inline __attribute__((always_inline)) +__m128i crc64ecma_hw_big_sse(const uint8_t*& data, size_t& nbytes, uint64_t crc) { + using SIMD = SSE; + using v128 = typename SIMD::v128; + v128 xmm[8]; + auto& ptr = (const v128*&)data; + static_loop<0, 7, 1>(BODY(i){ xmm[i] = SIMD::loadu(ptr+i); }); + xmm[0] ^= v128{(long)~crc, 0}; ptr += 8; nbytes -= 128; + do { + static_loop<0, 7, 1>(BODY(i) { + xmm[i] = SIMD::op(xmm[i], RK(3)) ^ SIMD::loadu(ptr+i); + }); + ptr += 8; nbytes -= 128; + } while (nbytes >= 128); + static_loop<0, 6, 1>(BODY(i) { + auto I = (i == 6) ? 1 : (9 + i * 2); + xmm[7] ^= SIMD::op(xmm[i], RK(I)); + }); + return xmm[7]; +} + +#ifdef __AVX512F__ +static const uint64_t rk512[] = { + 0xf31fd9271e228b79, // rk_1 + 0x8260adf2381ad81c, // rk_2 + 0xdabe95afc7875f40, // rk1 + 0xe05dd497ca393ae4, // rk2 + 0xd7d86b2af73de740, // rk3 + 0x8757d71d4fcc1000, // rk4 + 0xdabe95afc7875f40, + 0x0000000000000000, + 0x9c3e466c172963d5, + 0x92d8af2baf0e1e84, + 0x947874de595052cb, + 0x9e735cb59b4724da, + 0xe4ce2cd55fea0037, + 0x2fe3fd2920ce82ec, + 0x0e31d519421a63a5, + 0x2e30203212cac325, + 0x081f6054a7842df4, + 0x6ae3efbb9dd441f3, + 0x69a35d91c3730254, + 0xb5ea1af9c013aca4, + 0x3be653a30fe1af51, + 0x60095b008a9efa44, // rk20 + 0xdabe95afc7875f40, // rk_1b + 0xe05dd497ca393ae4, // rk_2b + 0x0000000000000000, + 0x0000000000000000, +}; +#define _RK(i) &rk512[(i)+1] + inline __attribute__((always_inline)) -uint64_t crc64ecma_hw_portable(const uint8_t *data, size_t nbytes, uint64_t crc) { +__m128i crc64ecma_hw_big_avx512(const uint8_t*& data, size_t& nbytes, uint64_t crc) { + assert(nbytes >= 256); + using v512 = __m512i_u; + __attribute__((aligned(16))) + v512 crc0 = {(long)~crc}; + auto& ptr = (const v512*&)data; + auto zmm0 = _mm512_loadu_si512(ptr++); zmm0 ^= crc0; + auto zmm4 = _mm512_loadu_si512(ptr++); + auto OP = [](v512 a, v512 b, v512 c) -> v512 { + auto x = _mm512_clmulepi64_epi128((a), (b), 0x01); + auto y = _mm512_clmulepi64_epi128((a), (b), 0x10); + return _mm512_ternarylogic_epi64(x, y, (c), 0x96); + }; + #define OP_PTR(a, b, ptr) OP(a, b, _mm512_loadu_si512(ptr)) + nbytes -= 128; + if (nbytes < 384) { + auto rk3 = _mm512_broadcast_i32x4(*(__m128i*)_RK(3)); + do { // fold 128 bytes each iteration + zmm0 = OP_PTR(zmm0, rk3, ptr++); + zmm4 = OP_PTR(zmm4, rk3, ptr++); + nbytes -= 128; + } while (nbytes >= 128); + } else { // nbytes >= 384 + auto rk_1_2 = _mm512_broadcast_i32x4(*(__m128i*)&rk512[0]); + auto zmm7 = _mm512_loadu_si512(ptr++); + auto zmm8 = _mm512_loadu_si512(ptr++); + nbytes -= 128; + do { // fold 256 bytes each iteration + zmm0 = OP_PTR(zmm0, rk_1_2, ptr++); + zmm4 = OP_PTR(zmm4, rk_1_2, ptr++); + zmm7 = OP_PTR(zmm7, rk_1_2, ptr++); + zmm8 = OP_PTR(zmm8, rk_1_2, ptr++); + nbytes -= 256; + } while (nbytes >= 256); + auto rk3 = _mm512_broadcast_i32x4(*(__m128i*)_RK(3)); + zmm0 = OP(zmm0, rk3, zmm7); + zmm4 = OP(zmm4, rk3, zmm8); + } + auto t = _mm512_extracti64x2_epi64(zmm4, 0x03); + auto zmm7 = v512{t[0], t[1]}; + auto zmm1 = OP(zmm0, *(v512*)_RK(9), zmm7); + zmm1 = OP(zmm4, *(v512*)_RK(17), zmm1); + auto zmm8 = _mm512_shuffle_i64x2(zmm1, zmm1, 0x4e); + auto ymm8 = ((__m256i&)zmm8) ^ ((__m256i&)zmm1); + return _mm256_extracti64x2_epi64(ymm8, 0) ^ + _mm256_extracti64x2_epi64(ymm8, 1) ; +} +#endif +template inline __attribute__((always_inline)) +uint64_t crc64ecma_hw_portable(const uint8_t *data, size_t nbytes, uint64_t crc, F hw_big) { if (unlikely(!nbytes || !data)) return crc; using SIMD = SSE; using v128 = typename SIMD::v128; v128 xmm7 = {(long)~crc}; auto& ptr = (const v128*&)data; if (nbytes >= 256) { - v128 xmm[8]; - assert(nbytes >= 256); - static_loop<0, 7, 1>(BODY(i){ xmm[i] = SIMD::loadu(ptr+i); }); - xmm[0] ^= xmm7; ptr += 8; nbytes -= 128; - do { - static_loop<0, 7, 1>(BODY(i) { - xmm[i] = SIMD::op(xmm[i], RK(3)) ^ SIMD::loadu(ptr+i); - }); - ptr += 8; nbytes -= 128; - } while (nbytes >= 128); - static_loop<0, 6, 1>(BODY(i) { - auto I = (i == 6) ? 1 : (9 + i * 2); - xmm[7] ^= SIMD::op(xmm[i], RK(I)); - }); - xmm7 = xmm[7]; + xmm7 = hw_big(data, nbytes, crc); } else if (nbytes >= 16) { xmm7 ^= SIMD::loadu(ptr++); nbytes -= 16; @@ -543,7 +631,11 @@ uint64_t crc64ecma_hw_portable(const uint8_t *data, size_t nbytes, uint64_t crc) } uint64_t crc64ecma_hw(const uint8_t *buf, size_t len, uint64_t crc) { - return crc64ecma_hw_portable(buf, len, crc); +#ifdef __AVX512F__ + return crc64ecma_hw_portable(buf, len, crc, crc64ecma_hw_big_avx512); +#else + return crc64ecma_hw_portable(buf, len, crc, crc64ecma_hw_big_sse); +#endif } template diff --git a/common/checksum/test/test_checksum.cpp b/common/checksum/test/test_checksum.cpp index 9f347381..8d740f9c 100644 --- a/common/checksum/test/test_checksum.cpp +++ b/common/checksum/test/test_checksum.cpp @@ -67,9 +67,9 @@ void do_test_crc(const char* name, CRC32C calcurlator) { } void do_test_crc_small(CRC64ECMA calc_sw, CRC64ECMA calc_hw, uint16_t begin, uint16_t end) { - alignas(16) unsigned char buf[64 * 1024 + 16]; + unsigned char buf[64 * 1024 + 16]; + for (uint16_t i = 0; i < begin; ++i) buf[i] = 'a' + i % 26; for (uint16_t i = begin; i < end; ++i) { - buf[i] = 0; auto crc_sw = calc_sw(buf, i, 0); auto crc_hw = calc_hw(buf, i, 0); if (crc_sw != crc_hw) printf("i=%d\n", i); @@ -121,39 +121,49 @@ TEST(TestChecksum, crc32c_hw_small) { do_test_crc_small(crc32c_sw, crc32c_hw_portable, 0, 4000); } -void do_test_crc_big(const char* name, CRC32C crc32c) { - static unsigned char buf[512 * 1024 * 1024]; - memset(buf, 0, sizeof(buf)); +void do_perf_crc(const char* name, CRC32C crc32c, unsigned long size) { + const unsigned long SIZE = 1 * 1024 * 1024 * 1024; + __attribute__((aligned(16))) + static unsigned char buf[SIZE+1]; + if (size > SIZE) size = SIZE; + memset(buf+1, 0, size); auto start = std::chrono::system_clock::now(); - const uint64_t MAX = 100; - for (auto i = MAX; i; --i) { - crc32c(buf, sizeof(buf), 0); + unsigned long rounds = SIZE / size * 10; + for (auto i = rounds; i; --i) { + crc32c(buf+1, size, 0); // test for memory un-alignment } - int time_cost = std::chrono::duration_cast(std::chrono::system_clock::now() - start).count(); - printf("%s time spent: %d us (%0.2f GB/s)\n", name, time_cost, - sizeof(buf) * MAX / 1024 / 1024 / 1024 / (double(time_cost) / 1000 / 1000)); + unsigned long time_cost = std::chrono::duration_cast(std::chrono::system_clock::now() - start).count(); + auto perf = size * rounds / (double(time_cost) / 1000 / 1000) / 1024 / 1024 / 1024; + printf("%s (%lu bytes * %lu rounds = %lu GB), time spent: %lu us (%0.2f GB/s)\n", + name, size, rounds, rounds * size / SIZE, time_cost, perf); } -inline void do_test_crc_big(const char* name, CRC64ECMA crc64ecma) { - return do_test_crc_big(name, (CRC32C&)crc64ecma); +inline void do_perf_crc(const char* name, CRC64ECMA crc64ecma, unsigned long size) { + return do_perf_crc(name, (CRC32C&)crc64ecma, size); } -TEST(TestChecksumBig, crc32c_hw_simple) { - do_test_crc_big("crc32c_hw_simple", crc32c_hw_simple); +const size_t _128KB = 128 * 1024; +const size_t _1GB = 1024 * 1024 * 1024; + +TEST(Perf, crc32c_hw_simple) { + do_perf_crc("crc32c_hw_simple", crc32c_hw_simple, _128KB); + do_perf_crc("crc32c_hw_simple", crc32c_hw_simple, _1GB); } -TEST(TestChecksumBig, crc32c_hw_portable) { - do_test_crc_big("crc32c_hw_portable", crc32c_hw_portable); +TEST(Perf, crc32c_hw_portable) { + do_perf_crc("crc32c_hw_portable", crc32c_hw_portable, _128KB); + do_perf_crc("crc32c_hw_portable", crc32c_hw_portable, _1GB); } -TEST(TestChecksumBig, crc32c_hw_asm) { - do_test_crc_big(crc32c_hw_asm_name, crc32c_hw_asm); +TEST(Perf, crc32c_hw_asm) { + do_perf_crc(crc32c_hw_asm_name, crc32c_hw_asm, _128KB); + do_perf_crc(crc32c_hw_asm_name, crc32c_hw_asm, _1GB); } -/* -TEST(TestChecksumBig, crc32c_sw) { - do_test_crc_big("crc32c_sw", crc32c_sw); + +TEST(Perf, crc32c_sw) { + do_perf_crc("crc32c_sw", crc32c_sw, _128KB); } -*/ + void do_test64(const char* name, CRC64ECMA crc64ecma) { auto start = std::chrono::system_clock::now(); for (int i = 0; i < 100; ++i) @@ -188,12 +198,14 @@ TEST(TestChecksum, crc64ecma_hw) { do_test64("crc64ecma_hw_portable", crc64ecma_hw); } -TEST(TestChecksumBig, crc64ecma_hw) { - do_test_crc_big("crc64ecma_hw_portable", crc64ecma_hw); +TEST(Perf, crc64ecma_hw) { + do_perf_crc("crc64ecma_hw_portable", crc64ecma_hw, _128KB); + do_perf_crc("crc64ecma_hw_portable", crc64ecma_hw, _1GB); } -TEST(TestChecksumBig, crc64ecma_hw_asm) { - do_test_crc_big(crc64ecma_hw_asm_name, crc64ecma_hw_asm); +TEST(Perf, crc64ecma_hw_asm) { + do_perf_crc(crc64ecma_hw_asm_name, crc64ecma_hw_asm, _128KB); + do_perf_crc(crc64ecma_hw_asm_name, crc64ecma_hw_asm, _1GB); } TEST(TestChecksum, crc64ecma_small) {