From 0d1d79c1c765370d52afe37f68b9457b45e50ff0 Mon Sep 17 00:00:00 2001 From: dhb <1084714805@qq.com> Date: Sat, 8 Jun 2024 21:57:42 +0800 Subject: [PATCH] fix compiling on avx512vnni Passed f15 STC, VLTC: Book: f15-base-8k TC: 10+0.1 Total/Win/Draw/Lose: 4224 / 1872 / 966 / 1386 PTNML: 204 / 298 / 811 / 406 / 393 WinRate: 55.75% ELO: 39.83[29.99, 49.81] LOS: 100.00 LLR: 3.31[-2.94, 2.94] TC: 180+1.8 Total/Win/Draw/Lose: 4314 / 1246 / 2190 / 878 PTNML: 47 / 390 / 994 / 600 / 126 WinRate: 54.27% ELO: 29.46[22.02, 36.98] LOS: 100.00 LLR: 3.06[-2.94, 2.94] Passed f20 STC, VLTC: Book: f20-base-8k TC: 10+0.1 Total/Win/Draw/Lose: 10298 / 5436 / 4 / 4858 PTNML: 1043 / 2 / 2770 / 2 / 1332 WinRate: 52.81% ELO: 19.18[12.28, 26.10] LOS: 100.00 LLR: 3.07[-2.94, 2.94] TC: 180+1.8 Total/Win/Draw/Lose: 8888 / 4709 / 21 / 4158 PTNML: 734 / 9 / 2681 / 12 / 1008 WinRate: 53.10% ELO: 21.23[13.95, 28.53] LOS: 100.00 LLR: 2.96[-2.94, 2.94] Passed s15 STC, VLTC: Book: s15-base-24k TC: 10+0.1 Total/Win/Draw/Lose: 3820 / 1666 / 953 / 1201 PTNML: 167 / 305 / 674 / 424 / 340 WinRate: 56.09% ELO: 42.18[32.10, 52.43] LOS: 100.00 LLR: 3.23[-2.94, 2.94] TC: 180+1.8 Total/Win/Draw/Lose: 3616 / 911 / 2153 / 552 PTNML: 37 / 282 / 881 / 501 / 107 WinRate: 54.96% ELO: 34.39[26.91, 41.98] LOS: 100.00 LLR: 3.15[-2.94, 2.94] Passed r15 STC, VLTC: Book: r15-base-40k TC: 10+0.1 Total/Win/Draw/Lose: 3686 / 1910 / 381 / 1395 PTNML: 231 / 147 / 801 / 204 / 460 WinRate: 56.99% ELO: 48.52[37.36, 59.89] LOS: 100.00 LLR: 3.17[-2.94, 2.94] TC: 180+1.8 Total/Win/Draw/Lose: 2734 / 1151 / 871 / 712 PTNML: 68 / 178 / 557 / 375 / 189 WinRate: 58.03% ELO: 55.96[44.53, 67.70] LOS: 100.00 LLR: 3.23[-2.94, 2.94] --- .github/workflows/rapfi.yml | 26 +++++++++++-------- Rapfi/CMakeLists.txt | 2 +- Rapfi/CMakeSettings.json | 51 +++++++++++++++++++++++++++++++++++++ Rapfi/eval/mix9litennue.cpp | 10 +++----- Rapfi/eval/simdops.h | 18 +++++++++++++ 5 files changed, 88 insertions(+), 19 deletions(-) diff --git a/.github/workflows/rapfi.yml b/.github/workflows/rapfi.yml index 835c553c..28eaec71 100644 --- a/.github/workflows/rapfi.yml +++ b/.github/workflows/rapfi.yml @@ -20,7 +20,7 @@ jobs: target: - { name: linux-gcc, - os: ubuntu-20.04, + os: ubuntu-22.04, c_compiler: gcc, cxx_compiler: g++, cmake_command: "", @@ -29,8 +29,8 @@ jobs: - { name: linux-clang, os: ubuntu-20.04, - c_compiler: clang, - cxx_compiler: clang++, + c_compiler: clang-18, + cxx_compiler: clang++-18, cmake_command: "", shell: "bash {0}" } @@ -55,9 +55,11 @@ jobs: shell: "msys2 {0}" } arch: - - { name: avx2, cmake_command: "-DUSE_BMI2=OFF -DUSE_AVX2=ON -DUSE_AVX=ON -DUSE_SSE=ON" } - - { name: avx, cmake_command: "-DUSE_BMI2=OFF -DUSE_AVX2=OFF -DUSE_AVX=ON -DUSE_SSE=ON" } - - { name: sse, cmake_command: "-DUSE_BMI2=OFF -DUSE_AVX2=OFF -DUSE_AVX=OFF -DUSE_SSE=ON" } + - { name: sse, cmake_command: "-DUSE_AVX2=OFF" } + - { name: avx2, cmake_command: "-DUSE_AVX2=ON" } + - { name: avxvnni, cmake_command: "-DUSE_AVX2=ON -DUSE_VNNI=ON" } + - { name: avx512, cmake_command: "-DUSE_AVX2=ON -DUSE_AVX512=ON" } + - { name: avx512vnni, cmake_command: "-DUSE_AVX2=ON -DUSE_AVX512=ON -DUSE_VNNI=ON" } defaults: run: @@ -71,18 +73,20 @@ jobs: fetch-depth: 0 submodules: true - - name: Download required linux packages - if: runner.os == 'Linux' + - name: Download latest clang compiler on linux + if: matrix.target.name == 'linux-clang' run: | - sudo apt update - sudo apt install -y libtbb-dev + wget https://apt.llvm.org/llvm.sh + chmod u+x llvm.sh + echo | sudo ./llvm.sh 18 + rm llvm.sh - name: Setup msys and install required packages if: runner.os == 'Windows' uses: msys2/setup-msys2@v2 with: msystem: ${{matrix.target.msys_sys}} - install: mingw-w64-${{matrix.target.msys_env}}-${{matrix.target.c_compiler}} mingw-w64-${{matrix.target.msys_env}}-tbb mingw-w64-${{matrix.target.msys_env}}-cmake make git + install: mingw-w64-${{matrix.target.msys_env}}-${{matrix.target.c_compiler}} mingw-w64-${{matrix.target.msys_env}}-cmake make git - name: Extract the bench hash from the commit history run: | diff --git a/Rapfi/CMakeLists.txt b/Rapfi/CMakeLists.txt index 2daa9e64..a2fef98c 100644 --- a/Rapfi/CMakeLists.txt +++ b/Rapfi/CMakeLists.txt @@ -332,7 +332,7 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "C if(USE_AVX512) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512dq -mavx512bw") if(USE_VNNI) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vnni") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512vnni -mavx512vl") endif() elseif(USE_AVX2) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -mfma") diff --git a/Rapfi/CMakeSettings.json b/Rapfi/CMakeSettings.json index 9a5fe428..fd4f0ab3 100644 --- a/Rapfi/CMakeSettings.json +++ b/Rapfi/CMakeSettings.json @@ -121,6 +121,34 @@ } ] }, + { + "name": "x64-Release-AVX512VNNI", + "generator": "Ninja", + "configurationType": "Release", + "buildRoot": "${projectDir}\\build\\${name}", + "installRoot": "${projectDir}\\build\\install\\${name}", + "cmakeCommandArgs": "", + "buildCommandArgs": "", + "ctestCommandArgs": "", + "inheritEnvironments": [ "msvc_x64_x64" ], + "variables": [ + { + "name": "ENABLE_LTO", + "value": "True", + "type": "BOOL" + }, + { + "name": "USE_AVX512", + "value": "True", + "type": "BOOL" + }, + { + "name": "USE_VNNI", + "value": "True", + "type": "BOOL" + } + ] + }, { "name": "x64-Release-ST", "generator": "Ninja", @@ -219,6 +247,29 @@ } ] }, + { + "name": "x64-Clang-Release-AVX512VNNI", + "generator": "Ninja", + "configurationType": "Release", + "buildRoot": "${projectDir}\\build\\${name}", + "installRoot": "${projectDir}\\build\\install\\${name}", + "cmakeCommandArgs": "", + "buildCommandArgs": "", + "ctestCommandArgs": "", + "inheritEnvironments": [ "clang_cl_x64_x64" ], + "variables": [ + { + "name": "USE_AVX512", + "value": "True", + "type": "BOOL" + }, + { + "name": "USE_VNNI", + "value": "True", + "type": "BOOL" + } + ] + }, { "name": "x64-Clang-Release-ST", "generator": "Ninja", diff --git a/Rapfi/eval/mix9litennue.cpp b/Rapfi/eval/mix9litennue.cpp index 56fdf1fc..58235551 100644 --- a/Rapfi/eval/mix9litennue.cpp +++ b/Rapfi/eval/mix9litennue.cpp @@ -54,17 +54,13 @@ constexpr int MaxOuterChanges[23] = {5, 11, 33, 107, 293, 675, 1 static Evaluation::WeightRegistry Mix9LiteWeightRegistry; constexpr int Alignment = 16; -constexpr simd::InstructionType IT512 = getInstTypeOfWidth(simd::NativeInstType, 512); constexpr simd::InstructionType IT256 = getInstTypeOfWidth(simd::NativeInstType, 256); constexpr simd::InstructionType IT128 = getInstTypeOfWidth(simd::NativeInstType, 128); template -using Batch = std::conditional_t< - simd::detail::VecBatch::NumExtra == 0, - simd::detail::VecBatch, - std::conditional_t::NumExtra == 0, - simd::detail::VecBatch, - simd::detail::VecBatch>>; +using Batch = std::conditional_t::NumExtra == 0, + simd::detail::VecBatch, + simd::detail::VecBatch>; template using Convert = simd::detail::VecCvt; diff --git a/Rapfi/eval/simdops.h b/Rapfi/eval/simdops.h index c01fde50..b931db5c 100644 --- a/Rapfi/eval/simdops.h +++ b/Rapfi/eval/simdops.h @@ -823,7 +823,11 @@ namespace detail { static FORCE_INLINE void dot4_u7i8_accum(R &acc, R a, R b) { #if defined(USE_VNNI) + #if !defined(USE_AVX512) acc = _mm_dpbusd_avx_epi32(acc, a, b); + #else + acc = _mm_dpbusd_epi32(acc, a, b); + #endif #else R product0 = simde_mm_maddubs_epi16(a, b); product0 = simde_mm_madd_epi16(product0, simde_mm_set1_epi16(1)); @@ -840,8 +844,13 @@ namespace detail { R low7 = simde_mm_andnot_si128(highest_bit, a); #if defined(USE_VNNI) + #if !defined(USE_AVX512) msb = _mm_dpbusd_avx_epi32(_mm_setzero_si128(), msb, b); // 0 or 128 low7 = _mm_dpbusd_avx_epi32(_mm_setzero_si128(), low7, b); + #else + msb = _mm_dpbusd_epi32(_mm_setzero_si128(), msb, b); // 0 or 128 + low7 = _mm_dpbusd_epi32(_mm_setzero_si128(), low7, b); + #endif #else // Multiply a * b in two parts and accumulate neighbouring outputs into int16 values msb = simde_mm_maddubs_epi16(msb, b); // 0 or 128 @@ -877,7 +886,11 @@ namespace detail { static FORCE_INLINE void dot4_u7i8_accum(R &acc, R a, R b) { #if defined(USE_VNNI) + #if !defined(USE_AVX512) acc = _mm256_dpbusd_avx_epi32(acc, a, b); + #else + acc = _mm256_dpbusd_epi32(acc, a, b); + #endif #else R product0 = simde_mm256_maddubs_epi16(a, b); product0 = simde_mm256_madd_epi16(product0, simde_mm256_set1_epi16(1)); @@ -894,8 +907,13 @@ namespace detail { R low7 = simde_mm256_andnot_si256(highest_bit, a); #if defined(USE_VNNI) + #if !defined(USE_AVX512) msb = _mm256_dpbusd_avx_epi32(_mm256_setzero_si256(), msb, b); // 0 or 128 low7 = _mm256_dpbusd_avx_epi32(_mm256_setzero_si256(), low7, b); + #else + msb = _mm256_dpbusd_epi32(_mm256_setzero_si256(), msb, b); // 0 or 128 + low7 = _mm256_dpbusd_epi32(_mm256_setzero_si256(), low7, b); + #endif #else // Multiply a * b in two parts and accumulate neighbouring outputs into int16 values msb = simde_mm256_maddubs_epi16(msb, b); // 0 or 128