From 7f7835af7faef46811371e24d31a7780ebab4904 Mon Sep 17 00:00:00 2001
From: Brian Smith <brian@briansmith.org>
Date: Fri, 1 Dec 2023 12:33:55 -0800
Subject: [PATCH] chacha: Move x86-64 CPU & length dispatching from assembly to
 C.

The x86-64 and x86 code both have 4x and 1x SSSE3 code paths. Only
the x86-64 version is modified here.

(For 32-bit x86, it looks like it would be much harder to separate
the 1x and 4x code paths into separate functions and there's
little motivation to do so. So it is likely that 32-bit x86 would
define |CHACHA20_ASM_SSSE3| without |CHACHA20_ASM_SSSE3_4X|.)

Bug: 673
Change-Id: I48a3fb8a2c6eb05a33c5ef20efff7ed000e51d71
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/64467
Reviewed-by: Bob Beck <bbe@google.com>
Reviewed-by: David Benjamin <davidben@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
---
 crypto/chacha/asm/chacha-x86_64.pl   | 98 ++++++++++------------------
 crypto/chacha/chacha.c               | 18 +++++
 crypto/chacha/chacha_test.cc         | 15 +++++
 crypto/chacha/internal.h             | 28 +++++++-
 crypto/fipsmodule/cpucap/cpu_intel.c |  6 +-
 crypto/fipsmodule/cpucap/internal.h  | 29 +++++++-
 6 files changed, 126 insertions(+), 68 deletions(-)

diff --git a/crypto/chacha/asm/chacha-x86_64.pl b/crypto/chacha/asm/chacha-x86_64.pl
index d212d5fdc51..4da5ec9cab6 100755
--- a/crypto/chacha/asm/chacha-x86_64.pl
+++ b/crypto/chacha/asm/chacha-x86_64.pl
@@ -80,8 +80,6 @@
 $code.=<<___;
 .text
 
-.extern OPENSSL_ia32cap_P
-
 .section .rodata
 .align	64
 .Lzero:
@@ -230,24 +228,12 @@ sub ROUND {			# critical path is 24 cycles per round
 ########################################################################
 # Generic code path that handles all lengths on pre-SSSE3 processors.
 $code.=<<___;
-.globl	ChaCha20_ctr32
-.type	ChaCha20_ctr32,\@function,5
+.globl	ChaCha20_ctr32_nohw
+.type	ChaCha20_ctr32_nohw,\@function,5
 .align	64
-ChaCha20_ctr32:
+ChaCha20_ctr32_nohw:
 .cfi_startproc
 	_CET_ENDBR
-	cmp	\$0,$len
-	je	.Lno_data
-	mov	OPENSSL_ia32cap_P+4(%rip),%r10
-___
-$code.=<<___	if ($avx>2);
-	bt	\$48,%r10		# check for AVX512F
-	jc	.LChaCha20_avx512
-___
-$code.=<<___;
-	test	\$`1<<(41-32)`,%r10d
-	jnz	.LChaCha20_ssse3
-
 	push	%rbx
 .cfi_push	rbx
 	push	%rbp
@@ -419,7 +405,7 @@ sub ROUND {			# critical path is 24 cycles per round
 .Lno_data:
 	ret
 .cfi_endproc
-.size	ChaCha20_ctr32,.-ChaCha20_ctr32
+.size	ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw
 ___
 
 ########################################################################
@@ -454,19 +440,16 @@ sub SSSE3ROUND {	# critical path is 20 "SIMD ticks" per round
 my $xframe = $win64 ? 32+8 : 8;
 
 $code.=<<___;
-.type	ChaCha20_ssse3,\@function,5
+.globl	ChaCha20_ctr32_ssse3
+.type	ChaCha20_ctr32_ssse3,\@function,5
 .align	32
-ChaCha20_ssse3:
-.LChaCha20_ssse3:
+ChaCha20_ctr32_ssse3:
 .cfi_startproc
+	_CET_ENDBR
 	mov	%rsp,%r9		# frame pointer
 .cfi_def_cfa_register	r9
 ___
 $code.=<<___;
-	cmp	\$128,$len		# we might throw away some data,
-	ja	.LChaCha20_4x		# but overall it won't be slower
-
-.Ldo_sse3_after_all:
 	sub	\$64+$xframe,%rsp
 ___
 $code.=<<___	if ($win64);
@@ -576,7 +559,7 @@ sub SSSE3ROUND {	# critical path is 20 "SIMD ticks" per round
 .Lssse3_epilogue:
 	ret
 .cfi_endproc
-.size	ChaCha20_ssse3,.-ChaCha20_ssse3
+.size	ChaCha20_ctr32_ssse3,.-ChaCha20_ctr32_ssse3
 ___
 }
 
@@ -714,29 +697,17 @@ sub SSSE3_lane_ROUND {
 my $xframe = $win64 ? 0xa8 : 8;
 
 $code.=<<___;
-.type	ChaCha20_4x,\@function,5
+.globl	ChaCha20_ctr32_ssse3_4x
+.type	ChaCha20_ctr32_ssse3_4x,\@function,5
 .align	32
-ChaCha20_4x:
-.LChaCha20_4x:
+ChaCha20_ctr32_ssse3_4x:
 .cfi_startproc
+	_CET_ENDBR
 	mov		%rsp,%r9		# frame pointer
 .cfi_def_cfa_register	r9
 	mov		%r10,%r11
 ___
-$code.=<<___	if ($avx>1);
-	shr		\$32,%r10		# OPENSSL_ia32cap_P+8
-	test		\$`1<<5`,%r10		# test AVX2
-	jnz		.LChaCha20_8x
-___
 $code.=<<___;
-	cmp		\$192,$len
-	ja		.Lproceed4x
-
-	and		\$`1<<26|1<<22`,%r11	# isolate XSAVE+MOVBE
-	cmp		\$`1<<22`,%r11		# check for MOVBE without XSAVE
-	je		.Ldo_sse3_after_all	# to detect Atom
-
-.Lproceed4x:
 	sub		\$0x140+$xframe,%rsp
 ___
 	################ stack layout
@@ -1164,7 +1135,7 @@ sub SSSE3_lane_ROUND {
 .L4x_epilogue:
 	ret
 .cfi_endproc
-.size	ChaCha20_4x,.-ChaCha20_4x
+.size	ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x
 ___
 }
 
@@ -1293,11 +1264,12 @@ sub AVX2_lane_ROUND {
 my $xframe = $win64 ? 0xa8 : 8;
 
 $code.=<<___;
-.type	ChaCha20_8x,\@function,5
+.globl	ChaCha20_ctr32_avx2
+.type	ChaCha20_ctr32_avx2,\@function,5
 .align	32
-ChaCha20_8x:
-.LChaCha20_8x:
+ChaCha20_ctr32_avx2:
 .cfi_startproc
+	_CET_ENDBR
 	mov		%rsp,%r9		# frame register
 .cfi_def_cfa_register	r9
 	sub		\$0x280+$xframe,%rsp
@@ -1809,7 +1781,7 @@ sub AVX2_lane_ROUND {
 .L8x_epilogue:
 	ret
 .cfi_endproc
-.size	ChaCha20_8x,.-ChaCha20_8x
+.size	ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2
 ___
 }
 
@@ -2719,22 +2691,22 @@ sub AVX512_lane_ROUND {
 
 .section	.pdata
 .align	4
-	.rva	.LSEH_begin_ChaCha20_ctr32
-	.rva	.LSEH_end_ChaCha20_ctr32
-	.rva	.LSEH_info_ChaCha20_ctr32
+	.rva	.LSEH_begin_ChaCha20_ctr32_nohw
+	.rva	.LSEH_end_ChaCha20_ctr32_nohw
+	.rva	.LSEH_info_ChaCha20_ctr32_nohw
 
-	.rva	.LSEH_begin_ChaCha20_ssse3
-	.rva	.LSEH_end_ChaCha20_ssse3
-	.rva	.LSEH_info_ChaCha20_ssse3
+	.rva	.LSEH_begin_ChaCha20_ctr32_ssse3
+	.rva	.LSEH_end_ChaCha20_ctr32_ssse3
+	.rva	.LSEH_info_ChaCha20_ctr32_ssse3
 
-	.rva	.LSEH_begin_ChaCha20_4x
-	.rva	.LSEH_end_ChaCha20_4x
-	.rva	.LSEH_info_ChaCha20_4x
+	.rva	.LSEH_begin_ChaCha20_ctr32_ssse3_4x
+	.rva	.LSEH_end_ChaCha20_ctr32_ssse3_4x
+	.rva	.LSEH_info_ChaCha20_ctr32_ssse3_4x
 ___
 $code.=<<___ if ($avx>1);
-	.rva	.LSEH_begin_ChaCha20_8x
-	.rva	.LSEH_end_ChaCha20_8x
-	.rva	.LSEH_info_ChaCha20_8x
+	.rva	.LSEH_begin_ChaCha20_ctr32_avx2
+	.rva	.LSEH_end_ChaCha20_ctr32_avx2
+	.rva	.LSEH_info_ChaCha20_ctr32_avx2
 ___
 $code.=<<___ if ($avx>2);
 	.rva	.LSEH_begin_ChaCha20_avx512
@@ -2748,22 +2720,22 @@ sub AVX512_lane_ROUND {
 $code.=<<___;
 .section	.xdata
 .align	8
-.LSEH_info_ChaCha20_ctr32:
+.LSEH_info_ChaCha20_ctr32_nohw:
 	.byte	9,0,0,0
 	.rva	se_handler
 
-.LSEH_info_ChaCha20_ssse3:
+.LSEH_info_ChaCha20_ctr32_ssse3:
 	.byte	9,0,0,0
 	.rva	ssse3_handler
 	.rva	.Lssse3_body,.Lssse3_epilogue
 
-.LSEH_info_ChaCha20_4x:
+.LSEH_info_ChaCha20_ctr32_ssse3_4x:
 	.byte	9,0,0,0
 	.rva	full_handler
 	.rva	.L4x_body,.L4x_epilogue
 ___
 $code.=<<___ if ($avx>1);
-.LSEH_info_ChaCha20_8x:
+.LSEH_info_ChaCha20_ctr32_avx2:
 	.byte	9,0,0,0
 	.rva	full_handler
 	.rva	.L8x_body,.L8x_epilogue			# HandlerData[]
diff --git a/crypto/chacha/chacha.c b/crypto/chacha/chacha.c
index 8a6dbfb60c0..87727295279 100644
--- a/crypto/chacha/chacha.c
+++ b/crypto/chacha/chacha.c
@@ -91,6 +91,24 @@ static void ChaCha20_ctr32(uint8_t *out, const uint8_t *in, size_t in_len,
     ChaCha20_ctr32_neon(out, in, in_len, key, counter);
     return;
   }
+#endif
+#if defined(CHACHA20_ASM_AVX2) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
+  if (ChaCha20_ctr32_avx2_capable(in_len)) {
+    ChaCha20_ctr32_avx2(out, in, in_len, key, counter);
+    return;
+  }
+#endif
+#if defined(CHACHA20_ASM_SSSE3_4X)
+  if (ChaCha20_ctr32_ssse3_4x_capable(in_len)) {
+    ChaCha20_ctr32_ssse3_4x(out, in, in_len, key, counter);
+    return;
+  }
+#endif
+#if defined(CHACHA20_ASM_SSSE3)
+  if (ChaCha20_ctr32_ssse3_capable(in_len)) {
+    ChaCha20_ctr32_ssse3(out, in, in_len, key, counter);
+    return;
+  }
 #endif
   if (in_len > 0) {
     ChaCha20_ctr32_nohw(out, in, in_len, key, counter);
diff --git a/crypto/chacha/chacha_test.cc b/crypto/chacha/chacha_test.cc
index 4e656aa2237..ebfd8f8aca6 100644
--- a/crypto/chacha/chacha_test.cc
+++ b/crypto/chacha/chacha_test.cc
@@ -359,6 +359,21 @@ static void check_abi(uint8_t *out, const uint8_t *in, size_t in_len,
     CHECK_ABI(ChaCha20_ctr32_neon, out, in, in_len, key, counter);
   }
 #endif
+#if defined(CHACHA20_ASM_AVX2) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)
+  if (ChaCha20_ctr32_avx2_capable(in_len)) {
+    CHECK_ABI(ChaCha20_ctr32_avx2, out, in, in_len, key, counter);
+  }
+#endif
+#if defined(CHACHA20_ASM_SSSE3_4X)
+  if (ChaCha20_ctr32_ssse3_4x_capable(in_len)) {
+    CHECK_ABI(ChaCha20_ctr32_ssse3_4x, out, in, in_len, key, counter);
+  }
+#endif
+#if defined(CHACHA20_ASM_SSSE3)
+  if (ChaCha20_ctr32_ssse3_capable(in_len)) {
+    CHECK_ABI(ChaCha20_ctr32_ssse3, out, in, in_len, key, counter);
+  }
+#endif
 #if defined(CHACHA20_ASM_NOHW)
   if (in_len > 0) {
     CHECK_ABI(ChaCha20_ctr32_nohw, out, in, in_len, key, counter);
diff --git a/crypto/chacha/internal.h b/crypto/chacha/internal.h
index 4c339c6fcd9..ce0d08620b7 100644
--- a/crypto/chacha/internal.h
+++ b/crypto/chacha/internal.h
@@ -30,8 +30,7 @@ extern "C" {
 void CRYPTO_hchacha20(uint8_t out[32], const uint8_t key[32],
                       const uint8_t nonce[16]);
 
-#if !defined(OPENSSL_NO_ASM) && \
-    (defined(OPENSSL_X86) || defined(OPENSSL_X86_64))
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86)
 
 #define CHACHA20_ASM
 
@@ -46,6 +45,31 @@ OPENSSL_INLINE int ChaCha20_ctr32_neon_capable(size_t len) {
 }
 void ChaCha20_ctr32_neon(uint8_t *out, const uint8_t *in, size_t in_len,
                          const uint32_t key[8], const uint32_t counter[4]);
+#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
+#define CHACHA20_ASM_NOHW
+
+#define CHACHA20_ASM_AVX2
+OPENSSL_INLINE int ChaCha20_ctr32_avx2_capable(size_t len) {
+  return (len > 128) && CRYPTO_is_AVX2_capable();
+}
+void ChaCha20_ctr32_avx2(uint8_t *out, const uint8_t *in, size_t in_len,
+                         const uint32_t key[8], const uint32_t counter[4]);
+
+#define CHACHA20_ASM_SSSE3_4X
+OPENSSL_INLINE int ChaCha20_ctr32_ssse3_4x_capable(size_t len) {
+  int capable = (len > 128) && CRYPTO_is_SSSE3_capable();
+  int faster = (len > 192) || !CRYPTO_cpu_perf_is_like_silvermont();
+  return capable && faster;
+}
+void ChaCha20_ctr32_ssse3_4x(uint8_t *out, const uint8_t *in, size_t in_len,
+                             const uint32_t key[8], const uint32_t counter[4]);
+
+#define CHACHA20_ASM_SSSE3
+OPENSSL_INLINE int ChaCha20_ctr32_ssse3_capable(size_t len) {
+  return (len > 128) && CRYPTO_is_SSSE3_capable();
+}
+void ChaCha20_ctr32_ssse3(uint8_t *out, const uint8_t *in, size_t in_len,
+                          const uint32_t key[8], const uint32_t counter[4]);
 #endif
 
 #if defined(CHACHA20_ASM)
diff --git a/crypto/fipsmodule/cpucap/cpu_intel.c b/crypto/fipsmodule/cpucap/cpu_intel.c
index 19b6540d023..bbf8543c862 100644
--- a/crypto/fipsmodule/cpucap/cpu_intel.c
+++ b/crypto/fipsmodule/cpucap/cpu_intel.c
@@ -240,7 +240,8 @@ void OPENSSL_cpuid_setup(void) {
 
     // Clear the XSAVE bit on Knights Landing to mimic Silvermont. This enables
     // some Silvermont-specific codepaths which perform better. See OpenSSL
-    // commit 64d92d74985ebb3d0be58a9718f9e080a14a8e7f.
+    // commit 64d92d74985ebb3d0be58a9718f9e080a14a8e7f and
+    // |CRYPTO_cpu_perf_is_like_silvermont|.
     if ((eax & 0x0fff0ff0) == 0x00050670 /* Knights Landing */ ||
         (eax & 0x0fff0ff0) == 0x00080650 /* Knights Mill (per SDE) */) {
       ecx &= ~(1u << 26);
@@ -267,7 +268,8 @@ void OPENSSL_cpuid_setup(void) {
     // Clear AVX2 and AVX512* bits.
     //
     // TODO(davidben): Should bits 17 and 26-28 also be cleared? Upstream
-    // doesn't clear those.
+    // doesn't clear those. See the comments in
+    // |CRYPTO_hardware_supports_XSAVE|.
     extended_features[0] &=
         ~((1u << 5) | (1u << 16) | (1u << 21) | (1u << 30) | (1u << 31));
   }
diff --git a/crypto/fipsmodule/cpucap/internal.h b/crypto/fipsmodule/cpucap/internal.h
index 63d9cc6e689..b145129b825 100644
--- a/crypto/fipsmodule/cpucap/internal.h
+++ b/crypto/fipsmodule/cpucap/internal.h
@@ -34,7 +34,8 @@ void OPENSSL_cpuid_setup(void);
 //     ECX for CPUID where EAX = 7
 //
 // Note: the CPUID bits are pre-adjusted for the OSXSAVE bit and the YMM and XMM
-// bits in XCR0, so it is not necessary to check those.
+// bits in XCR0, so it is not necessary to check those. (WARNING: See caveats
+// in cpu_intel.c.)
 extern uint32_t OPENSSL_ia32cap_P[4];
 
 #if defined(BORINGSSL_FIPS) && !defined(BORINGSSL_SHARED_LIBRARY)
@@ -81,6 +82,9 @@ OPENSSL_INLINE int CRYPTO_is_AESNI_capable(void) {
   return (OPENSSL_ia32cap_get()[1] & (1 << 25)) != 0;
 }
 
+// We intentionally avoid defining a |CRYPTO_is_XSAVE_capable| function. See
+// |CRYPTO_cpu_perf_is_like_silvermont|.
+
 OPENSSL_INLINE int CRYPTO_is_AVX_capable(void) {
   return (OPENSSL_ia32cap_get()[1] & (1 << 28)) != 0;
 }
@@ -131,6 +135,29 @@ OPENSSL_INLINE int CRYPTO_is_VBMI2_capable(void) {
   return (OPENSSL_ia32cap_get()[3] & (1 << 6)) != 0;
 }
 
+// CRYPTO_cpu_perf_is_like_silvermont returns one if, based on a heuristic, the
+// CPU has Silvermont-like performance characteristics. It is often faster to
+// run different codepaths on these CPUs than the available instructions would
+// otherwise select. See chacha-x86_64.pl.
+//
+// Bonnell, Silvermont's predecessor in the Atom lineup, will also be matched by
+// this. |OPENSSL_cpuid_setup| forces Knights Landing to also be matched by
+// this. Goldmont (Silvermont's successor in the Atom lineup) added XSAVE so it
+// isn't matched by this. Various sources indicate AMD first implemented MOVBE
+// and XSAVE at the same time in Jaguar, so it seems like AMD chips will not be
+// matched by this. That seems to be the case for other x86(-64) CPUs.
+OPENSSL_INLINE int CRYPTO_cpu_perf_is_like_silvermont(void) {
+  // WARNING: This MUST NOT be used to guard the execution of the XSAVE
+  // instruction. This is the "hardware supports XSAVE" bit, not the OSXSAVE bit
+  // that indicates whether we can safely execute XSAVE. This bit may be set
+  // even when XSAVE is disabled (by the operating system). See the comment in
+  // cpu_intel.c and check how the users of this bit use it.
+  //
+  // We do not use |__XSAVE__| for static detection because the hack in
+  // |OPENSSL_cpuid_setup| for Knights Landing CPUs needs to override it.
+  int hardware_supports_xsave = (OPENSSL_ia32cap_get()[1] & (1u << 26)) != 0;
+  return !hardware_supports_xsave && CRYPTO_is_MOVBE_capable();
+}
 
 #endif  // OPENSSL_X86 || OPENSSL_X86_64