Skip to content

Commit

Permalink
Remove alignment requirement on ghash-ssse3
Browse files Browse the repository at this point in the history
https://boringssl-review.googlesource.com/c/boringssl/+/74268 broke the
SSSE3 GHASH fallback because it no longer maintained alignment of
Htable. We had been relying on the memcpy to copy Htable into something
aligned.

Maintaining the alignment requirement without the memcpy is kind of a
nuisance because it now leaks into EVP_AEAD_CTX.  Since we don't have a
good way to make caller-allocatable structs aligned, it would mean
allocating 15 extra bytes and then finding the right position.

Benchmarks shows that the alignment makes no difference on a Intel(R)
Xeon(R) Gold 6154 CPU @ 3.00GHz. Of course, this is artificial because
that CPU would never run this code anyway.

I recall adding the alignment requirement because it gave a bit of a
perf boost on the old Mac Mini 2010 I was testing against, which
actually is a CPU that would run it. I was able to dig it up, but
apparently I no longer have a keyboard that's compatible with it. (That
machine is also long EOL and cannot even run Chrome's minimum macOS
version. Although its CPU may be representative for older Windows.)

Regardless, I don't think it makes sense to expend this complexity for
this. (See internal Chrome UMA Net.QuicSession.PreferAesGcm on Windows
for the percentage of Windows that would be running this code. Though
they should also be using ChaCha20-Poly1305 anyway.)

Bug: 42290477
Change-Id: I4ef8c636bfc18200869f011ea50cc5d4988244ba
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/74327
Auto-Submit: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Adam Langley <agl@google.com>
  • Loading branch information
davidben authored and Boringssl LUCI CQ committed Dec 15, 2024
1 parent 9e65c3e commit 676a802
Show file tree
Hide file tree
Showing 10 changed files with 48 additions and 84 deletions.
46 changes: 9 additions & 37 deletions crypto/fipsmodule/cipher/e_aes.cc.inc
Original file line number Diff line number Diff line change
Expand Up @@ -241,35 +241,9 @@ static int aes_ofb_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in,
return 1;
}

#if defined(OPENSSL_32_BIT)
#define EVP_AES_GCM_CTX_PADDING (4 + 8)
#else
#define EVP_AES_GCM_CTX_PADDING 8
#endif

static EVP_AES_GCM_CTX *aes_gcm_from_cipher_ctx(EVP_CIPHER_CTX *ctx) {
static_assert(
alignof(EVP_AES_GCM_CTX) <= 16,
"EVP_AES_GCM_CTX needs more alignment than this function provides");

// |malloc| guarantees up to 4-byte alignment on 32-bit and 8-byte alignment
// on 64-bit systems, so we need to adjust to reach 16-byte alignment.
assert(ctx->cipher->ctx_size ==
sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING);

char *ptr = reinterpret_cast<char *>(ctx->cipher_data);
#if defined(OPENSSL_32_BIT)
assert((uintptr_t)ptr % 4 == 0);
ptr += (uintptr_t)ptr & 4;
#endif
assert((uintptr_t)ptr % 8 == 0);
ptr += (uintptr_t)ptr & 8;
return (EVP_AES_GCM_CTX *)ptr;
}

static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
const uint8_t *iv, int enc) {
EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(ctx);
EVP_AES_GCM_CTX *gctx = reinterpret_cast<EVP_AES_GCM_CTX*>(ctx->cipher_data);
if (!iv && !key) {
return 1;
}
Expand Down Expand Up @@ -312,7 +286,7 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
}

static void aes_gcm_cleanup(EVP_CIPHER_CTX *c) {
EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(c);
EVP_AES_GCM_CTX *gctx = reinterpret_cast<EVP_AES_GCM_CTX*>(c->cipher_data);
OPENSSL_cleanse(&gctx->key, sizeof(gctx->key));
OPENSSL_cleanse(&gctx->gcm, sizeof(gctx->gcm));
if (gctx->iv != c->iv) {
Expand All @@ -321,7 +295,7 @@ static void aes_gcm_cleanup(EVP_CIPHER_CTX *c) {
}

static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) {
EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(c);
EVP_AES_GCM_CTX *gctx = reinterpret_cast<EVP_AES_GCM_CTX*>(c->cipher_data);
switch (type) {
case EVP_CTRL_INIT:
gctx->key_set = 0;
Expand Down Expand Up @@ -421,10 +395,8 @@ static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) {

case EVP_CTRL_COPY: {
EVP_CIPHER_CTX *out = reinterpret_cast<EVP_CIPHER_CTX *>(ptr);
EVP_AES_GCM_CTX *gctx_out = aes_gcm_from_cipher_ctx(out);
// |EVP_CIPHER_CTX_copy| copies this generically, but we must redo it in
// case |out->cipher_data| and |in->cipher_data| are differently aligned.
OPENSSL_memcpy(gctx_out, gctx, sizeof(EVP_AES_GCM_CTX));
EVP_AES_GCM_CTX *gctx_out =
reinterpret_cast<EVP_AES_GCM_CTX *>(out->cipher_data);
if (gctx->iv == c->iv) {
gctx_out->iv = out->iv;
} else {
Expand All @@ -444,7 +416,7 @@ static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) {

static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in,
size_t len) {
EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(ctx);
EVP_AES_GCM_CTX *gctx = reinterpret_cast<EVP_AES_GCM_CTX*>(ctx->cipher_data);

// If not set up, return error
if (!gctx->key_set) {
Expand Down Expand Up @@ -552,7 +524,7 @@ DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_128_gcm) {
out->block_size = 1;
out->key_len = 16;
out->iv_len = AES_GCM_NONCE_LENGTH;
out->ctx_size = sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING;
out->ctx_size = sizeof(EVP_AES_GCM_CTX);
out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV | EVP_CIPH_CUSTOM_COPY |
EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT |
EVP_CIPH_CTRL_INIT | EVP_CIPH_FLAG_AEAD_CIPHER;
Expand Down Expand Up @@ -620,7 +592,7 @@ DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_192_gcm) {
out->block_size = 1;
out->key_len = 24;
out->iv_len = AES_GCM_NONCE_LENGTH;
out->ctx_size = sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING;
out->ctx_size = sizeof(EVP_AES_GCM_CTX);
out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV | EVP_CIPH_CUSTOM_COPY |
EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT |
EVP_CIPH_CTRL_INIT | EVP_CIPH_FLAG_AEAD_CIPHER;
Expand Down Expand Up @@ -688,7 +660,7 @@ DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_256_gcm) {
out->block_size = 1;
out->key_len = 32;
out->iv_len = AES_GCM_NONCE_LENGTH;
out->ctx_size = sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING;
out->ctx_size = sizeof(EVP_AES_GCM_CTX);
out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV | EVP_CIPH_CUSTOM_COPY |
EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT |
EVP_CIPH_CTRL_INIT | EVP_CIPH_FLAG_AEAD_CIPHER;
Expand Down
2 changes: 1 addition & 1 deletion crypto/fipsmodule/modes/asm/ghash-ssse3-x86.pl
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ sub process_rows {

&mov("eax", $rows);
&set_label("loop_row_$call_counter");
&movdqa("xmm4", &QWP(0, $Htable));
&movdqu("xmm4", &QWP(0, $Htable));
&lea($Htable, &DWP(16, $Htable));

# Right-shift xmm2 and xmm3 by 8 bytes.
Expand Down
2 changes: 1 addition & 1 deletion crypto/fipsmodule/modes/asm/ghash-ssse3-x86_64.pl
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ sub process_rows {
return <<____;
movq \$$rows, %rax
.Loop_row_$call_counter:
movdqa ($Htable), %xmm4
movdqu ($Htable), %xmm4
leaq 16($Htable), $Htable
# Right-shift %xmm2 and %xmm3 by 8 bytes.
Expand Down
10 changes: 1 addition & 9 deletions crypto/fipsmodule/modes/internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,6 @@ typedef void (*ghash_func)(uint8_t Xi[16], const u128 Htable[16],
const uint8_t *inp, size_t len);

typedef struct gcm128_key_st {
// |gcm_*_ssse3| require a 16-byte-aligned |Htable| when hashing data, but not
// initialization. |GCM128_KEY| is not itself aligned to simplify embedding in
// |EVP_AEAD_CTX|, but |Htable|'s offset must be a multiple of 16.
// TODO(crbug.com/boringssl/604): Revisit this.
u128 Htable[16];
gmult_func gmult;
ghash_func ghash;
Expand Down Expand Up @@ -223,8 +219,6 @@ void gcm_gmult_clmul(uint8_t Xi[16], const u128 Htable[16]);
void gcm_ghash_clmul(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp,
size_t len);

// |gcm_gmult_ssse3| and |gcm_ghash_ssse3| require |Htable| to be
// 16-byte-aligned, but |gcm_init_ssse3| does not.
void gcm_init_ssse3(u128 Htable[16], const uint64_t Xi[2]);
void gcm_gmult_ssse3(uint8_t Xi[16], const u128 Htable[16]);
void gcm_ghash_ssse3(uint8_t Xi[16], const u128 Htable[16], const uint8_t *in,
Expand Down Expand Up @@ -382,9 +376,7 @@ size_t CRYPTO_cts128_encrypt_block(const uint8_t *in, uint8_t *out, size_t len,

struct polyval_ctx {
uint8_t S[16];
// |gcm_*_ssse3| require |Htable| to be 16-byte-aligned.
// TODO(crbug.com/boringssl/604): Revisit this.
alignas(16) u128 Htable[16];
u128 Htable[16];
gmult_func gmult;
ghash_func ghash;
};
Expand Down
12 changes: 6 additions & 6 deletions gen/bcm/ghash-ssse3-x86-apple.S
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ L000pic_point:
pxor %xmm3,%xmm3
movl $5,%eax
L001loop_row_1:
movdqa (%esi),%xmm4
movdqu (%esi),%xmm4
leal 16(%esi),%esi
movdqa %xmm2,%xmm6
.byte 102,15,58,15,243,1
Expand Down Expand Up @@ -62,7 +62,7 @@ L001loop_row_1:
pxor %xmm3,%xmm3
movl $5,%eax
L002loop_row_2:
movdqa (%esi),%xmm4
movdqu (%esi),%xmm4
leal 16(%esi),%esi
movdqa %xmm2,%xmm6
.byte 102,15,58,15,243,1
Expand Down Expand Up @@ -93,7 +93,7 @@ L002loop_row_2:
pxor %xmm3,%xmm3
movl $6,%eax
L003loop_row_3:
movdqa (%esi),%xmm4
movdqu (%esi),%xmm4
leal 16(%esi),%esi
movdqa %xmm2,%xmm6
.byte 102,15,58,15,243,1
Expand Down Expand Up @@ -169,7 +169,7 @@ L005loop_ghash:
pxor %xmm2,%xmm2
movl $5,%eax
L006loop_row_4:
movdqa (%esi),%xmm4
movdqu (%esi),%xmm4
leal 16(%esi),%esi
movdqa %xmm2,%xmm6
.byte 102,15,58,15,243,1
Expand Down Expand Up @@ -200,7 +200,7 @@ L006loop_row_4:
pxor %xmm3,%xmm3
movl $5,%eax
L007loop_row_5:
movdqa (%esi),%xmm4
movdqu (%esi),%xmm4
leal 16(%esi),%esi
movdqa %xmm2,%xmm6
.byte 102,15,58,15,243,1
Expand Down Expand Up @@ -231,7 +231,7 @@ L007loop_row_5:
pxor %xmm3,%xmm3
movl $6,%eax
L008loop_row_6:
movdqa (%esi),%xmm4
movdqu (%esi),%xmm4
leal 16(%esi),%esi
movdqa %xmm2,%xmm6
.byte 102,15,58,15,243,1
Expand Down
12 changes: 6 additions & 6 deletions gen/bcm/ghash-ssse3-x86-linux.S
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ gcm_gmult_ssse3:
pxor %xmm3,%xmm3
movl $5,%eax
.L001loop_row_1:
movdqa (%esi),%xmm4
movdqu (%esi),%xmm4
leal 16(%esi),%esi
movdqa %xmm2,%xmm6
.byte 102,15,58,15,243,1
Expand Down Expand Up @@ -63,7 +63,7 @@ gcm_gmult_ssse3:
pxor %xmm3,%xmm3
movl $5,%eax
.L002loop_row_2:
movdqa (%esi),%xmm4
movdqu (%esi),%xmm4
leal 16(%esi),%esi
movdqa %xmm2,%xmm6
.byte 102,15,58,15,243,1
Expand Down Expand Up @@ -94,7 +94,7 @@ gcm_gmult_ssse3:
pxor %xmm3,%xmm3
movl $6,%eax
.L003loop_row_3:
movdqa (%esi),%xmm4
movdqu (%esi),%xmm4
leal 16(%esi),%esi
movdqa %xmm2,%xmm6
.byte 102,15,58,15,243,1
Expand Down Expand Up @@ -172,7 +172,7 @@ gcm_ghash_ssse3:
pxor %xmm2,%xmm2
movl $5,%eax
.L006loop_row_4:
movdqa (%esi),%xmm4
movdqu (%esi),%xmm4
leal 16(%esi),%esi
movdqa %xmm2,%xmm6
.byte 102,15,58,15,243,1
Expand Down Expand Up @@ -203,7 +203,7 @@ gcm_ghash_ssse3:
pxor %xmm3,%xmm3
movl $5,%eax
.L007loop_row_5:
movdqa (%esi),%xmm4
movdqu (%esi),%xmm4
leal 16(%esi),%esi
movdqa %xmm2,%xmm6
.byte 102,15,58,15,243,1
Expand Down Expand Up @@ -234,7 +234,7 @@ gcm_ghash_ssse3:
pxor %xmm3,%xmm3
movl $6,%eax
.L008loop_row_6:
movdqa (%esi),%xmm4
movdqu (%esi),%xmm4
leal 16(%esi),%esi
movdqa %xmm2,%xmm6
.byte 102,15,58,15,243,1
Expand Down
12 changes: 6 additions & 6 deletions gen/bcm/ghash-ssse3-x86-win.asm
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ db 102,15,56,0,199
pxor xmm3,xmm3
mov eax,5
L$001loop_row_1:
movdqa xmm4,[esi]
movdqu xmm4,[esi]
lea esi,[16+esi]
movdqa xmm6,xmm2
db 102,15,58,15,243,1
Expand Down Expand Up @@ -69,7 +69,7 @@ db 102,15,56,0,233
pxor xmm3,xmm3
mov eax,5
L$002loop_row_2:
movdqa xmm4,[esi]
movdqu xmm4,[esi]
lea esi,[16+esi]
movdqa xmm6,xmm2
db 102,15,58,15,243,1
Expand Down Expand Up @@ -100,7 +100,7 @@ db 102,15,56,0,233
pxor xmm3,xmm3
mov eax,6
L$003loop_row_3:
movdqa xmm4,[esi]
movdqu xmm4,[esi]
lea esi,[16+esi]
movdqa xmm6,xmm2
db 102,15,58,15,243,1
Expand Down Expand Up @@ -175,7 +175,7 @@ db 102,15,56,0,207
pxor xmm2,xmm2
mov eax,5
L$006loop_row_4:
movdqa xmm4,[esi]
movdqu xmm4,[esi]
lea esi,[16+esi]
movdqa xmm6,xmm2
db 102,15,58,15,243,1
Expand Down Expand Up @@ -206,7 +206,7 @@ db 102,15,56,0,233
pxor xmm3,xmm3
mov eax,5
L$007loop_row_5:
movdqa xmm4,[esi]
movdqu xmm4,[esi]
lea esi,[16+esi]
movdqa xmm6,xmm2
db 102,15,58,15,243,1
Expand Down Expand Up @@ -237,7 +237,7 @@ db 102,15,56,0,233
pxor xmm3,xmm3
mov eax,6
L$008loop_row_6:
movdqa xmm4,[esi]
movdqu xmm4,[esi]
lea esi,[16+esi]
movdqa xmm6,xmm2
db 102,15,58,15,243,1
Expand Down
12 changes: 6 additions & 6 deletions gen/bcm/ghash-ssse3-x86_64-apple.S
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ _CET_ENDBR
pxor %xmm3,%xmm3
movq $5,%rax
L$oop_row_1:
movdqa (%rsi),%xmm4
movdqu (%rsi),%xmm4
leaq 16(%rsi),%rsi


Expand Down Expand Up @@ -86,7 +86,7 @@ L$oop_row_1:
pxor %xmm3,%xmm3
movq $5,%rax
L$oop_row_2:
movdqa (%rsi),%xmm4
movdqu (%rsi),%xmm4
leaq 16(%rsi),%rsi


Expand Down Expand Up @@ -134,7 +134,7 @@ L$oop_row_2:
pxor %xmm3,%xmm3
movq $6,%rax
L$oop_row_3:
movdqa (%rsi),%xmm4
movdqu (%rsi),%xmm4
leaq 16(%rsi),%rsi


Expand Down Expand Up @@ -241,7 +241,7 @@ L$oop_ghash:

movq $5,%rax
L$oop_row_4:
movdqa (%rsi),%xmm4
movdqu (%rsi),%xmm4
leaq 16(%rsi),%rsi


Expand Down Expand Up @@ -289,7 +289,7 @@ L$oop_row_4:
pxor %xmm3,%xmm3
movq $5,%rax
L$oop_row_5:
movdqa (%rsi),%xmm4
movdqu (%rsi),%xmm4
leaq 16(%rsi),%rsi


Expand Down Expand Up @@ -337,7 +337,7 @@ L$oop_row_5:
pxor %xmm3,%xmm3
movq $6,%rax
L$oop_row_6:
movdqa (%rsi),%xmm4
movdqu (%rsi),%xmm4
leaq 16(%rsi),%rsi


Expand Down
Loading

0 comments on commit 676a802

Please sign in to comment.