aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSalvatore Bonaccorso <carnil@debian.org>2021-03-19 16:48:59 +0100
committerSalvatore Bonaccorso <carnil@debian.org>2021-03-19 16:49:34 +0100
commitb04c1cdb3e7a7c4744c644394db8df3f702bdb65 (patch)
tree38835070b06715e244fbc159896b2a9c9835f488
parent724a0bce9f1ed34874a44beae19d287b8b0b2c7c (diff)
downloadkernel_replicant_linux-b04c1cdb3e7a7c4744c644394db8df3f702bdb65.tar.gz
kernel_replicant_linux-b04c1cdb3e7a7c4744c644394db8df3f702bdb65.tar.bz2
kernel_replicant_linux-b04c1cdb3e7a7c4744c644394db8df3f702bdb65.zip
[x86] crypto: x86/aes-ni-xts - use direct calls to and 4-way stride
-rw-r--r--debian/changelog1
-rw-r--r--debian/patches/bugfix/x86/crypto-x86-aes-ni-xts-use-direct-calls-to-and-4-way-.patch278
-rw-r--r--debian/patches/series1
3 files changed, 280 insertions, 0 deletions
diff --git a/debian/changelog b/debian/changelog
index 0366e7064231..7f29ad286e47 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -886,6 +886,7 @@ linux (5.10.24-1) UNRELEASED; urgency=medium
* bpf: Add sanity check for upper ptr_limit
* bpf, selftests: Fix up some test_verifier cases for unprivileged
* [x86] crypto: aesni - Use TEST %reg,%reg instead of CMP $0,%reg
+ * [x86] crypto: x86/aes-ni-xts - use direct calls to and 4-way stride
[ Wookey ]
* [arm64] drivers/perf: Enable ARM_CMN as module (Closes: #981186)
diff --git a/debian/patches/bugfix/x86/crypto-x86-aes-ni-xts-use-direct-calls-to-and-4-way-.patch b/debian/patches/bugfix/x86/crypto-x86-aes-ni-xts-use-direct-calls-to-and-4-way-.patch
new file mode 100644
index 000000000000..8310e7a82a5f
--- /dev/null
+++ b/debian/patches/bugfix/x86/crypto-x86-aes-ni-xts-use-direct-calls-to-and-4-way-.patch
@@ -0,0 +1,278 @@
+From fcbbb7a6484bd5ae998197af4bb02dec2f495414 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 31 Dec 2020 17:41:54 +0100
+Subject: crypto: x86/aes-ni-xts - use direct calls to and 4-way stride
+
+From: Ard Biesheuvel <ardb@kernel.org>
+
+[ Upstream commit 86ad60a65f29dd862a11c22bb4b5be28d6c5cef1 ]
+
+The XTS asm helper arrangement is a bit odd: the 8-way stride helper
+consists of back-to-back calls to the 4-way core transforms, which
+are called indirectly, based on a boolean that indicates whether we
+are performing encryption or decryption.
+
+Given how costly indirect calls are on x86, let's switch to direct
+calls, and given how the 8-way stride doesn't really add anything
+substantial, use a 4-way stride instead, and make the asm core
+routine deal with any multiple of 4 blocks. Since 512 byte sectors
+or 4 KB blocks are the typical quantities XTS operates on, increase
+the stride exported to the glue helper to 512 bytes as well.
+
+As a result, the number of indirect calls is reduced from 3 per 64 bytes
+of in/output to 1 per 512 bytes of in/output, which produces a 65% speedup
+when operating on 1 KB blocks (measured on a Intel(R) Core(TM) i7-8650U CPU)
+
+Fixes: 9697fa39efd3f ("x86/retpoline/crypto: Convert crypto assembler indirect jumps")
+Tested-by: Eric Biggers <ebiggers@google.com> # x86_64
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/crypto/aesni-intel_asm.S | 115 ++++++++++++++++++-----------
+ arch/x86/crypto/aesni-intel_glue.c | 25 ++++---
+ 2 files changed, 84 insertions(+), 56 deletions(-)
+
+diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
+index d1436c37008b..57aef3f5a81e 100644
+--- a/arch/x86/crypto/aesni-intel_asm.S
++++ b/arch/x86/crypto/aesni-intel_asm.S
+@@ -2715,25 +2715,18 @@ SYM_FUNC_END(aesni_ctr_enc)
+ pxor CTR, IV;
+
+ /*
+- * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst,
+- * const u8 *src, bool enc, le128 *iv)
++ * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
++ * const u8 *src, unsigned int len, le128 *iv)
+ */
+-SYM_FUNC_START(aesni_xts_crypt8)
++SYM_FUNC_START(aesni_xts_encrypt)
+ FRAME_BEGIN
+- testb %cl, %cl
+- movl $0, %ecx
+- movl $240, %r10d
+- leaq _aesni_enc4, %r11
+- leaq _aesni_dec4, %rax
+- cmovel %r10d, %ecx
+- cmoveq %rax, %r11
+
+ movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
+ movups (IVP), IV
+
+ mov 480(KEYP), KLEN
+- addq %rcx, KEYP
+
++.Lxts_enc_loop4:
+ movdqa IV, STATE1
+ movdqu 0x00(INP), INC
+ pxor INC, STATE1
+@@ -2757,71 +2750,103 @@ SYM_FUNC_START(aesni_xts_crypt8)
+ pxor INC, STATE4
+ movdqu IV, 0x30(OUTP)
+
+- CALL_NOSPEC r11
++ call _aesni_enc4
+
+ movdqu 0x00(OUTP), INC
+ pxor INC, STATE1
+ movdqu STATE1, 0x00(OUTP)
+
+- _aesni_gf128mul_x_ble()
+- movdqa IV, STATE1
+- movdqu 0x40(INP), INC
+- pxor INC, STATE1
+- movdqu IV, 0x40(OUTP)
+-
+ movdqu 0x10(OUTP), INC
+ pxor INC, STATE2
+ movdqu STATE2, 0x10(OUTP)
+
+- _aesni_gf128mul_x_ble()
+- movdqa IV, STATE2
+- movdqu 0x50(INP), INC
+- pxor INC, STATE2
+- movdqu IV, 0x50(OUTP)
+-
+ movdqu 0x20(OUTP), INC
+ pxor INC, STATE3
+ movdqu STATE3, 0x20(OUTP)
+
+- _aesni_gf128mul_x_ble()
+- movdqa IV, STATE3
+- movdqu 0x60(INP), INC
+- pxor INC, STATE3
+- movdqu IV, 0x60(OUTP)
+-
+ movdqu 0x30(OUTP), INC
+ pxor INC, STATE4
+ movdqu STATE4, 0x30(OUTP)
+
+ _aesni_gf128mul_x_ble()
+- movdqa IV, STATE4
+- movdqu 0x70(INP), INC
+- pxor INC, STATE4
+- movdqu IV, 0x70(OUTP)
+
+- _aesni_gf128mul_x_ble()
++ add $64, INP
++ add $64, OUTP
++ sub $64, LEN
++ ja .Lxts_enc_loop4
++
+ movups IV, (IVP)
+
+- CALL_NOSPEC r11
++ FRAME_END
++ ret
++SYM_FUNC_END(aesni_xts_encrypt)
++
++/*
++ * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
++ * const u8 *src, unsigned int len, le128 *iv)
++ */
++SYM_FUNC_START(aesni_xts_decrypt)
++ FRAME_BEGIN
++
++ movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
++ movups (IVP), IV
++
++ mov 480(KEYP), KLEN
++ add $240, KEYP
+
+- movdqu 0x40(OUTP), INC
++.Lxts_dec_loop4:
++ movdqa IV, STATE1
++ movdqu 0x00(INP), INC
+ pxor INC, STATE1
+- movdqu STATE1, 0x40(OUTP)
++ movdqu IV, 0x00(OUTP)
+
+- movdqu 0x50(OUTP), INC
++ _aesni_gf128mul_x_ble()
++ movdqa IV, STATE2
++ movdqu 0x10(INP), INC
++ pxor INC, STATE2
++ movdqu IV, 0x10(OUTP)
++
++ _aesni_gf128mul_x_ble()
++ movdqa IV, STATE3
++ movdqu 0x20(INP), INC
++ pxor INC, STATE3
++ movdqu IV, 0x20(OUTP)
++
++ _aesni_gf128mul_x_ble()
++ movdqa IV, STATE4
++ movdqu 0x30(INP), INC
++ pxor INC, STATE4
++ movdqu IV, 0x30(OUTP)
++
++ call _aesni_dec4
++
++ movdqu 0x00(OUTP), INC
++ pxor INC, STATE1
++ movdqu STATE1, 0x00(OUTP)
++
++ movdqu 0x10(OUTP), INC
+ pxor INC, STATE2
+- movdqu STATE2, 0x50(OUTP)
++ movdqu STATE2, 0x10(OUTP)
+
+- movdqu 0x60(OUTP), INC
++ movdqu 0x20(OUTP), INC
+ pxor INC, STATE3
+- movdqu STATE3, 0x60(OUTP)
++ movdqu STATE3, 0x20(OUTP)
+
+- movdqu 0x70(OUTP), INC
++ movdqu 0x30(OUTP), INC
+ pxor INC, STATE4
+- movdqu STATE4, 0x70(OUTP)
++ movdqu STATE4, 0x30(OUTP)
++
++ _aesni_gf128mul_x_ble()
++
++ add $64, INP
++ add $64, OUTP
++ sub $64, LEN
++ ja .Lxts_dec_loop4
++
++ movups IV, (IVP)
+
+ FRAME_END
+ ret
+-SYM_FUNC_END(aesni_xts_crypt8)
++SYM_FUNC_END(aesni_xts_decrypt)
+
+ #endif
+diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
+index f9a1d98e7534..be891fdf8d17 100644
+--- a/arch/x86/crypto/aesni-intel_glue.c
++++ b/arch/x86/crypto/aesni-intel_glue.c
+@@ -97,6 +97,12 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
+ #define AVX_GEN2_OPTSIZE 640
+ #define AVX_GEN4_OPTSIZE 4096
+
++asmlinkage void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *out,
++ const u8 *in, unsigned int len, u8 *iv);
++
++asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out,
++ const u8 *in, unsigned int len, u8 *iv);
++
+ #ifdef CONFIG_X86_64
+
+ static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
+@@ -104,9 +110,6 @@ static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
+ asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
+
+-asmlinkage void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *out,
+- const u8 *in, bool enc, le128 *iv);
+-
+ /* asmlinkage void aesni_gcm_enc()
+ * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
+ * struct gcm_context_data. May be uninitialized.
+@@ -547,14 +550,14 @@ static void aesni_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_dec);
+ }
+
+-static void aesni_xts_enc8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
++static void aesni_xts_enc32(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+ {
+- aesni_xts_crypt8(ctx, dst, src, true, iv);
++ aesni_xts_encrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv);
+ }
+
+-static void aesni_xts_dec8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
++static void aesni_xts_dec32(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
+ {
+- aesni_xts_crypt8(ctx, dst, src, false, iv);
++ aesni_xts_decrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv);
+ }
+
+ static const struct common_glue_ctx aesni_enc_xts = {
+@@ -562,8 +565,8 @@ static const struct common_glue_ctx aesni_enc_xts = {
+ .fpu_blocks_limit = 1,
+
+ .funcs = { {
+- .num_blocks = 8,
+- .fn_u = { .xts = aesni_xts_enc8 }
++ .num_blocks = 32,
++ .fn_u = { .xts = aesni_xts_enc32 }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = aesni_xts_enc }
+@@ -575,8 +578,8 @@ static const struct common_glue_ctx aesni_dec_xts = {
+ .fpu_blocks_limit = 1,
+
+ .funcs = { {
+- .num_blocks = 8,
+- .fn_u = { .xts = aesni_xts_dec8 }
++ .num_blocks = 32,
++ .fn_u = { .xts = aesni_xts_dec32 }
+ }, {
+ .num_blocks = 1,
+ .fn_u = { .xts = aesni_xts_dec }
+--
+2.30.1
+
diff --git a/debian/patches/series b/debian/patches/series
index 7634c44bccb8..8a72da81f97f 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -76,6 +76,7 @@ bugfix/powerpc/powerpc-boot-fix-missing-crc32poly.h-when-building-with-kernel_xz
bugfix/arm64/arm64-acpi-Add-fixup-for-HPE-m400-quirks.patch
bugfix/x86/x86-32-disable-3dnow-in-generic-config.patch
bugfix/x86/crypto-aesni-use-test-reg-reg-instead-of-cmp-0-reg.patch
+bugfix/x86/crypto-x86-aes-ni-xts-use-direct-calls-to-and-4-way-.patch
# Arch features
features/arm64/arm64-dts-rockchip-Add-basic-support-for-Kobol-s-Hel.patch