summaryrefslogtreecommitdiff
path: root/arch/x86/crypto
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r--arch/x86/crypto/Kconfig131
-rw-r--r--arch/x86/crypto/Makefile23
-rw-r--r--arch/x86/crypto/aegis128-aesni-glue.c13
-rw-r--r--arch/x86/crypto/aes-ctr-avx-x86_64.S47
-rw-r--r--arch/x86/crypto/aes-xts-avx-x86_64.S206
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c174
-rw-r--r--arch/x86/crypto/aria_aesni_avx2_glue.c22
-rw-r--r--arch/x86/crypto/aria_aesni_avx_glue.c20
-rw-r--r--arch/x86/crypto/aria_gfni_avx512_glue.c22
-rw-r--r--arch/x86/crypto/blake2s-core.S256
-rw-r--r--arch/x86/crypto/blake2s-glue.c74
-rw-r--r--arch/x86/crypto/camellia_aesni_avx2_glue.c21
-rw-r--r--arch/x86/crypto/camellia_aesni_avx_glue.c21
-rw-r--r--arch/x86/crypto/cast5_avx_glue.c21
-rw-r--r--arch/x86/crypto/cast6_avx_glue.c20
-rw-r--r--arch/x86/crypto/chacha-avx2-x86_64.S1021
-rw-r--r--arch/x86/crypto/chacha-avx512vl-x86_64.S836
-rw-r--r--arch/x86/crypto/chacha-ssse3-x86_64.S791
-rw-r--r--arch/x86/crypto/chacha_glue.c311
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S5
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c289
-rw-r--r--arch/x86/crypto/poly1305-x86_64-cryptogams.pl4248
-rw-r--r--arch/x86/crypto/poly1305_glue.c290
-rw-r--r--arch/x86/crypto/polyval-clmulni_glue.c72
-rw-r--r--arch/x86/crypto/serpent_avx2_glue.c21
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c21
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c21
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c89
-rw-r--r--arch/x86/crypto/sha256-avx-asm.S499
-rw-r--r--arch/x86/crypto/sha256-avx2-asm.S774
-rw-r--r--arch/x86/crypto/sha256-ssse3-asm.S513
-rw-r--r--arch/x86/crypto/sha256_ni_asm.S200
-rw-r--r--arch/x86/crypto/sha256_ssse3_glue.c467
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c75
-rw-r--r--arch/x86/crypto/sm3_avx_glue.c54
-rw-r--r--arch/x86/crypto/sm4_aesni_avx2_glue.c31
-rw-r--r--arch/x86/crypto/sm4_aesni_avx_glue.c31
-rw-r--r--arch/x86/crypto/twofish_avx_glue.c21
38 files changed, 435 insertions, 11316 deletions
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 3d948f10c94c..56cfdc79e2c6 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -4,7 +4,7 @@ menu "Accelerated Cryptographic Algorithms for CPU (x86)"
config CRYPTO_CURVE25519_X86
tristate
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_KPP
select CRYPTO_LIB_CURVE25519_GENERIC
select CRYPTO_ARCH_HAVE_LIB_CURVE25519
@@ -17,13 +17,11 @@ config CRYPTO_CURVE25519_X86
config CRYPTO_AES_NI_INTEL
tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XCTR, XTS, GCM (AES-NI/VAES)"
- depends on X86
select CRYPTO_AEAD
select CRYPTO_LIB_AES
select CRYPTO_LIB_GF128MUL
select CRYPTO_ALGAPI
select CRYPTO_SKCIPHER
- select CRYPTO_SIMD
help
Block cipher: AES cipher algorithms
AEAD cipher: AES with GCM
@@ -38,7 +36,7 @@ config CRYPTO_AES_NI_INTEL
config CRYPTO_BLOWFISH_X86_64
tristate "Ciphers: Blowfish, modes: ECB, CBC"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_SKCIPHER
select CRYPTO_BLOWFISH_COMMON
imply CRYPTO_CTR
@@ -50,7 +48,7 @@ config CRYPTO_BLOWFISH_X86_64
config CRYPTO_CAMELLIA_X86_64
tristate "Ciphers: Camellia with modes: ECB, CBC"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_SKCIPHER
imply CRYPTO_CTR
help
@@ -61,10 +59,9 @@ config CRYPTO_CAMELLIA_X86_64
config CRYPTO_CAMELLIA_AESNI_AVX_X86_64
tristate "Ciphers: Camellia with modes: ECB, CBC (AES-NI/AVX)"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_SKCIPHER
select CRYPTO_CAMELLIA_X86_64
- select CRYPTO_SIMD
imply CRYPTO_XTS
help
Length-preserving ciphers: Camellia with ECB and CBC modes
@@ -75,7 +72,7 @@ config CRYPTO_CAMELLIA_AESNI_AVX_X86_64
config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64
tristate "Ciphers: Camellia with modes: ECB, CBC (AES-NI/AVX2)"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_CAMELLIA_AESNI_AVX_X86_64
help
Length-preserving ciphers: Camellia with ECB and CBC modes
@@ -86,11 +83,10 @@ config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64
config CRYPTO_CAST5_AVX_X86_64
tristate "Ciphers: CAST5 with modes: ECB, CBC (AVX)"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_SKCIPHER
select CRYPTO_CAST5
select CRYPTO_CAST_COMMON
- select CRYPTO_SIMD
imply CRYPTO_CTR
help
Length-preserving ciphers: CAST5 (CAST-128) cipher algorithm
@@ -103,11 +99,10 @@ config CRYPTO_CAST5_AVX_X86_64
config CRYPTO_CAST6_AVX_X86_64
tristate "Ciphers: CAST6 with modes: ECB, CBC (AVX)"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_SKCIPHER
select CRYPTO_CAST6
select CRYPTO_CAST_COMMON
- select CRYPTO_SIMD
imply CRYPTO_XTS
imply CRYPTO_CTR
help
@@ -121,7 +116,7 @@ config CRYPTO_CAST6_AVX_X86_64
config CRYPTO_DES3_EDE_X86_64
tristate "Ciphers: Triple DES EDE with modes: ECB, CBC"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_SKCIPHER
select CRYPTO_LIB_DES
imply CRYPTO_CTR
@@ -135,10 +130,9 @@ config CRYPTO_DES3_EDE_X86_64
config CRYPTO_SERPENT_SSE2_X86_64
tristate "Ciphers: Serpent with modes: ECB, CBC (SSE2)"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_SKCIPHER
select CRYPTO_SERPENT
- select CRYPTO_SIMD
imply CRYPTO_CTR
help
Length-preserving ciphers: Serpent cipher algorithm
@@ -151,10 +145,9 @@ config CRYPTO_SERPENT_SSE2_X86_64
config CRYPTO_SERPENT_SSE2_586
tristate "Ciphers: Serpent with modes: ECB, CBC (32-bit with SSE2)"
- depends on X86 && !64BIT
+ depends on !64BIT
select CRYPTO_SKCIPHER
select CRYPTO_SERPENT
- select CRYPTO_SIMD
imply CRYPTO_CTR
help
Length-preserving ciphers: Serpent cipher algorithm
@@ -167,10 +160,9 @@ config CRYPTO_SERPENT_SSE2_586
config CRYPTO_SERPENT_AVX_X86_64
tristate "Ciphers: Serpent with modes: ECB, CBC (AVX)"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_SKCIPHER
select CRYPTO_SERPENT
- select CRYPTO_SIMD
imply CRYPTO_XTS
imply CRYPTO_CTR
help
@@ -184,7 +176,7 @@ config CRYPTO_SERPENT_AVX_X86_64
config CRYPTO_SERPENT_AVX2_X86_64
tristate "Ciphers: Serpent with modes: ECB, CBC (AVX2)"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_SERPENT_AVX_X86_64
help
Length-preserving ciphers: Serpent cipher algorithm
@@ -197,9 +189,8 @@ config CRYPTO_SERPENT_AVX2_X86_64
config CRYPTO_SM4_AESNI_AVX_X86_64
tristate "Ciphers: SM4 with modes: ECB, CBC, CTR (AES-NI/AVX)"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_SKCIPHER
- select CRYPTO_SIMD
select CRYPTO_ALGAPI
select CRYPTO_SM4
help
@@ -218,9 +209,8 @@ config CRYPTO_SM4_AESNI_AVX_X86_64
config CRYPTO_SM4_AESNI_AVX2_X86_64
tristate "Ciphers: SM4 with modes: ECB, CBC, CTR (AES-NI/AVX2)"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_SKCIPHER
- select CRYPTO_SIMD
select CRYPTO_ALGAPI
select CRYPTO_SM4
select CRYPTO_SM4_AESNI_AVX_X86_64
@@ -240,7 +230,7 @@ config CRYPTO_SM4_AESNI_AVX2_X86_64
config CRYPTO_TWOFISH_586
tristate "Ciphers: Twofish (32-bit)"
- depends on (X86 || UML_X86) && !64BIT
+ depends on !64BIT
select CRYPTO_ALGAPI
select CRYPTO_TWOFISH_COMMON
imply CRYPTO_CTR
@@ -251,7 +241,7 @@ config CRYPTO_TWOFISH_586
config CRYPTO_TWOFISH_X86_64
tristate "Ciphers: Twofish"
- depends on (X86 || UML_X86) && 64BIT
+ depends on 64BIT
select CRYPTO_ALGAPI
select CRYPTO_TWOFISH_COMMON
imply CRYPTO_CTR
@@ -262,7 +252,7 @@ config CRYPTO_TWOFISH_X86_64
config CRYPTO_TWOFISH_X86_64_3WAY
tristate "Ciphers: Twofish with modes: ECB, CBC (3-way parallel)"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_SKCIPHER
select CRYPTO_TWOFISH_COMMON
select CRYPTO_TWOFISH_X86_64
@@ -277,9 +267,8 @@ config CRYPTO_TWOFISH_X86_64_3WAY
config CRYPTO_TWOFISH_AVX_X86_64
tristate "Ciphers: Twofish with modes: ECB, CBC (AVX)"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_SKCIPHER
- select CRYPTO_SIMD
select CRYPTO_TWOFISH_COMMON
select CRYPTO_TWOFISH_X86_64
select CRYPTO_TWOFISH_X86_64_3WAY
@@ -295,9 +284,8 @@ config CRYPTO_TWOFISH_AVX_X86_64
config CRYPTO_ARIA_AESNI_AVX_X86_64
tristate "Ciphers: ARIA with modes: ECB, CTR (AES-NI/AVX/GFNI)"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_SKCIPHER
- select CRYPTO_SIMD
select CRYPTO_ALGAPI
select CRYPTO_ARIA
help
@@ -313,9 +301,8 @@ config CRYPTO_ARIA_AESNI_AVX_X86_64
config CRYPTO_ARIA_AESNI_AVX2_X86_64
tristate "Ciphers: ARIA with modes: ECB, CTR (AES-NI/AVX2/GFNI)"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_SKCIPHER
- select CRYPTO_SIMD
select CRYPTO_ALGAPI
select CRYPTO_ARIA
select CRYPTO_ARIA_AESNI_AVX_X86_64
@@ -332,9 +319,8 @@ config CRYPTO_ARIA_AESNI_AVX2_X86_64
config CRYPTO_ARIA_GFNI_AVX512_X86_64
tristate "Ciphers: ARIA with modes: ECB, CTR (AVX512/GFNI)"
- depends on X86 && 64BIT && AS_AVX512 && AS_GFNI
+ depends on 64BIT && AS_GFNI
select CRYPTO_SKCIPHER
- select CRYPTO_SIMD
select CRYPTO_ALGAPI
select CRYPTO_ARIA
select CRYPTO_ARIA_AESNI_AVX_X86_64
@@ -349,27 +335,10 @@ config CRYPTO_ARIA_GFNI_AVX512_X86_64
Processes 64 blocks in parallel.
-config CRYPTO_CHACHA20_X86_64
- tristate
- depends on X86 && 64BIT
- select CRYPTO_SKCIPHER
- select CRYPTO_LIB_CHACHA_GENERIC
- select CRYPTO_ARCH_HAVE_LIB_CHACHA
- default CRYPTO_LIB_CHACHA_INTERNAL
- help
- Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12
- stream cipher algorithms
-
- Architecture: x86_64 using:
- - SSSE3 (Supplemental SSE3)
- - AVX2 (Advanced Vector Extensions 2)
- - AVX-512VL (Advanced Vector Extensions-512VL)
-
config CRYPTO_AEGIS128_AESNI_SSE2
tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE4.1)"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_AEAD
- select CRYPTO_SIMD
help
AEGIS-128 AEAD algorithm
@@ -379,7 +348,7 @@ config CRYPTO_AEGIS128_AESNI_SSE2
config CRYPTO_NHPOLY1305_SSE2
tristate "Hash functions: NHPoly1305 (SSE2)"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_NHPOLY1305
help
NHPoly1305 hash function for Adiantum
@@ -389,7 +358,7 @@ config CRYPTO_NHPOLY1305_SSE2
config CRYPTO_NHPOLY1305_AVX2
tristate "Hash functions: NHPoly1305 (AVX2)"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_NHPOLY1305
help
NHPoly1305 hash function for Adiantum
@@ -397,21 +366,9 @@ config CRYPTO_NHPOLY1305_AVX2
Architecture: x86_64 using:
- AVX2 (Advanced Vector Extensions 2)
-config CRYPTO_BLAKE2S_X86
- bool "Hash functions: BLAKE2s (SSSE3/AVX-512)"
- depends on X86 && 64BIT
- select CRYPTO_LIB_BLAKE2S_GENERIC
- select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
- help
- BLAKE2s cryptographic hash function (RFC 7693)
-
- Architecture: x86_64 using:
- - SSSE3 (Supplemental SSE3)
- - AVX-512 (Advanced Vector Extensions-512)
-
config CRYPTO_POLYVAL_CLMUL_NI
tristate "Hash functions: POLYVAL (CLMUL-NI)"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_POLYVAL
help
POLYVAL hash function for HCTR2
@@ -419,23 +376,9 @@ config CRYPTO_POLYVAL_CLMUL_NI
Architecture: x86_64 using:
- CLMUL-NI (carry-less multiplication new instructions)
-config CRYPTO_POLY1305_X86_64
- tristate
- depends on X86 && 64BIT
- select CRYPTO_HASH
- select CRYPTO_LIB_POLY1305_GENERIC
- select CRYPTO_ARCH_HAVE_LIB_POLY1305
- default CRYPTO_LIB_POLY1305_INTERNAL
- help
- Poly1305 authenticator algorithm (RFC7539)
-
- Architecture: x86_64 using:
- - SSE2 (Streaming SIMD Extensions 2)
- - AVX2 (Advanced Vector Extensions 2)
-
config CRYPTO_SHA1_SSSE3
tristate "Hash functions: SHA-1 (SSSE3/AVX/AVX2/SHA-NI)"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_SHA1
select CRYPTO_HASH
help
@@ -447,23 +390,9 @@ config CRYPTO_SHA1_SSSE3
- AVX2 (Advanced Vector Extensions 2)
- SHA-NI (SHA Extensions New Instructions)
-config CRYPTO_SHA256_SSSE3
- tristate "Hash functions: SHA-224 and SHA-256 (SSSE3/AVX/AVX2/SHA-NI)"
- depends on X86 && 64BIT
- select CRYPTO_SHA256
- select CRYPTO_HASH
- help
- SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
-
- Architecture: x86_64 using:
- - SSSE3 (Supplemental SSE3)
- - AVX (Advanced Vector Extensions)
- - AVX2 (Advanced Vector Extensions 2)
- - SHA-NI (SHA Extensions New Instructions)
-
config CRYPTO_SHA512_SSSE3
tristate "Hash functions: SHA-384 and SHA-512 (SSSE3/AVX/AVX2)"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_SHA512
select CRYPTO_HASH
help
@@ -476,9 +405,9 @@ config CRYPTO_SHA512_SSSE3
config CRYPTO_SM3_AVX_X86_64
tristate "Hash functions: SM3 (AVX)"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_HASH
- select CRYPTO_SM3
+ select CRYPTO_LIB_SM3
help
SM3 secure hash function as defined by OSCCA GM/T 0004-2012 SM3
@@ -489,7 +418,7 @@ config CRYPTO_SM3_AVX_X86_64
config CRYPTO_GHASH_CLMUL_NI_INTEL
tristate "Hash functions: GHASH (CLMUL-NI)"
- depends on X86 && 64BIT
+ depends on 64BIT
select CRYPTO_CRYPTD
help
GCM GHASH hash function (NIST SP800-38D)
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 5d19f41bde58..aa289a9e0153 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -42,10 +42,6 @@ cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o
aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
-obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
-chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha_glue.o
-chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o
-
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \
@@ -56,29 +52,17 @@ aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o
endif
obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
-sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o
-sha1-ssse3-$(CONFIG_AS_SHA1_NI) += sha1_ni_asm.o
-
-obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
-sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
-sha256-ssse3-$(CONFIG_AS_SHA256_NI) += sha256_ni_asm.o
+sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ni_asm.o sha1_ssse3_glue.o
obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
-obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o
-libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o
-
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o
polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o
-obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
-poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
-targets += poly1305-x86_64-cryptogams.S
-
obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
@@ -104,10 +88,5 @@ aria-aesni-avx2-x86_64-y := aria-aesni-avx2-asm_64.o aria_aesni_avx2_glue.o
obj-$(CONFIG_CRYPTO_ARIA_GFNI_AVX512_X86_64) += aria-gfni-avx512-x86_64.o
aria-gfni-avx512-x86_64-y := aria-gfni-avx512-asm_64.o aria_gfni_avx512_glue.o
-quiet_cmd_perlasm = PERLASM $@
- cmd_perlasm = $(PERL) $< > $@
-$(obj)/%.S: $(src)/%.pl FORCE
- $(call if_changed,perlasm)
-
# Disable GCOV in odd or sensitive code
GCOV_PROFILE_curve25519-x86_64.o := n
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index 26786e15abac..f1b6d40154e3 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -8,7 +8,6 @@
*/
#include <crypto/internal/aead.h>
-#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <crypto/scatterwalk.h>
#include <linux/module.h>
@@ -233,21 +232,18 @@ static struct aead_alg crypto_aegis128_aesni_alg = {
.chunksize = AEGIS128_BLOCK_SIZE,
.base = {
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct aegis_ctx) +
__alignof__(struct aegis_ctx),
.cra_priority = 400,
- .cra_name = "__aegis128",
- .cra_driver_name = "__aegis128-aesni",
+ .cra_name = "aegis128",
+ .cra_driver_name = "aegis128-aesni",
.cra_module = THIS_MODULE,
}
};
-static struct simd_aead_alg *simd_alg;
-
static int __init crypto_aegis128_aesni_module_init(void)
{
if (!boot_cpu_has(X86_FEATURE_XMM4_1) ||
@@ -255,13 +251,12 @@ static int __init crypto_aegis128_aesni_module_init(void)
!cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
- return simd_register_aeads_compat(&crypto_aegis128_aesni_alg, 1,
- &simd_alg);
+ return crypto_register_aead(&crypto_aegis128_aesni_alg);
}
static void __exit crypto_aegis128_aesni_module_exit(void)
{
- simd_unregister_aeads(&crypto_aegis128_aesni_alg, 1, &simd_alg);
+ crypto_unregister_aead(&crypto_aegis128_aesni_alg);
}
module_init(crypto_aegis128_aesni_module_init);
diff --git a/arch/x86/crypto/aes-ctr-avx-x86_64.S b/arch/x86/crypto/aes-ctr-avx-x86_64.S
index 1685d8b24b2c..bbbfd80f5a50 100644
--- a/arch/x86/crypto/aes-ctr-avx-x86_64.S
+++ b/arch/x86/crypto/aes-ctr-avx-x86_64.S
@@ -48,8 +48,7 @@
// using the following sets of CPU features:
// - AES-NI && AVX
// - VAES && AVX2
-// - VAES && (AVX10/256 || (AVX512BW && AVX512VL)) && BMI2
-// - VAES && (AVX10/512 || (AVX512BW && AVX512VL)) && BMI2
+// - VAES && AVX512BW && AVX512VL && BMI2
//
// See the function definitions at the bottom of the file for more information.
@@ -76,7 +75,6 @@
.text
// Move a vector between memory and a register.
-// The register operand must be in the first 16 vector registers.
.macro _vmovdqu src, dst
.if VL < 64
vmovdqu \src, \dst
@@ -86,7 +84,6 @@
.endm
// Move a vector between registers.
-// The registers must be in the first 16 vector registers.
.macro _vmovdqa src, dst
.if VL < 64
vmovdqa \src, \dst
@@ -96,7 +93,7 @@
.endm
// Broadcast a 128-bit value from memory to all 128-bit lanes of a vector
-// register. The register operand must be in the first 16 vector registers.
+// register.
.macro _vbroadcast128 src, dst
.if VL == 16
vmovdqu \src, \dst
@@ -108,7 +105,6 @@
.endm
// XOR two vectors together.
-// Any register operands must be in the first 16 vector registers.
.macro _vpxor src1, src2, dst
.if VL < 64
vpxor \src1, \src2, \dst
@@ -199,8 +195,8 @@
// XOR each with the zero-th round key. Also update LE_CTR if !\final.
.macro _prepare_2_ctr_vecs is_xctr, i0, i1, final=0
.if \is_xctr
- .if USE_AVX10
- _vmovdqa LE_CTR, AESDATA\i0
+ .if USE_AVX512
+ vmovdqa64 LE_CTR, AESDATA\i0
vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i0
.else
vpxor XCTR_IV, LE_CTR, AESDATA\i0
@@ -208,7 +204,7 @@
.endif
vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1
- .if USE_AVX10
+ .if USE_AVX512
vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i1
.else
vpxor XCTR_IV, AESDATA\i1, AESDATA\i1
@@ -481,18 +477,12 @@
.Lxor_tail_partial_vec_0\@:
// XOR the remaining 1 <= LEN < VL bytes. It's easy if masked
// loads/stores are available; otherwise it's a bit harder...
-.if USE_AVX10
- .if VL <= 32
- mov $-1, %eax
- bzhi LEN, %eax, %eax
- kmovd %eax, %k1
- .else
+.if USE_AVX512
mov $-1, %rax
bzhi LEN64, %rax, %rax
kmovq %rax, %k1
- .endif
vmovdqu8 (SRC), AESDATA1{%k1}{z}
- _vpxor AESDATA1, AESDATA0, AESDATA0
+ vpxord AESDATA1, AESDATA0, AESDATA0
vmovdqu8 AESDATA0, (DST){%k1}
.else
.if VL == 32
@@ -554,7 +544,7 @@
// eliminates carries. |ctr| is the per-message block counter starting at 1.
.set VL, 16
-.set USE_AVX10, 0
+.set USE_AVX512, 0
SYM_TYPED_FUNC_START(aes_ctr64_crypt_aesni_avx)
_aes_ctr_crypt 0
SYM_FUNC_END(aes_ctr64_crypt_aesni_avx)
@@ -564,7 +554,7 @@ SYM_FUNC_END(aes_xctr_crypt_aesni_avx)
#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
.set VL, 32
-.set USE_AVX10, 0
+.set USE_AVX512, 0
SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx2)
_aes_ctr_crypt 0
SYM_FUNC_END(aes_ctr64_crypt_vaes_avx2)
@@ -572,21 +562,12 @@ SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx2)
_aes_ctr_crypt 1
SYM_FUNC_END(aes_xctr_crypt_vaes_avx2)
-.set VL, 32
-.set USE_AVX10, 1
-SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx10_256)
- _aes_ctr_crypt 0
-SYM_FUNC_END(aes_ctr64_crypt_vaes_avx10_256)
-SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx10_256)
- _aes_ctr_crypt 1
-SYM_FUNC_END(aes_xctr_crypt_vaes_avx10_256)
-
.set VL, 64
-.set USE_AVX10, 1
-SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx10_512)
+.set USE_AVX512, 1
+SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx512)
_aes_ctr_crypt 0
-SYM_FUNC_END(aes_ctr64_crypt_vaes_avx10_512)
-SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx10_512)
+SYM_FUNC_END(aes_ctr64_crypt_vaes_avx512)
+SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx512)
_aes_ctr_crypt 1
-SYM_FUNC_END(aes_xctr_crypt_vaes_avx10_512)
+SYM_FUNC_END(aes_xctr_crypt_vaes_avx512)
#endif // CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ
diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
index 93ba0ddbe009..db79cdf81588 100644
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -52,32 +52,25 @@
* different code, it uses a macro to generate several implementations that
* share similar source code but are targeted at different CPUs, listed below:
*
- * AES-NI + AVX
+ * AES-NI && AVX
* - 128-bit vectors (1 AES block per vector)
* - VEX-coded instructions
* - xmm0-xmm15
* - This is for older CPUs that lack VAES but do have AVX.
*
- * VAES + VPCLMULQDQ + AVX2
+ * VAES && VPCLMULQDQ && AVX2
* - 256-bit vectors (2 AES blocks per vector)
* - VEX-coded instructions
* - ymm0-ymm15
- * - This is for CPUs that have VAES but lack AVX512 or AVX10,
- * e.g. Intel's Alder Lake and AMD's Zen 3.
+ * - This is for CPUs that have VAES but either lack AVX512 (e.g. Intel's
+ * Alder Lake and AMD's Zen 3) or downclock too eagerly when using zmm
+ * registers (e.g. Intel's Ice Lake).
*
- * VAES + VPCLMULQDQ + AVX10/256 + BMI2
- * - 256-bit vectors (2 AES blocks per vector)
+ * VAES && VPCLMULQDQ && AVX512BW && AVX512VL && BMI2
+ * - 512-bit vectors (4 AES blocks per vector)
* - EVEX-coded instructions
- * - ymm0-ymm31
- * - This is for CPUs that have AVX512 but where using zmm registers causes
- * downclocking, and for CPUs that have AVX10/256 but not AVX10/512.
- * - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256.
- * To avoid confusion with 512-bit, we just write AVX10/256.
- *
- * VAES + VPCLMULQDQ + AVX10/512 + BMI2
- * - Same as the previous one, but upgrades to 512-bit vectors
- * (4 AES blocks per vector) in zmm0-zmm31.
- * - This is for CPUs that have good AVX512 or AVX10/512 support.
+ * - zmm0-zmm31
+ * - This is for CPUs that have good AVX512 support.
*
* This file doesn't have an implementation for AES-NI alone (without AVX), as
* the lack of VEX would make all the assembly code different.
@@ -107,9 +100,20 @@
// exists when there's a carry out of the low 64 bits of the tweak.
.quad 0x87, 1
+ // These are the shift amounts that are needed when multiplying by [x^0,
+ // x^1, x^2, x^3] to compute the first vector of tweaks when VL=64.
+ //
+ // The right shifts by 64 are expected to zeroize the destination.
+ // 'vpsrlvq' is indeed defined to do that; i.e. it doesn't truncate the
+ // amount to 64 & 63 = 0 like the 'shr' scalar shift instruction would.
+.Lrshift_amounts:
+ .byte 64, 64, 63, 63, 62, 62, 61, 61
+.Llshift_amounts:
+ .byte 0, 0, 1, 1, 2, 2, 3, 3
+
// This table contains constants for vpshufb and vpblendvb, used to
// handle variable byte shifts and blending during ciphertext stealing
- // on CPUs that don't support AVX10-style masking.
+ // on CPUs that don't support AVX512-style masking.
.Lcts_permute_table:
.byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
.byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
@@ -138,7 +142,7 @@
.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
_define_Vi \i
.endr
-.if USE_AVX10
+.if USE_AVX512
.irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
_define_Vi \i
.endr
@@ -193,7 +197,7 @@
// keys to the *end* of this register range. I.e., AES-128 uses
// KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
// (All also use KEY0 for the XOR-only "round" at the beginning.)
-.if USE_AVX10
+.if USE_AVX512
.set KEY1_XMM, %xmm16
.set KEY1, V16
.set KEY2_XMM, %xmm17
@@ -227,7 +231,6 @@
.endm
// Move a vector between memory and a register.
-// The register operand must be in the first 16 vector registers.
.macro _vmovdqu src, dst
.if VL < 64
vmovdqu \src, \dst
@@ -238,9 +241,9 @@
// Broadcast a 128-bit value into a vector.
.macro _vbroadcast128 src, dst
-.if VL == 16 && !USE_AVX10
+.if VL == 16
vmovdqu \src, \dst
-.elseif VL == 32 && !USE_AVX10
+.elseif VL == 32
vbroadcasti128 \src, \dst
.else
vbroadcasti32x4 \src, \dst
@@ -248,7 +251,6 @@
.endm
// XOR two vectors together.
-// Any register operands must be in the first 16 vector registers.
.macro _vpxor src1, src2, dst
.if VL < 64
vpxor \src1, \src2, \dst
@@ -259,7 +261,7 @@
// XOR three vectors together.
.macro _xor3 src1, src2, src3_and_dst
-.if USE_AVX10
+.if USE_AVX512
// vpternlogd with immediate 0x96 is a three-argument XOR.
vpternlogd $0x96, \src1, \src2, \src3_and_dst
.else
@@ -274,7 +276,7 @@
vpshufd $0x13, \src, \tmp
vpaddq \src, \src, \dst
vpsrad $31, \tmp, \tmp
-.if USE_AVX10
+.if USE_AVX512
vpternlogd $0x78, GF_POLY_XMM, \tmp, \dst
.else
vpand GF_POLY_XMM, \tmp, \tmp
@@ -303,52 +305,75 @@
// Given the first XTS tweak at (TWEAK), compute the first set of tweaks and
// store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5.
.macro _compute_first_set_of_tweaks
- vmovdqu (TWEAK), TWEAK0_XMM
- _vbroadcast128 .Lgf_poly(%rip), GF_POLY
.if VL == 16
- // With VL=16, multiplying by x serially is fastest.
+ vmovdqu (TWEAK), TWEAK0_XMM
+ vmovdqu .Lgf_poly(%rip), GF_POLY
_next_tweak TWEAK0, %xmm0, TWEAK1
_next_tweak TWEAK1, %xmm0, TWEAK2
_next_tweak TWEAK2, %xmm0, TWEAK3
-.else
-.if VL == 32
- // Compute the second block of TWEAK0.
+.elseif VL == 32
+ vmovdqu (TWEAK), TWEAK0_XMM
+ vbroadcasti128 .Lgf_poly(%rip), GF_POLY
+
+ // Compute the first vector of tweaks.
_next_tweak TWEAK0_XMM, %xmm0, %xmm1
vinserti128 $1, %xmm1, TWEAK0, TWEAK0
-.elseif VL == 64
- // Compute the remaining blocks of TWEAK0.
- _next_tweak TWEAK0_XMM, %xmm0, %xmm1
- _next_tweak %xmm1, %xmm0, %xmm2
- _next_tweak %xmm2, %xmm0, %xmm3
- vinserti32x4 $1, %xmm1, TWEAK0, TWEAK0
- vinserti32x4 $2, %xmm2, TWEAK0, TWEAK0
- vinserti32x4 $3, %xmm3, TWEAK0, TWEAK0
-.endif
- // Compute TWEAK[1-3] from TWEAK0.
- vpsrlq $64 - 1*VL/16, TWEAK0, V0
- vpsrlq $64 - 2*VL/16, TWEAK0, V2
- vpsrlq $64 - 3*VL/16, TWEAK0, V4
+
+ // Compute the next three vectors of tweaks:
+ // TWEAK1 = TWEAK0 * [x^2, x^2]
+ // TWEAK2 = TWEAK0 * [x^4, x^4]
+ // TWEAK3 = TWEAK0 * [x^6, x^6]
+ vpsrlq $64 - 2, TWEAK0, V0
+ vpsrlq $64 - 4, TWEAK0, V2
+ vpsrlq $64 - 6, TWEAK0, V4
vpclmulqdq $0x01, GF_POLY, V0, V1
vpclmulqdq $0x01, GF_POLY, V2, V3
vpclmulqdq $0x01, GF_POLY, V4, V5
vpslldq $8, V0, V0
vpslldq $8, V2, V2
vpslldq $8, V4, V4
- vpsllq $1*VL/16, TWEAK0, TWEAK1
- vpsllq $2*VL/16, TWEAK0, TWEAK2
- vpsllq $3*VL/16, TWEAK0, TWEAK3
-.if USE_AVX10
- vpternlogd $0x96, V0, V1, TWEAK1
- vpternlogd $0x96, V2, V3, TWEAK2
- vpternlogd $0x96, V4, V5, TWEAK3
-.else
+ vpsllq $2, TWEAK0, TWEAK1
+ vpsllq $4, TWEAK0, TWEAK2
+ vpsllq $6, TWEAK0, TWEAK3
vpxor V0, TWEAK1, TWEAK1
vpxor V2, TWEAK2, TWEAK2
vpxor V4, TWEAK3, TWEAK3
vpxor V1, TWEAK1, TWEAK1
vpxor V3, TWEAK2, TWEAK2
vpxor V5, TWEAK3, TWEAK3
-.endif
+.else
+ vbroadcasti32x4 (TWEAK), TWEAK0
+ vbroadcasti32x4 .Lgf_poly(%rip), GF_POLY
+
+ // Compute the first vector of tweaks:
+ // TWEAK0 = broadcast128(TWEAK) * [x^0, x^1, x^2, x^3]
+ vpmovzxbq .Lrshift_amounts(%rip), V4
+ vpsrlvq V4, TWEAK0, V0
+ vpclmulqdq $0x01, GF_POLY, V0, V1
+ vpmovzxbq .Llshift_amounts(%rip), V4
+ vpslldq $8, V0, V0
+ vpsllvq V4, TWEAK0, TWEAK0
+ vpternlogd $0x96, V0, V1, TWEAK0
+
+ // Compute the next three vectors of tweaks:
+ // TWEAK1 = TWEAK0 * [x^4, x^4, x^4, x^4]
+ // TWEAK2 = TWEAK0 * [x^8, x^8, x^8, x^8]
+ // TWEAK3 = TWEAK0 * [x^12, x^12, x^12, x^12]
+ // x^8 only needs byte-aligned shifts, so optimize accordingly.
+ vpsrlq $64 - 4, TWEAK0, V0
+ vpsrldq $(64 - 8) / 8, TWEAK0, V2
+ vpsrlq $64 - 12, TWEAK0, V4
+ vpclmulqdq $0x01, GF_POLY, V0, V1
+ vpclmulqdq $0x01, GF_POLY, V2, V3
+ vpclmulqdq $0x01, GF_POLY, V4, V5
+ vpslldq $8, V0, V0
+ vpslldq $8, V4, V4
+ vpsllq $4, TWEAK0, TWEAK1
+ vpslldq $8 / 8, TWEAK0, TWEAK2
+ vpsllq $12, TWEAK0, TWEAK3
+ vpternlogd $0x96, V0, V1, TWEAK1
+ vpxord V3, TWEAK2, TWEAK2
+ vpternlogd $0x96, V4, V5, TWEAK3
.endif
.endm
@@ -474,26 +499,26 @@
lea OFFS-16(KEY, KEYLEN64, 4), KEY
// If all 32 SIMD registers are available, cache all the round keys.
-.if USE_AVX10
+.if USE_AVX512
cmp $24, KEYLEN
jl .Laes128\@
je .Laes192\@
- _vbroadcast128 -6*16(KEY), KEY1
- _vbroadcast128 -5*16(KEY), KEY2
+ vbroadcasti32x4 -6*16(KEY), KEY1
+ vbroadcasti32x4 -5*16(KEY), KEY2
.Laes192\@:
- _vbroadcast128 -4*16(KEY), KEY3
- _vbroadcast128 -3*16(KEY), KEY4
+ vbroadcasti32x4 -4*16(KEY), KEY3
+ vbroadcasti32x4 -3*16(KEY), KEY4
.Laes128\@:
- _vbroadcast128 -2*16(KEY), KEY5
- _vbroadcast128 -1*16(KEY), KEY6
- _vbroadcast128 0*16(KEY), KEY7
- _vbroadcast128 1*16(KEY), KEY8
- _vbroadcast128 2*16(KEY), KEY9
- _vbroadcast128 3*16(KEY), KEY10
- _vbroadcast128 4*16(KEY), KEY11
- _vbroadcast128 5*16(KEY), KEY12
- _vbroadcast128 6*16(KEY), KEY13
- _vbroadcast128 7*16(KEY), KEY14
+ vbroadcasti32x4 -2*16(KEY), KEY5
+ vbroadcasti32x4 -1*16(KEY), KEY6
+ vbroadcasti32x4 0*16(KEY), KEY7
+ vbroadcasti32x4 1*16(KEY), KEY8
+ vbroadcasti32x4 2*16(KEY), KEY9
+ vbroadcasti32x4 3*16(KEY), KEY10
+ vbroadcasti32x4 4*16(KEY), KEY11
+ vbroadcasti32x4 5*16(KEY), KEY12
+ vbroadcasti32x4 6*16(KEY), KEY13
+ vbroadcasti32x4 7*16(KEY), KEY14
.endif
.endm
@@ -521,7 +546,7 @@
// using the same key for all block(s). The round key is loaded from the
// appropriate register or memory location for round \i. May clobber \tmp.
.macro _vaes_1x enc, i, xmm_suffix, data, tmp
-.if USE_AVX10
+.if USE_AVX512
_vaes \enc, KEY\i\xmm_suffix, \data
.else
.ifnb \xmm_suffix
@@ -538,7 +563,7 @@
// appropriate register or memory location for round \i. In addition, does two
// steps of the computation of the next set of tweaks. May clobber V4 and V5.
.macro _vaes_4x enc, i
-.if USE_AVX10
+.if USE_AVX512
_tweak_step (2*(\i-5))
_vaes \enc, KEY\i, V0
_vaes \enc, KEY\i, V1
@@ -574,7 +599,7 @@
.irp i, 5,6,7,8,9,10,11,12,13
_vaes_1x \enc, \i, \xmm_suffix, \data, tmp=\tmp
.endr
-.if USE_AVX10
+.if USE_AVX512
vpxord KEY14\xmm_suffix, \tweak, \tmp
.else
.ifnb \xmm_suffix
@@ -617,11 +642,11 @@
// This is the main loop, en/decrypting 4*VL bytes per iteration.
// XOR each source block with its tweak and the zero-th round key.
-.if USE_AVX10
- _vmovdqu 0*VL(SRC), V0
- _vmovdqu 1*VL(SRC), V1
- _vmovdqu 2*VL(SRC), V2
- _vmovdqu 3*VL(SRC), V3
+.if USE_AVX512
+ vmovdqu8 0*VL(SRC), V0
+ vmovdqu8 1*VL(SRC), V1
+ vmovdqu8 2*VL(SRC), V2
+ vmovdqu8 3*VL(SRC), V3
vpternlogd $0x96, TWEAK0, KEY0, V0
vpternlogd $0x96, TWEAK1, KEY0, V1
vpternlogd $0x96, TWEAK2, KEY0, V2
@@ -654,7 +679,7 @@
// Reduce latency by doing the XOR before the vaesenclast, utilizing the
// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a)
// (and likewise for vaesdeclast).
-.if USE_AVX10
+.if USE_AVX512
_tweak_step 18
_tweak_step 19
vpxord TWEAK0, KEY14, V4
@@ -762,7 +787,7 @@
_aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0, tmp=%xmm1
.endif
-.if USE_AVX10
+.if USE_AVX512
// Create a mask that has the first LEN bits set.
mov $-1, %r9d
bzhi LEN, %r9d, %r9d
@@ -811,7 +836,7 @@
// u8 iv[AES_BLOCK_SIZE]);
//
// Encrypt |iv| using the AES key |tweak_key| to get the first tweak. Assumes
-// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX10.
+// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX512.
SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
.set TWEAK_KEY, %rdi
.set IV, %rsi
@@ -853,7 +878,7 @@ SYM_FUNC_END(aes_xts_encrypt_iv)
// multiple of 16, then this function updates |tweak| to contain the next tweak.
.set VL, 16
-.set USE_AVX10, 0
+.set USE_AVX512, 0
SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx)
_aes_xts_crypt 1
SYM_FUNC_END(aes_xts_encrypt_aesni_avx)
@@ -863,7 +888,7 @@ SYM_FUNC_END(aes_xts_decrypt_aesni_avx)
#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
.set VL, 32
-.set USE_AVX10, 0
+.set USE_AVX512, 0
SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2)
_aes_xts_crypt 1
SYM_FUNC_END(aes_xts_encrypt_vaes_avx2)
@@ -871,21 +896,12 @@ SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2)
_aes_xts_crypt 0
SYM_FUNC_END(aes_xts_decrypt_vaes_avx2)
-.set VL, 32
-.set USE_AVX10, 1
-SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256)
- _aes_xts_crypt 1
-SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256)
-SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256)
- _aes_xts_crypt 0
-SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256)
-
.set VL, 64
-.set USE_AVX10, 1
-SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512)
+.set USE_AVX512, 1
+SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx512)
_aes_xts_crypt 1
-SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512)
-SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512)
+SYM_FUNC_END(aes_xts_encrypt_vaes_avx512)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx512)
_aes_xts_crypt 0
-SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512)
+SYM_FUNC_END(aes_xts_decrypt_vaes_avx512)
#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index bc655d794a95..061b1ced93c5 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -566,10 +566,9 @@ static struct crypto_alg aesni_cipher_alg = {
static struct skcipher_alg aesni_skciphers[] = {
{
.base = {
- .cra_name = "__ecb(aes)",
- .cra_driver_name = "__ecb-aes-aesni",
+ .cra_name = "ecb(aes)",
+ .cra_driver_name = "ecb-aes-aesni",
.cra_priority = 400,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = CRYPTO_AES_CTX_SIZE,
.cra_module = THIS_MODULE,
@@ -581,10 +580,9 @@ static struct skcipher_alg aesni_skciphers[] = {
.decrypt = ecb_decrypt,
}, {
.base = {
- .cra_name = "__cbc(aes)",
- .cra_driver_name = "__cbc-aes-aesni",
+ .cra_name = "cbc(aes)",
+ .cra_driver_name = "cbc-aes-aesni",
.cra_priority = 400,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = CRYPTO_AES_CTX_SIZE,
.cra_module = THIS_MODULE,
@@ -597,10 +595,9 @@ static struct skcipher_alg aesni_skciphers[] = {
.decrypt = cbc_decrypt,
}, {
.base = {
- .cra_name = "__cts(cbc(aes))",
- .cra_driver_name = "__cts-cbc-aes-aesni",
+ .cra_name = "cts(cbc(aes))",
+ .cra_driver_name = "cts-cbc-aes-aesni",
.cra_priority = 400,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = CRYPTO_AES_CTX_SIZE,
.cra_module = THIS_MODULE,
@@ -615,10 +612,9 @@ static struct skcipher_alg aesni_skciphers[] = {
#ifdef CONFIG_X86_64
}, {
.base = {
- .cra_name = "__ctr(aes)",
- .cra_driver_name = "__ctr-aes-aesni",
+ .cra_name = "ctr(aes)",
+ .cra_driver_name = "ctr-aes-aesni",
.cra_priority = 400,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = CRYPTO_AES_CTX_SIZE,
.cra_module = THIS_MODULE,
@@ -633,10 +629,9 @@ static struct skcipher_alg aesni_skciphers[] = {
#endif
}, {
.base = {
- .cra_name = "__xts(aes)",
- .cra_driver_name = "__xts-aes-aesni",
+ .cra_name = "xts(aes)",
+ .cra_driver_name = "xts-aes-aesni",
.cra_priority = 401,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = XTS_AES_CTX_SIZE,
.cra_module = THIS_MODULE,
@@ -651,9 +646,6 @@ static struct skcipher_alg aesni_skciphers[] = {
}
};
-static
-struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)];
-
#ifdef CONFIG_X86_64
asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
u8 iv[AES_BLOCK_SIZE]);
@@ -792,10 +784,9 @@ static int xctr_crypt_##suffix(struct skcipher_request *req) \
} \
\
static struct skcipher_alg skcipher_algs_##suffix[] = {{ \
- .base.cra_name = "__xts(aes)", \
- .base.cra_driver_name = "__xts-aes-" driver_name_suffix, \
+ .base.cra_name = "xts(aes)", \
+ .base.cra_driver_name = "xts-aes-" driver_name_suffix, \
.base.cra_priority = priority, \
- .base.cra_flags = CRYPTO_ALG_INTERNAL, \
.base.cra_blocksize = AES_BLOCK_SIZE, \
.base.cra_ctxsize = XTS_AES_CTX_SIZE, \
.base.cra_module = THIS_MODULE, \
@@ -807,10 +798,9 @@ static struct skcipher_alg skcipher_algs_##suffix[] = {{ \
.encrypt = xts_encrypt_##suffix, \
.decrypt = xts_decrypt_##suffix, \
}, { \
- .base.cra_name = "__ctr(aes)", \
- .base.cra_driver_name = "__ctr-aes-" driver_name_suffix, \
+ .base.cra_name = "ctr(aes)", \
+ .base.cra_driver_name = "ctr-aes-" driver_name_suffix, \
.base.cra_priority = priority, \
- .base.cra_flags = CRYPTO_ALG_INTERNAL, \
.base.cra_blocksize = 1, \
.base.cra_ctxsize = CRYPTO_AES_CTX_SIZE, \
.base.cra_module = THIS_MODULE, \
@@ -822,10 +812,9 @@ static struct skcipher_alg skcipher_algs_##suffix[] = {{ \
.encrypt = ctr_crypt_##suffix, \
.decrypt = ctr_crypt_##suffix, \
}, { \
- .base.cra_name = "__xctr(aes)", \
- .base.cra_driver_name = "__xctr-aes-" driver_name_suffix, \
+ .base.cra_name = "xctr(aes)", \
+ .base.cra_driver_name = "xctr-aes-" driver_name_suffix, \
.base.cra_priority = priority, \
- .base.cra_flags = CRYPTO_ALG_INTERNAL, \
.base.cra_blocksize = 1, \
.base.cra_ctxsize = CRYPTO_AES_CTX_SIZE, \
.base.cra_module = THIS_MODULE, \
@@ -836,16 +825,12 @@ static struct skcipher_alg skcipher_algs_##suffix[] = {{ \
.setkey = aesni_skcipher_setkey, \
.encrypt = xctr_crypt_##suffix, \
.decrypt = xctr_crypt_##suffix, \
-}}; \
- \
-static struct simd_skcipher_alg * \
-simd_skcipher_algs_##suffix[ARRAY_SIZE(skcipher_algs_##suffix)]
+}}
DEFINE_AVX_SKCIPHER_ALGS(aesni_avx, "aesni-avx", 500);
#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
DEFINE_AVX_SKCIPHER_ALGS(vaes_avx2, "vaes-avx2", 600);
-DEFINE_AVX_SKCIPHER_ALGS(vaes_avx10_256, "vaes-avx10_256", 700);
-DEFINE_AVX_SKCIPHER_ALGS(vaes_avx10_512, "vaes-avx10_512", 800);
+DEFINE_AVX_SKCIPHER_ALGS(vaes_avx512, "vaes-avx512", 800);
#endif
/* The common part of the x86_64 AES-GCM key struct */
@@ -1499,10 +1484,9 @@ static struct aead_alg aes_gcm_algs_##suffix[] = { { \
.chunksize = AES_BLOCK_SIZE, \
.maxauthsize = 16, \
.base = { \
- .cra_name = "__gcm(aes)", \
- .cra_driver_name = "__" generic_driver_name, \
+ .cra_name = "gcm(aes)", \
+ .cra_driver_name = generic_driver_name, \
.cra_priority = (priority), \
- .cra_flags = CRYPTO_ALG_INTERNAL, \
.cra_blocksize = 1, \
.cra_ctxsize = (ctxsize), \
.cra_module = THIS_MODULE, \
@@ -1516,17 +1500,14 @@ static struct aead_alg aes_gcm_algs_##suffix[] = { { \
.chunksize = AES_BLOCK_SIZE, \
.maxauthsize = 16, \
.base = { \
- .cra_name = "__rfc4106(gcm(aes))", \
- .cra_driver_name = "__" rfc_driver_name, \
+ .cra_name = "rfc4106(gcm(aes))", \
+ .cra_driver_name = rfc_driver_name, \
.cra_priority = (priority), \
- .cra_flags = CRYPTO_ALG_INTERNAL, \
.cra_blocksize = 1, \
.cra_ctxsize = (ctxsize), \
.cra_module = THIS_MODULE, \
}, \
-} }; \
- \
-static struct simd_aead_alg *aes_gcm_simdalgs_##suffix[2] \
+} }
/* aes_gcm_algs_aesni */
DEFINE_GCM_ALGS(aesni, /* no flags */ 0,
@@ -1556,14 +1537,12 @@ static int __init register_avx_algs(void)
if (!boot_cpu_has(X86_FEATURE_AVX))
return 0;
- err = simd_register_skciphers_compat(skcipher_algs_aesni_avx,
- ARRAY_SIZE(skcipher_algs_aesni_avx),
- simd_skcipher_algs_aesni_avx);
+ err = crypto_register_skciphers(skcipher_algs_aesni_avx,
+ ARRAY_SIZE(skcipher_algs_aesni_avx));
if (err)
return err;
- err = simd_register_aeads_compat(aes_gcm_algs_aesni_avx,
- ARRAY_SIZE(aes_gcm_algs_aesni_avx),
- aes_gcm_simdalgs_aesni_avx);
+ err = crypto_register_aeads(aes_gcm_algs_aesni_avx,
+ ARRAY_SIZE(aes_gcm_algs_aesni_avx));
if (err)
return err;
/*
@@ -1579,9 +1558,8 @@ static int __init register_avx_algs(void)
!boot_cpu_has(X86_FEATURE_PCLMULQDQ) ||
!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
return 0;
- err = simd_register_skciphers_compat(skcipher_algs_vaes_avx2,
- ARRAY_SIZE(skcipher_algs_vaes_avx2),
- simd_skcipher_algs_vaes_avx2);
+ err = crypto_register_skciphers(skcipher_algs_vaes_avx2,
+ ARRAY_SIZE(skcipher_algs_vaes_avx2));
if (err)
return err;
@@ -1592,76 +1570,52 @@ static int __init register_avx_algs(void)
XFEATURE_MASK_AVX512, NULL))
return 0;
- err = simd_register_skciphers_compat(skcipher_algs_vaes_avx10_256,
- ARRAY_SIZE(skcipher_algs_vaes_avx10_256),
- simd_skcipher_algs_vaes_avx10_256);
- if (err)
- return err;
- err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_256,
- ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256),
- aes_gcm_simdalgs_vaes_avx10_256);
+ err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_256,
+ ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256));
if (err)
return err;
if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) {
int i;
- for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx10_512); i++)
- skcipher_algs_vaes_avx10_512[i].base.cra_priority = 1;
+ for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx512); i++)
+ skcipher_algs_vaes_avx512[i].base.cra_priority = 1;
for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++)
aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1;
}
- err = simd_register_skciphers_compat(skcipher_algs_vaes_avx10_512,
- ARRAY_SIZE(skcipher_algs_vaes_avx10_512),
- simd_skcipher_algs_vaes_avx10_512);
+ err = crypto_register_skciphers(skcipher_algs_vaes_avx512,
+ ARRAY_SIZE(skcipher_algs_vaes_avx512));
if (err)
return err;
- err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_512,
- ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512),
- aes_gcm_simdalgs_vaes_avx10_512);
+ err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_512,
+ ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512));
if (err)
return err;
#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
return 0;
}
+#define unregister_skciphers(A) \
+ if (refcount_read(&(A)[0].base.cra_refcnt) != 0) \
+ crypto_unregister_skciphers((A), ARRAY_SIZE(A))
+#define unregister_aeads(A) \
+ if (refcount_read(&(A)[0].base.cra_refcnt) != 0) \
+ crypto_unregister_aeads((A), ARRAY_SIZE(A))
+
static void unregister_avx_algs(void)
{
- if (simd_skcipher_algs_aesni_avx[0])
- simd_unregister_skciphers(skcipher_algs_aesni_avx,
- ARRAY_SIZE(skcipher_algs_aesni_avx),
- simd_skcipher_algs_aesni_avx);
- if (aes_gcm_simdalgs_aesni_avx[0])
- simd_unregister_aeads(aes_gcm_algs_aesni_avx,
- ARRAY_SIZE(aes_gcm_algs_aesni_avx),
- aes_gcm_simdalgs_aesni_avx);
+ unregister_skciphers(skcipher_algs_aesni_avx);
+ unregister_aeads(aes_gcm_algs_aesni_avx);
#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
- if (simd_skcipher_algs_vaes_avx2[0])
- simd_unregister_skciphers(skcipher_algs_vaes_avx2,
- ARRAY_SIZE(skcipher_algs_vaes_avx2),
- simd_skcipher_algs_vaes_avx2);
- if (simd_skcipher_algs_vaes_avx10_256[0])
- simd_unregister_skciphers(skcipher_algs_vaes_avx10_256,
- ARRAY_SIZE(skcipher_algs_vaes_avx10_256),
- simd_skcipher_algs_vaes_avx10_256);
- if (aes_gcm_simdalgs_vaes_avx10_256[0])
- simd_unregister_aeads(aes_gcm_algs_vaes_avx10_256,
- ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256),
- aes_gcm_simdalgs_vaes_avx10_256);
- if (simd_skcipher_algs_vaes_avx10_512[0])
- simd_unregister_skciphers(skcipher_algs_vaes_avx10_512,
- ARRAY_SIZE(skcipher_algs_vaes_avx10_512),
- simd_skcipher_algs_vaes_avx10_512);
- if (aes_gcm_simdalgs_vaes_avx10_512[0])
- simd_unregister_aeads(aes_gcm_algs_vaes_avx10_512,
- ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512),
- aes_gcm_simdalgs_vaes_avx10_512);
+ unregister_skciphers(skcipher_algs_vaes_avx2);
+ unregister_skciphers(skcipher_algs_vaes_avx512);
+ unregister_aeads(aes_gcm_algs_vaes_avx10_256);
+ unregister_aeads(aes_gcm_algs_vaes_avx10_512);
#endif
}
#else /* CONFIG_X86_64 */
static struct aead_alg aes_gcm_algs_aesni[0];
-static struct simd_aead_alg *aes_gcm_simdalgs_aesni[0];
static int __init register_avx_algs(void)
{
@@ -1690,15 +1644,13 @@ static int __init aesni_init(void)
if (err)
return err;
- err = simd_register_skciphers_compat(aesni_skciphers,
- ARRAY_SIZE(aesni_skciphers),
- aesni_simd_skciphers);
+ err = crypto_register_skciphers(aesni_skciphers,
+ ARRAY_SIZE(aesni_skciphers));
if (err)
goto unregister_cipher;
- err = simd_register_aeads_compat(aes_gcm_algs_aesni,
- ARRAY_SIZE(aes_gcm_algs_aesni),
- aes_gcm_simdalgs_aesni);
+ err = crypto_register_aeads(aes_gcm_algs_aesni,
+ ARRAY_SIZE(aes_gcm_algs_aesni));
if (err)
goto unregister_skciphers;
@@ -1710,12 +1662,11 @@ static int __init aesni_init(void)
unregister_avx:
unregister_avx_algs();
- simd_unregister_aeads(aes_gcm_algs_aesni,
- ARRAY_SIZE(aes_gcm_algs_aesni),
- aes_gcm_simdalgs_aesni);
+ crypto_unregister_aeads(aes_gcm_algs_aesni,
+ ARRAY_SIZE(aes_gcm_algs_aesni));
unregister_skciphers:
- simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
- aesni_simd_skciphers);
+ crypto_unregister_skciphers(aesni_skciphers,
+ ARRAY_SIZE(aesni_skciphers));
unregister_cipher:
crypto_unregister_alg(&aesni_cipher_alg);
return err;
@@ -1723,11 +1674,10 @@ unregister_cipher:
static void __exit aesni_exit(void)
{
- simd_unregister_aeads(aes_gcm_algs_aesni,
- ARRAY_SIZE(aes_gcm_algs_aesni),
- aes_gcm_simdalgs_aesni);
- simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
- aesni_simd_skciphers);
+ crypto_unregister_aeads(aes_gcm_algs_aesni,
+ ARRAY_SIZE(aes_gcm_algs_aesni));
+ crypto_unregister_skciphers(aesni_skciphers,
+ ARRAY_SIZE(aesni_skciphers));
crypto_unregister_alg(&aesni_cipher_alg);
unregister_avx_algs();
}
diff --git a/arch/x86/crypto/aria_aesni_avx2_glue.c b/arch/x86/crypto/aria_aesni_avx2_glue.c
index 87a11804fc77..b4bddcd58457 100644
--- a/arch/x86/crypto/aria_aesni_avx2_glue.c
+++ b/arch/x86/crypto/aria_aesni_avx2_glue.c
@@ -6,7 +6,6 @@
*/
#include <crypto/algapi.h>
-#include <crypto/internal/simd.h>
#include <crypto/aria.h>
#include <linux/crypto.h>
#include <linux/err.h>
@@ -165,10 +164,9 @@ static int aria_avx2_init_tfm(struct crypto_skcipher *tfm)
static struct skcipher_alg aria_algs[] = {
{
- .base.cra_name = "__ecb(aria)",
- .base.cra_driver_name = "__ecb-aria-avx2",
+ .base.cra_name = "ecb(aria)",
+ .base.cra_driver_name = "ecb-aria-avx2",
.base.cra_priority = 500,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.base.cra_blocksize = ARIA_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct aria_ctx),
.base.cra_module = THIS_MODULE,
@@ -178,11 +176,10 @@ static struct skcipher_alg aria_algs[] = {
.encrypt = aria_avx2_ecb_encrypt,
.decrypt = aria_avx2_ecb_decrypt,
}, {
- .base.cra_name = "__ctr(aria)",
- .base.cra_driver_name = "__ctr-aria-avx2",
+ .base.cra_name = "ctr(aria)",
+ .base.cra_driver_name = "ctr-aria-avx2",
.base.cra_priority = 500,
- .base.cra_flags = CRYPTO_ALG_INTERNAL |
- CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE,
+ .base.cra_flags = CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct aria_ctx),
.base.cra_module = THIS_MODULE,
@@ -197,8 +194,6 @@ static struct skcipher_alg aria_algs[] = {
}
};
-static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)];
-
static int __init aria_avx2_init(void)
{
const char *feature_name;
@@ -233,15 +228,12 @@ static int __init aria_avx2_init(void)
aria_ops.aria_ctr_crypt_32way = aria_aesni_avx2_ctr_crypt_32way;
}
- return simd_register_skciphers_compat(aria_algs,
- ARRAY_SIZE(aria_algs),
- aria_simd_algs);
+ return crypto_register_skciphers(aria_algs, ARRAY_SIZE(aria_algs));
}
static void __exit aria_avx2_exit(void)
{
- simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs),
- aria_simd_algs);
+ crypto_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs));
}
module_init(aria_avx2_init);
diff --git a/arch/x86/crypto/aria_aesni_avx_glue.c b/arch/x86/crypto/aria_aesni_avx_glue.c
index 4e1516b76669..ab9b38d05332 100644
--- a/arch/x86/crypto/aria_aesni_avx_glue.c
+++ b/arch/x86/crypto/aria_aesni_avx_glue.c
@@ -6,7 +6,6 @@
*/
#include <crypto/algapi.h>
-#include <crypto/internal/simd.h>
#include <crypto/aria.h>
#include <linux/crypto.h>
#include <linux/err.h>
@@ -152,10 +151,9 @@ static int aria_avx_init_tfm(struct crypto_skcipher *tfm)
static struct skcipher_alg aria_algs[] = {
{
- .base.cra_name = "__ecb(aria)",
- .base.cra_driver_name = "__ecb-aria-avx",
+ .base.cra_name = "ecb(aria)",
+ .base.cra_driver_name = "ecb-aria-avx",
.base.cra_priority = 400,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.base.cra_blocksize = ARIA_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct aria_ctx),
.base.cra_module = THIS_MODULE,
@@ -165,10 +163,9 @@ static struct skcipher_alg aria_algs[] = {
.encrypt = aria_avx_ecb_encrypt,
.decrypt = aria_avx_ecb_decrypt,
}, {
- .base.cra_name = "__ctr(aria)",
- .base.cra_driver_name = "__ctr-aria-avx",
+ .base.cra_name = "ctr(aria)",
+ .base.cra_driver_name = "ctr-aria-avx",
.base.cra_priority = 400,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct aria_ctx),
.base.cra_module = THIS_MODULE,
@@ -184,8 +181,6 @@ static struct skcipher_alg aria_algs[] = {
}
};
-static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)];
-
static int __init aria_avx_init(void)
{
const char *feature_name;
@@ -213,15 +208,12 @@ static int __init aria_avx_init(void)
aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_ctr_crypt_16way;
}
- return simd_register_skciphers_compat(aria_algs,
- ARRAY_SIZE(aria_algs),
- aria_simd_algs);
+ return crypto_register_skciphers(aria_algs, ARRAY_SIZE(aria_algs));
}
static void __exit aria_avx_exit(void)
{
- simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs),
- aria_simd_algs);
+ crypto_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs));
}
module_init(aria_avx_init);
diff --git a/arch/x86/crypto/aria_gfni_avx512_glue.c b/arch/x86/crypto/aria_gfni_avx512_glue.c
index f4a2208d2638..363cbf4399cc 100644
--- a/arch/x86/crypto/aria_gfni_avx512_glue.c
+++ b/arch/x86/crypto/aria_gfni_avx512_glue.c
@@ -6,7 +6,6 @@
*/
#include <crypto/algapi.h>
-#include <crypto/internal/simd.h>
#include <crypto/aria.h>
#include <linux/crypto.h>
#include <linux/err.h>
@@ -165,10 +164,9 @@ static int aria_avx512_init_tfm(struct crypto_skcipher *tfm)
static struct skcipher_alg aria_algs[] = {
{
- .base.cra_name = "__ecb(aria)",
- .base.cra_driver_name = "__ecb-aria-avx512",
+ .base.cra_name = "ecb(aria)",
+ .base.cra_driver_name = "ecb-aria-avx512",
.base.cra_priority = 600,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.base.cra_blocksize = ARIA_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct aria_ctx),
.base.cra_module = THIS_MODULE,
@@ -178,11 +176,10 @@ static struct skcipher_alg aria_algs[] = {
.encrypt = aria_avx512_ecb_encrypt,
.decrypt = aria_avx512_ecb_decrypt,
}, {
- .base.cra_name = "__ctr(aria)",
- .base.cra_driver_name = "__ctr-aria-avx512",
+ .base.cra_name = "ctr(aria)",
+ .base.cra_driver_name = "ctr-aria-avx512",
.base.cra_priority = 600,
- .base.cra_flags = CRYPTO_ALG_INTERNAL |
- CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE,
+ .base.cra_flags = CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct aria_ctx),
.base.cra_module = THIS_MODULE,
@@ -197,8 +194,6 @@ static struct skcipher_alg aria_algs[] = {
}
};
-static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)];
-
static int __init aria_avx512_init(void)
{
const char *feature_name;
@@ -229,15 +224,12 @@ static int __init aria_avx512_init(void)
aria_ops.aria_decrypt_64way = aria_gfni_avx512_decrypt_64way;
aria_ops.aria_ctr_crypt_64way = aria_gfni_avx512_ctr_crypt_64way;
- return simd_register_skciphers_compat(aria_algs,
- ARRAY_SIZE(aria_algs),
- aria_simd_algs);
+ return crypto_register_skciphers(aria_algs, ARRAY_SIZE(aria_algs));
}
static void __exit aria_avx512_exit(void)
{
- simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs),
- aria_simd_algs);
+ crypto_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs));
}
module_init(aria_avx512_init);
diff --git a/arch/x86/crypto/blake2s-core.S b/arch/x86/crypto/blake2s-core.S
deleted file mode 100644
index b50b35ff1fdb..000000000000
--- a/arch/x86/crypto/blake2s-core.S
+++ /dev/null
@@ -1,256 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
- */
-
-#include <linux/linkage.h>
-
-.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
-.align 32
-IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
- .octa 0x5BE0CD191F83D9AB9B05688C510E527F
-.section .rodata.cst16.ROT16, "aM", @progbits, 16
-.align 16
-ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
-.section .rodata.cst16.ROR328, "aM", @progbits, 16
-.align 16
-ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
-.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
-.align 64
-SIGMA:
-.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
-.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
-.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
-.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
-.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
-.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
-.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
-.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
-.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
-.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
-#ifdef CONFIG_AS_AVX512
-.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
-.align 64
-SIGMA2:
-.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
-.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
-.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
-.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
-.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
-.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
-.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
-.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
-.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
-.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
-#endif /* CONFIG_AS_AVX512 */
-
-.text
-SYM_FUNC_START(blake2s_compress_ssse3)
- testq %rdx,%rdx
- je .Lendofloop
- movdqu (%rdi),%xmm0
- movdqu 0x10(%rdi),%xmm1
- movdqa ROT16(%rip),%xmm12
- movdqa ROR328(%rip),%xmm13
- movdqu 0x20(%rdi),%xmm14
- movq %rcx,%xmm15
- leaq SIGMA+0xa0(%rip),%r8
- jmp .Lbeginofloop
- .align 32
-.Lbeginofloop:
- movdqa %xmm0,%xmm10
- movdqa %xmm1,%xmm11
- paddq %xmm15,%xmm14
- movdqa IV(%rip),%xmm2
- movdqa %xmm14,%xmm3
- pxor IV+0x10(%rip),%xmm3
- leaq SIGMA(%rip),%rcx
-.Lroundloop:
- movzbl (%rcx),%eax
- movd (%rsi,%rax,4),%xmm4
- movzbl 0x1(%rcx),%eax
- movd (%rsi,%rax,4),%xmm5
- movzbl 0x2(%rcx),%eax
- movd (%rsi,%rax,4),%xmm6
- movzbl 0x3(%rcx),%eax
- movd (%rsi,%rax,4),%xmm7
- punpckldq %xmm5,%xmm4
- punpckldq %xmm7,%xmm6
- punpcklqdq %xmm6,%xmm4
- paddd %xmm4,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movzbl 0x4(%rcx),%eax
- movd (%rsi,%rax,4),%xmm5
- movzbl 0x5(%rcx),%eax
- movd (%rsi,%rax,4),%xmm6
- movzbl 0x6(%rcx),%eax
- movd (%rsi,%rax,4),%xmm7
- movzbl 0x7(%rcx),%eax
- movd (%rsi,%rax,4),%xmm4
- punpckldq %xmm6,%xmm5
- punpckldq %xmm4,%xmm7
- punpcklqdq %xmm7,%xmm5
- paddd %xmm5,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x93,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x39,%xmm2,%xmm2
- movzbl 0x8(%rcx),%eax
- movd (%rsi,%rax,4),%xmm6
- movzbl 0x9(%rcx),%eax
- movd (%rsi,%rax,4),%xmm7
- movzbl 0xa(%rcx),%eax
- movd (%rsi,%rax,4),%xmm4
- movzbl 0xb(%rcx),%eax
- movd (%rsi,%rax,4),%xmm5
- punpckldq %xmm7,%xmm6
- punpckldq %xmm5,%xmm4
- punpcklqdq %xmm4,%xmm6
- paddd %xmm6,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm12,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0xc,%xmm1
- pslld $0x14,%xmm8
- por %xmm8,%xmm1
- movzbl 0xc(%rcx),%eax
- movd (%rsi,%rax,4),%xmm7
- movzbl 0xd(%rcx),%eax
- movd (%rsi,%rax,4),%xmm4
- movzbl 0xe(%rcx),%eax
- movd (%rsi,%rax,4),%xmm5
- movzbl 0xf(%rcx),%eax
- movd (%rsi,%rax,4),%xmm6
- punpckldq %xmm4,%xmm7
- punpckldq %xmm6,%xmm5
- punpcklqdq %xmm5,%xmm7
- paddd %xmm7,%xmm0
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm13,%xmm3
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm8
- psrld $0x7,%xmm1
- pslld $0x19,%xmm8
- por %xmm8,%xmm1
- pshufd $0x39,%xmm0,%xmm0
- pshufd $0x4e,%xmm3,%xmm3
- pshufd $0x93,%xmm2,%xmm2
- addq $0x10,%rcx
- cmpq %r8,%rcx
- jnz .Lroundloop
- pxor %xmm2,%xmm0
- pxor %xmm3,%xmm1
- pxor %xmm10,%xmm0
- pxor %xmm11,%xmm1
- addq $0x40,%rsi
- decq %rdx
- jnz .Lbeginofloop
- movdqu %xmm0,(%rdi)
- movdqu %xmm1,0x10(%rdi)
- movdqu %xmm14,0x20(%rdi)
-.Lendofloop:
- RET
-SYM_FUNC_END(blake2s_compress_ssse3)
-
-#ifdef CONFIG_AS_AVX512
-SYM_FUNC_START(blake2s_compress_avx512)
- vmovdqu (%rdi),%xmm0
- vmovdqu 0x10(%rdi),%xmm1
- vmovdqu 0x20(%rdi),%xmm4
- vmovq %rcx,%xmm5
- vmovdqa IV(%rip),%xmm14
- vmovdqa IV+16(%rip),%xmm15
- jmp .Lblake2s_compress_avx512_mainloop
-.align 32
-.Lblake2s_compress_avx512_mainloop:
- vmovdqa %xmm0,%xmm10
- vmovdqa %xmm1,%xmm11
- vpaddq %xmm5,%xmm4,%xmm4
- vmovdqa %xmm14,%xmm2
- vpxor %xmm15,%xmm4,%xmm3
- vmovdqu (%rsi),%ymm6
- vmovdqu 0x20(%rsi),%ymm7
- addq $0x40,%rsi
- leaq SIGMA2(%rip),%rax
- movb $0xa,%cl
-.Lblake2s_compress_avx512_roundloop:
- addq $0x40,%rax
- vmovdqa -0x40(%rax),%ymm8
- vmovdqa -0x20(%rax),%ymm9
- vpermi2d %ymm7,%ymm6,%ymm8
- vpermi2d %ymm7,%ymm6,%ymm9
- vmovdqa %ymm8,%ymm6
- vmovdqa %ymm9,%ymm7
- vpaddd %xmm8,%xmm0,%xmm0
- vpaddd %xmm1,%xmm0,%xmm0
- vpxor %xmm0,%xmm3,%xmm3
- vprord $0x10,%xmm3,%xmm3
- vpaddd %xmm3,%xmm2,%xmm2
- vpxor %xmm2,%xmm1,%xmm1
- vprord $0xc,%xmm1,%xmm1
- vextracti128 $0x1,%ymm8,%xmm8
- vpaddd %xmm8,%xmm0,%xmm0
- vpaddd %xmm1,%xmm0,%xmm0
- vpxor %xmm0,%xmm3,%xmm3
- vprord $0x8,%xmm3,%xmm3
- vpaddd %xmm3,%xmm2,%xmm2
- vpxor %xmm2,%xmm1,%xmm1
- vprord $0x7,%xmm1,%xmm1
- vpshufd $0x93,%xmm0,%xmm0
- vpshufd $0x4e,%xmm3,%xmm3
- vpshufd $0x39,%xmm2,%xmm2
- vpaddd %xmm9,%xmm0,%xmm0
- vpaddd %xmm1,%xmm0,%xmm0
- vpxor %xmm0,%xmm3,%xmm3
- vprord $0x10,%xmm3,%xmm3
- vpaddd %xmm3,%xmm2,%xmm2
- vpxor %xmm2,%xmm1,%xmm1
- vprord $0xc,%xmm1,%xmm1
- vextracti128 $0x1,%ymm9,%xmm9
- vpaddd %xmm9,%xmm0,%xmm0
- vpaddd %xmm1,%xmm0,%xmm0
- vpxor %xmm0,%xmm3,%xmm3
- vprord $0x8,%xmm3,%xmm3
- vpaddd %xmm3,%xmm2,%xmm2
- vpxor %xmm2,%xmm1,%xmm1
- vprord $0x7,%xmm1,%xmm1
- vpshufd $0x39,%xmm0,%xmm0
- vpshufd $0x4e,%xmm3,%xmm3
- vpshufd $0x93,%xmm2,%xmm2
- decb %cl
- jne .Lblake2s_compress_avx512_roundloop
- vpxor %xmm10,%xmm0,%xmm0
- vpxor %xmm11,%xmm1,%xmm1
- vpxor %xmm2,%xmm0,%xmm0
- vpxor %xmm3,%xmm1,%xmm1
- decq %rdx
- jne .Lblake2s_compress_avx512_mainloop
- vmovdqu %xmm0,(%rdi)
- vmovdqu %xmm1,0x10(%rdi)
- vmovdqu %xmm4,0x20(%rdi)
- vzeroupper
- RET
-SYM_FUNC_END(blake2s_compress_avx512)
-#endif /* CONFIG_AS_AVX512 */
diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
deleted file mode 100644
index 0313f9673f56..000000000000
--- a/arch/x86/crypto/blake2s-glue.c
+++ /dev/null
@@ -1,74 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <crypto/internal/blake2s.h>
-
-#include <linux/types.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/sizes.h>
-
-#include <asm/cpufeature.h>
-#include <asm/fpu/api.h>
-#include <asm/processor.h>
-#include <asm/simd.h>
-
-asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
- const u8 *block, const size_t nblocks,
- const u32 inc);
-asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
- const u8 *block, const size_t nblocks,
- const u32 inc);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
-
-void blake2s_compress(struct blake2s_state *state, const u8 *block,
- size_t nblocks, const u32 inc)
-{
- /* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
-
- if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
- blake2s_compress_generic(state, block, nblocks, inc);
- return;
- }
-
- do {
- const size_t blocks = min_t(size_t, nblocks,
- SZ_4K / BLAKE2S_BLOCK_SIZE);
-
- kernel_fpu_begin();
- if (IS_ENABLED(CONFIG_AS_AVX512) &&
- static_branch_likely(&blake2s_use_avx512))
- blake2s_compress_avx512(state, block, blocks, inc);
- else
- blake2s_compress_ssse3(state, block, blocks, inc);
- kernel_fpu_end();
-
- nblocks -= blocks;
- block += blocks * BLAKE2S_BLOCK_SIZE;
- } while (nblocks);
-}
-EXPORT_SYMBOL(blake2s_compress);
-
-static int __init blake2s_mod_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_SSSE3))
- static_branch_enable(&blake2s_use_ssse3);
-
- if (IS_ENABLED(CONFIG_AS_AVX512) &&
- boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_AVX512F) &&
- boot_cpu_has(X86_FEATURE_AVX512VL) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
- XFEATURE_MASK_AVX512, NULL))
- static_branch_enable(&blake2s_use_avx512);
-
- return 0;
-}
-
-subsys_initcall(blake2s_mod_init);
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
index e7e4d64e9577..2d2f4e16537c 100644
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -6,7 +6,6 @@
*/
#include <crypto/algapi.h>
-#include <crypto/internal/simd.h>
#include <linux/crypto.h>
#include <linux/err.h>
#include <linux/module.h>
@@ -69,10 +68,9 @@ static int cbc_decrypt(struct skcipher_request *req)
static struct skcipher_alg camellia_algs[] = {
{
- .base.cra_name = "__ecb(camellia)",
- .base.cra_driver_name = "__ecb-camellia-aesni-avx2",
+ .base.cra_name = "ecb(camellia)",
+ .base.cra_driver_name = "ecb-camellia-aesni-avx2",
.base.cra_priority = 500,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct camellia_ctx),
.base.cra_module = THIS_MODULE,
@@ -82,10 +80,9 @@ static struct skcipher_alg camellia_algs[] = {
.encrypt = ecb_encrypt,
.decrypt = ecb_decrypt,
}, {
- .base.cra_name = "__cbc(camellia)",
- .base.cra_driver_name = "__cbc-camellia-aesni-avx2",
+ .base.cra_name = "cbc(camellia)",
+ .base.cra_driver_name = "cbc-camellia-aesni-avx2",
.base.cra_priority = 500,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct camellia_ctx),
.base.cra_module = THIS_MODULE,
@@ -98,8 +95,6 @@ static struct skcipher_alg camellia_algs[] = {
},
};
-static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)];
-
static int __init camellia_aesni_init(void)
{
const char *feature_name;
@@ -118,15 +113,13 @@ static int __init camellia_aesni_init(void)
return -ENODEV;
}
- return simd_register_skciphers_compat(camellia_algs,
- ARRAY_SIZE(camellia_algs),
- camellia_simd_algs);
+ return crypto_register_skciphers(camellia_algs,
+ ARRAY_SIZE(camellia_algs));
}
static void __exit camellia_aesni_fini(void)
{
- simd_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs),
- camellia_simd_algs);
+ crypto_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs));
}
module_init(camellia_aesni_init);
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index c7ccf63e741e..a7d162388142 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -6,7 +6,6 @@
*/
#include <crypto/algapi.h>
-#include <crypto/internal/simd.h>
#include <linux/crypto.h>
#include <linux/err.h>
#include <linux/module.h>
@@ -69,10 +68,9 @@ static int cbc_decrypt(struct skcipher_request *req)
static struct skcipher_alg camellia_algs[] = {
{
- .base.cra_name = "__ecb(camellia)",
- .base.cra_driver_name = "__ecb-camellia-aesni",
+ .base.cra_name = "ecb(camellia)",
+ .base.cra_driver_name = "ecb-camellia-aesni",
.base.cra_priority = 400,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct camellia_ctx),
.base.cra_module = THIS_MODULE,
@@ -82,10 +80,9 @@ static struct skcipher_alg camellia_algs[] = {
.encrypt = ecb_encrypt,
.decrypt = ecb_decrypt,
}, {
- .base.cra_name = "__cbc(camellia)",
- .base.cra_driver_name = "__cbc-camellia-aesni",
+ .base.cra_name = "cbc(camellia)",
+ .base.cra_driver_name = "cbc-camellia-aesni",
.base.cra_priority = 400,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.base.cra_blocksize = CAMELLIA_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct camellia_ctx),
.base.cra_module = THIS_MODULE,
@@ -98,8 +95,6 @@ static struct skcipher_alg camellia_algs[] = {
}
};
-static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)];
-
static int __init camellia_aesni_init(void)
{
const char *feature_name;
@@ -117,15 +112,13 @@ static int __init camellia_aesni_init(void)
return -ENODEV;
}
- return simd_register_skciphers_compat(camellia_algs,
- ARRAY_SIZE(camellia_algs),
- camellia_simd_algs);
+ return crypto_register_skciphers(camellia_algs,
+ ARRAY_SIZE(camellia_algs));
}
static void __exit camellia_aesni_fini(void)
{
- simd_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs),
- camellia_simd_algs);
+ crypto_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs));
}
module_init(camellia_aesni_init);
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index 3976a87f92ad..3aca04d43b34 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -8,7 +8,6 @@
#include <crypto/algapi.h>
#include <crypto/cast5.h>
-#include <crypto/internal/simd.h>
#include <linux/crypto.h>
#include <linux/err.h>
#include <linux/module.h>
@@ -64,10 +63,9 @@ static int cbc_decrypt(struct skcipher_request *req)
static struct skcipher_alg cast5_algs[] = {
{
- .base.cra_name = "__ecb(cast5)",
- .base.cra_driver_name = "__ecb-cast5-avx",
+ .base.cra_name = "ecb(cast5)",
+ .base.cra_driver_name = "ecb-cast5-avx",
.base.cra_priority = 200,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.base.cra_blocksize = CAST5_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct cast5_ctx),
.base.cra_module = THIS_MODULE,
@@ -77,10 +75,9 @@ static struct skcipher_alg cast5_algs[] = {
.encrypt = ecb_encrypt,
.decrypt = ecb_decrypt,
}, {
- .base.cra_name = "__cbc(cast5)",
- .base.cra_driver_name = "__cbc-cast5-avx",
+ .base.cra_name = "cbc(cast5)",
+ .base.cra_driver_name = "cbc-cast5-avx",
.base.cra_priority = 200,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.base.cra_blocksize = CAST5_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct cast5_ctx),
.base.cra_module = THIS_MODULE,
@@ -93,8 +90,6 @@ static struct skcipher_alg cast5_algs[] = {
}
};
-static struct simd_skcipher_alg *cast5_simd_algs[ARRAY_SIZE(cast5_algs)];
-
static int __init cast5_init(void)
{
const char *feature_name;
@@ -105,15 +100,13 @@ static int __init cast5_init(void)
return -ENODEV;
}
- return simd_register_skciphers_compat(cast5_algs,
- ARRAY_SIZE(cast5_algs),
- cast5_simd_algs);
+ return crypto_register_skciphers(cast5_algs,
+ ARRAY_SIZE(cast5_algs));
}
static void __exit cast5_exit(void)
{
- simd_unregister_skciphers(cast5_algs, ARRAY_SIZE(cast5_algs),
- cast5_simd_algs);
+ crypto_unregister_skciphers(cast5_algs, ARRAY_SIZE(cast5_algs));
}
module_init(cast5_init);
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 7e2aea372349..c4dd28c30303 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -14,7 +14,6 @@
#include <linux/err.h>
#include <crypto/algapi.h>
#include <crypto/cast6.h>
-#include <crypto/internal/simd.h>
#include "ecb_cbc_helpers.h"
@@ -64,10 +63,9 @@ static int cbc_decrypt(struct skcipher_request *req)
static struct skcipher_alg cast6_algs[] = {
{
- .base.cra_name = "__ecb(cast6)",
- .base.cra_driver_name = "__ecb-cast6-avx",
+ .base.cra_name = "ecb(cast6)",
+ .base.cra_driver_name = "ecb-cast6-avx",
.base.cra_priority = 200,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.base.cra_blocksize = CAST6_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct cast6_ctx),
.base.cra_module = THIS_MODULE,
@@ -77,10 +75,9 @@ static struct skcipher_alg cast6_algs[] = {
.encrypt = ecb_encrypt,
.decrypt = ecb_decrypt,
}, {
- .base.cra_name = "__cbc(cast6)",
- .base.cra_driver_name = "__cbc-cast6-avx",
+ .base.cra_name = "cbc(cast6)",
+ .base.cra_driver_name = "cbc-cast6-avx",
.base.cra_priority = 200,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.base.cra_blocksize = CAST6_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct cast6_ctx),
.base.cra_module = THIS_MODULE,
@@ -93,8 +90,6 @@ static struct skcipher_alg cast6_algs[] = {
},
};
-static struct simd_skcipher_alg *cast6_simd_algs[ARRAY_SIZE(cast6_algs)];
-
static int __init cast6_init(void)
{
const char *feature_name;
@@ -105,15 +100,12 @@ static int __init cast6_init(void)
return -ENODEV;
}
- return simd_register_skciphers_compat(cast6_algs,
- ARRAY_SIZE(cast6_algs),
- cast6_simd_algs);
+ return crypto_register_skciphers(cast6_algs, ARRAY_SIZE(cast6_algs));
}
static void __exit cast6_exit(void)
{
- simd_unregister_skciphers(cast6_algs, ARRAY_SIZE(cast6_algs),
- cast6_simd_algs);
+ crypto_unregister_skciphers(cast6_algs, ARRAY_SIZE(cast6_algs));
}
module_init(cast6_init);
diff --git a/arch/x86/crypto/chacha-avx2-x86_64.S b/arch/x86/crypto/chacha-avx2-x86_64.S
deleted file mode 100644
index f3d8fc018249..000000000000
--- a/arch/x86/crypto/chacha-avx2-x86_64.S
+++ /dev/null
@@ -1,1021 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
- *
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <linux/linkage.h>
-
-.section .rodata.cst32.ROT8, "aM", @progbits, 32
-.align 32
-ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
- .octa 0x0e0d0c0f0a09080b0605040702010003
-
-.section .rodata.cst32.ROT16, "aM", @progbits, 32
-.align 32
-ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
- .octa 0x0d0c0f0e09080b0a0504070601000302
-
-.section .rodata.cst32.CTRINC, "aM", @progbits, 32
-.align 32
-CTRINC: .octa 0x00000003000000020000000100000000
- .octa 0x00000007000000060000000500000004
-
-.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
-.align 32
-CTR2BL: .octa 0x00000000000000000000000000000000
- .octa 0x00000000000000000000000000000001
-
-.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
-.align 32
-CTR4BL: .octa 0x00000000000000000000000000000002
- .octa 0x00000000000000000000000000000003
-
-.text
-
-SYM_FUNC_START(chacha_2block_xor_avx2)
- # %rdi: Input state matrix, s
- # %rsi: up to 2 data blocks output, o
- # %rdx: up to 2 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts two ChaCha blocks by loading the state
- # matrix twice across four AVX registers. It performs matrix operations
- # on four words in each matrix in parallel, but requires shuffling to
- # rearrange the words after each round.
-
- vzeroupper
-
- # x0..3[0-2] = s0..3
- vbroadcasti128 0x00(%rdi),%ymm0
- vbroadcasti128 0x10(%rdi),%ymm1
- vbroadcasti128 0x20(%rdi),%ymm2
- vbroadcasti128 0x30(%rdi),%ymm3
-
- vpaddd CTR2BL(%rip),%ymm3,%ymm3
-
- vmovdqa %ymm0,%ymm8
- vmovdqa %ymm1,%ymm9
- vmovdqa %ymm2,%ymm10
- vmovdqa %ymm3,%ymm11
-
- vmovdqa ROT8(%rip),%ymm4
- vmovdqa ROT16(%rip),%ymm5
-
- mov %rcx,%rax
-
-.Ldoubleround:
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm5,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm6
- vpslld $12,%ymm6,%ymm6
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm6,%ymm1,%ymm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm4,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm7
- vpslld $7,%ymm7,%ymm7
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm7,%ymm1,%ymm1
-
- # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm1,%ymm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm3,%ymm3
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm5,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm6
- vpslld $12,%ymm6,%ymm6
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm6,%ymm1,%ymm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm4,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm7
- vpslld $7,%ymm7,%ymm7
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm7,%ymm1,%ymm1
-
- # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm1,%ymm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm3,%ymm3
-
- sub $2,%r8d
- jnz .Ldoubleround
-
- # o0 = i0 ^ (x0 + s0)
- vpaddd %ymm8,%ymm0,%ymm7
- cmp $0x10,%rax
- jl .Lxorpart2
- vpxor 0x00(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x00(%rsi)
- vextracti128 $1,%ymm7,%xmm0
- # o1 = i1 ^ (x1 + s1)
- vpaddd %ymm9,%ymm1,%ymm7
- cmp $0x20,%rax
- jl .Lxorpart2
- vpxor 0x10(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x10(%rsi)
- vextracti128 $1,%ymm7,%xmm1
- # o2 = i2 ^ (x2 + s2)
- vpaddd %ymm10,%ymm2,%ymm7
- cmp $0x30,%rax
- jl .Lxorpart2
- vpxor 0x20(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x20(%rsi)
- vextracti128 $1,%ymm7,%xmm2
- # o3 = i3 ^ (x3 + s3)
- vpaddd %ymm11,%ymm3,%ymm7
- cmp $0x40,%rax
- jl .Lxorpart2
- vpxor 0x30(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x30(%rsi)
- vextracti128 $1,%ymm7,%xmm3
-
- # xor and write second block
- vmovdqa %xmm0,%xmm7
- cmp $0x50,%rax
- jl .Lxorpart2
- vpxor 0x40(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x40(%rsi)
-
- vmovdqa %xmm1,%xmm7
- cmp $0x60,%rax
- jl .Lxorpart2
- vpxor 0x50(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x50(%rsi)
-
- vmovdqa %xmm2,%xmm7
- cmp $0x70,%rax
- jl .Lxorpart2
- vpxor 0x60(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x60(%rsi)
-
- vmovdqa %xmm3,%xmm7
- cmp $0x80,%rax
- jl .Lxorpart2
- vpxor 0x70(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x70(%rsi)
-
-.Ldone2:
- vzeroupper
- RET
-
-.Lxorpart2:
- # xor remaining bytes from partial register into output
- mov %rax,%r9
- and $0x0f,%r9
- jz .Ldone2
- and $~0x0f,%rax
-
- mov %rsi,%r11
-
- lea 8(%rsp),%r10
- sub $0x10,%rsp
- and $~31,%rsp
-
- lea (%rdx,%rax),%rsi
- mov %rsp,%rdi
- mov %r9,%rcx
- rep movsb
-
- vpxor 0x00(%rsp),%xmm7,%xmm7
- vmovdqa %xmm7,0x00(%rsp)
-
- mov %rsp,%rsi
- lea (%r11,%rax),%rdi
- mov %r9,%rcx
- rep movsb
-
- lea -8(%r10),%rsp
- jmp .Ldone2
-
-SYM_FUNC_END(chacha_2block_xor_avx2)
-
-SYM_FUNC_START(chacha_4block_xor_avx2)
- # %rdi: Input state matrix, s
- # %rsi: up to 4 data blocks output, o
- # %rdx: up to 4 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts four ChaCha blocks by loading the state
- # matrix four times across eight AVX registers. It performs matrix
- # operations on four words in two matrices in parallel, sequentially
- # to the operations on the four words of the other two matrices. The
- # required word shuffling has a rather high latency, we can do the
- # arithmetic on two matrix-pairs without much slowdown.
-
- vzeroupper
-
- # x0..3[0-4] = s0..3
- vbroadcasti128 0x00(%rdi),%ymm0
- vbroadcasti128 0x10(%rdi),%ymm1
- vbroadcasti128 0x20(%rdi),%ymm2
- vbroadcasti128 0x30(%rdi),%ymm3
-
- vmovdqa %ymm0,%ymm4
- vmovdqa %ymm1,%ymm5
- vmovdqa %ymm2,%ymm6
- vmovdqa %ymm3,%ymm7
-
- vpaddd CTR2BL(%rip),%ymm3,%ymm3
- vpaddd CTR4BL(%rip),%ymm7,%ymm7
-
- vmovdqa %ymm0,%ymm11
- vmovdqa %ymm1,%ymm12
- vmovdqa %ymm2,%ymm13
- vmovdqa %ymm3,%ymm14
- vmovdqa %ymm7,%ymm15
-
- vmovdqa ROT8(%rip),%ymm8
- vmovdqa ROT16(%rip),%ymm9
-
- mov %rcx,%rax
-
-.Ldoubleround4:
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm9,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxor %ymm4,%ymm7,%ymm7
- vpshufb %ymm9,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm10
- vpslld $12,%ymm10,%ymm10
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm10,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm6,%ymm5,%ymm5
- vmovdqa %ymm5,%ymm10
- vpslld $12,%ymm10,%ymm10
- vpsrld $20,%ymm5,%ymm5
- vpor %ymm10,%ymm5,%ymm5
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm8,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxor %ymm4,%ymm7,%ymm7
- vpshufb %ymm8,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm10
- vpslld $7,%ymm10,%ymm10
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm10,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm6,%ymm5,%ymm5
- vmovdqa %ymm5,%ymm10
- vpslld $7,%ymm10,%ymm10
- vpsrld $25,%ymm5,%ymm5
- vpor %ymm10,%ymm5,%ymm5
-
- # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm1,%ymm1
- vpshufd $0x39,%ymm5,%ymm5
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- vpshufd $0x4e,%ymm6,%ymm6
- # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm3,%ymm3
- vpshufd $0x93,%ymm7,%ymm7
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm9,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxor %ymm4,%ymm7,%ymm7
- vpshufb %ymm9,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm10
- vpslld $12,%ymm10,%ymm10
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm10,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm6,%ymm5,%ymm5
- vmovdqa %ymm5,%ymm10
- vpslld $12,%ymm10,%ymm10
- vpsrld $20,%ymm5,%ymm5
- vpor %ymm10,%ymm5,%ymm5
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxor %ymm0,%ymm3,%ymm3
- vpshufb %ymm8,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxor %ymm4,%ymm7,%ymm7
- vpshufb %ymm8,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxor %ymm2,%ymm1,%ymm1
- vmovdqa %ymm1,%ymm10
- vpslld $7,%ymm10,%ymm10
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm10,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxor %ymm6,%ymm5,%ymm5
- vmovdqa %ymm5,%ymm10
- vpslld $7,%ymm10,%ymm10
- vpsrld $25,%ymm5,%ymm5
- vpor %ymm10,%ymm5,%ymm5
-
- # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm1,%ymm1
- vpshufd $0x93,%ymm5,%ymm5
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- vpshufd $0x4e,%ymm6,%ymm6
- # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm3,%ymm3
- vpshufd $0x39,%ymm7,%ymm7
-
- sub $2,%r8d
- jnz .Ldoubleround4
-
- # o0 = i0 ^ (x0 + s0), first block
- vpaddd %ymm11,%ymm0,%ymm10
- cmp $0x10,%rax
- jl .Lxorpart4
- vpxor 0x00(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x00(%rsi)
- vextracti128 $1,%ymm10,%xmm0
- # o1 = i1 ^ (x1 + s1), first block
- vpaddd %ymm12,%ymm1,%ymm10
- cmp $0x20,%rax
- jl .Lxorpart4
- vpxor 0x10(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x10(%rsi)
- vextracti128 $1,%ymm10,%xmm1
- # o2 = i2 ^ (x2 + s2), first block
- vpaddd %ymm13,%ymm2,%ymm10
- cmp $0x30,%rax
- jl .Lxorpart4
- vpxor 0x20(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x20(%rsi)
- vextracti128 $1,%ymm10,%xmm2
- # o3 = i3 ^ (x3 + s3), first block
- vpaddd %ymm14,%ymm3,%ymm10
- cmp $0x40,%rax
- jl .Lxorpart4
- vpxor 0x30(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x30(%rsi)
- vextracti128 $1,%ymm10,%xmm3
-
- # xor and write second block
- vmovdqa %xmm0,%xmm10
- cmp $0x50,%rax
- jl .Lxorpart4
- vpxor 0x40(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x40(%rsi)
-
- vmovdqa %xmm1,%xmm10
- cmp $0x60,%rax
- jl .Lxorpart4
- vpxor 0x50(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x50(%rsi)
-
- vmovdqa %xmm2,%xmm10
- cmp $0x70,%rax
- jl .Lxorpart4
- vpxor 0x60(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x60(%rsi)
-
- vmovdqa %xmm3,%xmm10
- cmp $0x80,%rax
- jl .Lxorpart4
- vpxor 0x70(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x70(%rsi)
-
- # o0 = i0 ^ (x0 + s0), third block
- vpaddd %ymm11,%ymm4,%ymm10
- cmp $0x90,%rax
- jl .Lxorpart4
- vpxor 0x80(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x80(%rsi)
- vextracti128 $1,%ymm10,%xmm4
- # o1 = i1 ^ (x1 + s1), third block
- vpaddd %ymm12,%ymm5,%ymm10
- cmp $0xa0,%rax
- jl .Lxorpart4
- vpxor 0x90(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x90(%rsi)
- vextracti128 $1,%ymm10,%xmm5
- # o2 = i2 ^ (x2 + s2), third block
- vpaddd %ymm13,%ymm6,%ymm10
- cmp $0xb0,%rax
- jl .Lxorpart4
- vpxor 0xa0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xa0(%rsi)
- vextracti128 $1,%ymm10,%xmm6
- # o3 = i3 ^ (x3 + s3), third block
- vpaddd %ymm15,%ymm7,%ymm10
- cmp $0xc0,%rax
- jl .Lxorpart4
- vpxor 0xb0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xb0(%rsi)
- vextracti128 $1,%ymm10,%xmm7
-
- # xor and write fourth block
- vmovdqa %xmm4,%xmm10
- cmp $0xd0,%rax
- jl .Lxorpart4
- vpxor 0xc0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xc0(%rsi)
-
- vmovdqa %xmm5,%xmm10
- cmp $0xe0,%rax
- jl .Lxorpart4
- vpxor 0xd0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xd0(%rsi)
-
- vmovdqa %xmm6,%xmm10
- cmp $0xf0,%rax
- jl .Lxorpart4
- vpxor 0xe0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xe0(%rsi)
-
- vmovdqa %xmm7,%xmm10
- cmp $0x100,%rax
- jl .Lxorpart4
- vpxor 0xf0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xf0(%rsi)
-
-.Ldone4:
- vzeroupper
- RET
-
-.Lxorpart4:
- # xor remaining bytes from partial register into output
- mov %rax,%r9
- and $0x0f,%r9
- jz .Ldone4
- and $~0x0f,%rax
-
- mov %rsi,%r11
-
- lea 8(%rsp),%r10
- sub $0x10,%rsp
- and $~31,%rsp
-
- lea (%rdx,%rax),%rsi
- mov %rsp,%rdi
- mov %r9,%rcx
- rep movsb
-
- vpxor 0x00(%rsp),%xmm10,%xmm10
- vmovdqa %xmm10,0x00(%rsp)
-
- mov %rsp,%rsi
- lea (%r11,%rax),%rdi
- mov %r9,%rcx
- rep movsb
-
- lea -8(%r10),%rsp
- jmp .Ldone4
-
-SYM_FUNC_END(chacha_4block_xor_avx2)
-
-SYM_FUNC_START(chacha_8block_xor_avx2)
- # %rdi: Input state matrix, s
- # %rsi: up to 8 data blocks output, o
- # %rdx: up to 8 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts eight consecutive ChaCha blocks by loading
- # the state matrix in AVX registers eight times. As we need some
- # scratch registers, we save the first four registers on the stack. The
- # algorithm performs each operation on the corresponding word of each
- # state matrix, hence requires no word shuffling. For final XORing step
- # we transpose the matrix by interleaving 32-, 64- and then 128-bit
- # words, which allows us to do XOR in AVX registers. 8/16-bit word
- # rotation is done with the slightly better performing byte shuffling,
- # 7/12-bit word rotation uses traditional shift+OR.
-
- vzeroupper
- # 4 * 32 byte stack, 32-byte aligned
- lea 8(%rsp),%r10
- and $~31, %rsp
- sub $0x80, %rsp
- mov %rcx,%rax
-
- # x0..15[0-7] = s[0..15]
- vpbroadcastd 0x00(%rdi),%ymm0
- vpbroadcastd 0x04(%rdi),%ymm1
- vpbroadcastd 0x08(%rdi),%ymm2
- vpbroadcastd 0x0c(%rdi),%ymm3
- vpbroadcastd 0x10(%rdi),%ymm4
- vpbroadcastd 0x14(%rdi),%ymm5
- vpbroadcastd 0x18(%rdi),%ymm6
- vpbroadcastd 0x1c(%rdi),%ymm7
- vpbroadcastd 0x20(%rdi),%ymm8
- vpbroadcastd 0x24(%rdi),%ymm9
- vpbroadcastd 0x28(%rdi),%ymm10
- vpbroadcastd 0x2c(%rdi),%ymm11
- vpbroadcastd 0x30(%rdi),%ymm12
- vpbroadcastd 0x34(%rdi),%ymm13
- vpbroadcastd 0x38(%rdi),%ymm14
- vpbroadcastd 0x3c(%rdi),%ymm15
- # x0..3 on stack
- vmovdqa %ymm0,0x00(%rsp)
- vmovdqa %ymm1,0x20(%rsp)
- vmovdqa %ymm2,0x40(%rsp)
- vmovdqa %ymm3,0x60(%rsp)
-
- vmovdqa CTRINC(%rip),%ymm1
- vmovdqa ROT8(%rip),%ymm2
- vmovdqa ROT16(%rip),%ymm3
-
- # x12 += counter values 0-3
- vpaddd %ymm1,%ymm12,%ymm12
-
-.Ldoubleround8:
- # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
- vpaddd 0x00(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm3,%ymm12,%ymm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
- vpaddd 0x20(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm3,%ymm13,%ymm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
- vpaddd 0x40(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm3,%ymm14,%ymm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
- vpaddd 0x60(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm3,%ymm15,%ymm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $12,%ymm4,%ymm0
- vpsrld $20,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $12,%ymm5,%ymm0
- vpsrld $20,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpslld $12,%ymm6,%ymm0
- vpsrld $20,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
- vpaddd %ymm15,%ymm11,%ymm11
- vpxor %ymm11,%ymm7,%ymm7
- vpslld $12,%ymm7,%ymm0
- vpsrld $20,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
-
- # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
- vpaddd 0x00(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm2,%ymm12,%ymm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
- vpaddd 0x20(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm2,%ymm13,%ymm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
- vpaddd 0x40(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm2,%ymm14,%ymm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
- vpaddd 0x60(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm2,%ymm15,%ymm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm0
- vpsrld $25,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm0
- vpsrld $25,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpslld $7,%ymm6,%ymm0
- vpsrld $25,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
- vpaddd %ymm15,%ymm11,%ymm11
- vpxor %ymm11,%ymm7,%ymm7
- vpslld $7,%ymm7,%ymm0
- vpsrld $25,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
- vpaddd 0x00(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm3,%ymm15,%ymm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
- vpaddd 0x20(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm3,%ymm12,%ymm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
- vpaddd 0x40(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm3,%ymm13,%ymm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
- vpaddd 0x60(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm3,%ymm14,%ymm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
- vpaddd %ymm15,%ymm10,%ymm10
- vpxor %ymm10,%ymm5,%ymm5
- vpslld $12,%ymm5,%ymm0
- vpsrld $20,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
- vpaddd %ymm12,%ymm11,%ymm11
- vpxor %ymm11,%ymm6,%ymm6
- vpslld $12,%ymm6,%ymm0
- vpsrld $20,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
- vpaddd %ymm13,%ymm8,%ymm8
- vpxor %ymm8,%ymm7,%ymm7
- vpslld $12,%ymm7,%ymm0
- vpsrld $20,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
- vpaddd %ymm14,%ymm9,%ymm9
- vpxor %ymm9,%ymm4,%ymm4
- vpslld $12,%ymm4,%ymm0
- vpsrld $20,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
- vpaddd 0x00(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm2,%ymm15,%ymm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
- vpaddd 0x20(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm2,%ymm12,%ymm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
- vpaddd 0x40(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm2,%ymm13,%ymm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
- vpaddd 0x60(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm2,%ymm14,%ymm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
- vpaddd %ymm15,%ymm10,%ymm10
- vpxor %ymm10,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm0
- vpsrld $25,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
- vpaddd %ymm12,%ymm11,%ymm11
- vpxor %ymm11,%ymm6,%ymm6
- vpslld $7,%ymm6,%ymm0
- vpsrld $25,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
- vpaddd %ymm13,%ymm8,%ymm8
- vpxor %ymm8,%ymm7,%ymm7
- vpslld $7,%ymm7,%ymm0
- vpsrld $25,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
- vpaddd %ymm14,%ymm9,%ymm9
- vpxor %ymm9,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm0
- vpsrld $25,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
-
- sub $2,%r8d
- jnz .Ldoubleround8
-
- # x0..15[0-3] += s[0..15]
- vpbroadcastd 0x00(%rdi),%ymm0
- vpaddd 0x00(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpbroadcastd 0x04(%rdi),%ymm0
- vpaddd 0x20(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpbroadcastd 0x08(%rdi),%ymm0
- vpaddd 0x40(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpbroadcastd 0x0c(%rdi),%ymm0
- vpaddd 0x60(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpbroadcastd 0x10(%rdi),%ymm0
- vpaddd %ymm0,%ymm4,%ymm4
- vpbroadcastd 0x14(%rdi),%ymm0
- vpaddd %ymm0,%ymm5,%ymm5
- vpbroadcastd 0x18(%rdi),%ymm0
- vpaddd %ymm0,%ymm6,%ymm6
- vpbroadcastd 0x1c(%rdi),%ymm0
- vpaddd %ymm0,%ymm7,%ymm7
- vpbroadcastd 0x20(%rdi),%ymm0
- vpaddd %ymm0,%ymm8,%ymm8
- vpbroadcastd 0x24(%rdi),%ymm0
- vpaddd %ymm0,%ymm9,%ymm9
- vpbroadcastd 0x28(%rdi),%ymm0
- vpaddd %ymm0,%ymm10,%ymm10
- vpbroadcastd 0x2c(%rdi),%ymm0
- vpaddd %ymm0,%ymm11,%ymm11
- vpbroadcastd 0x30(%rdi),%ymm0
- vpaddd %ymm0,%ymm12,%ymm12
- vpbroadcastd 0x34(%rdi),%ymm0
- vpaddd %ymm0,%ymm13,%ymm13
- vpbroadcastd 0x38(%rdi),%ymm0
- vpaddd %ymm0,%ymm14,%ymm14
- vpbroadcastd 0x3c(%rdi),%ymm0
- vpaddd %ymm0,%ymm15,%ymm15
-
- # x12 += counter values 0-3
- vpaddd %ymm1,%ymm12,%ymm12
-
- # interleave 32-bit words in state n, n+1
- vmovdqa 0x00(%rsp),%ymm0
- vmovdqa 0x20(%rsp),%ymm1
- vpunpckldq %ymm1,%ymm0,%ymm2
- vpunpckhdq %ymm1,%ymm0,%ymm1
- vmovdqa %ymm2,0x00(%rsp)
- vmovdqa %ymm1,0x20(%rsp)
- vmovdqa 0x40(%rsp),%ymm0
- vmovdqa 0x60(%rsp),%ymm1
- vpunpckldq %ymm1,%ymm0,%ymm2
- vpunpckhdq %ymm1,%ymm0,%ymm1
- vmovdqa %ymm2,0x40(%rsp)
- vmovdqa %ymm1,0x60(%rsp)
- vmovdqa %ymm4,%ymm0
- vpunpckldq %ymm5,%ymm0,%ymm4
- vpunpckhdq %ymm5,%ymm0,%ymm5
- vmovdqa %ymm6,%ymm0
- vpunpckldq %ymm7,%ymm0,%ymm6
- vpunpckhdq %ymm7,%ymm0,%ymm7
- vmovdqa %ymm8,%ymm0
- vpunpckldq %ymm9,%ymm0,%ymm8
- vpunpckhdq %ymm9,%ymm0,%ymm9
- vmovdqa %ymm10,%ymm0
- vpunpckldq %ymm11,%ymm0,%ymm10
- vpunpckhdq %ymm11,%ymm0,%ymm11
- vmovdqa %ymm12,%ymm0
- vpunpckldq %ymm13,%ymm0,%ymm12
- vpunpckhdq %ymm13,%ymm0,%ymm13
- vmovdqa %ymm14,%ymm0
- vpunpckldq %ymm15,%ymm0,%ymm14
- vpunpckhdq %ymm15,%ymm0,%ymm15
-
- # interleave 64-bit words in state n, n+2
- vmovdqa 0x00(%rsp),%ymm0
- vmovdqa 0x40(%rsp),%ymm2
- vpunpcklqdq %ymm2,%ymm0,%ymm1
- vpunpckhqdq %ymm2,%ymm0,%ymm2
- vmovdqa %ymm1,0x00(%rsp)
- vmovdqa %ymm2,0x40(%rsp)
- vmovdqa 0x20(%rsp),%ymm0
- vmovdqa 0x60(%rsp),%ymm2
- vpunpcklqdq %ymm2,%ymm0,%ymm1
- vpunpckhqdq %ymm2,%ymm0,%ymm2
- vmovdqa %ymm1,0x20(%rsp)
- vmovdqa %ymm2,0x60(%rsp)
- vmovdqa %ymm4,%ymm0
- vpunpcklqdq %ymm6,%ymm0,%ymm4
- vpunpckhqdq %ymm6,%ymm0,%ymm6
- vmovdqa %ymm5,%ymm0
- vpunpcklqdq %ymm7,%ymm0,%ymm5
- vpunpckhqdq %ymm7,%ymm0,%ymm7
- vmovdqa %ymm8,%ymm0
- vpunpcklqdq %ymm10,%ymm0,%ymm8
- vpunpckhqdq %ymm10,%ymm0,%ymm10
- vmovdqa %ymm9,%ymm0
- vpunpcklqdq %ymm11,%ymm0,%ymm9
- vpunpckhqdq %ymm11,%ymm0,%ymm11
- vmovdqa %ymm12,%ymm0
- vpunpcklqdq %ymm14,%ymm0,%ymm12
- vpunpckhqdq %ymm14,%ymm0,%ymm14
- vmovdqa %ymm13,%ymm0
- vpunpcklqdq %ymm15,%ymm0,%ymm13
- vpunpckhqdq %ymm15,%ymm0,%ymm15
-
- # interleave 128-bit words in state n, n+4
- # xor/write first four blocks
- vmovdqa 0x00(%rsp),%ymm1
- vperm2i128 $0x20,%ymm4,%ymm1,%ymm0
- cmp $0x0020,%rax
- jl .Lxorpart8
- vpxor 0x0000(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0000(%rsi)
- vperm2i128 $0x31,%ymm4,%ymm1,%ymm4
-
- vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
- cmp $0x0040,%rax
- jl .Lxorpart8
- vpxor 0x0020(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0020(%rsi)
- vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
-
- vmovdqa 0x40(%rsp),%ymm1
- vperm2i128 $0x20,%ymm6,%ymm1,%ymm0
- cmp $0x0060,%rax
- jl .Lxorpart8
- vpxor 0x0040(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0040(%rsi)
- vperm2i128 $0x31,%ymm6,%ymm1,%ymm6
-
- vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
- cmp $0x0080,%rax
- jl .Lxorpart8
- vpxor 0x0060(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0060(%rsi)
- vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
-
- vmovdqa 0x20(%rsp),%ymm1
- vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
- cmp $0x00a0,%rax
- jl .Lxorpart8
- vpxor 0x0080(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0080(%rsi)
- vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
-
- vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
- cmp $0x00c0,%rax
- jl .Lxorpart8
- vpxor 0x00a0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x00a0(%rsi)
- vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
-
- vmovdqa 0x60(%rsp),%ymm1
- vperm2i128 $0x20,%ymm7,%ymm1,%ymm0
- cmp $0x00e0,%rax
- jl .Lxorpart8
- vpxor 0x00c0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x00c0(%rsi)
- vperm2i128 $0x31,%ymm7,%ymm1,%ymm7
-
- vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
- cmp $0x0100,%rax
- jl .Lxorpart8
- vpxor 0x00e0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x00e0(%rsi)
- vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
-
- # xor remaining blocks, write to output
- vmovdqa %ymm4,%ymm0
- cmp $0x0120,%rax
- jl .Lxorpart8
- vpxor 0x0100(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0100(%rsi)
-
- vmovdqa %ymm12,%ymm0
- cmp $0x0140,%rax
- jl .Lxorpart8
- vpxor 0x0120(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0120(%rsi)
-
- vmovdqa %ymm6,%ymm0
- cmp $0x0160,%rax
- jl .Lxorpart8
- vpxor 0x0140(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0140(%rsi)
-
- vmovdqa %ymm14,%ymm0
- cmp $0x0180,%rax
- jl .Lxorpart8
- vpxor 0x0160(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0160(%rsi)
-
- vmovdqa %ymm5,%ymm0
- cmp $0x01a0,%rax
- jl .Lxorpart8
- vpxor 0x0180(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0180(%rsi)
-
- vmovdqa %ymm13,%ymm0
- cmp $0x01c0,%rax
- jl .Lxorpart8
- vpxor 0x01a0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x01a0(%rsi)
-
- vmovdqa %ymm7,%ymm0
- cmp $0x01e0,%rax
- jl .Lxorpart8
- vpxor 0x01c0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x01c0(%rsi)
-
- vmovdqa %ymm15,%ymm0
- cmp $0x0200,%rax
- jl .Lxorpart8
- vpxor 0x01e0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x01e0(%rsi)
-
-.Ldone8:
- vzeroupper
- lea -8(%r10),%rsp
- RET
-
-.Lxorpart8:
- # xor remaining bytes from partial register into output
- mov %rax,%r9
- and $0x1f,%r9
- jz .Ldone8
- and $~0x1f,%rax
-
- mov %rsi,%r11
-
- lea (%rdx,%rax),%rsi
- mov %rsp,%rdi
- mov %r9,%rcx
- rep movsb
-
- vpxor 0x00(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
-
- mov %rsp,%rsi
- lea (%r11,%rax),%rdi
- mov %r9,%rcx
- rep movsb
-
- jmp .Ldone8
-
-SYM_FUNC_END(chacha_8block_xor_avx2)
diff --git a/arch/x86/crypto/chacha-avx512vl-x86_64.S b/arch/x86/crypto/chacha-avx512vl-x86_64.S
deleted file mode 100644
index 259383e1ad44..000000000000
--- a/arch/x86/crypto/chacha-avx512vl-x86_64.S
+++ /dev/null
@@ -1,836 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
- *
- * Copyright (C) 2018 Martin Willi
- */
-
-#include <linux/linkage.h>
-
-.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
-.align 32
-CTR2BL: .octa 0x00000000000000000000000000000000
- .octa 0x00000000000000000000000000000001
-
-.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
-.align 32
-CTR4BL: .octa 0x00000000000000000000000000000002
- .octa 0x00000000000000000000000000000003
-
-.section .rodata.cst32.CTR8BL, "aM", @progbits, 32
-.align 32
-CTR8BL: .octa 0x00000003000000020000000100000000
- .octa 0x00000007000000060000000500000004
-
-.text
-
-SYM_FUNC_START(chacha_2block_xor_avx512vl)
- # %rdi: Input state matrix, s
- # %rsi: up to 2 data blocks output, o
- # %rdx: up to 2 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts two ChaCha blocks by loading the state
- # matrix twice across four AVX registers. It performs matrix operations
- # on four words in each matrix in parallel, but requires shuffling to
- # rearrange the words after each round.
-
- vzeroupper
-
- # x0..3[0-2] = s0..3
- vbroadcasti128 0x00(%rdi),%ymm0
- vbroadcasti128 0x10(%rdi),%ymm1
- vbroadcasti128 0x20(%rdi),%ymm2
- vbroadcasti128 0x30(%rdi),%ymm3
-
- vpaddd CTR2BL(%rip),%ymm3,%ymm3
-
- vmovdqa %ymm0,%ymm8
- vmovdqa %ymm1,%ymm9
- vmovdqa %ymm2,%ymm10
- vmovdqa %ymm3,%ymm11
-
-.Ldoubleround:
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $16,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $12,%ymm1,%ymm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $8,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $7,%ymm1,%ymm1
-
- # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm1,%ymm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm3,%ymm3
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $16,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $12,%ymm1,%ymm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $8,%ymm3,%ymm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $7,%ymm1,%ymm1
-
- # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm1,%ymm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm3,%ymm3
-
- sub $2,%r8d
- jnz .Ldoubleround
-
- # o0 = i0 ^ (x0 + s0)
- vpaddd %ymm8,%ymm0,%ymm7
- cmp $0x10,%rcx
- jl .Lxorpart2
- vpxord 0x00(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x00(%rsi)
- vextracti128 $1,%ymm7,%xmm0
- # o1 = i1 ^ (x1 + s1)
- vpaddd %ymm9,%ymm1,%ymm7
- cmp $0x20,%rcx
- jl .Lxorpart2
- vpxord 0x10(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x10(%rsi)
- vextracti128 $1,%ymm7,%xmm1
- # o2 = i2 ^ (x2 + s2)
- vpaddd %ymm10,%ymm2,%ymm7
- cmp $0x30,%rcx
- jl .Lxorpart2
- vpxord 0x20(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x20(%rsi)
- vextracti128 $1,%ymm7,%xmm2
- # o3 = i3 ^ (x3 + s3)
- vpaddd %ymm11,%ymm3,%ymm7
- cmp $0x40,%rcx
- jl .Lxorpart2
- vpxord 0x30(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x30(%rsi)
- vextracti128 $1,%ymm7,%xmm3
-
- # xor and write second block
- vmovdqa %xmm0,%xmm7
- cmp $0x50,%rcx
- jl .Lxorpart2
- vpxord 0x40(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x40(%rsi)
-
- vmovdqa %xmm1,%xmm7
- cmp $0x60,%rcx
- jl .Lxorpart2
- vpxord 0x50(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x50(%rsi)
-
- vmovdqa %xmm2,%xmm7
- cmp $0x70,%rcx
- jl .Lxorpart2
- vpxord 0x60(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x60(%rsi)
-
- vmovdqa %xmm3,%xmm7
- cmp $0x80,%rcx
- jl .Lxorpart2
- vpxord 0x70(%rdx),%xmm7,%xmm6
- vmovdqu %xmm6,0x70(%rsi)
-
-.Ldone2:
- vzeroupper
- RET
-
-.Lxorpart2:
- # xor remaining bytes from partial register into output
- mov %rcx,%rax
- and $0xf,%rcx
- jz .Ldone2
- mov %rax,%r9
- and $~0xf,%r9
-
- mov $1,%rax
- shld %cl,%rax,%rax
- sub $1,%rax
- kmovq %rax,%k1
-
- vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
- vpxord %xmm7,%xmm1,%xmm1
- vmovdqu8 %xmm1,(%rsi,%r9){%k1}
-
- jmp .Ldone2
-
-SYM_FUNC_END(chacha_2block_xor_avx512vl)
-
-SYM_FUNC_START(chacha_4block_xor_avx512vl)
- # %rdi: Input state matrix, s
- # %rsi: up to 4 data blocks output, o
- # %rdx: up to 4 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts four ChaCha blocks by loading the state
- # matrix four times across eight AVX registers. It performs matrix
- # operations on four words in two matrices in parallel, sequentially
- # to the operations on the four words of the other two matrices. The
- # required word shuffling has a rather high latency, we can do the
- # arithmetic on two matrix-pairs without much slowdown.
-
- vzeroupper
-
- # x0..3[0-4] = s0..3
- vbroadcasti128 0x00(%rdi),%ymm0
- vbroadcasti128 0x10(%rdi),%ymm1
- vbroadcasti128 0x20(%rdi),%ymm2
- vbroadcasti128 0x30(%rdi),%ymm3
-
- vmovdqa %ymm0,%ymm4
- vmovdqa %ymm1,%ymm5
- vmovdqa %ymm2,%ymm6
- vmovdqa %ymm3,%ymm7
-
- vpaddd CTR2BL(%rip),%ymm3,%ymm3
- vpaddd CTR4BL(%rip),%ymm7,%ymm7
-
- vmovdqa %ymm0,%ymm11
- vmovdqa %ymm1,%ymm12
- vmovdqa %ymm2,%ymm13
- vmovdqa %ymm3,%ymm14
- vmovdqa %ymm7,%ymm15
-
-.Ldoubleround4:
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $16,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxord %ymm4,%ymm7,%ymm7
- vprold $16,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $12,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxord %ymm6,%ymm5,%ymm5
- vprold $12,%ymm5,%ymm5
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $8,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxord %ymm4,%ymm7,%ymm7
- vprold $8,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $7,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxord %ymm6,%ymm5,%ymm5
- vprold $7,%ymm5,%ymm5
-
- # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm1,%ymm1
- vpshufd $0x39,%ymm5,%ymm5
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- vpshufd $0x4e,%ymm6,%ymm6
- # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm3,%ymm3
- vpshufd $0x93,%ymm7,%ymm7
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $16,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxord %ymm4,%ymm7,%ymm7
- vprold $16,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $12,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxord %ymm6,%ymm5,%ymm5
- vprold $12,%ymm5,%ymm5
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- vpaddd %ymm1,%ymm0,%ymm0
- vpxord %ymm0,%ymm3,%ymm3
- vprold $8,%ymm3,%ymm3
-
- vpaddd %ymm5,%ymm4,%ymm4
- vpxord %ymm4,%ymm7,%ymm7
- vprold $8,%ymm7,%ymm7
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- vpaddd %ymm3,%ymm2,%ymm2
- vpxord %ymm2,%ymm1,%ymm1
- vprold $7,%ymm1,%ymm1
-
- vpaddd %ymm7,%ymm6,%ymm6
- vpxord %ymm6,%ymm5,%ymm5
- vprold $7,%ymm5,%ymm5
-
- # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- vpshufd $0x93,%ymm1,%ymm1
- vpshufd $0x93,%ymm5,%ymm5
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- vpshufd $0x4e,%ymm2,%ymm2
- vpshufd $0x4e,%ymm6,%ymm6
- # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- vpshufd $0x39,%ymm3,%ymm3
- vpshufd $0x39,%ymm7,%ymm7
-
- sub $2,%r8d
- jnz .Ldoubleround4
-
- # o0 = i0 ^ (x0 + s0), first block
- vpaddd %ymm11,%ymm0,%ymm10
- cmp $0x10,%rcx
- jl .Lxorpart4
- vpxord 0x00(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x00(%rsi)
- vextracti128 $1,%ymm10,%xmm0
- # o1 = i1 ^ (x1 + s1), first block
- vpaddd %ymm12,%ymm1,%ymm10
- cmp $0x20,%rcx
- jl .Lxorpart4
- vpxord 0x10(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x10(%rsi)
- vextracti128 $1,%ymm10,%xmm1
- # o2 = i2 ^ (x2 + s2), first block
- vpaddd %ymm13,%ymm2,%ymm10
- cmp $0x30,%rcx
- jl .Lxorpart4
- vpxord 0x20(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x20(%rsi)
- vextracti128 $1,%ymm10,%xmm2
- # o3 = i3 ^ (x3 + s3), first block
- vpaddd %ymm14,%ymm3,%ymm10
- cmp $0x40,%rcx
- jl .Lxorpart4
- vpxord 0x30(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x30(%rsi)
- vextracti128 $1,%ymm10,%xmm3
-
- # xor and write second block
- vmovdqa %xmm0,%xmm10
- cmp $0x50,%rcx
- jl .Lxorpart4
- vpxord 0x40(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x40(%rsi)
-
- vmovdqa %xmm1,%xmm10
- cmp $0x60,%rcx
- jl .Lxorpart4
- vpxord 0x50(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x50(%rsi)
-
- vmovdqa %xmm2,%xmm10
- cmp $0x70,%rcx
- jl .Lxorpart4
- vpxord 0x60(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x60(%rsi)
-
- vmovdqa %xmm3,%xmm10
- cmp $0x80,%rcx
- jl .Lxorpart4
- vpxord 0x70(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x70(%rsi)
-
- # o0 = i0 ^ (x0 + s0), third block
- vpaddd %ymm11,%ymm4,%ymm10
- cmp $0x90,%rcx
- jl .Lxorpart4
- vpxord 0x80(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x80(%rsi)
- vextracti128 $1,%ymm10,%xmm4
- # o1 = i1 ^ (x1 + s1), third block
- vpaddd %ymm12,%ymm5,%ymm10
- cmp $0xa0,%rcx
- jl .Lxorpart4
- vpxord 0x90(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0x90(%rsi)
- vextracti128 $1,%ymm10,%xmm5
- # o2 = i2 ^ (x2 + s2), third block
- vpaddd %ymm13,%ymm6,%ymm10
- cmp $0xb0,%rcx
- jl .Lxorpart4
- vpxord 0xa0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xa0(%rsi)
- vextracti128 $1,%ymm10,%xmm6
- # o3 = i3 ^ (x3 + s3), third block
- vpaddd %ymm15,%ymm7,%ymm10
- cmp $0xc0,%rcx
- jl .Lxorpart4
- vpxord 0xb0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xb0(%rsi)
- vextracti128 $1,%ymm10,%xmm7
-
- # xor and write fourth block
- vmovdqa %xmm4,%xmm10
- cmp $0xd0,%rcx
- jl .Lxorpart4
- vpxord 0xc0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xc0(%rsi)
-
- vmovdqa %xmm5,%xmm10
- cmp $0xe0,%rcx
- jl .Lxorpart4
- vpxord 0xd0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xd0(%rsi)
-
- vmovdqa %xmm6,%xmm10
- cmp $0xf0,%rcx
- jl .Lxorpart4
- vpxord 0xe0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xe0(%rsi)
-
- vmovdqa %xmm7,%xmm10
- cmp $0x100,%rcx
- jl .Lxorpart4
- vpxord 0xf0(%rdx),%xmm10,%xmm9
- vmovdqu %xmm9,0xf0(%rsi)
-
-.Ldone4:
- vzeroupper
- RET
-
-.Lxorpart4:
- # xor remaining bytes from partial register into output
- mov %rcx,%rax
- and $0xf,%rcx
- jz .Ldone4
- mov %rax,%r9
- and $~0xf,%r9
-
- mov $1,%rax
- shld %cl,%rax,%rax
- sub $1,%rax
- kmovq %rax,%k1
-
- vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
- vpxord %xmm10,%xmm1,%xmm1
- vmovdqu8 %xmm1,(%rsi,%r9){%k1}
-
- jmp .Ldone4
-
-SYM_FUNC_END(chacha_4block_xor_avx512vl)
-
-SYM_FUNC_START(chacha_8block_xor_avx512vl)
- # %rdi: Input state matrix, s
- # %rsi: up to 8 data blocks output, o
- # %rdx: up to 8 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts eight consecutive ChaCha blocks by loading
- # the state matrix in AVX registers eight times. Compared to AVX2, this
- # mostly benefits from the new rotate instructions in VL and the
- # additional registers.
-
- vzeroupper
-
- # x0..15[0-7] = s[0..15]
- vpbroadcastd 0x00(%rdi),%ymm0
- vpbroadcastd 0x04(%rdi),%ymm1
- vpbroadcastd 0x08(%rdi),%ymm2
- vpbroadcastd 0x0c(%rdi),%ymm3
- vpbroadcastd 0x10(%rdi),%ymm4
- vpbroadcastd 0x14(%rdi),%ymm5
- vpbroadcastd 0x18(%rdi),%ymm6
- vpbroadcastd 0x1c(%rdi),%ymm7
- vpbroadcastd 0x20(%rdi),%ymm8
- vpbroadcastd 0x24(%rdi),%ymm9
- vpbroadcastd 0x28(%rdi),%ymm10
- vpbroadcastd 0x2c(%rdi),%ymm11
- vpbroadcastd 0x30(%rdi),%ymm12
- vpbroadcastd 0x34(%rdi),%ymm13
- vpbroadcastd 0x38(%rdi),%ymm14
- vpbroadcastd 0x3c(%rdi),%ymm15
-
- # x12 += counter values 0-3
- vpaddd CTR8BL(%rip),%ymm12,%ymm12
-
- vmovdqa64 %ymm0,%ymm16
- vmovdqa64 %ymm1,%ymm17
- vmovdqa64 %ymm2,%ymm18
- vmovdqa64 %ymm3,%ymm19
- vmovdqa64 %ymm4,%ymm20
- vmovdqa64 %ymm5,%ymm21
- vmovdqa64 %ymm6,%ymm22
- vmovdqa64 %ymm7,%ymm23
- vmovdqa64 %ymm8,%ymm24
- vmovdqa64 %ymm9,%ymm25
- vmovdqa64 %ymm10,%ymm26
- vmovdqa64 %ymm11,%ymm27
- vmovdqa64 %ymm12,%ymm28
- vmovdqa64 %ymm13,%ymm29
- vmovdqa64 %ymm14,%ymm30
- vmovdqa64 %ymm15,%ymm31
-
-.Ldoubleround8:
- # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
- vpaddd %ymm0,%ymm4,%ymm0
- vpxord %ymm0,%ymm12,%ymm12
- vprold $16,%ymm12,%ymm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
- vpaddd %ymm1,%ymm5,%ymm1
- vpxord %ymm1,%ymm13,%ymm13
- vprold $16,%ymm13,%ymm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
- vpaddd %ymm2,%ymm6,%ymm2
- vpxord %ymm2,%ymm14,%ymm14
- vprold $16,%ymm14,%ymm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
- vpaddd %ymm3,%ymm7,%ymm3
- vpxord %ymm3,%ymm15,%ymm15
- vprold $16,%ymm15,%ymm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
- vpaddd %ymm12,%ymm8,%ymm8
- vpxord %ymm8,%ymm4,%ymm4
- vprold $12,%ymm4,%ymm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
- vpaddd %ymm13,%ymm9,%ymm9
- vpxord %ymm9,%ymm5,%ymm5
- vprold $12,%ymm5,%ymm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
- vpaddd %ymm14,%ymm10,%ymm10
- vpxord %ymm10,%ymm6,%ymm6
- vprold $12,%ymm6,%ymm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
- vpaddd %ymm15,%ymm11,%ymm11
- vpxord %ymm11,%ymm7,%ymm7
- vprold $12,%ymm7,%ymm7
-
- # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
- vpaddd %ymm0,%ymm4,%ymm0
- vpxord %ymm0,%ymm12,%ymm12
- vprold $8,%ymm12,%ymm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
- vpaddd %ymm1,%ymm5,%ymm1
- vpxord %ymm1,%ymm13,%ymm13
- vprold $8,%ymm13,%ymm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
- vpaddd %ymm2,%ymm6,%ymm2
- vpxord %ymm2,%ymm14,%ymm14
- vprold $8,%ymm14,%ymm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
- vpaddd %ymm3,%ymm7,%ymm3
- vpxord %ymm3,%ymm15,%ymm15
- vprold $8,%ymm15,%ymm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
- vpaddd %ymm12,%ymm8,%ymm8
- vpxord %ymm8,%ymm4,%ymm4
- vprold $7,%ymm4,%ymm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
- vpaddd %ymm13,%ymm9,%ymm9
- vpxord %ymm9,%ymm5,%ymm5
- vprold $7,%ymm5,%ymm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
- vpaddd %ymm14,%ymm10,%ymm10
- vpxord %ymm10,%ymm6,%ymm6
- vprold $7,%ymm6,%ymm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
- vpaddd %ymm15,%ymm11,%ymm11
- vpxord %ymm11,%ymm7,%ymm7
- vprold $7,%ymm7,%ymm7
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
- vpaddd %ymm0,%ymm5,%ymm0
- vpxord %ymm0,%ymm15,%ymm15
- vprold $16,%ymm15,%ymm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
- vpaddd %ymm1,%ymm6,%ymm1
- vpxord %ymm1,%ymm12,%ymm12
- vprold $16,%ymm12,%ymm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
- vpaddd %ymm2,%ymm7,%ymm2
- vpxord %ymm2,%ymm13,%ymm13
- vprold $16,%ymm13,%ymm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
- vpaddd %ymm3,%ymm4,%ymm3
- vpxord %ymm3,%ymm14,%ymm14
- vprold $16,%ymm14,%ymm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
- vpaddd %ymm15,%ymm10,%ymm10
- vpxord %ymm10,%ymm5,%ymm5
- vprold $12,%ymm5,%ymm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
- vpaddd %ymm12,%ymm11,%ymm11
- vpxord %ymm11,%ymm6,%ymm6
- vprold $12,%ymm6,%ymm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
- vpaddd %ymm13,%ymm8,%ymm8
- vpxord %ymm8,%ymm7,%ymm7
- vprold $12,%ymm7,%ymm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
- vpaddd %ymm14,%ymm9,%ymm9
- vpxord %ymm9,%ymm4,%ymm4
- vprold $12,%ymm4,%ymm4
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
- vpaddd %ymm0,%ymm5,%ymm0
- vpxord %ymm0,%ymm15,%ymm15
- vprold $8,%ymm15,%ymm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
- vpaddd %ymm1,%ymm6,%ymm1
- vpxord %ymm1,%ymm12,%ymm12
- vprold $8,%ymm12,%ymm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
- vpaddd %ymm2,%ymm7,%ymm2
- vpxord %ymm2,%ymm13,%ymm13
- vprold $8,%ymm13,%ymm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
- vpaddd %ymm3,%ymm4,%ymm3
- vpxord %ymm3,%ymm14,%ymm14
- vprold $8,%ymm14,%ymm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
- vpaddd %ymm15,%ymm10,%ymm10
- vpxord %ymm10,%ymm5,%ymm5
- vprold $7,%ymm5,%ymm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
- vpaddd %ymm12,%ymm11,%ymm11
- vpxord %ymm11,%ymm6,%ymm6
- vprold $7,%ymm6,%ymm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
- vpaddd %ymm13,%ymm8,%ymm8
- vpxord %ymm8,%ymm7,%ymm7
- vprold $7,%ymm7,%ymm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
- vpaddd %ymm14,%ymm9,%ymm9
- vpxord %ymm9,%ymm4,%ymm4
- vprold $7,%ymm4,%ymm4
-
- sub $2,%r8d
- jnz .Ldoubleround8
-
- # x0..15[0-3] += s[0..15]
- vpaddd %ymm16,%ymm0,%ymm0
- vpaddd %ymm17,%ymm1,%ymm1
- vpaddd %ymm18,%ymm2,%ymm2
- vpaddd %ymm19,%ymm3,%ymm3
- vpaddd %ymm20,%ymm4,%ymm4
- vpaddd %ymm21,%ymm5,%ymm5
- vpaddd %ymm22,%ymm6,%ymm6
- vpaddd %ymm23,%ymm7,%ymm7
- vpaddd %ymm24,%ymm8,%ymm8
- vpaddd %ymm25,%ymm9,%ymm9
- vpaddd %ymm26,%ymm10,%ymm10
- vpaddd %ymm27,%ymm11,%ymm11
- vpaddd %ymm28,%ymm12,%ymm12
- vpaddd %ymm29,%ymm13,%ymm13
- vpaddd %ymm30,%ymm14,%ymm14
- vpaddd %ymm31,%ymm15,%ymm15
-
- # interleave 32-bit words in state n, n+1
- vpunpckldq %ymm1,%ymm0,%ymm16
- vpunpckhdq %ymm1,%ymm0,%ymm17
- vpunpckldq %ymm3,%ymm2,%ymm18
- vpunpckhdq %ymm3,%ymm2,%ymm19
- vpunpckldq %ymm5,%ymm4,%ymm20
- vpunpckhdq %ymm5,%ymm4,%ymm21
- vpunpckldq %ymm7,%ymm6,%ymm22
- vpunpckhdq %ymm7,%ymm6,%ymm23
- vpunpckldq %ymm9,%ymm8,%ymm24
- vpunpckhdq %ymm9,%ymm8,%ymm25
- vpunpckldq %ymm11,%ymm10,%ymm26
- vpunpckhdq %ymm11,%ymm10,%ymm27
- vpunpckldq %ymm13,%ymm12,%ymm28
- vpunpckhdq %ymm13,%ymm12,%ymm29
- vpunpckldq %ymm15,%ymm14,%ymm30
- vpunpckhdq %ymm15,%ymm14,%ymm31
-
- # interleave 64-bit words in state n, n+2
- vpunpcklqdq %ymm18,%ymm16,%ymm0
- vpunpcklqdq %ymm19,%ymm17,%ymm1
- vpunpckhqdq %ymm18,%ymm16,%ymm2
- vpunpckhqdq %ymm19,%ymm17,%ymm3
- vpunpcklqdq %ymm22,%ymm20,%ymm4
- vpunpcklqdq %ymm23,%ymm21,%ymm5
- vpunpckhqdq %ymm22,%ymm20,%ymm6
- vpunpckhqdq %ymm23,%ymm21,%ymm7
- vpunpcklqdq %ymm26,%ymm24,%ymm8
- vpunpcklqdq %ymm27,%ymm25,%ymm9
- vpunpckhqdq %ymm26,%ymm24,%ymm10
- vpunpckhqdq %ymm27,%ymm25,%ymm11
- vpunpcklqdq %ymm30,%ymm28,%ymm12
- vpunpcklqdq %ymm31,%ymm29,%ymm13
- vpunpckhqdq %ymm30,%ymm28,%ymm14
- vpunpckhqdq %ymm31,%ymm29,%ymm15
-
- # interleave 128-bit words in state n, n+4
- # xor/write first four blocks
- vmovdqa64 %ymm0,%ymm16
- vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
- cmp $0x0020,%rcx
- jl .Lxorpart8
- vpxord 0x0000(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0000(%rsi)
- vmovdqa64 %ymm16,%ymm0
- vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
-
- vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
- cmp $0x0040,%rcx
- jl .Lxorpart8
- vpxord 0x0020(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0020(%rsi)
- vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
-
- vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
- cmp $0x0060,%rcx
- jl .Lxorpart8
- vpxord 0x0040(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0040(%rsi)
- vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
-
- vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
- cmp $0x0080,%rcx
- jl .Lxorpart8
- vpxord 0x0060(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0060(%rsi)
- vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
-
- vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
- cmp $0x00a0,%rcx
- jl .Lxorpart8
- vpxord 0x0080(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0080(%rsi)
- vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
-
- vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
- cmp $0x00c0,%rcx
- jl .Lxorpart8
- vpxord 0x00a0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x00a0(%rsi)
- vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
-
- vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
- cmp $0x00e0,%rcx
- jl .Lxorpart8
- vpxord 0x00c0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x00c0(%rsi)
- vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
-
- vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
- cmp $0x0100,%rcx
- jl .Lxorpart8
- vpxord 0x00e0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x00e0(%rsi)
- vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
-
- # xor remaining blocks, write to output
- vmovdqa64 %ymm4,%ymm0
- cmp $0x0120,%rcx
- jl .Lxorpart8
- vpxord 0x0100(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0100(%rsi)
-
- vmovdqa64 %ymm12,%ymm0
- cmp $0x0140,%rcx
- jl .Lxorpart8
- vpxord 0x0120(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0120(%rsi)
-
- vmovdqa64 %ymm6,%ymm0
- cmp $0x0160,%rcx
- jl .Lxorpart8
- vpxord 0x0140(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0140(%rsi)
-
- vmovdqa64 %ymm14,%ymm0
- cmp $0x0180,%rcx
- jl .Lxorpart8
- vpxord 0x0160(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0160(%rsi)
-
- vmovdqa64 %ymm5,%ymm0
- cmp $0x01a0,%rcx
- jl .Lxorpart8
- vpxord 0x0180(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x0180(%rsi)
-
- vmovdqa64 %ymm13,%ymm0
- cmp $0x01c0,%rcx
- jl .Lxorpart8
- vpxord 0x01a0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x01a0(%rsi)
-
- vmovdqa64 %ymm7,%ymm0
- cmp $0x01e0,%rcx
- jl .Lxorpart8
- vpxord 0x01c0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x01c0(%rsi)
-
- vmovdqa64 %ymm15,%ymm0
- cmp $0x0200,%rcx
- jl .Lxorpart8
- vpxord 0x01e0(%rdx),%ymm0,%ymm0
- vmovdqu64 %ymm0,0x01e0(%rsi)
-
-.Ldone8:
- vzeroupper
- RET
-
-.Lxorpart8:
- # xor remaining bytes from partial register into output
- mov %rcx,%rax
- and $0x1f,%rcx
- jz .Ldone8
- mov %rax,%r9
- and $~0x1f,%r9
-
- mov $1,%rax
- shld %cl,%rax,%rax
- sub $1,%rax
- kmovq %rax,%k1
-
- vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
- vpxord %ymm0,%ymm1,%ymm1
- vmovdqu8 %ymm1,(%rsi,%r9){%k1}
-
- jmp .Ldone8
-
-SYM_FUNC_END(chacha_8block_xor_avx512vl)
diff --git a/arch/x86/crypto/chacha-ssse3-x86_64.S b/arch/x86/crypto/chacha-ssse3-x86_64.S
deleted file mode 100644
index 7111949cd5b9..000000000000
--- a/arch/x86/crypto/chacha-ssse3-x86_64.S
+++ /dev/null
@@ -1,791 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
- *
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <linux/linkage.h>
-#include <asm/frame.h>
-
-.section .rodata.cst16.ROT8, "aM", @progbits, 16
-.align 16
-ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
-.section .rodata.cst16.ROT16, "aM", @progbits, 16
-.align 16
-ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
-.section .rodata.cst16.CTRINC, "aM", @progbits, 16
-.align 16
-CTRINC: .octa 0x00000003000000020000000100000000
-
-.text
-
-/*
- * chacha_permute - permute one block
- *
- * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
- * function performs matrix operations on four words in parallel, but requires
- * shuffling to rearrange the words after each round. 8/16-bit word rotation is
- * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
- * rotation uses traditional shift+OR.
- *
- * The round count is given in %r8d.
- *
- * Clobbers: %r8d, %xmm4-%xmm7
- */
-SYM_FUNC_START_LOCAL(chacha_permute)
-
- movdqa ROT8(%rip),%xmm4
- movdqa ROT16(%rip),%xmm5
-
-.Ldoubleround:
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm5,%xmm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm6
- pslld $12,%xmm6
- psrld $20,%xmm1
- por %xmm6,%xmm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm4,%xmm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm7
- pslld $7,%xmm7
- psrld $25,%xmm1
- por %xmm7,%xmm1
-
- # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
- pshufd $0x39,%xmm1,%xmm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- pshufd $0x4e,%xmm2,%xmm2
- # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
- pshufd $0x93,%xmm3,%xmm3
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm5,%xmm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm6
- pslld $12,%xmm6
- psrld $20,%xmm1
- por %xmm6,%xmm1
-
- # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- pshufb %xmm4,%xmm3
-
- # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm7
- pslld $7,%xmm7
- psrld $25,%xmm1
- por %xmm7,%xmm1
-
- # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
- pshufd $0x93,%xmm1,%xmm1
- # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
- pshufd $0x4e,%xmm2,%xmm2
- # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
- pshufd $0x39,%xmm3,%xmm3
-
- sub $2,%r8d
- jnz .Ldoubleround
-
- RET
-SYM_FUNC_END(chacha_permute)
-
-SYM_FUNC_START(chacha_block_xor_ssse3)
- # %rdi: Input state matrix, s
- # %rsi: up to 1 data block output, o
- # %rdx: up to 1 data block input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
- FRAME_BEGIN
-
- # x0..3 = s0..3
- movdqu 0x00(%rdi),%xmm0
- movdqu 0x10(%rdi),%xmm1
- movdqu 0x20(%rdi),%xmm2
- movdqu 0x30(%rdi),%xmm3
- movdqa %xmm0,%xmm8
- movdqa %xmm1,%xmm9
- movdqa %xmm2,%xmm10
- movdqa %xmm3,%xmm11
-
- mov %rcx,%rax
- call chacha_permute
-
- # o0 = i0 ^ (x0 + s0)
- paddd %xmm8,%xmm0
- cmp $0x10,%rax
- jl .Lxorpart
- movdqu 0x00(%rdx),%xmm4
- pxor %xmm4,%xmm0
- movdqu %xmm0,0x00(%rsi)
- # o1 = i1 ^ (x1 + s1)
- paddd %xmm9,%xmm1
- movdqa %xmm1,%xmm0
- cmp $0x20,%rax
- jl .Lxorpart
- movdqu 0x10(%rdx),%xmm0
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x10(%rsi)
- # o2 = i2 ^ (x2 + s2)
- paddd %xmm10,%xmm2
- movdqa %xmm2,%xmm0
- cmp $0x30,%rax
- jl .Lxorpart
- movdqu 0x20(%rdx),%xmm0
- pxor %xmm2,%xmm0
- movdqu %xmm0,0x20(%rsi)
- # o3 = i3 ^ (x3 + s3)
- paddd %xmm11,%xmm3
- movdqa %xmm3,%xmm0
- cmp $0x40,%rax
- jl .Lxorpart
- movdqu 0x30(%rdx),%xmm0
- pxor %xmm3,%xmm0
- movdqu %xmm0,0x30(%rsi)
-
-.Ldone:
- FRAME_END
- RET
-
-.Lxorpart:
- # xor remaining bytes from partial register into output
- mov %rax,%r9
- and $0x0f,%r9
- jz .Ldone
- and $~0x0f,%rax
-
- mov %rsi,%r11
-
- lea 8(%rsp),%r10
- sub $0x10,%rsp
- and $~31,%rsp
-
- lea (%rdx,%rax),%rsi
- mov %rsp,%rdi
- mov %r9,%rcx
- rep movsb
-
- pxor 0x00(%rsp),%xmm0
- movdqa %xmm0,0x00(%rsp)
-
- mov %rsp,%rsi
- lea (%r11,%rax),%rdi
- mov %r9,%rcx
- rep movsb
-
- lea -8(%r10),%rsp
- jmp .Ldone
-
-SYM_FUNC_END(chacha_block_xor_ssse3)
-
-SYM_FUNC_START(hchacha_block_ssse3)
- # %rdi: Input state matrix, s
- # %rsi: output (8 32-bit words)
- # %edx: nrounds
- FRAME_BEGIN
-
- movdqu 0x00(%rdi),%xmm0
- movdqu 0x10(%rdi),%xmm1
- movdqu 0x20(%rdi),%xmm2
- movdqu 0x30(%rdi),%xmm3
-
- mov %edx,%r8d
- call chacha_permute
-
- movdqu %xmm0,0x00(%rsi)
- movdqu %xmm3,0x10(%rsi)
-
- FRAME_END
- RET
-SYM_FUNC_END(hchacha_block_ssse3)
-
-SYM_FUNC_START(chacha_4block_xor_ssse3)
- # %rdi: Input state matrix, s
- # %rsi: up to 4 data blocks output, o
- # %rdx: up to 4 data blocks input, i
- # %rcx: input/output length in bytes
- # %r8d: nrounds
-
- # This function encrypts four consecutive ChaCha blocks by loading the
- # the state matrix in SSE registers four times. As we need some scratch
- # registers, we save the first four registers on the stack. The
- # algorithm performs each operation on the corresponding word of each
- # state matrix, hence requires no word shuffling. For final XORing step
- # we transpose the matrix by interleaving 32- and then 64-bit words,
- # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
- # done with the slightly better performing SSSE3 byte shuffling,
- # 7/12-bit word rotation uses traditional shift+OR.
-
- lea 8(%rsp),%r10
- sub $0x80,%rsp
- and $~63,%rsp
- mov %rcx,%rax
-
- # x0..15[0-3] = s0..3[0..3]
- movq 0x00(%rdi),%xmm1
- pshufd $0x00,%xmm1,%xmm0
- pshufd $0x55,%xmm1,%xmm1
- movq 0x08(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- movq 0x10(%rdi),%xmm5
- pshufd $0x00,%xmm5,%xmm4
- pshufd $0x55,%xmm5,%xmm5
- movq 0x18(%rdi),%xmm7
- pshufd $0x00,%xmm7,%xmm6
- pshufd $0x55,%xmm7,%xmm7
- movq 0x20(%rdi),%xmm9
- pshufd $0x00,%xmm9,%xmm8
- pshufd $0x55,%xmm9,%xmm9
- movq 0x28(%rdi),%xmm11
- pshufd $0x00,%xmm11,%xmm10
- pshufd $0x55,%xmm11,%xmm11
- movq 0x30(%rdi),%xmm13
- pshufd $0x00,%xmm13,%xmm12
- pshufd $0x55,%xmm13,%xmm13
- movq 0x38(%rdi),%xmm15
- pshufd $0x00,%xmm15,%xmm14
- pshufd $0x55,%xmm15,%xmm15
- # x0..3 on stack
- movdqa %xmm0,0x00(%rsp)
- movdqa %xmm1,0x10(%rsp)
- movdqa %xmm2,0x20(%rsp)
- movdqa %xmm3,0x30(%rsp)
-
- movdqa CTRINC(%rip),%xmm1
- movdqa ROT8(%rip),%xmm2
- movdqa ROT16(%rip),%xmm3
-
- # x12 += counter values 0-3
- paddd %xmm1,%xmm12
-
-.Ldoubleround4:
- # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
- movdqa 0x00(%rsp),%xmm0
- paddd %xmm4,%xmm0
- movdqa %xmm0,0x00(%rsp)
- pxor %xmm0,%xmm12
- pshufb %xmm3,%xmm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
- movdqa 0x10(%rsp),%xmm0
- paddd %xmm5,%xmm0
- movdqa %xmm0,0x10(%rsp)
- pxor %xmm0,%xmm13
- pshufb %xmm3,%xmm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
- movdqa 0x20(%rsp),%xmm0
- paddd %xmm6,%xmm0
- movdqa %xmm0,0x20(%rsp)
- pxor %xmm0,%xmm14
- pshufb %xmm3,%xmm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
- movdqa 0x30(%rsp),%xmm0
- paddd %xmm7,%xmm0
- movdqa %xmm0,0x30(%rsp)
- pxor %xmm0,%xmm15
- pshufb %xmm3,%xmm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm4
- por %xmm0,%xmm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm5
- por %xmm0,%xmm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm6
- por %xmm0,%xmm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
- paddd %xmm15,%xmm11
- pxor %xmm11,%xmm7
- movdqa %xmm7,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm7
- por %xmm0,%xmm7
-
- # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
- movdqa 0x00(%rsp),%xmm0
- paddd %xmm4,%xmm0
- movdqa %xmm0,0x00(%rsp)
- pxor %xmm0,%xmm12
- pshufb %xmm2,%xmm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
- movdqa 0x10(%rsp),%xmm0
- paddd %xmm5,%xmm0
- movdqa %xmm0,0x10(%rsp)
- pxor %xmm0,%xmm13
- pshufb %xmm2,%xmm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
- movdqa 0x20(%rsp),%xmm0
- paddd %xmm6,%xmm0
- movdqa %xmm0,0x20(%rsp)
- pxor %xmm0,%xmm14
- pshufb %xmm2,%xmm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
- movdqa 0x30(%rsp),%xmm0
- paddd %xmm7,%xmm0
- movdqa %xmm0,0x30(%rsp)
- pxor %xmm0,%xmm15
- pshufb %xmm2,%xmm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
- paddd %xmm12,%xmm8
- pxor %xmm8,%xmm4
- movdqa %xmm4,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm4
- por %xmm0,%xmm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
- paddd %xmm13,%xmm9
- pxor %xmm9,%xmm5
- movdqa %xmm5,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm5
- por %xmm0,%xmm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
- paddd %xmm14,%xmm10
- pxor %xmm10,%xmm6
- movdqa %xmm6,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm6
- por %xmm0,%xmm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
- paddd %xmm15,%xmm11
- pxor %xmm11,%xmm7
- movdqa %xmm7,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm7
- por %xmm0,%xmm7
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
- movdqa 0x00(%rsp),%xmm0
- paddd %xmm5,%xmm0
- movdqa %xmm0,0x00(%rsp)
- pxor %xmm0,%xmm15
- pshufb %xmm3,%xmm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
- movdqa 0x10(%rsp),%xmm0
- paddd %xmm6,%xmm0
- movdqa %xmm0,0x10(%rsp)
- pxor %xmm0,%xmm12
- pshufb %xmm3,%xmm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
- movdqa 0x20(%rsp),%xmm0
- paddd %xmm7,%xmm0
- movdqa %xmm0,0x20(%rsp)
- pxor %xmm0,%xmm13
- pshufb %xmm3,%xmm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
- movdqa 0x30(%rsp),%xmm0
- paddd %xmm4,%xmm0
- movdqa %xmm0,0x30(%rsp)
- pxor %xmm0,%xmm14
- pshufb %xmm3,%xmm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
- paddd %xmm15,%xmm10
- pxor %xmm10,%xmm5
- movdqa %xmm5,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm5
- por %xmm0,%xmm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
- paddd %xmm12,%xmm11
- pxor %xmm11,%xmm6
- movdqa %xmm6,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm6
- por %xmm0,%xmm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
- paddd %xmm13,%xmm8
- pxor %xmm8,%xmm7
- movdqa %xmm7,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm7
- por %xmm0,%xmm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
- paddd %xmm14,%xmm9
- pxor %xmm9,%xmm4
- movdqa %xmm4,%xmm0
- pslld $12,%xmm0
- psrld $20,%xmm4
- por %xmm0,%xmm4
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
- movdqa 0x00(%rsp),%xmm0
- paddd %xmm5,%xmm0
- movdqa %xmm0,0x00(%rsp)
- pxor %xmm0,%xmm15
- pshufb %xmm2,%xmm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
- movdqa 0x10(%rsp),%xmm0
- paddd %xmm6,%xmm0
- movdqa %xmm0,0x10(%rsp)
- pxor %xmm0,%xmm12
- pshufb %xmm2,%xmm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
- movdqa 0x20(%rsp),%xmm0
- paddd %xmm7,%xmm0
- movdqa %xmm0,0x20(%rsp)
- pxor %xmm0,%xmm13
- pshufb %xmm2,%xmm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
- movdqa 0x30(%rsp),%xmm0
- paddd %xmm4,%xmm0
- movdqa %xmm0,0x30(%rsp)
- pxor %xmm0,%xmm14
- pshufb %xmm2,%xmm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
- paddd %xmm15,%xmm10
- pxor %xmm10,%xmm5
- movdqa %xmm5,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm5
- por %xmm0,%xmm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
- paddd %xmm12,%xmm11
- pxor %xmm11,%xmm6
- movdqa %xmm6,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm6
- por %xmm0,%xmm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
- paddd %xmm13,%xmm8
- pxor %xmm8,%xmm7
- movdqa %xmm7,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm7
- por %xmm0,%xmm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
- paddd %xmm14,%xmm9
- pxor %xmm9,%xmm4
- movdqa %xmm4,%xmm0
- pslld $7,%xmm0
- psrld $25,%xmm4
- por %xmm0,%xmm4
-
- sub $2,%r8d
- jnz .Ldoubleround4
-
- # x0[0-3] += s0[0]
- # x1[0-3] += s0[1]
- movq 0x00(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd 0x00(%rsp),%xmm2
- movdqa %xmm2,0x00(%rsp)
- paddd 0x10(%rsp),%xmm3
- movdqa %xmm3,0x10(%rsp)
- # x2[0-3] += s0[2]
- # x3[0-3] += s0[3]
- movq 0x08(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd 0x20(%rsp),%xmm2
- movdqa %xmm2,0x20(%rsp)
- paddd 0x30(%rsp),%xmm3
- movdqa %xmm3,0x30(%rsp)
-
- # x4[0-3] += s1[0]
- # x5[0-3] += s1[1]
- movq 0x10(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm4
- paddd %xmm3,%xmm5
- # x6[0-3] += s1[2]
- # x7[0-3] += s1[3]
- movq 0x18(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm6
- paddd %xmm3,%xmm7
-
- # x8[0-3] += s2[0]
- # x9[0-3] += s2[1]
- movq 0x20(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm8
- paddd %xmm3,%xmm9
- # x10[0-3] += s2[2]
- # x11[0-3] += s2[3]
- movq 0x28(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm10
- paddd %xmm3,%xmm11
-
- # x12[0-3] += s3[0]
- # x13[0-3] += s3[1]
- movq 0x30(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm12
- paddd %xmm3,%xmm13
- # x14[0-3] += s3[2]
- # x15[0-3] += s3[3]
- movq 0x38(%rdi),%xmm3
- pshufd $0x00,%xmm3,%xmm2
- pshufd $0x55,%xmm3,%xmm3
- paddd %xmm2,%xmm14
- paddd %xmm3,%xmm15
-
- # x12 += counter values 0-3
- paddd %xmm1,%xmm12
-
- # interleave 32-bit words in state n, n+1
- movdqa 0x00(%rsp),%xmm0
- movdqa 0x10(%rsp),%xmm1
- movdqa %xmm0,%xmm2
- punpckldq %xmm1,%xmm2
- punpckhdq %xmm1,%xmm0
- movdqa %xmm2,0x00(%rsp)
- movdqa %xmm0,0x10(%rsp)
- movdqa 0x20(%rsp),%xmm0
- movdqa 0x30(%rsp),%xmm1
- movdqa %xmm0,%xmm2
- punpckldq %xmm1,%xmm2
- punpckhdq %xmm1,%xmm0
- movdqa %xmm2,0x20(%rsp)
- movdqa %xmm0,0x30(%rsp)
- movdqa %xmm4,%xmm0
- punpckldq %xmm5,%xmm4
- punpckhdq %xmm5,%xmm0
- movdqa %xmm0,%xmm5
- movdqa %xmm6,%xmm0
- punpckldq %xmm7,%xmm6
- punpckhdq %xmm7,%xmm0
- movdqa %xmm0,%xmm7
- movdqa %xmm8,%xmm0
- punpckldq %xmm9,%xmm8
- punpckhdq %xmm9,%xmm0
- movdqa %xmm0,%xmm9
- movdqa %xmm10,%xmm0
- punpckldq %xmm11,%xmm10
- punpckhdq %xmm11,%xmm0
- movdqa %xmm0,%xmm11
- movdqa %xmm12,%xmm0
- punpckldq %xmm13,%xmm12
- punpckhdq %xmm13,%xmm0
- movdqa %xmm0,%xmm13
- movdqa %xmm14,%xmm0
- punpckldq %xmm15,%xmm14
- punpckhdq %xmm15,%xmm0
- movdqa %xmm0,%xmm15
-
- # interleave 64-bit words in state n, n+2
- movdqa 0x00(%rsp),%xmm0
- movdqa 0x20(%rsp),%xmm1
- movdqa %xmm0,%xmm2
- punpcklqdq %xmm1,%xmm2
- punpckhqdq %xmm1,%xmm0
- movdqa %xmm2,0x00(%rsp)
- movdqa %xmm0,0x20(%rsp)
- movdqa 0x10(%rsp),%xmm0
- movdqa 0x30(%rsp),%xmm1
- movdqa %xmm0,%xmm2
- punpcklqdq %xmm1,%xmm2
- punpckhqdq %xmm1,%xmm0
- movdqa %xmm2,0x10(%rsp)
- movdqa %xmm0,0x30(%rsp)
- movdqa %xmm4,%xmm0
- punpcklqdq %xmm6,%xmm4
- punpckhqdq %xmm6,%xmm0
- movdqa %xmm0,%xmm6
- movdqa %xmm5,%xmm0
- punpcklqdq %xmm7,%xmm5
- punpckhqdq %xmm7,%xmm0
- movdqa %xmm0,%xmm7
- movdqa %xmm8,%xmm0
- punpcklqdq %xmm10,%xmm8
- punpckhqdq %xmm10,%xmm0
- movdqa %xmm0,%xmm10
- movdqa %xmm9,%xmm0
- punpcklqdq %xmm11,%xmm9
- punpckhqdq %xmm11,%xmm0
- movdqa %xmm0,%xmm11
- movdqa %xmm12,%xmm0
- punpcklqdq %xmm14,%xmm12
- punpckhqdq %xmm14,%xmm0
- movdqa %xmm0,%xmm14
- movdqa %xmm13,%xmm0
- punpcklqdq %xmm15,%xmm13
- punpckhqdq %xmm15,%xmm0
- movdqa %xmm0,%xmm15
-
- # xor with corresponding input, write to output
- movdqa 0x00(%rsp),%xmm0
- cmp $0x10,%rax
- jl .Lxorpart4
- movdqu 0x00(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x00(%rsi)
-
- movdqu %xmm4,%xmm0
- cmp $0x20,%rax
- jl .Lxorpart4
- movdqu 0x10(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x10(%rsi)
-
- movdqu %xmm8,%xmm0
- cmp $0x30,%rax
- jl .Lxorpart4
- movdqu 0x20(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x20(%rsi)
-
- movdqu %xmm12,%xmm0
- cmp $0x40,%rax
- jl .Lxorpart4
- movdqu 0x30(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x30(%rsi)
-
- movdqa 0x20(%rsp),%xmm0
- cmp $0x50,%rax
- jl .Lxorpart4
- movdqu 0x40(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x40(%rsi)
-
- movdqu %xmm6,%xmm0
- cmp $0x60,%rax
- jl .Lxorpart4
- movdqu 0x50(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x50(%rsi)
-
- movdqu %xmm10,%xmm0
- cmp $0x70,%rax
- jl .Lxorpart4
- movdqu 0x60(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x60(%rsi)
-
- movdqu %xmm14,%xmm0
- cmp $0x80,%rax
- jl .Lxorpart4
- movdqu 0x70(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x70(%rsi)
-
- movdqa 0x10(%rsp),%xmm0
- cmp $0x90,%rax
- jl .Lxorpart4
- movdqu 0x80(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x80(%rsi)
-
- movdqu %xmm5,%xmm0
- cmp $0xa0,%rax
- jl .Lxorpart4
- movdqu 0x90(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0x90(%rsi)
-
- movdqu %xmm9,%xmm0
- cmp $0xb0,%rax
- jl .Lxorpart4
- movdqu 0xa0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xa0(%rsi)
-
- movdqu %xmm13,%xmm0
- cmp $0xc0,%rax
- jl .Lxorpart4
- movdqu 0xb0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xb0(%rsi)
-
- movdqa 0x30(%rsp),%xmm0
- cmp $0xd0,%rax
- jl .Lxorpart4
- movdqu 0xc0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xc0(%rsi)
-
- movdqu %xmm7,%xmm0
- cmp $0xe0,%rax
- jl .Lxorpart4
- movdqu 0xd0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xd0(%rsi)
-
- movdqu %xmm11,%xmm0
- cmp $0xf0,%rax
- jl .Lxorpart4
- movdqu 0xe0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xe0(%rsi)
-
- movdqu %xmm15,%xmm0
- cmp $0x100,%rax
- jl .Lxorpart4
- movdqu 0xf0(%rdx),%xmm1
- pxor %xmm1,%xmm0
- movdqu %xmm0,0xf0(%rsi)
-
-.Ldone4:
- lea -8(%r10),%rsp
- RET
-
-.Lxorpart4:
- # xor remaining bytes from partial register into output
- mov %rax,%r9
- and $0x0f,%r9
- jz .Ldone4
- and $~0x0f,%rax
-
- mov %rsi,%r11
-
- lea (%rdx,%rax),%rsi
- mov %rsp,%rdi
- mov %r9,%rcx
- rep movsb
-
- pxor 0x00(%rsp),%xmm0
- movdqa %xmm0,0x00(%rsp)
-
- mov %rsp,%rsi
- lea (%r11,%rax),%rdi
- mov %r9,%rcx
- rep movsb
-
- jmp .Ldone4
-
-SYM_FUNC_END(chacha_4block_xor_ssse3)
diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
deleted file mode 100644
index 8bb74a272879..000000000000
--- a/arch/x86/crypto/chacha_glue.c
+++ /dev/null
@@ -1,311 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * x64 SIMD accelerated ChaCha and XChaCha stream ciphers,
- * including ChaCha20 (RFC7539)
- *
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <crypto/algapi.h>
-#include <crypto/internal/chacha.h>
-#include <crypto/internal/simd.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/sizes.h>
-#include <asm/simd.h>
-
-asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
-
-asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-
-asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
- unsigned int len, int nrounds);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
-
-static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
-{
- len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
- return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
-}
-
-static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
-{
- if (IS_ENABLED(CONFIG_AS_AVX512) &&
- static_branch_likely(&chacha_use_avx512vl)) {
- while (bytes >= CHACHA_BLOCK_SIZE * 8) {
- chacha_8block_xor_avx512vl(state, dst, src, bytes,
- nrounds);
- bytes -= CHACHA_BLOCK_SIZE * 8;
- src += CHACHA_BLOCK_SIZE * 8;
- dst += CHACHA_BLOCK_SIZE * 8;
- state[12] += 8;
- }
- if (bytes > CHACHA_BLOCK_SIZE * 4) {
- chacha_8block_xor_avx512vl(state, dst, src, bytes,
- nrounds);
- state[12] += chacha_advance(bytes, 8);
- return;
- }
- if (bytes > CHACHA_BLOCK_SIZE * 2) {
- chacha_4block_xor_avx512vl(state, dst, src, bytes,
- nrounds);
- state[12] += chacha_advance(bytes, 4);
- return;
- }
- if (bytes) {
- chacha_2block_xor_avx512vl(state, dst, src, bytes,
- nrounds);
- state[12] += chacha_advance(bytes, 2);
- return;
- }
- }
-
- if (static_branch_likely(&chacha_use_avx2)) {
- while (bytes >= CHACHA_BLOCK_SIZE * 8) {
- chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
- bytes -= CHACHA_BLOCK_SIZE * 8;
- src += CHACHA_BLOCK_SIZE * 8;
- dst += CHACHA_BLOCK_SIZE * 8;
- state[12] += 8;
- }
- if (bytes > CHACHA_BLOCK_SIZE * 4) {
- chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
- state[12] += chacha_advance(bytes, 8);
- return;
- }
- if (bytes > CHACHA_BLOCK_SIZE * 2) {
- chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
- state[12] += chacha_advance(bytes, 4);
- return;
- }
- if (bytes > CHACHA_BLOCK_SIZE) {
- chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
- state[12] += chacha_advance(bytes, 2);
- return;
- }
- }
-
- while (bytes >= CHACHA_BLOCK_SIZE * 4) {
- chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
- bytes -= CHACHA_BLOCK_SIZE * 4;
- src += CHACHA_BLOCK_SIZE * 4;
- dst += CHACHA_BLOCK_SIZE * 4;
- state[12] += 4;
- }
- if (bytes > CHACHA_BLOCK_SIZE) {
- chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
- state[12] += chacha_advance(bytes, 4);
- return;
- }
- if (bytes) {
- chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
- state[12]++;
- }
-}
-
-void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
-{
- if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) {
- hchacha_block_generic(state, stream, nrounds);
- } else {
- kernel_fpu_begin();
- hchacha_block_ssse3(state, stream, nrounds);
- kernel_fpu_end();
- }
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
- int nrounds)
-{
- if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() ||
- bytes <= CHACHA_BLOCK_SIZE)
- return chacha_crypt_generic(state, dst, src, bytes, nrounds);
-
- do {
- unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
-
- kernel_fpu_begin();
- chacha_dosimd(state, dst, src, todo, nrounds);
- kernel_fpu_end();
-
- bytes -= todo;
- src += todo;
- dst += todo;
- } while (bytes);
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-static int chacha_simd_stream_xor(struct skcipher_request *req,
- const struct chacha_ctx *ctx, const u8 *iv)
-{
- u32 state[CHACHA_STATE_WORDS] __aligned(8);
- struct skcipher_walk walk;
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- chacha_init(state, ctx->key, iv);
-
- while (walk.nbytes > 0) {
- unsigned int nbytes = walk.nbytes;
-
- if (nbytes < walk.total)
- nbytes = round_down(nbytes, walk.stride);
-
- if (!static_branch_likely(&chacha_use_simd) ||
- !crypto_simd_usable()) {
- chacha_crypt_generic(state, walk.dst.virt.addr,
- walk.src.virt.addr, nbytes,
- ctx->nrounds);
- } else {
- kernel_fpu_begin();
- chacha_dosimd(state, walk.dst.virt.addr,
- walk.src.virt.addr, nbytes,
- ctx->nrounds);
- kernel_fpu_end();
- }
- err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
- }
-
- return err;
-}
-
-static int chacha_simd(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- return chacha_simd_stream_xor(req, ctx, req->iv);
-}
-
-static int xchacha_simd(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
- u32 state[CHACHA_STATE_WORDS] __aligned(8);
- struct chacha_ctx subctx;
- u8 real_iv[16];
-
- chacha_init(state, ctx->key, req->iv);
-
- if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) {
- kernel_fpu_begin();
- hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
- kernel_fpu_end();
- } else {
- hchacha_block_generic(state, subctx.key, ctx->nrounds);
- }
- subctx.nrounds = ctx->nrounds;
-
- memcpy(&real_iv[0], req->iv + 24, 8);
- memcpy(&real_iv[8], req->iv + 16, 8);
- return chacha_simd_stream_xor(req, &subctx, real_iv);
-}
-
-static struct skcipher_alg algs[] = {
- {
- .base.cra_name = "chacha20",
- .base.cra_driver_name = "chacha20-simd",
- .base.cra_priority = 300,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct chacha_ctx),
- .base.cra_module = THIS_MODULE,
-
- .min_keysize = CHACHA_KEY_SIZE,
- .max_keysize = CHACHA_KEY_SIZE,
- .ivsize = CHACHA_IV_SIZE,
- .chunksize = CHACHA_BLOCK_SIZE,
- .setkey = chacha20_setkey,
- .encrypt = chacha_simd,
- .decrypt = chacha_simd,
- }, {
- .base.cra_name = "xchacha20",
- .base.cra_driver_name = "xchacha20-simd",
- .base.cra_priority = 300,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct chacha_ctx),
- .base.cra_module = THIS_MODULE,
-
- .min_keysize = CHACHA_KEY_SIZE,
- .max_keysize = CHACHA_KEY_SIZE,
- .ivsize = XCHACHA_IV_SIZE,
- .chunksize = CHACHA_BLOCK_SIZE,
- .setkey = chacha20_setkey,
- .encrypt = xchacha_simd,
- .decrypt = xchacha_simd,
- }, {
- .base.cra_name = "xchacha12",
- .base.cra_driver_name = "xchacha12-simd",
- .base.cra_priority = 300,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct chacha_ctx),
- .base.cra_module = THIS_MODULE,
-
- .min_keysize = CHACHA_KEY_SIZE,
- .max_keysize = CHACHA_KEY_SIZE,
- .ivsize = XCHACHA_IV_SIZE,
- .chunksize = CHACHA_BLOCK_SIZE,
- .setkey = chacha12_setkey,
- .encrypt = xchacha_simd,
- .decrypt = xchacha_simd,
- },
-};
-
-static int __init chacha_simd_mod_init(void)
-{
- if (!boot_cpu_has(X86_FEATURE_SSSE3))
- return 0;
-
- static_branch_enable(&chacha_use_simd);
-
- if (boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
- static_branch_enable(&chacha_use_avx2);
-
- if (IS_ENABLED(CONFIG_AS_AVX512) &&
- boot_cpu_has(X86_FEATURE_AVX512VL) &&
- boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
- static_branch_enable(&chacha_use_avx512vl);
- }
- return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ?
- crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
-}
-
-static void __exit chacha_simd_mod_fini(void)
-{
- if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) && boot_cpu_has(X86_FEATURE_SSSE3))
- crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-module_init(chacha_simd_mod_init);
-module_exit(chacha_simd_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
-MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)");
-MODULE_ALIAS_CRYPTO("chacha20");
-MODULE_ALIAS_CRYPTO("chacha20-simd");
-MODULE_ALIAS_CRYPTO("xchacha20");
-MODULE_ALIAS_CRYPTO("xchacha20-simd");
-MODULE_ALIAS_CRYPTO("xchacha12");
-MODULE_ALIAS_CRYPTO("xchacha12-simd");
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index 99cb983ded9e..c4fbaa82ed7a 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -103,8 +103,8 @@ SYM_FUNC_START(clmul_ghash_mul)
SYM_FUNC_END(clmul_ghash_mul)
/*
- * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
- * const le128 *shash);
+ * int clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
+ * const le128 *shash);
*/
SYM_FUNC_START(clmul_ghash_update)
FRAME_BEGIN
@@ -127,6 +127,7 @@ SYM_FUNC_START(clmul_ghash_update)
pshufb BSWAP, DATA
movups DATA, (%rdi)
.Lupdate_just_ret:
+ mov %rdx, %rax
FRAME_END
RET
SYM_FUNC_END(clmul_ghash_update)
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index c759ec808bf1..aea5d4d06be7 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -7,41 +7,27 @@
* Author: Huang Ying <ying.huang@intel.com>
*/
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/crypto.h>
-#include <crypto/algapi.h>
-#include <crypto/cryptd.h>
-#include <crypto/gf128mul.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
#include <asm/cpu_device_id.h>
#include <asm/simd.h>
+#include <crypto/b128ops.h>
+#include <crypto/ghash.h>
+#include <crypto/internal/hash.h>
+#include <crypto/utils.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
#include <linux/unaligned.h>
-#define GHASH_BLOCK_SIZE 16
-#define GHASH_DIGEST_SIZE 16
+asmlinkage void clmul_ghash_mul(char *dst, const le128 *shash);
-void clmul_ghash_mul(char *dst, const le128 *shash);
+asmlinkage int clmul_ghash_update(char *dst, const char *src,
+ unsigned int srclen, const le128 *shash);
-void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
- const le128 *shash);
-
-struct ghash_async_ctx {
- struct cryptd_ahash *cryptd_tfm;
-};
-
-struct ghash_ctx {
+struct x86_ghash_ctx {
le128 shash;
};
-struct ghash_desc_ctx {
- u8 buffer[GHASH_BLOCK_SIZE];
- u32 bytes;
-};
-
static int ghash_init(struct shash_desc *desc)
{
struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
@@ -54,7 +40,7 @@ static int ghash_init(struct shash_desc *desc)
static int ghash_setkey(struct crypto_shash *tfm,
const u8 *key, unsigned int keylen)
{
- struct ghash_ctx *ctx = crypto_shash_ctx(tfm);
+ struct x86_ghash_ctx *ctx = crypto_shash_ctx(tfm);
u64 a, b;
if (keylen != GHASH_BLOCK_SIZE)
@@ -95,64 +81,38 @@ static int ghash_setkey(struct crypto_shash *tfm,
static int ghash_update(struct shash_desc *desc,
const u8 *src, unsigned int srclen)
{
+ struct x86_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
- struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
u8 *dst = dctx->buffer;
+ int remain;
kernel_fpu_begin();
- if (dctx->bytes) {
- int n = min(srclen, dctx->bytes);
- u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
-
- dctx->bytes -= n;
- srclen -= n;
-
- while (n--)
- *pos++ ^= *src++;
-
- if (!dctx->bytes)
- clmul_ghash_mul(dst, &ctx->shash);
- }
-
- clmul_ghash_update(dst, src, srclen, &ctx->shash);
+ remain = clmul_ghash_update(dst, src, srclen, &ctx->shash);
kernel_fpu_end();
-
- if (srclen & 0xf) {
- src += srclen - (srclen & 0xf);
- srclen &= 0xf;
- dctx->bytes = GHASH_BLOCK_SIZE - srclen;
- while (srclen--)
- *dst++ ^= *src++;
- }
-
- return 0;
+ return remain;
}
-static void ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx)
+static void ghash_flush(struct x86_ghash_ctx *ctx, struct ghash_desc_ctx *dctx,
+ const u8 *src, unsigned int len)
{
u8 *dst = dctx->buffer;
- if (dctx->bytes) {
- u8 *tmp = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
-
- while (dctx->bytes--)
- *tmp++ ^= 0;
-
- kernel_fpu_begin();
+ kernel_fpu_begin();
+ if (len) {
+ crypto_xor(dst, src, len);
clmul_ghash_mul(dst, &ctx->shash);
- kernel_fpu_end();
}
-
- dctx->bytes = 0;
+ kernel_fpu_end();
}
-static int ghash_final(struct shash_desc *desc, u8 *dst)
+static int ghash_finup(struct shash_desc *desc, const u8 *src,
+ unsigned int len, u8 *dst)
{
+ struct x86_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
- struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
u8 *buf = dctx->buffer;
- ghash_flush(ctx, dctx);
+ ghash_flush(ctx, dctx, src, len);
memcpy(dst, buf, GHASH_BLOCK_SIZE);
return 0;
@@ -162,186 +122,20 @@ static struct shash_alg ghash_alg = {
.digestsize = GHASH_DIGEST_SIZE,
.init = ghash_init,
.update = ghash_update,
- .final = ghash_final,
+ .finup = ghash_finup,
.setkey = ghash_setkey,
.descsize = sizeof(struct ghash_desc_ctx),
.base = {
- .cra_name = "__ghash",
- .cra_driver_name = "__ghash-pclmulqdqni",
- .cra_priority = 0,
- .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_name = "ghash",
+ .cra_driver_name = "ghash-pclmulqdqni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
.cra_blocksize = GHASH_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct ghash_ctx),
+ .cra_ctxsize = sizeof(struct x86_ghash_ctx),
.cra_module = THIS_MODULE,
},
};
-static int ghash_async_init(struct ahash_request *req)
-{
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
- struct ahash_request *cryptd_req = ahash_request_ctx(req);
- struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
- struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
- struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
-
- desc->tfm = child;
- return crypto_shash_init(desc);
-}
-
-static void ghash_init_cryptd_req(struct ahash_request *req)
-{
- struct ahash_request *cryptd_req = ahash_request_ctx(req);
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
- struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
-
- ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
- ahash_request_set_callback(cryptd_req, req->base.flags,
- req->base.complete, req->base.data);
- ahash_request_set_crypt(cryptd_req, req->src, req->result,
- req->nbytes);
-}
-
-static int ghash_async_update(struct ahash_request *req)
-{
- struct ahash_request *cryptd_req = ahash_request_ctx(req);
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
- struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
-
- if (!crypto_simd_usable() ||
- (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
- ghash_init_cryptd_req(req);
- return crypto_ahash_update(cryptd_req);
- } else {
- struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
- return shash_ahash_update(req, desc);
- }
-}
-
-static int ghash_async_final(struct ahash_request *req)
-{
- struct ahash_request *cryptd_req = ahash_request_ctx(req);
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
- struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
-
- if (!crypto_simd_usable() ||
- (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
- ghash_init_cryptd_req(req);
- return crypto_ahash_final(cryptd_req);
- } else {
- struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
- return crypto_shash_final(desc, req->result);
- }
-}
-
-static int ghash_async_import(struct ahash_request *req, const void *in)
-{
- struct ahash_request *cryptd_req = ahash_request_ctx(req);
- struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
- struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
-
- ghash_async_init(req);
- memcpy(dctx, in, sizeof(*dctx));
- return 0;
-
-}
-
-static int ghash_async_export(struct ahash_request *req, void *out)
-{
- struct ahash_request *cryptd_req = ahash_request_ctx(req);
- struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
- struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
-
- memcpy(out, dctx, sizeof(*dctx));
- return 0;
-
-}
-
-static int ghash_async_digest(struct ahash_request *req)
-{
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
- struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
- struct ahash_request *cryptd_req = ahash_request_ctx(req);
- struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
-
- if (!crypto_simd_usable() ||
- (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) {
- ghash_init_cryptd_req(req);
- return crypto_ahash_digest(cryptd_req);
- } else {
- struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
- struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
-
- desc->tfm = child;
- return shash_ahash_digest(req, desc);
- }
-}
-
-static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key,
- unsigned int keylen)
-{
- struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
- struct crypto_ahash *child = &ctx->cryptd_tfm->base;
-
- crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK);
- crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm)
- & CRYPTO_TFM_REQ_MASK);
- return crypto_ahash_setkey(child, key, keylen);
-}
-
-static int ghash_async_init_tfm(struct crypto_tfm *tfm)
-{
- struct cryptd_ahash *cryptd_tfm;
- struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
-
- cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni",
- CRYPTO_ALG_INTERNAL,
- CRYPTO_ALG_INTERNAL);
- if (IS_ERR(cryptd_tfm))
- return PTR_ERR(cryptd_tfm);
- ctx->cryptd_tfm = cryptd_tfm;
- crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
- sizeof(struct ahash_request) +
- crypto_ahash_reqsize(&cryptd_tfm->base));
-
- return 0;
-}
-
-static void ghash_async_exit_tfm(struct crypto_tfm *tfm)
-{
- struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
-
- cryptd_free_ahash(ctx->cryptd_tfm);
-}
-
-static struct ahash_alg ghash_async_alg = {
- .init = ghash_async_init,
- .update = ghash_async_update,
- .final = ghash_async_final,
- .setkey = ghash_async_setkey,
- .digest = ghash_async_digest,
- .export = ghash_async_export,
- .import = ghash_async_import,
- .halg = {
- .digestsize = GHASH_DIGEST_SIZE,
- .statesize = sizeof(struct ghash_desc_ctx),
- .base = {
- .cra_name = "ghash",
- .cra_driver_name = "ghash-clmulni",
- .cra_priority = 400,
- .cra_ctxsize = sizeof(struct ghash_async_ctx),
- .cra_flags = CRYPTO_ALG_ASYNC,
- .cra_blocksize = GHASH_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- .cra_init = ghash_async_init_tfm,
- .cra_exit = ghash_async_exit_tfm,
- },
- },
-};
-
static const struct x86_cpu_id pcmul_cpu_id[] = {
X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), /* Pickle-Mickle-Duck */
{}
@@ -350,29 +144,14 @@ MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id);
static int __init ghash_pclmulqdqni_mod_init(void)
{
- int err;
-
if (!x86_match_cpu(pcmul_cpu_id))
return -ENODEV;
- err = crypto_register_shash(&ghash_alg);
- if (err)
- goto err_out;
- err = crypto_register_ahash(&ghash_async_alg);
- if (err)
- goto err_shash;
-
- return 0;
-
-err_shash:
- crypto_unregister_shash(&ghash_alg);
-err_out:
- return err;
+ return crypto_register_shash(&ghash_alg);
}
static void __exit ghash_pclmulqdqni_mod_exit(void)
{
- crypto_unregister_ahash(&ghash_async_alg);
crypto_unregister_shash(&ghash_alg);
}
diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
deleted file mode 100644
index b9abcd79c1f4..000000000000
--- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
+++ /dev/null
@@ -1,4248 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
-#
-# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
-# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
-# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
-#
-# This code is taken from the OpenSSL project but the author, Andy Polyakov,
-# has relicensed it under the licenses specified in the SPDX header above.
-# The original headers, including the original license headers, are
-# included below for completeness.
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# This module implements Poly1305 hash for x86_64.
-#
-# March 2015
-#
-# Initial release.
-#
-# December 2016
-#
-# Add AVX512F+VL+BW code path.
-#
-# November 2017
-#
-# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
-# executed even on Knights Landing. Trigger for modification was
-# observation that AVX512 code paths can negatively affect overall
-# Skylake-X system performance. Since we are likely to suppress
-# AVX512F capability flag [at least on Skylake-X], conversion serves
-# as kind of "investment protection". Note that next *lake processor,
-# Cannonlake, has AVX512IFMA code path to execute...
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone,
-# measured with rdtsc at fixed clock frequency.
-#
-# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
-# P4 4.46/+120% -
-# Core 2 2.41/+90% -
-# Westmere 1.88/+120% -
-# Sandy Bridge 1.39/+140% 1.10
-# Haswell 1.14/+175% 1.11 0.65
-# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
-# Silvermont 2.83/+95% -
-# Knights L 3.60/? 1.65 1.10 0.41(***)
-# Goldmont 1.70/+180% -
-# VIA Nano 1.82/+150% -
-# Sledgehammer 1.38/+160% -
-# Bulldozer 2.30/+130% 0.97
-# Ryzen 1.15/+200% 1.08 1.18
-#
-# (*) improvement coefficients relative to clang are more modest and
-# are ~50% on most processors, in both cases we are comparing to
-# __int128 code;
-# (**) SSE2 implementation was attempted, but among non-AVX processors
-# it was faster than integer-only code only on older Intel P4 and
-# Core processors, 50-30%, less newer processor is, but slower on
-# contemporary ones, for example almost 2x slower on Atom, and as
-# former are naturally disappearing, SSE2 is deemed unnecessary;
-# (***) strangely enough performance seems to vary from core to core,
-# listed result is best case;
-
-$flavour = shift;
-$output = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-$kernel=0; $kernel=1 if (!$flavour && !$output);
-
-if (!$kernel) {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
- die "can't locate x86_64-xlate.pl";
-
- open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
- *STDOUT=*OUT;
-
- if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
- =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
- $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
- }
-
- if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
- `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
- $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
- $avx += 1 if ($1==2.11 && $2>=8);
- }
-
- if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
- `ml64 2>&1` =~ /Version ([0-9]+)\./) {
- $avx = ($1>=10) + ($1>=11);
- }
-
- if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
- $avx = ($2>=3.0) + ($2>3.0);
- }
-} else {
- $avx = 4; # The kernel uses ifdefs for this.
-}
-
-sub declare_function() {
- my ($name, $align, $nargs) = @_;
- if($kernel) {
- $code .= "SYM_FUNC_START($name)\n";
- $code .= ".L$name:\n";
- } else {
- $code .= ".globl $name\n";
- $code .= ".type $name,\@function,$nargs\n";
- $code .= ".align $align\n";
- $code .= "$name:\n";
- }
-}
-
-sub end_function() {
- my ($name) = @_;
- if($kernel) {
- $code .= "SYM_FUNC_END($name)\n";
- } else {
- $code .= ".size $name,.-$name\n";
- }
-}
-
-$code.=<<___ if $kernel;
-#include <linux/linkage.h>
-___
-
-if ($avx) {
-$code.=<<___ if $kernel;
-.section .rodata
-___
-$code.=<<___;
-.align 64
-.Lconst:
-.Lmask24:
-.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
-.L129:
-.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
-.Lmask26:
-.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
-.Lpermd_avx2:
-.long 2,2,2,3,2,0,2,1
-.Lpermd_avx512:
-.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
-
-.L2_44_inp_permd:
-.long 0,1,1,2,2,3,7,7
-.L2_44_inp_shift:
-.quad 0,12,24,64
-.L2_44_mask:
-.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
-.L2_44_shift_rgt:
-.quad 44,44,42,64
-.L2_44_shift_lft:
-.quad 8,8,10,64
-
-.align 64
-.Lx_mask44:
-.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
-.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
-.Lx_mask42:
-.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
-.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
-___
-}
-$code.=<<___ if (!$kernel);
-.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-.align 16
-___
-
-my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
-my ($mac,$nonce)=($inp,$len); # *_emit arguments
-my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
-my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
-
-sub poly1305_iteration {
-# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
-# output: $h0-$h2 *= $r0-$r1
-$code.=<<___;
- mulq $h0 # h0*r1
- mov %rax,$d2
- mov $r0,%rax
- mov %rdx,$d3
-
- mulq $h0 # h0*r0
- mov %rax,$h0 # future $h0
- mov $r0,%rax
- mov %rdx,$d1
-
- mulq $h1 # h1*r0
- add %rax,$d2
- mov $s1,%rax
- adc %rdx,$d3
-
- mulq $h1 # h1*s1
- mov $h2,$h1 # borrow $h1
- add %rax,$h0
- adc %rdx,$d1
-
- imulq $s1,$h1 # h2*s1
- add $h1,$d2
- mov $d1,$h1
- adc \$0,$d3
-
- imulq $r0,$h2 # h2*r0
- add $d2,$h1
- mov \$-4,%rax # mask value
- adc $h2,$d3
-
- and $d3,%rax # last reduction step
- mov $d3,$h2
- shr \$2,$d3
- and \$3,$h2
- add $d3,%rax
- add %rax,$h0
- adc \$0,$h1
- adc \$0,$h2
-___
-}
-
-########################################################################
-# Layout of opaque area is following.
-#
-# unsigned __int64 h[3]; # current hash value base 2^64
-# unsigned __int64 r[2]; # key value base 2^64
-
-$code.=<<___;
-.text
-___
-$code.=<<___ if (!$kernel);
-.extern OPENSSL_ia32cap_P
-
-.globl poly1305_init_x86_64
-.hidden poly1305_init_x86_64
-.globl poly1305_blocks_x86_64
-.hidden poly1305_blocks_x86_64
-.globl poly1305_emit_x86_64
-.hidden poly1305_emit_x86_64
-___
-&declare_function("poly1305_init_x86_64", 32, 3);
-$code.=<<___;
- xor %eax,%eax
- mov %rax,0($ctx) # initialize hash value
- mov %rax,8($ctx)
- mov %rax,16($ctx)
-
- test $inp,$inp
- je .Lno_key
-___
-$code.=<<___ if (!$kernel);
- lea poly1305_blocks_x86_64(%rip),%r10
- lea poly1305_emit_x86_64(%rip),%r11
-___
-$code.=<<___ if (!$kernel && $avx);
- mov OPENSSL_ia32cap_P+4(%rip),%r9
- lea poly1305_blocks_avx(%rip),%rax
- lea poly1305_emit_avx(%rip),%rcx
- bt \$`60-32`,%r9 # AVX?
- cmovc %rax,%r10
- cmovc %rcx,%r11
-___
-$code.=<<___ if (!$kernel && $avx>1);
- lea poly1305_blocks_avx2(%rip),%rax
- bt \$`5+32`,%r9 # AVX2?
- cmovc %rax,%r10
-___
-$code.=<<___ if (!$kernel && $avx>3);
- mov \$`(1<<31|1<<21|1<<16)`,%rax
- shr \$32,%r9
- and %rax,%r9
- cmp %rax,%r9
- je .Linit_base2_44
-___
-$code.=<<___;
- mov \$0x0ffffffc0fffffff,%rax
- mov \$0x0ffffffc0ffffffc,%rcx
- and 0($inp),%rax
- and 8($inp),%rcx
- mov %rax,24($ctx)
- mov %rcx,32($ctx)
-___
-$code.=<<___ if (!$kernel && $flavour !~ /elf32/);
- mov %r10,0(%rdx)
- mov %r11,8(%rdx)
-___
-$code.=<<___ if (!$kernel && $flavour =~ /elf32/);
- mov %r10d,0(%rdx)
- mov %r11d,4(%rdx)
-___
-$code.=<<___;
- mov \$1,%eax
-.Lno_key:
- RET
-___
-&end_function("poly1305_init_x86_64");
-
-&declare_function("poly1305_blocks_x86_64", 32, 4);
-$code.=<<___;
-.cfi_startproc
-.Lblocks:
- shr \$4,$len
- jz .Lno_data # too short
-
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
- push $ctx
-.cfi_push $ctx
-.Lblocks_body:
-
- mov $len,%r15 # reassign $len
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- mov 0($ctx),$h0 # load hash value
- mov 8($ctx),$h1
- mov 16($ctx),$h2
-
- mov $s1,$r1
- shr \$2,$s1
- mov $r1,%rax
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
- jmp .Loop
-
-.align 32
-.Loop:
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
-___
-
- &poly1305_iteration();
-
-$code.=<<___;
- mov $r1,%rax
- dec %r15 # len-=16
- jnz .Loop
-
- mov 0(%rsp),$ctx
-.cfi_restore $ctx
-
- mov $h0,0($ctx) # store hash value
- mov $h1,8($ctx)
- mov $h2,16($ctx)
-
- mov 8(%rsp),%r15
-.cfi_restore %r15
- mov 16(%rsp),%r14
-.cfi_restore %r14
- mov 24(%rsp),%r13
-.cfi_restore %r13
- mov 32(%rsp),%r12
-.cfi_restore %r12
- mov 40(%rsp),%rbx
-.cfi_restore %rbx
- lea 48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
-.Lno_data:
-.Lblocks_epilogue:
- RET
-.cfi_endproc
-___
-&end_function("poly1305_blocks_x86_64");
-
-&declare_function("poly1305_emit_x86_64", 32, 3);
-$code.=<<___;
-.Lemit:
- mov 0($ctx),%r8 # load hash value
- mov 8($ctx),%r9
- mov 16($ctx),%r10
-
- mov %r8,%rax
- add \$5,%r8 # compare to modulus
- mov %r9,%rcx
- adc \$0,%r9
- adc \$0,%r10
- shr \$2,%r10 # did 130-bit value overflow?
- cmovnz %r8,%rax
- cmovnz %r9,%rcx
-
- add 0($nonce),%rax # accumulate nonce
- adc 8($nonce),%rcx
- mov %rax,0($mac) # write result
- mov %rcx,8($mac)
-
- RET
-___
-&end_function("poly1305_emit_x86_64");
-if ($avx) {
-
-########################################################################
-# Layout of opaque area is following.
-#
-# unsigned __int32 h[5]; # current hash value base 2^26
-# unsigned __int32 is_base2_26;
-# unsigned __int64 r[2]; # key value base 2^64
-# unsigned __int64 pad;
-# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
-#
-# where r^n are base 2^26 digits of degrees of multiplier key. There are
-# 5 digits, but last four are interleaved with multiples of 5, totalling
-# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
-
-my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
- map("%xmm$_",(0..15));
-
-$code.=<<___;
-.type __poly1305_block,\@abi-omnipotent
-.align 32
-__poly1305_block:
- push $ctx
-___
- &poly1305_iteration();
-$code.=<<___;
- pop $ctx
- RET
-.size __poly1305_block,.-__poly1305_block
-
-.type __poly1305_init_avx,\@abi-omnipotent
-.align 32
-__poly1305_init_avx:
- push %rbp
- mov %rsp,%rbp
- mov $r0,$h0
- mov $r1,$h1
- xor $h2,$h2
-
- lea 48+64($ctx),$ctx # size optimization
-
- mov $r1,%rax
- call __poly1305_block # r^2
-
- mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
- mov \$0x3ffffff,%edx
- mov $h0,$d1
- and $h0#d,%eax
- mov $r0,$d2
- and $r0#d,%edx
- mov %eax,`16*0+0-64`($ctx)
- shr \$26,$d1
- mov %edx,`16*0+4-64`($ctx)
- shr \$26,$d2
-
- mov \$0x3ffffff,%eax
- mov \$0x3ffffff,%edx
- and $d1#d,%eax
- and $d2#d,%edx
- mov %eax,`16*1+0-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov %edx,`16*1+4-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- mov %eax,`16*2+0-64`($ctx)
- shr \$26,$d1
- mov %edx,`16*2+4-64`($ctx)
- shr \$26,$d2
-
- mov $h1,%rax
- mov $r1,%rdx
- shl \$12,%rax
- shl \$12,%rdx
- or $d1,%rax
- or $d2,%rdx
- and \$0x3ffffff,%eax
- and \$0x3ffffff,%edx
- mov %eax,`16*3+0-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov %edx,`16*3+4-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- mov %eax,`16*4+0-64`($ctx)
- mov $h1,$d1
- mov %edx,`16*4+4-64`($ctx)
- mov $r1,$d2
-
- mov \$0x3ffffff,%eax
- mov \$0x3ffffff,%edx
- shr \$14,$d1
- shr \$14,$d2
- and $d1#d,%eax
- and $d2#d,%edx
- mov %eax,`16*5+0-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov %edx,`16*5+4-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- mov %eax,`16*6+0-64`($ctx)
- shr \$26,$d1
- mov %edx,`16*6+4-64`($ctx)
- shr \$26,$d2
-
- mov $h2,%rax
- shl \$24,%rax
- or %rax,$d1
- mov $d1#d,`16*7+0-64`($ctx)
- lea ($d1,$d1,4),$d1 # *5
- mov $d2#d,`16*7+4-64`($ctx)
- lea ($d2,$d2,4),$d2 # *5
- mov $d1#d,`16*8+0-64`($ctx)
- mov $d2#d,`16*8+4-64`($ctx)
-
- mov $r1,%rax
- call __poly1305_block # r^3
-
- mov \$0x3ffffff,%eax # save r^3 base 2^26
- mov $h0,$d1
- and $h0#d,%eax
- shr \$26,$d1
- mov %eax,`16*0+12-64`($ctx)
-
- mov \$0x3ffffff,%edx
- and $d1#d,%edx
- mov %edx,`16*1+12-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*2+12-64`($ctx)
-
- mov $h1,%rax
- shl \$12,%rax
- or $d1,%rax
- and \$0x3ffffff,%eax
- mov %eax,`16*3+12-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov $h1,$d1
- mov %eax,`16*4+12-64`($ctx)
-
- mov \$0x3ffffff,%edx
- shr \$14,$d1
- and $d1#d,%edx
- mov %edx,`16*5+12-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*6+12-64`($ctx)
-
- mov $h2,%rax
- shl \$24,%rax
- or %rax,$d1
- mov $d1#d,`16*7+12-64`($ctx)
- lea ($d1,$d1,4),$d1 # *5
- mov $d1#d,`16*8+12-64`($ctx)
-
- mov $r1,%rax
- call __poly1305_block # r^4
-
- mov \$0x3ffffff,%eax # save r^4 base 2^26
- mov $h0,$d1
- and $h0#d,%eax
- shr \$26,$d1
- mov %eax,`16*0+8-64`($ctx)
-
- mov \$0x3ffffff,%edx
- and $d1#d,%edx
- mov %edx,`16*1+8-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*2+8-64`($ctx)
-
- mov $h1,%rax
- shl \$12,%rax
- or $d1,%rax
- and \$0x3ffffff,%eax
- mov %eax,`16*3+8-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov $h1,$d1
- mov %eax,`16*4+8-64`($ctx)
-
- mov \$0x3ffffff,%edx
- shr \$14,$d1
- and $d1#d,%edx
- mov %edx,`16*5+8-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*6+8-64`($ctx)
-
- mov $h2,%rax
- shl \$24,%rax
- or %rax,$d1
- mov $d1#d,`16*7+8-64`($ctx)
- lea ($d1,$d1,4),$d1 # *5
- mov $d1#d,`16*8+8-64`($ctx)
-
- lea -48-64($ctx),$ctx # size [de-]optimization
- pop %rbp
- RET
-.size __poly1305_init_avx,.-__poly1305_init_avx
-___
-
-&declare_function("poly1305_blocks_avx", 32, 4);
-$code.=<<___;
-.cfi_startproc
- mov 20($ctx),%r8d # is_base2_26
- cmp \$128,$len
- jae .Lblocks_avx
- test %r8d,%r8d
- jz .Lblocks
-
-.Lblocks_avx:
- and \$-16,$len
- jz .Lno_data_avx
-
- vzeroupper
-
- test %r8d,%r8d
- jz .Lbase2_64_avx
-
- test \$31,$len
- jz .Leven_avx
-
- push %rbp
-.cfi_push %rbp
- mov %rsp,%rbp
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
-.Lblocks_avx_body:
-
- mov $len,%r15 # reassign $len
-
- mov 0($ctx),$d1 # load hash value
- mov 8($ctx),$d2
- mov 16($ctx),$h2#d
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- ################################# base 2^26 -> base 2^64
- mov $d1#d,$h0#d
- and \$`-1*(1<<31)`,$d1
- mov $d2,$r1 # borrow $r1
- mov $d2#d,$h1#d
- and \$`-1*(1<<31)`,$d2
-
- shr \$6,$d1
- shl \$52,$r1
- add $d1,$h0
- shr \$12,$h1
- shr \$18,$d2
- add $r1,$h0
- adc $d2,$h1
-
- mov $h2,$d1
- shl \$40,$d1
- shr \$24,$h2
- add $d1,$h1
- adc \$0,$h2 # can be partially reduced...
-
- mov \$-4,$d2 # ... so reduce
- mov $h2,$d1
- and $h2,$d2
- shr \$2,$d1
- and \$3,$h2
- add $d2,$d1 # =*5
- add $d1,$h0
- adc \$0,$h1
- adc \$0,$h2
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
-
- call __poly1305_block
-
- test $padbit,$padbit # if $padbit is zero,
- jz .Lstore_base2_64_avx # store hash in base 2^64 format
-
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$r0
- mov $h1,$r1
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$r0
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $r0,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$r1
- and \$0x3ffffff,$h1 # h[3]
- or $r1,$h2 # h[4]
-
- sub \$16,%r15
- jz .Lstore_base2_26_avx
-
- vmovd %rax#d,$H0
- vmovd %rdx#d,$H1
- vmovd $h0#d,$H2
- vmovd $h1#d,$H3
- vmovd $h2#d,$H4
- jmp .Lproceed_avx
-
-.align 32
-.Lstore_base2_64_avx:
- mov $h0,0($ctx)
- mov $h1,8($ctx)
- mov $h2,16($ctx) # note that is_base2_26 is zeroed
- jmp .Ldone_avx
-
-.align 16
-.Lstore_base2_26_avx:
- mov %rax#d,0($ctx) # store hash value base 2^26
- mov %rdx#d,4($ctx)
- mov $h0#d,8($ctx)
- mov $h1#d,12($ctx)
- mov $h2#d,16($ctx)
-.align 16
-.Ldone_avx:
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbx
-.cfi_restore %rbx
- pop %rbp
-.cfi_restore %rbp
-.Lno_data_avx:
-.Lblocks_avx_epilogue:
- RET
-.cfi_endproc
-
-.align 32
-.Lbase2_64_avx:
-.cfi_startproc
- push %rbp
-.cfi_push %rbp
- mov %rsp,%rbp
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
-.Lbase2_64_avx_body:
-
- mov $len,%r15 # reassign $len
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- mov 0($ctx),$h0 # load hash value
- mov 8($ctx),$h1
- mov 16($ctx),$h2#d
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
- test \$31,$len
- jz .Linit_avx
-
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
- sub \$16,%r15
-
- call __poly1305_block
-
-.Linit_avx:
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$d1
- mov $h1,$d2
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$d1
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $d1,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$d2
- and \$0x3ffffff,$h1 # h[3]
- or $d2,$h2 # h[4]
-
- vmovd %rax#d,$H0
- vmovd %rdx#d,$H1
- vmovd $h0#d,$H2
- vmovd $h1#d,$H3
- vmovd $h2#d,$H4
- movl \$1,20($ctx) # set is_base2_26
-
- call __poly1305_init_avx
-
-.Lproceed_avx:
- mov %r15,$len
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbx
-.cfi_restore %rbx
- pop %rbp
-.cfi_restore %rbp
-.Lbase2_64_avx_epilogue:
- jmp .Ldo_avx
-.cfi_endproc
-
-.align 32
-.Leven_avx:
-.cfi_startproc
- vmovd 4*0($ctx),$H0 # load hash value
- vmovd 4*1($ctx),$H1
- vmovd 4*2($ctx),$H2
- vmovd 4*3($ctx),$H3
- vmovd 4*4($ctx),$H4
-
-.Ldo_avx:
-___
-$code.=<<___ if (!$win64);
- lea 8(%rsp),%r10
-.cfi_def_cfa_register %r10
- and \$-32,%rsp
- sub \$-8,%rsp
- lea -0x58(%rsp),%r11
- sub \$0x178,%rsp
-___
-$code.=<<___ if ($win64);
- lea -0xf8(%rsp),%r11
- sub \$0x218,%rsp
- vmovdqa %xmm6,0x50(%r11)
- vmovdqa %xmm7,0x60(%r11)
- vmovdqa %xmm8,0x70(%r11)
- vmovdqa %xmm9,0x80(%r11)
- vmovdqa %xmm10,0x90(%r11)
- vmovdqa %xmm11,0xa0(%r11)
- vmovdqa %xmm12,0xb0(%r11)
- vmovdqa %xmm13,0xc0(%r11)
- vmovdqa %xmm14,0xd0(%r11)
- vmovdqa %xmm15,0xe0(%r11)
-.Ldo_avx_body:
-___
-$code.=<<___;
- sub \$64,$len
- lea -32($inp),%rax
- cmovc %rax,$inp
-
- vmovdqu `16*3`($ctx),$D4 # preload r0^2
- lea `16*3+64`($ctx),$ctx # size optimization
- lea .Lconst(%rip),%rcx
-
- ################################################################
- # load input
- vmovdqu 16*2($inp),$T0
- vmovdqu 16*3($inp),$T1
- vmovdqa 64(%rcx),$MASK # .Lmask26
-
- vpsrldq \$6,$T0,$T2 # splat input
- vpsrldq \$6,$T1,$T3
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpunpcklqdq $T3,$T2,$T3 # 2:3
-
- vpsrlq \$40,$T4,$T4 # 4
- vpsrlq \$26,$T0,$T1
- vpand $MASK,$T0,$T0 # 0
- vpsrlq \$4,$T3,$T2
- vpand $MASK,$T1,$T1 # 1
- vpsrlq \$30,$T3,$T3
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- jbe .Lskip_loop_avx
-
- # expand and copy pre-calculated table to stack
- vmovdqu `16*1-64`($ctx),$D1
- vmovdqu `16*2-64`($ctx),$D2
- vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
- vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
- vmovdqa $D3,-0x90(%r11)
- vmovdqa $D0,0x00(%rsp)
- vpshufd \$0xEE,$D1,$D4
- vmovdqu `16*3-64`($ctx),$D0
- vpshufd \$0x44,$D1,$D1
- vmovdqa $D4,-0x80(%r11)
- vmovdqa $D1,0x10(%rsp)
- vpshufd \$0xEE,$D2,$D3
- vmovdqu `16*4-64`($ctx),$D1
- vpshufd \$0x44,$D2,$D2
- vmovdqa $D3,-0x70(%r11)
- vmovdqa $D2,0x20(%rsp)
- vpshufd \$0xEE,$D0,$D4
- vmovdqu `16*5-64`($ctx),$D2
- vpshufd \$0x44,$D0,$D0
- vmovdqa $D4,-0x60(%r11)
- vmovdqa $D0,0x30(%rsp)
- vpshufd \$0xEE,$D1,$D3
- vmovdqu `16*6-64`($ctx),$D0
- vpshufd \$0x44,$D1,$D1
- vmovdqa $D3,-0x50(%r11)
- vmovdqa $D1,0x40(%rsp)
- vpshufd \$0xEE,$D2,$D4
- vmovdqu `16*7-64`($ctx),$D1
- vpshufd \$0x44,$D2,$D2
- vmovdqa $D4,-0x40(%r11)
- vmovdqa $D2,0x50(%rsp)
- vpshufd \$0xEE,$D0,$D3
- vmovdqu `16*8-64`($ctx),$D2
- vpshufd \$0x44,$D0,$D0
- vmovdqa $D3,-0x30(%r11)
- vmovdqa $D0,0x60(%rsp)
- vpshufd \$0xEE,$D1,$D4
- vpshufd \$0x44,$D1,$D1
- vmovdqa $D4,-0x20(%r11)
- vmovdqa $D1,0x70(%rsp)
- vpshufd \$0xEE,$D2,$D3
- vmovdqa 0x00(%rsp),$D4 # preload r0^2
- vpshufd \$0x44,$D2,$D2
- vmovdqa $D3,-0x10(%r11)
- vmovdqa $D2,0x80(%rsp)
-
- jmp .Loop_avx
-
-.align 32
-.Loop_avx:
- ################################################################
- # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
- # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
- # \___________________/
- # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
- # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
- # \___________________/ \____________________/
- #
- # Note that we start with inp[2:3]*r^2. This is because it
- # doesn't depend on reduction in previous iteration.
- ################################################################
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- #
- # though note that $Tx and $Hx are "reversed" in this section,
- # and $D4 is preloaded with r0^2...
-
- vpmuludq $T0,$D4,$D0 # d0 = h0*r0
- vpmuludq $T1,$D4,$D1 # d1 = h1*r0
- vmovdqa $H2,0x20(%r11) # offload hash
- vpmuludq $T2,$D4,$D2 # d3 = h2*r0
- vmovdqa 0x10(%rsp),$H2 # r1^2
- vpmuludq $T3,$D4,$D3 # d3 = h3*r0
- vpmuludq $T4,$D4,$D4 # d4 = h4*r0
-
- vmovdqa $H0,0x00(%r11) #
- vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
- vmovdqa $H1,0x10(%r11) #
- vpmuludq $T3,$H2,$H1 # h3*r1
- vpaddq $H0,$D0,$D0 # d0 += h4*s1
- vpaddq $H1,$D4,$D4 # d4 += h3*r1
- vmovdqa $H3,0x30(%r11) #
- vpmuludq $T2,$H2,$H0 # h2*r1
- vpmuludq $T1,$H2,$H1 # h1*r1
- vpaddq $H0,$D3,$D3 # d3 += h2*r1
- vmovdqa 0x30(%rsp),$H3 # r2^2
- vpaddq $H1,$D2,$D2 # d2 += h1*r1
- vmovdqa $H4,0x40(%r11) #
- vpmuludq $T0,$H2,$H2 # h0*r1
- vpmuludq $T2,$H3,$H0 # h2*r2
- vpaddq $H2,$D1,$D1 # d1 += h0*r1
-
- vmovdqa 0x40(%rsp),$H4 # s2^2
- vpaddq $H0,$D4,$D4 # d4 += h2*r2
- vpmuludq $T1,$H3,$H1 # h1*r2
- vpmuludq $T0,$H3,$H3 # h0*r2
- vpaddq $H1,$D3,$D3 # d3 += h1*r2
- vmovdqa 0x50(%rsp),$H2 # r3^2
- vpaddq $H3,$D2,$D2 # d2 += h0*r2
- vpmuludq $T4,$H4,$H0 # h4*s2
- vpmuludq $T3,$H4,$H4 # h3*s2
- vpaddq $H0,$D1,$D1 # d1 += h4*s2
- vmovdqa 0x60(%rsp),$H3 # s3^2
- vpaddq $H4,$D0,$D0 # d0 += h3*s2
-
- vmovdqa 0x80(%rsp),$H4 # s4^2
- vpmuludq $T1,$H2,$H1 # h1*r3
- vpmuludq $T0,$H2,$H2 # h0*r3
- vpaddq $H1,$D4,$D4 # d4 += h1*r3
- vpaddq $H2,$D3,$D3 # d3 += h0*r3
- vpmuludq $T4,$H3,$H0 # h4*s3
- vpmuludq $T3,$H3,$H1 # h3*s3
- vpaddq $H0,$D2,$D2 # d2 += h4*s3
- vmovdqu 16*0($inp),$H0 # load input
- vpaddq $H1,$D1,$D1 # d1 += h3*s3
- vpmuludq $T2,$H3,$H3 # h2*s3
- vpmuludq $T2,$H4,$T2 # h2*s4
- vpaddq $H3,$D0,$D0 # d0 += h2*s3
-
- vmovdqu 16*1($inp),$H1 #
- vpaddq $T2,$D1,$D1 # d1 += h2*s4
- vpmuludq $T3,$H4,$T3 # h3*s4
- vpmuludq $T4,$H4,$T4 # h4*s4
- vpsrldq \$6,$H0,$H2 # splat input
- vpaddq $T3,$D2,$D2 # d2 += h3*s4
- vpaddq $T4,$D3,$D3 # d3 += h4*s4
- vpsrldq \$6,$H1,$H3 #
- vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
- vpmuludq $T1,$H4,$T0 # h1*s4
- vpunpckhqdq $H1,$H0,$H4 # 4
- vpaddq $T4,$D4,$D4 # d4 += h0*r4
- vmovdqa -0x90(%r11),$T4 # r0^4
- vpaddq $T0,$D0,$D0 # d0 += h1*s4
-
- vpunpcklqdq $H1,$H0,$H0 # 0:1
- vpunpcklqdq $H3,$H2,$H3 # 2:3
-
- #vpsrlq \$40,$H4,$H4 # 4
- vpsrldq \$`40/8`,$H4,$H4 # 4
- vpsrlq \$26,$H0,$H1
- vpand $MASK,$H0,$H0 # 0
- vpsrlq \$4,$H3,$H2
- vpand $MASK,$H1,$H1 # 1
- vpand 0(%rcx),$H4,$H4 # .Lmask24
- vpsrlq \$30,$H3,$H3
- vpand $MASK,$H2,$H2 # 2
- vpand $MASK,$H3,$H3 # 3
- vpor 32(%rcx),$H4,$H4 # padbit, yes, always
-
- vpaddq 0x00(%r11),$H0,$H0 # add hash value
- vpaddq 0x10(%r11),$H1,$H1
- vpaddq 0x20(%r11),$H2,$H2
- vpaddq 0x30(%r11),$H3,$H3
- vpaddq 0x40(%r11),$H4,$H4
-
- lea 16*2($inp),%rax
- lea 16*4($inp),$inp
- sub \$64,$len
- cmovc %rax,$inp
-
- ################################################################
- # Now we accumulate (inp[0:1]+hash)*r^4
- ################################################################
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- vpmuludq $H0,$T4,$T0 # h0*r0
- vpmuludq $H1,$T4,$T1 # h1*r0
- vpaddq $T0,$D0,$D0
- vpaddq $T1,$D1,$D1
- vmovdqa -0x80(%r11),$T2 # r1^4
- vpmuludq $H2,$T4,$T0 # h2*r0
- vpmuludq $H3,$T4,$T1 # h3*r0
- vpaddq $T0,$D2,$D2
- vpaddq $T1,$D3,$D3
- vpmuludq $H4,$T4,$T4 # h4*r0
- vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
- vpaddq $T4,$D4,$D4
-
- vpaddq $T0,$D0,$D0 # d0 += h4*s1
- vpmuludq $H2,$T2,$T1 # h2*r1
- vpmuludq $H3,$T2,$T0 # h3*r1
- vpaddq $T1,$D3,$D3 # d3 += h2*r1
- vmovdqa -0x60(%r11),$T3 # r2^4
- vpaddq $T0,$D4,$D4 # d4 += h3*r1
- vpmuludq $H1,$T2,$T1 # h1*r1
- vpmuludq $H0,$T2,$T2 # h0*r1
- vpaddq $T1,$D2,$D2 # d2 += h1*r1
- vpaddq $T2,$D1,$D1 # d1 += h0*r1
-
- vmovdqa -0x50(%r11),$T4 # s2^4
- vpmuludq $H2,$T3,$T0 # h2*r2
- vpmuludq $H1,$T3,$T1 # h1*r2
- vpaddq $T0,$D4,$D4 # d4 += h2*r2
- vpaddq $T1,$D3,$D3 # d3 += h1*r2
- vmovdqa -0x40(%r11),$T2 # r3^4
- vpmuludq $H0,$T3,$T3 # h0*r2
- vpmuludq $H4,$T4,$T0 # h4*s2
- vpaddq $T3,$D2,$D2 # d2 += h0*r2
- vpaddq $T0,$D1,$D1 # d1 += h4*s2
- vmovdqa -0x30(%r11),$T3 # s3^4
- vpmuludq $H3,$T4,$T4 # h3*s2
- vpmuludq $H1,$T2,$T1 # h1*r3
- vpaddq $T4,$D0,$D0 # d0 += h3*s2
-
- vmovdqa -0x10(%r11),$T4 # s4^4
- vpaddq $T1,$D4,$D4 # d4 += h1*r3
- vpmuludq $H0,$T2,$T2 # h0*r3
- vpmuludq $H4,$T3,$T0 # h4*s3
- vpaddq $T2,$D3,$D3 # d3 += h0*r3
- vpaddq $T0,$D2,$D2 # d2 += h4*s3
- vmovdqu 16*2($inp),$T0 # load input
- vpmuludq $H3,$T3,$T2 # h3*s3
- vpmuludq $H2,$T3,$T3 # h2*s3
- vpaddq $T2,$D1,$D1 # d1 += h3*s3
- vmovdqu 16*3($inp),$T1 #
- vpaddq $T3,$D0,$D0 # d0 += h2*s3
-
- vpmuludq $H2,$T4,$H2 # h2*s4
- vpmuludq $H3,$T4,$H3 # h3*s4
- vpsrldq \$6,$T0,$T2 # splat input
- vpaddq $H2,$D1,$D1 # d1 += h2*s4
- vpmuludq $H4,$T4,$H4 # h4*s4
- vpsrldq \$6,$T1,$T3 #
- vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
- vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
- vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
- vpmuludq $H1,$T4,$H0
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
- vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
-
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpunpcklqdq $T3,$T2,$T3 # 2:3
-
- #vpsrlq \$40,$T4,$T4 # 4
- vpsrldq \$`40/8`,$T4,$T4 # 4
- vpsrlq \$26,$T0,$T1
- vmovdqa 0x00(%rsp),$D4 # preload r0^2
- vpand $MASK,$T0,$T0 # 0
- vpsrlq \$4,$T3,$T2
- vpand $MASK,$T1,$T1 # 1
- vpand 0(%rcx),$T4,$T4 # .Lmask24
- vpsrlq \$30,$T3,$T3
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- ################################################################
- # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- # and P. Schwabe
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$D1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D0
- vpand $MASK,$H4,$H4
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D0,$H0,$H0
- vpsllq \$2,$D0,$D0
- vpaddq $D0,$H0,$H0 # h4 -> h0
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- ja .Loop_avx
-
-.Lskip_loop_avx:
- ################################################################
- # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
- vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
- add \$32,$len
- jnz .Long_tail_avx
-
- vpaddq $H2,$T2,$T2
- vpaddq $H0,$T0,$T0
- vpaddq $H1,$T1,$T1
- vpaddq $H3,$T3,$T3
- vpaddq $H4,$T4,$T4
-
-.Long_tail_avx:
- vmovdqa $H2,0x20(%r11)
- vmovdqa $H0,0x00(%r11)
- vmovdqa $H1,0x10(%r11)
- vmovdqa $H3,0x30(%r11)
- vmovdqa $H4,0x40(%r11)
-
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- vpmuludq $T2,$D4,$D2 # d2 = h2*r0
- vpmuludq $T0,$D4,$D0 # d0 = h0*r0
- vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
- vpmuludq $T1,$D4,$D1 # d1 = h1*r0
- vpmuludq $T3,$D4,$D3 # d3 = h3*r0
- vpmuludq $T4,$D4,$D4 # d4 = h4*r0
-
- vpmuludq $T3,$H2,$H0 # h3*r1
- vpaddq $H0,$D4,$D4 # d4 += h3*r1
- vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
- vpmuludq $T2,$H2,$H1 # h2*r1
- vpaddq $H1,$D3,$D3 # d3 += h2*r1
- vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
- vpmuludq $T1,$H2,$H0 # h1*r1
- vpaddq $H0,$D2,$D2 # d2 += h1*r1
- vpmuludq $T0,$H2,$H2 # h0*r1
- vpaddq $H2,$D1,$D1 # d1 += h0*r1
- vpmuludq $T4,$H3,$H3 # h4*s1
- vpaddq $H3,$D0,$D0 # d0 += h4*s1
-
- vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
- vpmuludq $T2,$H4,$H1 # h2*r2
- vpaddq $H1,$D4,$D4 # d4 += h2*r2
- vpmuludq $T1,$H4,$H0 # h1*r2
- vpaddq $H0,$D3,$D3 # d3 += h1*r2
- vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
- vpmuludq $T0,$H4,$H4 # h0*r2
- vpaddq $H4,$D2,$D2 # d2 += h0*r2
- vpmuludq $T4,$H2,$H1 # h4*s2
- vpaddq $H1,$D1,$D1 # d1 += h4*s2
- vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
- vpmuludq $T3,$H2,$H2 # h3*s2
- vpaddq $H2,$D0,$D0 # d0 += h3*s2
-
- vpmuludq $T1,$H3,$H0 # h1*r3
- vpaddq $H0,$D4,$D4 # d4 += h1*r3
- vpmuludq $T0,$H3,$H3 # h0*r3
- vpaddq $H3,$D3,$D3 # d3 += h0*r3
- vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
- vpmuludq $T4,$H4,$H1 # h4*s3
- vpaddq $H1,$D2,$D2 # d2 += h4*s3
- vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
- vpmuludq $T3,$H4,$H0 # h3*s3
- vpaddq $H0,$D1,$D1 # d1 += h3*s3
- vpmuludq $T2,$H4,$H4 # h2*s3
- vpaddq $H4,$D0,$D0 # d0 += h2*s3
-
- vpmuludq $T0,$H2,$H2 # h0*r4
- vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
- vpmuludq $T4,$H3,$H1 # h4*s4
- vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
- vpmuludq $T3,$H3,$H0 # h3*s4
- vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
- vpmuludq $T2,$H3,$H1 # h2*s4
- vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
- vpmuludq $T1,$H3,$H3 # h1*s4
- vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
-
- jz .Lshort_tail_avx
-
- vmovdqu 16*0($inp),$H0 # load input
- vmovdqu 16*1($inp),$H1
-
- vpsrldq \$6,$H0,$H2 # splat input
- vpsrldq \$6,$H1,$H3
- vpunpckhqdq $H1,$H0,$H4 # 4
- vpunpcklqdq $H1,$H0,$H0 # 0:1
- vpunpcklqdq $H3,$H2,$H3 # 2:3
-
- vpsrlq \$40,$H4,$H4 # 4
- vpsrlq \$26,$H0,$H1
- vpand $MASK,$H0,$H0 # 0
- vpsrlq \$4,$H3,$H2
- vpand $MASK,$H1,$H1 # 1
- vpsrlq \$30,$H3,$H3
- vpand $MASK,$H2,$H2 # 2
- vpand $MASK,$H3,$H3 # 3
- vpor 32(%rcx),$H4,$H4 # padbit, yes, always
-
- vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
- vpaddq 0x00(%r11),$H0,$H0
- vpaddq 0x10(%r11),$H1,$H1
- vpaddq 0x20(%r11),$H2,$H2
- vpaddq 0x30(%r11),$H3,$H3
- vpaddq 0x40(%r11),$H4,$H4
-
- ################################################################
- # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
-
- vpmuludq $H0,$T4,$T0 # h0*r0
- vpaddq $T0,$D0,$D0 # d0 += h0*r0
- vpmuludq $H1,$T4,$T1 # h1*r0
- vpaddq $T1,$D1,$D1 # d1 += h1*r0
- vpmuludq $H2,$T4,$T0 # h2*r0
- vpaddq $T0,$D2,$D2 # d2 += h2*r0
- vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
- vpmuludq $H3,$T4,$T1 # h3*r0
- vpaddq $T1,$D3,$D3 # d3 += h3*r0
- vpmuludq $H4,$T4,$T4 # h4*r0
- vpaddq $T4,$D4,$D4 # d4 += h4*r0
-
- vpmuludq $H3,$T2,$T0 # h3*r1
- vpaddq $T0,$D4,$D4 # d4 += h3*r1
- vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
- vpmuludq $H2,$T2,$T1 # h2*r1
- vpaddq $T1,$D3,$D3 # d3 += h2*r1
- vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
- vpmuludq $H1,$T2,$T0 # h1*r1
- vpaddq $T0,$D2,$D2 # d2 += h1*r1
- vpmuludq $H0,$T2,$T2 # h0*r1
- vpaddq $T2,$D1,$D1 # d1 += h0*r1
- vpmuludq $H4,$T3,$T3 # h4*s1
- vpaddq $T3,$D0,$D0 # d0 += h4*s1
-
- vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
- vpmuludq $H2,$T4,$T1 # h2*r2
- vpaddq $T1,$D4,$D4 # d4 += h2*r2
- vpmuludq $H1,$T4,$T0 # h1*r2
- vpaddq $T0,$D3,$D3 # d3 += h1*r2
- vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
- vpmuludq $H0,$T4,$T4 # h0*r2
- vpaddq $T4,$D2,$D2 # d2 += h0*r2
- vpmuludq $H4,$T2,$T1 # h4*s2
- vpaddq $T1,$D1,$D1 # d1 += h4*s2
- vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
- vpmuludq $H3,$T2,$T2 # h3*s2
- vpaddq $T2,$D0,$D0 # d0 += h3*s2
-
- vpmuludq $H1,$T3,$T0 # h1*r3
- vpaddq $T0,$D4,$D4 # d4 += h1*r3
- vpmuludq $H0,$T3,$T3 # h0*r3
- vpaddq $T3,$D3,$D3 # d3 += h0*r3
- vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
- vpmuludq $H4,$T4,$T1 # h4*s3
- vpaddq $T1,$D2,$D2 # d2 += h4*s3
- vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
- vpmuludq $H3,$T4,$T0 # h3*s3
- vpaddq $T0,$D1,$D1 # d1 += h3*s3
- vpmuludq $H2,$T4,$T4 # h2*s3
- vpaddq $T4,$D0,$D0 # d0 += h2*s3
-
- vpmuludq $H0,$T2,$T2 # h0*r4
- vpaddq $T2,$D4,$D4 # d4 += h0*r4
- vpmuludq $H4,$T3,$T1 # h4*s4
- vpaddq $T1,$D3,$D3 # d3 += h4*s4
- vpmuludq $H3,$T3,$T0 # h3*s4
- vpaddq $T0,$D2,$D2 # d2 += h3*s4
- vpmuludq $H2,$T3,$T1 # h2*s4
- vpaddq $T1,$D1,$D1 # d1 += h2*s4
- vpmuludq $H1,$T3,$T3 # h1*s4
- vpaddq $T3,$D0,$D0 # d0 += h1*s4
-
-.Lshort_tail_avx:
- ################################################################
- # horizontal addition
-
- vpsrldq \$8,$D4,$T4
- vpsrldq \$8,$D3,$T3
- vpsrldq \$8,$D1,$T1
- vpsrldq \$8,$D0,$T0
- vpsrldq \$8,$D2,$T2
- vpaddq $T3,$D3,$D3
- vpaddq $T4,$D4,$D4
- vpaddq $T0,$D0,$D0
- vpaddq $T1,$D1,$D1
- vpaddq $T2,$D2,$D2
-
- ################################################################
- # lazy reduction
-
- vpsrlq \$26,$D3,$H3
- vpand $MASK,$D3,$D3
- vpaddq $H3,$D4,$D4 # h3 -> h4
-
- vpsrlq \$26,$D0,$H0
- vpand $MASK,$D0,$D0
- vpaddq $H0,$D1,$D1 # h0 -> h1
-
- vpsrlq \$26,$D4,$H4
- vpand $MASK,$D4,$D4
-
- vpsrlq \$26,$D1,$H1
- vpand $MASK,$D1,$D1
- vpaddq $H1,$D2,$D2 # h1 -> h2
-
- vpaddq $H4,$D0,$D0
- vpsllq \$2,$H4,$H4
- vpaddq $H4,$D0,$D0 # h4 -> h0
-
- vpsrlq \$26,$D2,$H2
- vpand $MASK,$D2,$D2
- vpaddq $H2,$D3,$D3 # h2 -> h3
-
- vpsrlq \$26,$D0,$H0
- vpand $MASK,$D0,$D0
- vpaddq $H0,$D1,$D1 # h0 -> h1
-
- vpsrlq \$26,$D3,$H3
- vpand $MASK,$D3,$D3
- vpaddq $H3,$D4,$D4 # h3 -> h4
-
- vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
- vmovd $D1,`4*1-48-64`($ctx)
- vmovd $D2,`4*2-48-64`($ctx)
- vmovd $D3,`4*3-48-64`($ctx)
- vmovd $D4,`4*4-48-64`($ctx)
-___
-$code.=<<___ if ($win64);
- vmovdqa 0x50(%r11),%xmm6
- vmovdqa 0x60(%r11),%xmm7
- vmovdqa 0x70(%r11),%xmm8
- vmovdqa 0x80(%r11),%xmm9
- vmovdqa 0x90(%r11),%xmm10
- vmovdqa 0xa0(%r11),%xmm11
- vmovdqa 0xb0(%r11),%xmm12
- vmovdqa 0xc0(%r11),%xmm13
- vmovdqa 0xd0(%r11),%xmm14
- vmovdqa 0xe0(%r11),%xmm15
- lea 0xf8(%r11),%rsp
-.Ldo_avx_epilogue:
-___
-$code.=<<___ if (!$win64);
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-___
-$code.=<<___;
- vzeroupper
- RET
-.cfi_endproc
-___
-&end_function("poly1305_blocks_avx");
-
-&declare_function("poly1305_emit_avx", 32, 3);
-$code.=<<___;
- cmpl \$0,20($ctx) # is_base2_26?
- je .Lemit
-
- mov 0($ctx),%eax # load hash value base 2^26
- mov 4($ctx),%ecx
- mov 8($ctx),%r8d
- mov 12($ctx),%r11d
- mov 16($ctx),%r10d
-
- shl \$26,%rcx # base 2^26 -> base 2^64
- mov %r8,%r9
- shl \$52,%r8
- add %rcx,%rax
- shr \$12,%r9
- add %rax,%r8 # h0
- adc \$0,%r9
-
- shl \$14,%r11
- mov %r10,%rax
- shr \$24,%r10
- add %r11,%r9
- shl \$40,%rax
- add %rax,%r9 # h1
- adc \$0,%r10 # h2
-
- mov %r10,%rax # could be partially reduced, so reduce
- mov %r10,%rcx
- and \$3,%r10
- shr \$2,%rax
- and \$-4,%rcx
- add %rcx,%rax
- add %rax,%r8
- adc \$0,%r9
- adc \$0,%r10
-
- mov %r8,%rax
- add \$5,%r8 # compare to modulus
- mov %r9,%rcx
- adc \$0,%r9
- adc \$0,%r10
- shr \$2,%r10 # did 130-bit value overflow?
- cmovnz %r8,%rax
- cmovnz %r9,%rcx
-
- add 0($nonce),%rax # accumulate nonce
- adc 8($nonce),%rcx
- mov %rax,0($mac) # write result
- mov %rcx,8($mac)
-
- RET
-___
-&end_function("poly1305_emit_avx");
-
-if ($avx>1) {
-
-my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
- map("%ymm$_",(0..15));
-my $S4=$MASK;
-
-sub poly1305_blocks_avxN {
- my ($avx512) = @_;
- my $suffix = $avx512 ? "_avx512" : "";
-$code.=<<___;
-.cfi_startproc
- mov 20($ctx),%r8d # is_base2_26
- cmp \$128,$len
- jae .Lblocks_avx2$suffix
- test %r8d,%r8d
- jz .Lblocks
-
-.Lblocks_avx2$suffix:
- and \$-16,$len
- jz .Lno_data_avx2$suffix
-
- vzeroupper
-
- test %r8d,%r8d
- jz .Lbase2_64_avx2$suffix
-
- test \$63,$len
- jz .Leven_avx2$suffix
-
- push %rbp
-.cfi_push %rbp
- mov %rsp,%rbp
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
-.Lblocks_avx2_body$suffix:
-
- mov $len,%r15 # reassign $len
-
- mov 0($ctx),$d1 # load hash value
- mov 8($ctx),$d2
- mov 16($ctx),$h2#d
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- ################################# base 2^26 -> base 2^64
- mov $d1#d,$h0#d
- and \$`-1*(1<<31)`,$d1
- mov $d2,$r1 # borrow $r1
- mov $d2#d,$h1#d
- and \$`-1*(1<<31)`,$d2
-
- shr \$6,$d1
- shl \$52,$r1
- add $d1,$h0
- shr \$12,$h1
- shr \$18,$d2
- add $r1,$h0
- adc $d2,$h1
-
- mov $h2,$d1
- shl \$40,$d1
- shr \$24,$h2
- add $d1,$h1
- adc \$0,$h2 # can be partially reduced...
-
- mov \$-4,$d2 # ... so reduce
- mov $h2,$d1
- and $h2,$d2
- shr \$2,$d1
- and \$3,$h2
- add $d2,$d1 # =*5
- add $d1,$h0
- adc \$0,$h1
- adc \$0,$h2
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
-.Lbase2_26_pre_avx2$suffix:
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
- sub \$16,%r15
-
- call __poly1305_block
- mov $r1,%rax
-
- test \$63,%r15
- jnz .Lbase2_26_pre_avx2$suffix
-
- test $padbit,$padbit # if $padbit is zero,
- jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format
-
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$r0
- mov $h1,$r1
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$r0
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $r0,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$r1
- and \$0x3ffffff,$h1 # h[3]
- or $r1,$h2 # h[4]
-
- test %r15,%r15
- jz .Lstore_base2_26_avx2$suffix
-
- vmovd %rax#d,%x#$H0
- vmovd %rdx#d,%x#$H1
- vmovd $h0#d,%x#$H2
- vmovd $h1#d,%x#$H3
- vmovd $h2#d,%x#$H4
- jmp .Lproceed_avx2$suffix
-
-.align 32
-.Lstore_base2_64_avx2$suffix:
- mov $h0,0($ctx)
- mov $h1,8($ctx)
- mov $h2,16($ctx) # note that is_base2_26 is zeroed
- jmp .Ldone_avx2$suffix
-
-.align 16
-.Lstore_base2_26_avx2$suffix:
- mov %rax#d,0($ctx) # store hash value base 2^26
- mov %rdx#d,4($ctx)
- mov $h0#d,8($ctx)
- mov $h1#d,12($ctx)
- mov $h2#d,16($ctx)
-.align 16
-.Ldone_avx2$suffix:
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbx
-.cfi_restore %rbx
- pop %rbp
-.cfi_restore %rbp
-.Lno_data_avx2$suffix:
-.Lblocks_avx2_epilogue$suffix:
- RET
-.cfi_endproc
-
-.align 32
-.Lbase2_64_avx2$suffix:
-.cfi_startproc
- push %rbp
-.cfi_push %rbp
- mov %rsp,%rbp
- push %rbx
-.cfi_push %rbx
- push %r12
-.cfi_push %r12
- push %r13
-.cfi_push %r13
- push %r14
-.cfi_push %r14
- push %r15
-.cfi_push %r15
-.Lbase2_64_avx2_body$suffix:
-
- mov $len,%r15 # reassign $len
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- mov 0($ctx),$h0 # load hash value
- mov 8($ctx),$h1
- mov 16($ctx),$h2#d
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
- test \$63,$len
- jz .Linit_avx2$suffix
-
-.Lbase2_64_pre_avx2$suffix:
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
- sub \$16,%r15
-
- call __poly1305_block
- mov $r1,%rax
-
- test \$63,%r15
- jnz .Lbase2_64_pre_avx2$suffix
-
-.Linit_avx2$suffix:
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$d1
- mov $h1,$d2
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$d1
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $d1,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$d2
- and \$0x3ffffff,$h1 # h[3]
- or $d2,$h2 # h[4]
-
- vmovd %rax#d,%x#$H0
- vmovd %rdx#d,%x#$H1
- vmovd $h0#d,%x#$H2
- vmovd $h1#d,%x#$H3
- vmovd $h2#d,%x#$H4
- movl \$1,20($ctx) # set is_base2_26
-
- call __poly1305_init_avx
-
-.Lproceed_avx2$suffix:
- mov %r15,$len # restore $len
-___
-$code.=<<___ if (!$kernel);
- mov OPENSSL_ia32cap_P+8(%rip),%r9d
- mov \$`(1<<31|1<<30|1<<16)`,%r11d
-___
-$code.=<<___;
- pop %r15
-.cfi_restore %r15
- pop %r14
-.cfi_restore %r14
- pop %r13
-.cfi_restore %r13
- pop %r12
-.cfi_restore %r12
- pop %rbx
-.cfi_restore %rbx
- pop %rbp
-.cfi_restore %rbp
-.Lbase2_64_avx2_epilogue$suffix:
- jmp .Ldo_avx2$suffix
-.cfi_endproc
-
-.align 32
-.Leven_avx2$suffix:
-.cfi_startproc
-___
-$code.=<<___ if (!$kernel);
- mov OPENSSL_ia32cap_P+8(%rip),%r9d
-___
-$code.=<<___;
- vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
- vmovd 4*1($ctx),%x#$H1
- vmovd 4*2($ctx),%x#$H2
- vmovd 4*3($ctx),%x#$H3
- vmovd 4*4($ctx),%x#$H4
-
-.Ldo_avx2$suffix:
-___
-$code.=<<___ if (!$kernel && $avx>2);
- cmp \$512,$len
- jb .Lskip_avx512
- and %r11d,%r9d
- test \$`1<<16`,%r9d # check for AVX512F
- jnz .Lblocks_avx512
-.Lskip_avx512$suffix:
-___
-$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
- cmp \$512,$len
- jae .Lblocks_avx512
-___
-$code.=<<___ if (!$win64);
- lea 8(%rsp),%r10
-.cfi_def_cfa_register %r10
- sub \$0x128,%rsp
-___
-$code.=<<___ if ($win64);
- lea 8(%rsp),%r10
- sub \$0x1c8,%rsp
- vmovdqa %xmm6,-0xb0(%r10)
- vmovdqa %xmm7,-0xa0(%r10)
- vmovdqa %xmm8,-0x90(%r10)
- vmovdqa %xmm9,-0x80(%r10)
- vmovdqa %xmm10,-0x70(%r10)
- vmovdqa %xmm11,-0x60(%r10)
- vmovdqa %xmm12,-0x50(%r10)
- vmovdqa %xmm13,-0x40(%r10)
- vmovdqa %xmm14,-0x30(%r10)
- vmovdqa %xmm15,-0x20(%r10)
-.Ldo_avx2_body$suffix:
-___
-$code.=<<___;
- lea .Lconst(%rip),%rcx
- lea 48+64($ctx),$ctx # size optimization
- vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
-
- # expand and copy pre-calculated table to stack
- vmovdqu `16*0-64`($ctx),%x#$T2
- and \$-512,%rsp
- vmovdqu `16*1-64`($ctx),%x#$T3
- vmovdqu `16*2-64`($ctx),%x#$T4
- vmovdqu `16*3-64`($ctx),%x#$D0
- vmovdqu `16*4-64`($ctx),%x#$D1
- vmovdqu `16*5-64`($ctx),%x#$D2
- lea 0x90(%rsp),%rax # size optimization
- vmovdqu `16*6-64`($ctx),%x#$D3
- vpermd $T2,$T0,$T2 # 00003412 -> 14243444
- vmovdqu `16*7-64`($ctx),%x#$D4
- vpermd $T3,$T0,$T3
- vmovdqu `16*8-64`($ctx),%x#$MASK
- vpermd $T4,$T0,$T4
- vmovdqa $T2,0x00(%rsp)
- vpermd $D0,$T0,$D0
- vmovdqa $T3,0x20-0x90(%rax)
- vpermd $D1,$T0,$D1
- vmovdqa $T4,0x40-0x90(%rax)
- vpermd $D2,$T0,$D2
- vmovdqa $D0,0x60-0x90(%rax)
- vpermd $D3,$T0,$D3
- vmovdqa $D1,0x80-0x90(%rax)
- vpermd $D4,$T0,$D4
- vmovdqa $D2,0xa0-0x90(%rax)
- vpermd $MASK,$T0,$MASK
- vmovdqa $D3,0xc0-0x90(%rax)
- vmovdqa $D4,0xe0-0x90(%rax)
- vmovdqa $MASK,0x100-0x90(%rax)
- vmovdqa 64(%rcx),$MASK # .Lmask26
-
- ################################################################
- # load input
- vmovdqu 16*0($inp),%x#$T0
- vmovdqu 16*1($inp),%x#$T1
- vinserti128 \$1,16*2($inp),$T0,$T0
- vinserti128 \$1,16*3($inp),$T1,$T1
- lea 16*4($inp),$inp
-
- vpsrldq \$6,$T0,$T2 # splat input
- vpsrldq \$6,$T1,$T3
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpunpcklqdq $T3,$T2,$T2 # 2:3
- vpunpcklqdq $T1,$T0,$T0 # 0:1
-
- vpsrlq \$30,$T2,$T3
- vpsrlq \$4,$T2,$T2
- vpsrlq \$26,$T0,$T1
- vpsrlq \$40,$T4,$T4 # 4
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T0,$T0 # 0
- vpand $MASK,$T1,$T1 # 1
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- vpaddq $H2,$T2,$H2 # accumulate input
- sub \$64,$len
- jz .Ltail_avx2$suffix
- jmp .Loop_avx2$suffix
-
-.align 32
-.Loop_avx2$suffix:
- ################################################################
- # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
- # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
- # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
- # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
- # \________/\__________/
- ################################################################
- #vpaddq $H2,$T2,$H2 # accumulate input
- vpaddq $H0,$T0,$H0
- vmovdqa `32*0`(%rsp),$T0 # r0^4
- vpaddq $H1,$T1,$H1
- vmovdqa `32*1`(%rsp),$T1 # r1^4
- vpaddq $H3,$T3,$H3
- vmovdqa `32*3`(%rsp),$T2 # r2^4
- vpaddq $H4,$T4,$H4
- vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
- vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
-
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- #
- # however, as h2 is "chronologically" first one available pull
- # corresponding operations up, so it's
- #
- # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
- # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
- # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
-
- vpmuludq $H2,$T0,$D2 # d2 = h2*r0
- vpmuludq $H2,$T1,$D3 # d3 = h2*r1
- vpmuludq $H2,$T2,$D4 # d4 = h2*r2
- vpmuludq $H2,$T3,$D0 # d0 = h2*s3
- vpmuludq $H2,$S4,$D1 # d1 = h2*s4
-
- vpmuludq $H0,$T1,$T4 # h0*r1
- vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
- vpaddq $T4,$D1,$D1 # d1 += h0*r1
- vpaddq $H2,$D2,$D2 # d2 += h1*r1
- vpmuludq $H3,$T1,$T4 # h3*r1
- vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
- vpaddq $T4,$D4,$D4 # d4 += h3*r1
- vpaddq $H2,$D0,$D0 # d0 += h4*s1
- vmovdqa `32*4-0x90`(%rax),$T1 # s2
-
- vpmuludq $H0,$T0,$T4 # h0*r0
- vpmuludq $H1,$T0,$H2 # h1*r0
- vpaddq $T4,$D0,$D0 # d0 += h0*r0
- vpaddq $H2,$D1,$D1 # d1 += h1*r0
- vpmuludq $H3,$T0,$T4 # h3*r0
- vpmuludq $H4,$T0,$H2 # h4*r0
- vmovdqu 16*0($inp),%x#$T0 # load input
- vpaddq $T4,$D3,$D3 # d3 += h3*r0
- vpaddq $H2,$D4,$D4 # d4 += h4*r0
- vinserti128 \$1,16*2($inp),$T0,$T0
-
- vpmuludq $H3,$T1,$T4 # h3*s2
- vpmuludq $H4,$T1,$H2 # h4*s2
- vmovdqu 16*1($inp),%x#$T1
- vpaddq $T4,$D0,$D0 # d0 += h3*s2
- vpaddq $H2,$D1,$D1 # d1 += h4*s2
- vmovdqa `32*5-0x90`(%rax),$H2 # r3
- vpmuludq $H1,$T2,$T4 # h1*r2
- vpmuludq $H0,$T2,$T2 # h0*r2
- vpaddq $T4,$D3,$D3 # d3 += h1*r2
- vpaddq $T2,$D2,$D2 # d2 += h0*r2
- vinserti128 \$1,16*3($inp),$T1,$T1
- lea 16*4($inp),$inp
-
- vpmuludq $H1,$H2,$T4 # h1*r3
- vpmuludq $H0,$H2,$H2 # h0*r3
- vpsrldq \$6,$T0,$T2 # splat input
- vpaddq $T4,$D4,$D4 # d4 += h1*r3
- vpaddq $H2,$D3,$D3 # d3 += h0*r3
- vpmuludq $H3,$T3,$T4 # h3*s3
- vpmuludq $H4,$T3,$H2 # h4*s3
- vpsrldq \$6,$T1,$T3
- vpaddq $T4,$D1,$D1 # d1 += h3*s3
- vpaddq $H2,$D2,$D2 # d2 += h4*s3
- vpunpckhqdq $T1,$T0,$T4 # 4
-
- vpmuludq $H3,$S4,$H3 # h3*s4
- vpmuludq $H4,$S4,$H4 # h4*s4
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
- vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
- vpunpcklqdq $T3,$T2,$T3 # 2:3
- vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
- vpmuludq $H1,$S4,$H0 # h1*s4
- vmovdqa 64(%rcx),$MASK # .Lmask26
- vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
- vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
-
- ################################################################
- # lazy reduction (interleaved with tail of input splat)
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$D1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D4
- vpand $MASK,$H4,$H4
-
- vpsrlq \$4,$T3,$T2
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D4,$H0,$H0
- vpsllq \$2,$D4,$D4
- vpaddq $D4,$H0,$H0 # h4 -> h0
-
- vpand $MASK,$T2,$T2 # 2
- vpsrlq \$26,$T0,$T1
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpaddq $T2,$H2,$H2 # modulo-scheduled
- vpsrlq \$30,$T3,$T3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$40,$T4,$T4 # 4
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpand $MASK,$T0,$T0 # 0
- vpand $MASK,$T1,$T1 # 1
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- sub \$64,$len
- jnz .Loop_avx2$suffix
-
- .byte 0x66,0x90
-.Ltail_avx2$suffix:
- ################################################################
- # while above multiplications were by r^4 in all lanes, in last
- # iteration we multiply least significant lane by r^4 and most
- # significant one by r, so copy of above except that references
- # to the precomputed table are displaced by 4...
-
- #vpaddq $H2,$T2,$H2 # accumulate input
- vpaddq $H0,$T0,$H0
- vmovdqu `32*0+4`(%rsp),$T0 # r0^4
- vpaddq $H1,$T1,$H1
- vmovdqu `32*1+4`(%rsp),$T1 # r1^4
- vpaddq $H3,$T3,$H3
- vmovdqu `32*3+4`(%rsp),$T2 # r2^4
- vpaddq $H4,$T4,$H4
- vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
- vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
-
- vpmuludq $H2,$T0,$D2 # d2 = h2*r0
- vpmuludq $H2,$T1,$D3 # d3 = h2*r1
- vpmuludq $H2,$T2,$D4 # d4 = h2*r2
- vpmuludq $H2,$T3,$D0 # d0 = h2*s3
- vpmuludq $H2,$S4,$D1 # d1 = h2*s4
-
- vpmuludq $H0,$T1,$T4 # h0*r1
- vpmuludq $H1,$T1,$H2 # h1*r1
- vpaddq $T4,$D1,$D1 # d1 += h0*r1
- vpaddq $H2,$D2,$D2 # d2 += h1*r1
- vpmuludq $H3,$T1,$T4 # h3*r1
- vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
- vpaddq $T4,$D4,$D4 # d4 += h3*r1
- vpaddq $H2,$D0,$D0 # d0 += h4*s1
-
- vpmuludq $H0,$T0,$T4 # h0*r0
- vpmuludq $H1,$T0,$H2 # h1*r0
- vpaddq $T4,$D0,$D0 # d0 += h0*r0
- vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
- vpaddq $H2,$D1,$D1 # d1 += h1*r0
- vpmuludq $H3,$T0,$T4 # h3*r0
- vpmuludq $H4,$T0,$H2 # h4*r0
- vpaddq $T4,$D3,$D3 # d3 += h3*r0
- vpaddq $H2,$D4,$D4 # d4 += h4*r0
-
- vpmuludq $H3,$T1,$T4 # h3*s2
- vpmuludq $H4,$T1,$H2 # h4*s2
- vpaddq $T4,$D0,$D0 # d0 += h3*s2
- vpaddq $H2,$D1,$D1 # d1 += h4*s2
- vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
- vpmuludq $H1,$T2,$T4 # h1*r2
- vpmuludq $H0,$T2,$T2 # h0*r2
- vpaddq $T4,$D3,$D3 # d3 += h1*r2
- vpaddq $T2,$D2,$D2 # d2 += h0*r2
-
- vpmuludq $H1,$H2,$T4 # h1*r3
- vpmuludq $H0,$H2,$H2 # h0*r3
- vpaddq $T4,$D4,$D4 # d4 += h1*r3
- vpaddq $H2,$D3,$D3 # d3 += h0*r3
- vpmuludq $H3,$T3,$T4 # h3*s3
- vpmuludq $H4,$T3,$H2 # h4*s3
- vpaddq $T4,$D1,$D1 # d1 += h3*s3
- vpaddq $H2,$D2,$D2 # d2 += h4*s3
-
- vpmuludq $H3,$S4,$H3 # h3*s4
- vpmuludq $H4,$S4,$H4 # h4*s4
- vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
- vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
- vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
- vpmuludq $H1,$S4,$H0 # h1*s4
- vmovdqa 64(%rcx),$MASK # .Lmask26
- vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
- vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
-
- ################################################################
- # horizontal addition
-
- vpsrldq \$8,$D1,$T1
- vpsrldq \$8,$H2,$T2
- vpsrldq \$8,$H3,$T3
- vpsrldq \$8,$H4,$T4
- vpsrldq \$8,$H0,$T0
- vpaddq $T1,$D1,$D1
- vpaddq $T2,$H2,$H2
- vpaddq $T3,$H3,$H3
- vpaddq $T4,$H4,$H4
- vpaddq $T0,$H0,$H0
-
- vpermq \$0x2,$H3,$T3
- vpermq \$0x2,$H4,$T4
- vpermq \$0x2,$H0,$T0
- vpermq \$0x2,$D1,$T1
- vpermq \$0x2,$H2,$T2
- vpaddq $T3,$H3,$H3
- vpaddq $T4,$H4,$H4
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$D1,$D1
- vpaddq $T2,$H2,$H2
-
- ################################################################
- # lazy reduction
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$D1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D4
- vpand $MASK,$H4,$H4
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D4,$H0,$H0
- vpsllq \$2,$D4,$D4
- vpaddq $D4,$H0,$H0 # h4 -> h0
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
- vmovd %x#$H1,`4*1-48-64`($ctx)
- vmovd %x#$H2,`4*2-48-64`($ctx)
- vmovd %x#$H3,`4*3-48-64`($ctx)
- vmovd %x#$H4,`4*4-48-64`($ctx)
-___
-$code.=<<___ if ($win64);
- vmovdqa -0xb0(%r10),%xmm6
- vmovdqa -0xa0(%r10),%xmm7
- vmovdqa -0x90(%r10),%xmm8
- vmovdqa -0x80(%r10),%xmm9
- vmovdqa -0x70(%r10),%xmm10
- vmovdqa -0x60(%r10),%xmm11
- vmovdqa -0x50(%r10),%xmm12
- vmovdqa -0x40(%r10),%xmm13
- vmovdqa -0x30(%r10),%xmm14
- vmovdqa -0x20(%r10),%xmm15
- lea -8(%r10),%rsp
-.Ldo_avx2_epilogue$suffix:
-___
-$code.=<<___ if (!$win64);
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-___
-$code.=<<___;
- vzeroupper
- RET
-.cfi_endproc
-___
-if($avx > 2 && $avx512) {
-my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
-my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
-my $PADBIT="%zmm30";
-
-map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
-map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
-map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
-map(s/%y/%z/,($MASK));
-
-$code.=<<___;
-.cfi_startproc
-.Lblocks_avx512:
- mov \$15,%eax
- kmovw %eax,%k2
-___
-$code.=<<___ if (!$win64);
- lea 8(%rsp),%r10
-.cfi_def_cfa_register %r10
- sub \$0x128,%rsp
-___
-$code.=<<___ if ($win64);
- lea 8(%rsp),%r10
- sub \$0x1c8,%rsp
- vmovdqa %xmm6,-0xb0(%r10)
- vmovdqa %xmm7,-0xa0(%r10)
- vmovdqa %xmm8,-0x90(%r10)
- vmovdqa %xmm9,-0x80(%r10)
- vmovdqa %xmm10,-0x70(%r10)
- vmovdqa %xmm11,-0x60(%r10)
- vmovdqa %xmm12,-0x50(%r10)
- vmovdqa %xmm13,-0x40(%r10)
- vmovdqa %xmm14,-0x30(%r10)
- vmovdqa %xmm15,-0x20(%r10)
-.Ldo_avx512_body:
-___
-$code.=<<___;
- lea .Lconst(%rip),%rcx
- lea 48+64($ctx),$ctx # size optimization
- vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
-
- # expand pre-calculated table
- vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
- and \$-512,%rsp
- vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
- mov \$0x20,%rax
- vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
- vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
- vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
- vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
- vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
- vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
- vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
- vpermd $D0,$T2,$R0 # 00003412 -> 14243444
- vpbroadcastq 64(%rcx),$MASK # .Lmask26
- vpermd $D1,$T2,$R1
- vpermd $T0,$T2,$S1
- vpermd $D2,$T2,$R2
- vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
- vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
- vpermd $T1,$T2,$S2
- vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
- vpsrlq \$32,$R1,$T1
- vpermd $D3,$T2,$R3
- vmovdqa64 $S1,0x40(%rsp){%k2}
- vpermd $T3,$T2,$S3
- vpermd $D4,$T2,$R4
- vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
- vpermd $T4,$T2,$S4
- vmovdqa64 $S2,0x80(%rsp){%k2}
- vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
- vmovdqa64 $S3,0xc0(%rsp){%k2}
- vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
- vmovdqa64 $S4,0x100(%rsp){%k2}
-
- ################################################################
- # calculate 5th through 8th powers of the key
- #
- # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
- # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
- # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
- # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
- # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
-
- vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
- vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
- vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
- vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
- vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
- vpsrlq \$32,$R2,$T2
-
- vpmuludq $T1,$S4,$M0
- vpmuludq $T1,$R0,$M1
- vpmuludq $T1,$R1,$M2
- vpmuludq $T1,$R2,$M3
- vpmuludq $T1,$R3,$M4
- vpsrlq \$32,$R3,$T3
- vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
- vpaddq $M1,$D1,$D1 # d1 += r1'*r0
- vpaddq $M2,$D2,$D2 # d2 += r1'*r1
- vpaddq $M3,$D3,$D3 # d3 += r1'*r2
- vpaddq $M4,$D4,$D4 # d4 += r1'*r3
-
- vpmuludq $T2,$S3,$M0
- vpmuludq $T2,$S4,$M1
- vpmuludq $T2,$R1,$M3
- vpmuludq $T2,$R2,$M4
- vpmuludq $T2,$R0,$M2
- vpsrlq \$32,$R4,$T4
- vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
- vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
- vpaddq $M3,$D3,$D3 # d3 += r2'*r1
- vpaddq $M4,$D4,$D4 # d4 += r2'*r2
- vpaddq $M2,$D2,$D2 # d2 += r2'*r0
-
- vpmuludq $T3,$S2,$M0
- vpmuludq $T3,$R0,$M3
- vpmuludq $T3,$R1,$M4
- vpmuludq $T3,$S3,$M1
- vpmuludq $T3,$S4,$M2
- vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
- vpaddq $M3,$D3,$D3 # d3 += r3'*r0
- vpaddq $M4,$D4,$D4 # d4 += r3'*r1
- vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
- vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
-
- vpmuludq $T4,$S4,$M3
- vpmuludq $T4,$R0,$M4
- vpmuludq $T4,$S1,$M0
- vpmuludq $T4,$S2,$M1
- vpmuludq $T4,$S3,$M2
- vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
- vpaddq $M4,$D4,$D4 # d4 += r2'*r0
- vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
- vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
- vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
-
- ################################################################
- # load input
- vmovdqu64 16*0($inp),%z#$T3
- vmovdqu64 16*4($inp),%z#$T4
- lea 16*8($inp),$inp
-
- ################################################################
- # lazy reduction
-
- vpsrlq \$26,$D3,$M3
- vpandq $MASK,$D3,$D3
- vpaddq $M3,$D4,$D4 # d3 -> d4
-
- vpsrlq \$26,$D0,$M0
- vpandq $MASK,$D0,$D0
- vpaddq $M0,$D1,$D1 # d0 -> d1
-
- vpsrlq \$26,$D4,$M4
- vpandq $MASK,$D4,$D4
-
- vpsrlq \$26,$D1,$M1
- vpandq $MASK,$D1,$D1
- vpaddq $M1,$D2,$D2 # d1 -> d2
-
- vpaddq $M4,$D0,$D0
- vpsllq \$2,$M4,$M4
- vpaddq $M4,$D0,$D0 # d4 -> d0
-
- vpsrlq \$26,$D2,$M2
- vpandq $MASK,$D2,$D2
- vpaddq $M2,$D3,$D3 # d2 -> d3
-
- vpsrlq \$26,$D0,$M0
- vpandq $MASK,$D0,$D0
- vpaddq $M0,$D1,$D1 # d0 -> d1
-
- vpsrlq \$26,$D3,$M3
- vpandq $MASK,$D3,$D3
- vpaddq $M3,$D4,$D4 # d3 -> d4
-
- ################################################################
- # at this point we have 14243444 in $R0-$S4 and 05060708 in
- # $D0-$D4, ...
-
- vpunpcklqdq $T4,$T3,$T0 # transpose input
- vpunpckhqdq $T4,$T3,$T4
-
- # ... since input 64-bit lanes are ordered as 73625140, we could
- # "vperm" it to 76543210 (here and in each loop iteration), *or*
- # we could just flow along, hence the goal for $R0-$S4 is
- # 1858286838784888 ...
-
- vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
- mov \$0x7777,%eax
- kmovw %eax,%k1
-
- vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
- vpermd $R1,$M0,$R1
- vpermd $R2,$M0,$R2
- vpermd $R3,$M0,$R3
- vpermd $R4,$M0,$R4
-
- vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
- vpermd $D1,$M0,${R1}{%k1}
- vpermd $D2,$M0,${R2}{%k1}
- vpermd $D3,$M0,${R3}{%k1}
- vpermd $D4,$M0,${R4}{%k1}
-
- vpslld \$2,$R1,$S1 # *5
- vpslld \$2,$R2,$S2
- vpslld \$2,$R3,$S3
- vpslld \$2,$R4,$S4
- vpaddd $R1,$S1,$S1
- vpaddd $R2,$S2,$S2
- vpaddd $R3,$S3,$S3
- vpaddd $R4,$S4,$S4
-
- vpbroadcastq 32(%rcx),$PADBIT # .L129
-
- vpsrlq \$52,$T0,$T2 # splat input
- vpsllq \$12,$T4,$T3
- vporq $T3,$T2,$T2
- vpsrlq \$26,$T0,$T1
- vpsrlq \$14,$T4,$T3
- vpsrlq \$40,$T4,$T4 # 4
- vpandq $MASK,$T2,$T2 # 2
- vpandq $MASK,$T0,$T0 # 0
- #vpandq $MASK,$T1,$T1 # 1
- #vpandq $MASK,$T3,$T3 # 3
- #vporq $PADBIT,$T4,$T4 # padbit, yes, always
-
- vpaddq $H2,$T2,$H2 # accumulate input
- sub \$192,$len
- jbe .Ltail_avx512
- jmp .Loop_avx512
-
-.align 32
-.Loop_avx512:
- ################################################################
- # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
- # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
- # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
- # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
- # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
- # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
- # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
- # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
- # \________/\___________/
- ################################################################
- #vpaddq $H2,$T2,$H2 # accumulate input
-
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- #
- # however, as h2 is "chronologically" first one available pull
- # corresponding operations up, so it's
- #
- # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
- # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
- # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
- # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
- # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
-
- vpmuludq $H2,$R1,$D3 # d3 = h2*r1
- vpaddq $H0,$T0,$H0
- vpmuludq $H2,$R2,$D4 # d4 = h2*r2
- vpandq $MASK,$T1,$T1 # 1
- vpmuludq $H2,$S3,$D0 # d0 = h2*s3
- vpandq $MASK,$T3,$T3 # 3
- vpmuludq $H2,$S4,$D1 # d1 = h2*s4
- vporq $PADBIT,$T4,$T4 # padbit, yes, always
- vpmuludq $H2,$R0,$D2 # d2 = h2*r0
- vpaddq $H1,$T1,$H1 # accumulate input
- vpaddq $H3,$T3,$H3
- vpaddq $H4,$T4,$H4
-
- vmovdqu64 16*0($inp),$T3 # load input
- vmovdqu64 16*4($inp),$T4
- lea 16*8($inp),$inp
- vpmuludq $H0,$R3,$M3
- vpmuludq $H0,$R4,$M4
- vpmuludq $H0,$R0,$M0
- vpmuludq $H0,$R1,$M1
- vpaddq $M3,$D3,$D3 # d3 += h0*r3
- vpaddq $M4,$D4,$D4 # d4 += h0*r4
- vpaddq $M0,$D0,$D0 # d0 += h0*r0
- vpaddq $M1,$D1,$D1 # d1 += h0*r1
-
- vpmuludq $H1,$R2,$M3
- vpmuludq $H1,$R3,$M4
- vpmuludq $H1,$S4,$M0
- vpmuludq $H0,$R2,$M2
- vpaddq $M3,$D3,$D3 # d3 += h1*r2
- vpaddq $M4,$D4,$D4 # d4 += h1*r3
- vpaddq $M0,$D0,$D0 # d0 += h1*s4
- vpaddq $M2,$D2,$D2 # d2 += h0*r2
-
- vpunpcklqdq $T4,$T3,$T0 # transpose input
- vpunpckhqdq $T4,$T3,$T4
-
- vpmuludq $H3,$R0,$M3
- vpmuludq $H3,$R1,$M4
- vpmuludq $H1,$R0,$M1
- vpmuludq $H1,$R1,$M2
- vpaddq $M3,$D3,$D3 # d3 += h3*r0
- vpaddq $M4,$D4,$D4 # d4 += h3*r1
- vpaddq $M1,$D1,$D1 # d1 += h1*r0
- vpaddq $M2,$D2,$D2 # d2 += h1*r1
-
- vpmuludq $H4,$S4,$M3
- vpmuludq $H4,$R0,$M4
- vpmuludq $H3,$S2,$M0
- vpmuludq $H3,$S3,$M1
- vpaddq $M3,$D3,$D3 # d3 += h4*s4
- vpmuludq $H3,$S4,$M2
- vpaddq $M4,$D4,$D4 # d4 += h4*r0
- vpaddq $M0,$D0,$D0 # d0 += h3*s2
- vpaddq $M1,$D1,$D1 # d1 += h3*s3
- vpaddq $M2,$D2,$D2 # d2 += h3*s4
-
- vpmuludq $H4,$S1,$M0
- vpmuludq $H4,$S2,$M1
- vpmuludq $H4,$S3,$M2
- vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
- vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
- vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
-
- ################################################################
- # lazy reduction (interleaved with input splat)
-
- vpsrlq \$52,$T0,$T2 # splat input
- vpsllq \$12,$T4,$T3
-
- vpsrlq \$26,$D3,$H3
- vpandq $MASK,$D3,$D3
- vpaddq $H3,$D4,$H4 # h3 -> h4
-
- vporq $T3,$T2,$T2
-
- vpsrlq \$26,$H0,$D0
- vpandq $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpandq $MASK,$T2,$T2 # 2
-
- vpsrlq \$26,$H4,$D4
- vpandq $MASK,$H4,$H4
-
- vpsrlq \$26,$H1,$D1
- vpandq $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D4,$H0,$H0
- vpsllq \$2,$D4,$D4
- vpaddq $D4,$H0,$H0 # h4 -> h0
-
- vpaddq $T2,$H2,$H2 # modulo-scheduled
- vpsrlq \$26,$T0,$T1
-
- vpsrlq \$26,$H2,$D2
- vpandq $MASK,$H2,$H2
- vpaddq $D2,$D3,$H3 # h2 -> h3
-
- vpsrlq \$14,$T4,$T3
-
- vpsrlq \$26,$H0,$D0
- vpandq $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$40,$T4,$T4 # 4
-
- vpsrlq \$26,$H3,$D3
- vpandq $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpandq $MASK,$T0,$T0 # 0
- #vpandq $MASK,$T1,$T1 # 1
- #vpandq $MASK,$T3,$T3 # 3
- #vporq $PADBIT,$T4,$T4 # padbit, yes, always
-
- sub \$128,$len
- ja .Loop_avx512
-
-.Ltail_avx512:
- ################################################################
- # while above multiplications were by r^8 in all lanes, in last
- # iteration we multiply least significant lane by r^8 and most
- # significant one by r, that's why table gets shifted...
-
- vpsrlq \$32,$R0,$R0 # 0105020603070408
- vpsrlq \$32,$R1,$R1
- vpsrlq \$32,$R2,$R2
- vpsrlq \$32,$S3,$S3
- vpsrlq \$32,$S4,$S4
- vpsrlq \$32,$R3,$R3
- vpsrlq \$32,$R4,$R4
- vpsrlq \$32,$S1,$S1
- vpsrlq \$32,$S2,$S2
-
- ################################################################
- # load either next or last 64 byte of input
- lea ($inp,$len),$inp
-
- #vpaddq $H2,$T2,$H2 # accumulate input
- vpaddq $H0,$T0,$H0
-
- vpmuludq $H2,$R1,$D3 # d3 = h2*r1
- vpmuludq $H2,$R2,$D4 # d4 = h2*r2
- vpmuludq $H2,$S3,$D0 # d0 = h2*s3
- vpandq $MASK,$T1,$T1 # 1
- vpmuludq $H2,$S4,$D1 # d1 = h2*s4
- vpandq $MASK,$T3,$T3 # 3
- vpmuludq $H2,$R0,$D2 # d2 = h2*r0
- vporq $PADBIT,$T4,$T4 # padbit, yes, always
- vpaddq $H1,$T1,$H1 # accumulate input
- vpaddq $H3,$T3,$H3
- vpaddq $H4,$T4,$H4
-
- vmovdqu 16*0($inp),%x#$T0
- vpmuludq $H0,$R3,$M3
- vpmuludq $H0,$R4,$M4
- vpmuludq $H0,$R0,$M0
- vpmuludq $H0,$R1,$M1
- vpaddq $M3,$D3,$D3 # d3 += h0*r3
- vpaddq $M4,$D4,$D4 # d4 += h0*r4
- vpaddq $M0,$D0,$D0 # d0 += h0*r0
- vpaddq $M1,$D1,$D1 # d1 += h0*r1
-
- vmovdqu 16*1($inp),%x#$T1
- vpmuludq $H1,$R2,$M3
- vpmuludq $H1,$R3,$M4
- vpmuludq $H1,$S4,$M0
- vpmuludq $H0,$R2,$M2
- vpaddq $M3,$D3,$D3 # d3 += h1*r2
- vpaddq $M4,$D4,$D4 # d4 += h1*r3
- vpaddq $M0,$D0,$D0 # d0 += h1*s4
- vpaddq $M2,$D2,$D2 # d2 += h0*r2
-
- vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
- vpmuludq $H3,$R0,$M3
- vpmuludq $H3,$R1,$M4
- vpmuludq $H1,$R0,$M1
- vpmuludq $H1,$R1,$M2
- vpaddq $M3,$D3,$D3 # d3 += h3*r0
- vpaddq $M4,$D4,$D4 # d4 += h3*r1
- vpaddq $M1,$D1,$D1 # d1 += h1*r0
- vpaddq $M2,$D2,$D2 # d2 += h1*r1
-
- vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
- vpmuludq $H4,$S4,$M3
- vpmuludq $H4,$R0,$M4
- vpmuludq $H3,$S2,$M0
- vpmuludq $H3,$S3,$M1
- vpmuludq $H3,$S4,$M2
- vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
- vpaddq $M4,$D4,$D4 # d4 += h4*r0
- vpaddq $M0,$D0,$D0 # d0 += h3*s2
- vpaddq $M1,$D1,$D1 # d1 += h3*s3
- vpaddq $M2,$D2,$D2 # d2 += h3*s4
-
- vpmuludq $H4,$S1,$M0
- vpmuludq $H4,$S2,$M1
- vpmuludq $H4,$S3,$M2
- vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
- vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
- vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
-
- ################################################################
- # horizontal addition
-
- mov \$1,%eax
- vpermq \$0xb1,$H3,$D3
- vpermq \$0xb1,$D4,$H4
- vpermq \$0xb1,$H0,$D0
- vpermq \$0xb1,$H1,$D1
- vpermq \$0xb1,$H2,$D2
- vpaddq $D3,$H3,$H3
- vpaddq $D4,$H4,$H4
- vpaddq $D0,$H0,$H0
- vpaddq $D1,$H1,$H1
- vpaddq $D2,$H2,$H2
-
- kmovw %eax,%k3
- vpermq \$0x2,$H3,$D3
- vpermq \$0x2,$H4,$D4
- vpermq \$0x2,$H0,$D0
- vpermq \$0x2,$H1,$D1
- vpermq \$0x2,$H2,$D2
- vpaddq $D3,$H3,$H3
- vpaddq $D4,$H4,$H4
- vpaddq $D0,$H0,$H0
- vpaddq $D1,$H1,$H1
- vpaddq $D2,$H2,$H2
-
- vextracti64x4 \$0x1,$H3,%y#$D3
- vextracti64x4 \$0x1,$H4,%y#$D4
- vextracti64x4 \$0x1,$H0,%y#$D0
- vextracti64x4 \$0x1,$H1,%y#$D1
- vextracti64x4 \$0x1,$H2,%y#$D2
- vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
- vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
- vpaddq $D0,$H0,${H0}{%k3}{z}
- vpaddq $D1,$H1,${H1}{%k3}{z}
- vpaddq $D2,$H2,${H2}{%k3}{z}
-___
-map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
-map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
-$code.=<<___;
- ################################################################
- # lazy reduction (interleaved with input splat)
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpsrldq \$6,$T0,$T2 # splat input
- vpsrldq \$6,$T1,$T3
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpunpcklqdq $T3,$T2,$T2 # 2:3
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D4
- vpand $MASK,$H4,$H4
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpsrlq \$30,$T2,$T3
- vpsrlq \$4,$T2,$T2
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D4,$H0,$H0
- vpsllq \$2,$D4,$D4
- vpsrlq \$26,$T0,$T1
- vpsrlq \$40,$T4,$T4 # 4
- vpaddq $D4,$H0,$H0 # h4 -> h0
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T0,$T0 # 0
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
- vpand $MASK,$T1,$T1 # 1
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
- add \$64,$len
- jnz .Ltail_avx2$suffix
-
- vpsubq $T2,$H2,$H2 # undo input accumulation
- vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
- vmovd %x#$H1,`4*1-48-64`($ctx)
- vmovd %x#$H2,`4*2-48-64`($ctx)
- vmovd %x#$H3,`4*3-48-64`($ctx)
- vmovd %x#$H4,`4*4-48-64`($ctx)
- vzeroall
-___
-$code.=<<___ if ($win64);
- movdqa -0xb0(%r10),%xmm6
- movdqa -0xa0(%r10),%xmm7
- movdqa -0x90(%r10),%xmm8
- movdqa -0x80(%r10),%xmm9
- movdqa -0x70(%r10),%xmm10
- movdqa -0x60(%r10),%xmm11
- movdqa -0x50(%r10),%xmm12
- movdqa -0x40(%r10),%xmm13
- movdqa -0x30(%r10),%xmm14
- movdqa -0x20(%r10),%xmm15
- lea -8(%r10),%rsp
-.Ldo_avx512_epilogue:
-___
-$code.=<<___ if (!$win64);
- lea -8(%r10),%rsp
-.cfi_def_cfa_register %rsp
-___
-$code.=<<___;
- RET
-.cfi_endproc
-___
-
-}
-
-}
-
-&declare_function("poly1305_blocks_avx2", 32, 4);
-poly1305_blocks_avxN(0);
-&end_function("poly1305_blocks_avx2");
-
-#######################################################################
-if ($avx>2) {
-# On entry we have input length divisible by 64. But since inner loop
-# processes 128 bytes per iteration, cases when length is not divisible
-# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
-# reason stack layout is kept identical to poly1305_blocks_avx2. If not
-# for this tail, we wouldn't have to even allocate stack frame...
-
-if($kernel) {
- $code .= "#ifdef CONFIG_AS_AVX512\n";
-}
-
-&declare_function("poly1305_blocks_avx512", 32, 4);
-poly1305_blocks_avxN(1);
-&end_function("poly1305_blocks_avx512");
-
-if ($kernel) {
- $code .= "#endif\n";
-}
-
-if (!$kernel && $avx>3) {
-########################################################################
-# VPMADD52 version using 2^44 radix.
-#
-# One can argue that base 2^52 would be more natural. Well, even though
-# some operations would be more natural, one has to recognize couple of
-# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
-# at amount of multiply-n-accumulate operations. Secondly, it makes it
-# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
-# reference implementations], which means that more such operations
-# would have to be performed in inner loop, which in turn makes critical
-# path longer. In other words, even though base 2^44 reduction might
-# look less elegant, overall critical path is actually shorter...
-
-########################################################################
-# Layout of opaque area is following.
-#
-# unsigned __int64 h[3]; # current hash value base 2^44
-# unsigned __int64 s[2]; # key value*20 base 2^44
-# unsigned __int64 r[3]; # key value base 2^44
-# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
-# # r^n positions reflect
-# # placement in register, not
-# # memory, R[3] is R[1]*20
-
-$code.=<<___;
-.type poly1305_init_base2_44,\@function,3
-.align 32
-poly1305_init_base2_44:
- xor %eax,%eax
- mov %rax,0($ctx) # initialize hash value
- mov %rax,8($ctx)
- mov %rax,16($ctx)
-
-.Linit_base2_44:
- lea poly1305_blocks_vpmadd52(%rip),%r10
- lea poly1305_emit_base2_44(%rip),%r11
-
- mov \$0x0ffffffc0fffffff,%rax
- mov \$0x0ffffffc0ffffffc,%rcx
- and 0($inp),%rax
- mov \$0x00000fffffffffff,%r8
- and 8($inp),%rcx
- mov \$0x00000fffffffffff,%r9
- and %rax,%r8
- shrd \$44,%rcx,%rax
- mov %r8,40($ctx) # r0
- and %r9,%rax
- shr \$24,%rcx
- mov %rax,48($ctx) # r1
- lea (%rax,%rax,4),%rax # *5
- mov %rcx,56($ctx) # r2
- shl \$2,%rax # magic <<2
- lea (%rcx,%rcx,4),%rcx # *5
- shl \$2,%rcx # magic <<2
- mov %rax,24($ctx) # s1
- mov %rcx,32($ctx) # s2
- movq \$-1,64($ctx) # write impossible value
-___
-$code.=<<___ if ($flavour !~ /elf32/);
- mov %r10,0(%rdx)
- mov %r11,8(%rdx)
-___
-$code.=<<___ if ($flavour =~ /elf32/);
- mov %r10d,0(%rdx)
- mov %r11d,4(%rdx)
-___
-$code.=<<___;
- mov \$1,%eax
- RET
-.size poly1305_init_base2_44,.-poly1305_init_base2_44
-___
-{
-my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
-my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
-my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
-
-$code.=<<___;
-.type poly1305_blocks_vpmadd52,\@function,4
-.align 32
-poly1305_blocks_vpmadd52:
- shr \$4,$len
- jz .Lno_data_vpmadd52 # too short
-
- shl \$40,$padbit
- mov 64($ctx),%r8 # peek on power of the key
-
- # if powers of the key are not calculated yet, process up to 3
- # blocks with this single-block subroutine, otherwise ensure that
- # length is divisible by 2 blocks and pass the rest down to next
- # subroutine...
-
- mov \$3,%rax
- mov \$1,%r10
- cmp \$4,$len # is input long
- cmovae %r10,%rax
- test %r8,%r8 # is power value impossible?
- cmovns %r10,%rax
-
- and $len,%rax # is input of favourable length?
- jz .Lblocks_vpmadd52_4x
-
- sub %rax,$len
- mov \$7,%r10d
- mov \$1,%r11d
- kmovw %r10d,%k7
- lea .L2_44_inp_permd(%rip),%r10
- kmovw %r11d,%k1
-
- vmovq $padbit,%x#$PAD
- vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
- vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
- vpermq \$0xcf,$PAD,$PAD
- vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
-
- vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
- vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
- vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
- vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
-
- vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
- vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
-
- jmp .Loop_vpmadd52
-
-.align 32
-.Loop_vpmadd52:
- vmovdqu32 0($inp),%x#$T0 # load input as ----3210
- lea 16($inp),$inp
-
- vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
- vpsrlvq $inp_shift,$T0,$T0
- vpandq $reduc_mask,$T0,$T0
- vporq $PAD,$T0,$T0
-
- vpaddq $T0,$Dlo,$Dlo # accumulate input
-
- vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
- vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
- vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
-
- vpxord $Dlo,$Dlo,$Dlo
- vpxord $Dhi,$Dhi,$Dhi
-
- vpmadd52luq $r2r1r0,$H0,$Dlo
- vpmadd52huq $r2r1r0,$H0,$Dhi
-
- vpmadd52luq $r1r0s2,$H1,$Dlo
- vpmadd52huq $r1r0s2,$H1,$Dhi
-
- vpmadd52luq $r0s2s1,$H2,$Dlo
- vpmadd52huq $r0s2s1,$H2,$Dhi
-
- vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
- vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
- vpandq $reduc_mask,$Dlo,$Dlo
-
- vpaddq $T0,$Dhi,$Dhi
-
- vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
-
- vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
-
- vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
- vpandq $reduc_mask,$Dlo,$Dlo
-
- vpermq \$0b10010011,$T0,$T0
-
- vpaddq $T0,$Dlo,$Dlo
-
- vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
-
- vpaddq $T0,$Dlo,$Dlo
- vpsllq \$2,$T0,$T0
-
- vpaddq $T0,$Dlo,$Dlo
-
- dec %rax # len-=16
- jnz .Loop_vpmadd52
-
- vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
-
- test $len,$len
- jnz .Lblocks_vpmadd52_4x
-
-.Lno_data_vpmadd52:
- RET
-.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
-___
-}
-{
-########################################################################
-# As implied by its name 4x subroutine processes 4 blocks in parallel
-# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
-# and is handled in 256-bit %ymm registers.
-
-my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
-my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
-my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
-
-$code.=<<___;
-.type poly1305_blocks_vpmadd52_4x,\@function,4
-.align 32
-poly1305_blocks_vpmadd52_4x:
- shr \$4,$len
- jz .Lno_data_vpmadd52_4x # too short
-
- shl \$40,$padbit
- mov 64($ctx),%r8 # peek on power of the key
-
-.Lblocks_vpmadd52_4x:
- vpbroadcastq $padbit,$PAD
-
- vmovdqa64 .Lx_mask44(%rip),$mask44
- mov \$5,%eax
- vmovdqa64 .Lx_mask42(%rip),$mask42
- kmovw %eax,%k1 # used in 2x path
-
- test %r8,%r8 # is power value impossible?
- js .Linit_vpmadd52 # if it is, then init R[4]
-
- vmovq 0($ctx),%x#$H0 # load current hash value
- vmovq 8($ctx),%x#$H1
- vmovq 16($ctx),%x#$H2
-
- test \$3,$len # is length 4*n+2?
- jnz .Lblocks_vpmadd52_2x_do
-
-.Lblocks_vpmadd52_4x_do:
- vpbroadcastq 64($ctx),$R0 # load 4th power of the key
- vpbroadcastq 96($ctx),$R1
- vpbroadcastq 128($ctx),$R2
- vpbroadcastq 160($ctx),$S1
-
-.Lblocks_vpmadd52_4x_key_loaded:
- vpsllq \$2,$R2,$S2 # S2 = R2*5*4
- vpaddq $R2,$S2,$S2
- vpsllq \$2,$S2,$S2
-
- test \$7,$len # is len 8*n?
- jz .Lblocks_vpmadd52_8x
-
- vmovdqu64 16*0($inp),$T2 # load data
- vmovdqu64 16*2($inp),$T3
- lea 16*4($inp),$inp
-
- vpunpcklqdq $T3,$T2,$T1 # transpose data
- vpunpckhqdq $T3,$T2,$T3
-
- # at this point 64-bit lanes are ordered as 3-1-2-0
-
- vpsrlq \$24,$T3,$T2 # splat the data
- vporq $PAD,$T2,$T2
- vpaddq $T2,$H2,$H2 # accumulate input
- vpandq $mask44,$T1,$T0
- vpsrlq \$44,$T1,$T1
- vpsllq \$20,$T3,$T3
- vporq $T3,$T1,$T1
- vpandq $mask44,$T1,$T1
-
- sub \$4,$len
- jz .Ltail_vpmadd52_4x
- jmp .Loop_vpmadd52_4x
- ud2
-
-.align 32
-.Linit_vpmadd52:
- vmovq 24($ctx),%x#$S1 # load key
- vmovq 56($ctx),%x#$H2
- vmovq 32($ctx),%x#$S2
- vmovq 40($ctx),%x#$R0
- vmovq 48($ctx),%x#$R1
-
- vmovdqa $R0,$H0
- vmovdqa $R1,$H1
- vmovdqa $H2,$R2
-
- mov \$2,%eax
-
-.Lmul_init_vpmadd52:
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $H2,$S1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $H2,$S1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $H2,$S2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $H2,$S2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $H2,$R0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $H2,$R0,$D2hi
-
- vpmadd52luq $H0,$R0,$D0lo
- vpmadd52huq $H0,$R0,$D0hi
- vpmadd52luq $H0,$R1,$D1lo
- vpmadd52huq $H0,$R1,$D1hi
- vpmadd52luq $H0,$R2,$D2lo
- vpmadd52huq $H0,$R2,$D2hi
-
- vpmadd52luq $H1,$S2,$D0lo
- vpmadd52huq $H1,$S2,$D0hi
- vpmadd52luq $H1,$R0,$D1lo
- vpmadd52huq $H1,$R0,$D1hi
- vpmadd52luq $H1,$R1,$D2lo
- vpmadd52huq $H1,$R1,$D2hi
-
- ################################################################
- # partial reduction
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$H0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$H1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$H2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
-
- vpsrlq \$44,$H0,$tmp # additional step
- vpandq $mask44,$H0,$H0
-
- vpaddq $tmp,$H1,$H1
-
- dec %eax
- jz .Ldone_init_vpmadd52
-
- vpunpcklqdq $R1,$H1,$R1 # 1,2
- vpbroadcastq %x#$H1,%x#$H1 # 2,2
- vpunpcklqdq $R2,$H2,$R2
- vpbroadcastq %x#$H2,%x#$H2
- vpunpcklqdq $R0,$H0,$R0
- vpbroadcastq %x#$H0,%x#$H0
-
- vpsllq \$2,$R1,$S1 # S1 = R1*5*4
- vpsllq \$2,$R2,$S2 # S2 = R2*5*4
- vpaddq $R1,$S1,$S1
- vpaddq $R2,$S2,$S2
- vpsllq \$2,$S1,$S1
- vpsllq \$2,$S2,$S2
-
- jmp .Lmul_init_vpmadd52
- ud2
-
-.align 32
-.Ldone_init_vpmadd52:
- vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
- vinserti128 \$1,%x#$R2,$H2,$R2
- vinserti128 \$1,%x#$R0,$H0,$R0
-
- vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
- vpermq \$0b11011000,$R2,$R2
- vpermq \$0b11011000,$R0,$R0
-
- vpsllq \$2,$R1,$S1 # S1 = R1*5*4
- vpaddq $R1,$S1,$S1
- vpsllq \$2,$S1,$S1
-
- vmovq 0($ctx),%x#$H0 # load current hash value
- vmovq 8($ctx),%x#$H1
- vmovq 16($ctx),%x#$H2
-
- test \$3,$len # is length 4*n+2?
- jnz .Ldone_init_vpmadd52_2x
-
- vmovdqu64 $R0,64($ctx) # save key powers
- vpbroadcastq %x#$R0,$R0 # broadcast 4th power
- vmovdqu64 $R1,96($ctx)
- vpbroadcastq %x#$R1,$R1
- vmovdqu64 $R2,128($ctx)
- vpbroadcastq %x#$R2,$R2
- vmovdqu64 $S1,160($ctx)
- vpbroadcastq %x#$S1,$S1
-
- jmp .Lblocks_vpmadd52_4x_key_loaded
- ud2
-
-.align 32
-.Ldone_init_vpmadd52_2x:
- vmovdqu64 $R0,64($ctx) # save key powers
- vpsrldq \$8,$R0,$R0 # 0-1-0-2
- vmovdqu64 $R1,96($ctx)
- vpsrldq \$8,$R1,$R1
- vmovdqu64 $R2,128($ctx)
- vpsrldq \$8,$R2,$R2
- vmovdqu64 $S1,160($ctx)
- vpsrldq \$8,$S1,$S1
- jmp .Lblocks_vpmadd52_2x_key_loaded
- ud2
-
-.align 32
-.Lblocks_vpmadd52_2x_do:
- vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
- vmovdqu64 160+8($ctx),${S1}{%k1}{z}
- vmovdqu64 64+8($ctx),${R0}{%k1}{z}
- vmovdqu64 96+8($ctx),${R1}{%k1}{z}
-
-.Lblocks_vpmadd52_2x_key_loaded:
- vmovdqu64 16*0($inp),$T2 # load data
- vpxorq $T3,$T3,$T3
- lea 16*2($inp),$inp
-
- vpunpcklqdq $T3,$T2,$T1 # transpose data
- vpunpckhqdq $T3,$T2,$T3
-
- # at this point 64-bit lanes are ordered as x-1-x-0
-
- vpsrlq \$24,$T3,$T2 # splat the data
- vporq $PAD,$T2,$T2
- vpaddq $T2,$H2,$H2 # accumulate input
- vpandq $mask44,$T1,$T0
- vpsrlq \$44,$T1,$T1
- vpsllq \$20,$T3,$T3
- vporq $T3,$T1,$T1
- vpandq $mask44,$T1,$T1
-
- jmp .Ltail_vpmadd52_2x
- ud2
-
-.align 32
-.Loop_vpmadd52_4x:
- #vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$H1,$H1
-
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $H2,$S1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $H2,$S1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $H2,$S2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $H2,$S2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $H2,$R0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $H2,$R0,$D2hi
-
- vmovdqu64 16*0($inp),$T2 # load data
- vmovdqu64 16*2($inp),$T3
- lea 16*4($inp),$inp
- vpmadd52luq $H0,$R0,$D0lo
- vpmadd52huq $H0,$R0,$D0hi
- vpmadd52luq $H0,$R1,$D1lo
- vpmadd52huq $H0,$R1,$D1hi
- vpmadd52luq $H0,$R2,$D2lo
- vpmadd52huq $H0,$R2,$D2hi
-
- vpunpcklqdq $T3,$T2,$T1 # transpose data
- vpunpckhqdq $T3,$T2,$T3
- vpmadd52luq $H1,$S2,$D0lo
- vpmadd52huq $H1,$S2,$D0hi
- vpmadd52luq $H1,$R0,$D1lo
- vpmadd52huq $H1,$R0,$D1hi
- vpmadd52luq $H1,$R1,$D2lo
- vpmadd52huq $H1,$R1,$D2hi
-
- ################################################################
- # partial reduction (interleaved with data splat)
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$H0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpsrlq \$24,$T3,$T2
- vporq $PAD,$T2,$T2
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$H1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpandq $mask44,$T1,$T0
- vpsrlq \$44,$T1,$T1
- vpsllq \$20,$T3,$T3
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$H2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $D2hi,$H0,$H0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
- vporq $T3,$T1,$T1
- vpandq $mask44,$T1,$T1
-
- vpsrlq \$44,$H0,$tmp # additional step
- vpandq $mask44,$H0,$H0
-
- vpaddq $tmp,$H1,$H1
-
- sub \$4,$len # len-=64
- jnz .Loop_vpmadd52_4x
-
-.Ltail_vpmadd52_4x:
- vmovdqu64 128($ctx),$R2 # load all key powers
- vmovdqu64 160($ctx),$S1
- vmovdqu64 64($ctx),$R0
- vmovdqu64 96($ctx),$R1
-
-.Ltail_vpmadd52_2x:
- vpsllq \$2,$R2,$S2 # S2 = R2*5*4
- vpaddq $R2,$S2,$S2
- vpsllq \$2,$S2,$S2
-
- #vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$H1,$H1
-
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $H2,$S1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $H2,$S1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $H2,$S2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $H2,$S2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $H2,$R0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $H2,$R0,$D2hi
-
- vpmadd52luq $H0,$R0,$D0lo
- vpmadd52huq $H0,$R0,$D0hi
- vpmadd52luq $H0,$R1,$D1lo
- vpmadd52huq $H0,$R1,$D1hi
- vpmadd52luq $H0,$R2,$D2lo
- vpmadd52huq $H0,$R2,$D2hi
-
- vpmadd52luq $H1,$S2,$D0lo
- vpmadd52huq $H1,$S2,$D0hi
- vpmadd52luq $H1,$R0,$D1lo
- vpmadd52huq $H1,$R0,$D1hi
- vpmadd52luq $H1,$R1,$D2lo
- vpmadd52huq $H1,$R1,$D2hi
-
- ################################################################
- # horizontal addition
-
- mov \$1,%eax
- kmovw %eax,%k1
- vpsrldq \$8,$D0lo,$T0
- vpsrldq \$8,$D0hi,$H0
- vpsrldq \$8,$D1lo,$T1
- vpsrldq \$8,$D1hi,$H1
- vpaddq $T0,$D0lo,$D0lo
- vpaddq $H0,$D0hi,$D0hi
- vpsrldq \$8,$D2lo,$T2
- vpsrldq \$8,$D2hi,$H2
- vpaddq $T1,$D1lo,$D1lo
- vpaddq $H1,$D1hi,$D1hi
- vpermq \$0x2,$D0lo,$T0
- vpermq \$0x2,$D0hi,$H0
- vpaddq $T2,$D2lo,$D2lo
- vpaddq $H2,$D2hi,$D2hi
-
- vpermq \$0x2,$D1lo,$T1
- vpermq \$0x2,$D1hi,$H1
- vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
- vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
- vpermq \$0x2,$D2lo,$T2
- vpermq \$0x2,$D2hi,$H2
- vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
- vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
- vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
- vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
-
- ################################################################
- # partial reduction
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$H0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$H1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$H2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
-
- vpsrlq \$44,$H0,$tmp # additional step
- vpandq $mask44,$H0,$H0
-
- vpaddq $tmp,$H1,$H1
- # at this point $len is
- # either 4*n+2 or 0...
- sub \$2,$len # len-=32
- ja .Lblocks_vpmadd52_4x_do
-
- vmovq %x#$H0,0($ctx)
- vmovq %x#$H1,8($ctx)
- vmovq %x#$H2,16($ctx)
- vzeroall
-
-.Lno_data_vpmadd52_4x:
- RET
-.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
-___
-}
-{
-########################################################################
-# As implied by its name 8x subroutine processes 8 blocks in parallel...
-# This is intermediate version, as it's used only in cases when input
-# length is either 8*n, 8*n+1 or 8*n+2...
-
-my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
-my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
-my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
-my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
-
-$code.=<<___;
-.type poly1305_blocks_vpmadd52_8x,\@function,4
-.align 32
-poly1305_blocks_vpmadd52_8x:
- shr \$4,$len
- jz .Lno_data_vpmadd52_8x # too short
-
- shl \$40,$padbit
- mov 64($ctx),%r8 # peek on power of the key
-
- vmovdqa64 .Lx_mask44(%rip),$mask44
- vmovdqa64 .Lx_mask42(%rip),$mask42
-
- test %r8,%r8 # is power value impossible?
- js .Linit_vpmadd52 # if it is, then init R[4]
-
- vmovq 0($ctx),%x#$H0 # load current hash value
- vmovq 8($ctx),%x#$H1
- vmovq 16($ctx),%x#$H2
-
-.Lblocks_vpmadd52_8x:
- ################################################################
- # fist we calculate more key powers
-
- vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
- vmovdqu64 160($ctx),$S1
- vmovdqu64 64($ctx),$R0
- vmovdqu64 96($ctx),$R1
-
- vpsllq \$2,$R2,$S2 # S2 = R2*5*4
- vpaddq $R2,$S2,$S2
- vpsllq \$2,$S2,$S2
-
- vpbroadcastq %x#$R2,$RR2 # broadcast 4th power
- vpbroadcastq %x#$R0,$RR0
- vpbroadcastq %x#$R1,$RR1
-
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $RR2,$S1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $RR2,$S1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $RR2,$S2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $RR2,$S2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $RR2,$R0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $RR2,$R0,$D2hi
-
- vpmadd52luq $RR0,$R0,$D0lo
- vpmadd52huq $RR0,$R0,$D0hi
- vpmadd52luq $RR0,$R1,$D1lo
- vpmadd52huq $RR0,$R1,$D1hi
- vpmadd52luq $RR0,$R2,$D2lo
- vpmadd52huq $RR0,$R2,$D2hi
-
- vpmadd52luq $RR1,$S2,$D0lo
- vpmadd52huq $RR1,$S2,$D0hi
- vpmadd52luq $RR1,$R0,$D1lo
- vpmadd52huq $RR1,$R0,$D1hi
- vpmadd52luq $RR1,$R1,$D2lo
- vpmadd52huq $RR1,$R1,$D2hi
-
- ################################################################
- # partial reduction
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$RR0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$RR1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$RR2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $D2hi,$RR0,$RR0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$RR0,$RR0
-
- vpsrlq \$44,$RR0,$tmp # additional step
- vpandq $mask44,$RR0,$RR0
-
- vpaddq $tmp,$RR1,$RR1
-
- ################################################################
- # At this point Rx holds 1324 powers, RRx - 5768, and the goal
- # is 15263748, which reflects how data is loaded...
-
- vpunpcklqdq $R2,$RR2,$T2 # 3748
- vpunpckhqdq $R2,$RR2,$R2 # 1526
- vpunpcklqdq $R0,$RR0,$T0
- vpunpckhqdq $R0,$RR0,$R0
- vpunpcklqdq $R1,$RR1,$T1
- vpunpckhqdq $R1,$RR1,$R1
-___
-######## switch to %zmm
-map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
-map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
-map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
-map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
-
-$code.=<<___;
- vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748
- vshufi64x2 \$0x44,$R0,$T0,$RR0
- vshufi64x2 \$0x44,$R1,$T1,$RR1
-
- vmovdqu64 16*0($inp),$T2 # load data
- vmovdqu64 16*4($inp),$T3
- lea 16*8($inp),$inp
-
- vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4
- vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4
- vpaddq $RR2,$SS2,$SS2
- vpaddq $RR1,$SS1,$SS1
- vpsllq \$2,$SS2,$SS2
- vpsllq \$2,$SS1,$SS1
-
- vpbroadcastq $padbit,$PAD
- vpbroadcastq %x#$mask44,$mask44
- vpbroadcastq %x#$mask42,$mask42
-
- vpbroadcastq %x#$SS1,$S1 # broadcast 8th power
- vpbroadcastq %x#$SS2,$S2
- vpbroadcastq %x#$RR0,$R0
- vpbroadcastq %x#$RR1,$R1
- vpbroadcastq %x#$RR2,$R2
-
- vpunpcklqdq $T3,$T2,$T1 # transpose data
- vpunpckhqdq $T3,$T2,$T3
-
- # at this point 64-bit lanes are ordered as 73625140
-
- vpsrlq \$24,$T3,$T2 # splat the data
- vporq $PAD,$T2,$T2
- vpaddq $T2,$H2,$H2 # accumulate input
- vpandq $mask44,$T1,$T0
- vpsrlq \$44,$T1,$T1
- vpsllq \$20,$T3,$T3
- vporq $T3,$T1,$T1
- vpandq $mask44,$T1,$T1
-
- sub \$8,$len
- jz .Ltail_vpmadd52_8x
- jmp .Loop_vpmadd52_8x
-
-.align 32
-.Loop_vpmadd52_8x:
- #vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$H1,$H1
-
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $H2,$S1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $H2,$S1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $H2,$S2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $H2,$S2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $H2,$R0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $H2,$R0,$D2hi
-
- vmovdqu64 16*0($inp),$T2 # load data
- vmovdqu64 16*4($inp),$T3
- lea 16*8($inp),$inp
- vpmadd52luq $H0,$R0,$D0lo
- vpmadd52huq $H0,$R0,$D0hi
- vpmadd52luq $H0,$R1,$D1lo
- vpmadd52huq $H0,$R1,$D1hi
- vpmadd52luq $H0,$R2,$D2lo
- vpmadd52huq $H0,$R2,$D2hi
-
- vpunpcklqdq $T3,$T2,$T1 # transpose data
- vpunpckhqdq $T3,$T2,$T3
- vpmadd52luq $H1,$S2,$D0lo
- vpmadd52huq $H1,$S2,$D0hi
- vpmadd52luq $H1,$R0,$D1lo
- vpmadd52huq $H1,$R0,$D1hi
- vpmadd52luq $H1,$R1,$D2lo
- vpmadd52huq $H1,$R1,$D2hi
-
- ################################################################
- # partial reduction (interleaved with data splat)
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$H0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpsrlq \$24,$T3,$T2
- vporq $PAD,$T2,$T2
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$H1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpandq $mask44,$T1,$T0
- vpsrlq \$44,$T1,$T1
- vpsllq \$20,$T3,$T3
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$H2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $D2hi,$H0,$H0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
- vporq $T3,$T1,$T1
- vpandq $mask44,$T1,$T1
-
- vpsrlq \$44,$H0,$tmp # additional step
- vpandq $mask44,$H0,$H0
-
- vpaddq $tmp,$H1,$H1
-
- sub \$8,$len # len-=128
- jnz .Loop_vpmadd52_8x
-
-.Ltail_vpmadd52_8x:
- #vpaddq $T2,$H2,$H2 # accumulate input
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$H1,$H1
-
- vpxorq $D0lo,$D0lo,$D0lo
- vpmadd52luq $H2,$SS1,$D0lo
- vpxorq $D0hi,$D0hi,$D0hi
- vpmadd52huq $H2,$SS1,$D0hi
- vpxorq $D1lo,$D1lo,$D1lo
- vpmadd52luq $H2,$SS2,$D1lo
- vpxorq $D1hi,$D1hi,$D1hi
- vpmadd52huq $H2,$SS2,$D1hi
- vpxorq $D2lo,$D2lo,$D2lo
- vpmadd52luq $H2,$RR0,$D2lo
- vpxorq $D2hi,$D2hi,$D2hi
- vpmadd52huq $H2,$RR0,$D2hi
-
- vpmadd52luq $H0,$RR0,$D0lo
- vpmadd52huq $H0,$RR0,$D0hi
- vpmadd52luq $H0,$RR1,$D1lo
- vpmadd52huq $H0,$RR1,$D1hi
- vpmadd52luq $H0,$RR2,$D2lo
- vpmadd52huq $H0,$RR2,$D2hi
-
- vpmadd52luq $H1,$SS2,$D0lo
- vpmadd52huq $H1,$SS2,$D0hi
- vpmadd52luq $H1,$RR0,$D1lo
- vpmadd52huq $H1,$RR0,$D1hi
- vpmadd52luq $H1,$RR1,$D2lo
- vpmadd52huq $H1,$RR1,$D2hi
-
- ################################################################
- # horizontal addition
-
- mov \$1,%eax
- kmovw %eax,%k1
- vpsrldq \$8,$D0lo,$T0
- vpsrldq \$8,$D0hi,$H0
- vpsrldq \$8,$D1lo,$T1
- vpsrldq \$8,$D1hi,$H1
- vpaddq $T0,$D0lo,$D0lo
- vpaddq $H0,$D0hi,$D0hi
- vpsrldq \$8,$D2lo,$T2
- vpsrldq \$8,$D2hi,$H2
- vpaddq $T1,$D1lo,$D1lo
- vpaddq $H1,$D1hi,$D1hi
- vpermq \$0x2,$D0lo,$T0
- vpermq \$0x2,$D0hi,$H0
- vpaddq $T2,$D2lo,$D2lo
- vpaddq $H2,$D2hi,$D2hi
-
- vpermq \$0x2,$D1lo,$T1
- vpermq \$0x2,$D1hi,$H1
- vpaddq $T0,$D0lo,$D0lo
- vpaddq $H0,$D0hi,$D0hi
- vpermq \$0x2,$D2lo,$T2
- vpermq \$0x2,$D2hi,$H2
- vpaddq $T1,$D1lo,$D1lo
- vpaddq $H1,$D1hi,$D1hi
- vextracti64x4 \$1,$D0lo,%y#$T0
- vextracti64x4 \$1,$D0hi,%y#$H0
- vpaddq $T2,$D2lo,$D2lo
- vpaddq $H2,$D2hi,$D2hi
-
- vextracti64x4 \$1,$D1lo,%y#$T1
- vextracti64x4 \$1,$D1hi,%y#$H1
- vextracti64x4 \$1,$D2lo,%y#$T2
- vextracti64x4 \$1,$D2hi,%y#$H2
-___
-######## switch back to %ymm
-map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
-map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
-map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
-
-$code.=<<___;
- vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
- vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
- vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
- vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
- vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
- vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
-
- ################################################################
- # partial reduction
- vpsrlq \$44,$D0lo,$tmp
- vpsllq \$8,$D0hi,$D0hi
- vpandq $mask44,$D0lo,$H0
- vpaddq $tmp,$D0hi,$D0hi
-
- vpaddq $D0hi,$D1lo,$D1lo
-
- vpsrlq \$44,$D1lo,$tmp
- vpsllq \$8,$D1hi,$D1hi
- vpandq $mask44,$D1lo,$H1
- vpaddq $tmp,$D1hi,$D1hi
-
- vpaddq $D1hi,$D2lo,$D2lo
-
- vpsrlq \$42,$D2lo,$tmp
- vpsllq \$10,$D2hi,$D2hi
- vpandq $mask42,$D2lo,$H2
- vpaddq $tmp,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
- vpsllq \$2,$D2hi,$D2hi
-
- vpaddq $D2hi,$H0,$H0
-
- vpsrlq \$44,$H0,$tmp # additional step
- vpandq $mask44,$H0,$H0
-
- vpaddq $tmp,$H1,$H1
-
- ################################################################
-
- vmovq %x#$H0,0($ctx)
- vmovq %x#$H1,8($ctx)
- vmovq %x#$H2,16($ctx)
- vzeroall
-
-.Lno_data_vpmadd52_8x:
- RET
-.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
-___
-}
-$code.=<<___;
-.type poly1305_emit_base2_44,\@function,3
-.align 32
-poly1305_emit_base2_44:
- mov 0($ctx),%r8 # load hash value
- mov 8($ctx),%r9
- mov 16($ctx),%r10
-
- mov %r9,%rax
- shr \$20,%r9
- shl \$44,%rax
- mov %r10,%rcx
- shr \$40,%r10
- shl \$24,%rcx
-
- add %rax,%r8
- adc %rcx,%r9
- adc \$0,%r10
-
- mov %r8,%rax
- add \$5,%r8 # compare to modulus
- mov %r9,%rcx
- adc \$0,%r9
- adc \$0,%r10
- shr \$2,%r10 # did 130-bit value overflow?
- cmovnz %r8,%rax
- cmovnz %r9,%rcx
-
- add 0($nonce),%rax # accumulate nonce
- adc 8($nonce),%rcx
- mov %rax,0($mac) # write result
- mov %rcx,8($mac)
-
- RET
-.size poly1305_emit_base2_44,.-poly1305_emit_base2_44
-___
-} } }
-}
-
-if (!$kernel)
-{ # chacha20-poly1305 helpers
-my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
- ("%rdi","%rsi","%rdx","%rcx"); # Unix order
-$code.=<<___;
-.globl xor128_encrypt_n_pad
-.type xor128_encrypt_n_pad,\@abi-omnipotent
-.align 16
-xor128_encrypt_n_pad:
- sub $otp,$inp
- sub $otp,$out
- mov $len,%r10 # put len aside
- shr \$4,$len # len / 16
- jz .Ltail_enc
- nop
-.Loop_enc_xmm:
- movdqu ($inp,$otp),%xmm0
- pxor ($otp),%xmm0
- movdqu %xmm0,($out,$otp)
- movdqa %xmm0,($otp)
- lea 16($otp),$otp
- dec $len
- jnz .Loop_enc_xmm
-
- and \$15,%r10 # len % 16
- jz .Ldone_enc
-
-.Ltail_enc:
- mov \$16,$len
- sub %r10,$len
- xor %eax,%eax
-.Loop_enc_byte:
- mov ($inp,$otp),%al
- xor ($otp),%al
- mov %al,($out,$otp)
- mov %al,($otp)
- lea 1($otp),$otp
- dec %r10
- jnz .Loop_enc_byte
-
- xor %eax,%eax
-.Loop_enc_pad:
- mov %al,($otp)
- lea 1($otp),$otp
- dec $len
- jnz .Loop_enc_pad
-
-.Ldone_enc:
- mov $otp,%rax
- RET
-.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
-
-.globl xor128_decrypt_n_pad
-.type xor128_decrypt_n_pad,\@abi-omnipotent
-.align 16
-xor128_decrypt_n_pad:
- sub $otp,$inp
- sub $otp,$out
- mov $len,%r10 # put len aside
- shr \$4,$len # len / 16
- jz .Ltail_dec
- nop
-.Loop_dec_xmm:
- movdqu ($inp,$otp),%xmm0
- movdqa ($otp),%xmm1
- pxor %xmm0,%xmm1
- movdqu %xmm1,($out,$otp)
- movdqa %xmm0,($otp)
- lea 16($otp),$otp
- dec $len
- jnz .Loop_dec_xmm
-
- pxor %xmm1,%xmm1
- and \$15,%r10 # len % 16
- jz .Ldone_dec
-
-.Ltail_dec:
- mov \$16,$len
- sub %r10,$len
- xor %eax,%eax
- xor %r11d,%r11d
-.Loop_dec_byte:
- mov ($inp,$otp),%r11b
- mov ($otp),%al
- xor %r11b,%al
- mov %al,($out,$otp)
- mov %r11b,($otp)
- lea 1($otp),$otp
- dec %r10
- jnz .Loop_dec_byte
-
- xor %eax,%eax
-.Loop_dec_pad:
- mov %al,($otp)
- lea 1($otp),$otp
- dec $len
- jnz .Loop_dec_pad
-
-.Ldone_dec:
- mov $otp,%rax
- RET
-.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
-___
-}
-
-# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
-# CONTEXT *context,DISPATCHER_CONTEXT *disp)
-if ($win64) {
-$rec="%rcx";
-$frame="%rdx";
-$context="%r8";
-$disp="%r9";
-
-$code.=<<___;
-.extern __imp_RtlVirtualUnwind
-.type se_handler,\@abi-omnipotent
-.align 16
-se_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
-
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
-
- mov 0(%r11),%r10d # HandlerData[0]
- lea (%rsi,%r10),%r10 # prologue label
- cmp %r10,%rbx # context->Rip<.Lprologue
- jb .Lcommon_seh_tail
-
- mov 152($context),%rax # pull context->Rsp
-
- mov 4(%r11),%r10d # HandlerData[1]
- lea (%rsi,%r10),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=.Lepilogue
- jae .Lcommon_seh_tail
-
- lea 48(%rax),%rax
-
- mov -8(%rax),%rbx
- mov -16(%rax),%rbp
- mov -24(%rax),%r12
- mov -32(%rax),%r13
- mov -40(%rax),%r14
- mov -48(%rax),%r15
- mov %rbx,144($context) # restore context->Rbx
- mov %rbp,160($context) # restore context->Rbp
- mov %r12,216($context) # restore context->R12
- mov %r13,224($context) # restore context->R13
- mov %r14,232($context) # restore context->R14
- mov %r15,240($context) # restore context->R14
-
- jmp .Lcommon_seh_tail
-.size se_handler,.-se_handler
-
-.type avx_handler,\@abi-omnipotent
-.align 16
-avx_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
-
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
-
- mov 0(%r11),%r10d # HandlerData[0]
- lea (%rsi,%r10),%r10 # prologue label
- cmp %r10,%rbx # context->Rip<prologue label
- jb .Lcommon_seh_tail
-
- mov 152($context),%rax # pull context->Rsp
-
- mov 4(%r11),%r10d # HandlerData[1]
- lea (%rsi,%r10),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=epilogue label
- jae .Lcommon_seh_tail
-
- mov 208($context),%rax # pull context->R11
-
- lea 0x50(%rax),%rsi
- lea 0xf8(%rax),%rax
- lea 512($context),%rdi # &context.Xmm6
- mov \$20,%ecx
- .long 0xa548f3fc # cld; rep movsq
-
-.Lcommon_seh_tail:
- mov 8(%rax),%rdi
- mov 16(%rax),%rsi
- mov %rax,152($context) # restore context->Rsp
- mov %rsi,168($context) # restore context->Rsi
- mov %rdi,176($context) # restore context->Rdi
-
- mov 40($disp),%rdi # disp->ContextRecord
- mov $context,%rsi # context
- mov \$154,%ecx # sizeof(CONTEXT)
- .long 0xa548f3fc # cld; rep movsq
-
- mov $disp,%rsi
- xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER
- mov 8(%rsi),%rdx # arg2, disp->ImageBase
- mov 0(%rsi),%r8 # arg3, disp->ControlPc
- mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
- mov 40(%rsi),%r10 # disp->ContextRecord
- lea 56(%rsi),%r11 # &disp->HandlerData
- lea 24(%rsi),%r12 # &disp->EstablisherFrame
- mov %r10,32(%rsp) # arg5
- mov %r11,40(%rsp) # arg6
- mov %r12,48(%rsp) # arg7
- mov %rcx,56(%rsp) # arg8, (NULL)
- call *__imp_RtlVirtualUnwind(%rip)
-
- mov \$1,%eax # ExceptionContinueSearch
- add \$64,%rsp
- popfq
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- pop %rdi
- pop %rsi
- RET
-.size avx_handler,.-avx_handler
-
-.section .pdata
-.align 4
- .rva .LSEH_begin_poly1305_init_x86_64
- .rva .LSEH_end_poly1305_init_x86_64
- .rva .LSEH_info_poly1305_init_x86_64
-
- .rva .LSEH_begin_poly1305_blocks_x86_64
- .rva .LSEH_end_poly1305_blocks_x86_64
- .rva .LSEH_info_poly1305_blocks_x86_64
-
- .rva .LSEH_begin_poly1305_emit_x86_64
- .rva .LSEH_end_poly1305_emit_x86_64
- .rva .LSEH_info_poly1305_emit_x86_64
-___
-$code.=<<___ if ($avx);
- .rva .LSEH_begin_poly1305_blocks_avx
- .rva .Lbase2_64_avx
- .rva .LSEH_info_poly1305_blocks_avx_1
-
- .rva .Lbase2_64_avx
- .rva .Leven_avx
- .rva .LSEH_info_poly1305_blocks_avx_2
-
- .rva .Leven_avx
- .rva .LSEH_end_poly1305_blocks_avx
- .rva .LSEH_info_poly1305_blocks_avx_3
-
- .rva .LSEH_begin_poly1305_emit_avx
- .rva .LSEH_end_poly1305_emit_avx
- .rva .LSEH_info_poly1305_emit_avx
-___
-$code.=<<___ if ($avx>1);
- .rva .LSEH_begin_poly1305_blocks_avx2
- .rva .Lbase2_64_avx2
- .rva .LSEH_info_poly1305_blocks_avx2_1
-
- .rva .Lbase2_64_avx2
- .rva .Leven_avx2
- .rva .LSEH_info_poly1305_blocks_avx2_2
-
- .rva .Leven_avx2
- .rva .LSEH_end_poly1305_blocks_avx2
- .rva .LSEH_info_poly1305_blocks_avx2_3
-___
-$code.=<<___ if ($avx>2);
- .rva .LSEH_begin_poly1305_blocks_avx512
- .rva .LSEH_end_poly1305_blocks_avx512
- .rva .LSEH_info_poly1305_blocks_avx512
-___
-$code.=<<___;
-.section .xdata
-.align 8
-.LSEH_info_poly1305_init_x86_64:
- .byte 9,0,0,0
- .rva se_handler
- .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
-
-.LSEH_info_poly1305_blocks_x86_64:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lblocks_body,.Lblocks_epilogue
-
-.LSEH_info_poly1305_emit_x86_64:
- .byte 9,0,0,0
- .rva se_handler
- .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
-___
-$code.=<<___ if ($avx);
-.LSEH_info_poly1305_blocks_avx_1:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx_2:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx_3:
- .byte 9,0,0,0
- .rva avx_handler
- .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_emit_avx:
- .byte 9,0,0,0
- .rva se_handler
- .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
-___
-$code.=<<___ if ($avx>1);
-.LSEH_info_poly1305_blocks_avx2_1:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx2_2:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx2_3:
- .byte 9,0,0,0
- .rva avx_handler
- .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
-___
-$code.=<<___ if ($avx>2);
-.LSEH_info_poly1305_blocks_avx512:
- .byte 9,0,0,0
- .rva avx_handler
- .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
-___
-}
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/\/\// and !/^$/);
- print;
-}
-close SELF;
-
-foreach (split('\n',$code)) {
- s/\`([^\`]*)\`/eval($1)/ge;
- s/%r([a-z]+)#d/%e$1/g;
- s/%r([0-9]+)#d/%r$1d/g;
- s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
-
- if ($kernel) {
- s/(^\.type.*),[0-9]+$/\1/;
- s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
- next if /^\.cfi.*/;
- }
-
- print $_,"\n";
-}
-close STDOUT;
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
deleted file mode 100644
index 08ff4b489f7e..000000000000
--- a/arch/x86/crypto/poly1305_glue.c
+++ /dev/null
@@ -1,290 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/*
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#include <crypto/algapi.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/poly1305.h>
-#include <crypto/internal/simd.h>
-#include <linux/crypto.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/sizes.h>
-#include <asm/cpu_device_id.h>
-#include <asm/simd.h>
-
-asmlinkage void poly1305_init_x86_64(void *ctx,
- const u8 key[POLY1305_BLOCK_SIZE]);
-asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
- const size_t len, const u32 padbit);
-asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
- const u32 nonce[4]);
-asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
- const u32 nonce[4]);
-asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
- const u32 padbit);
-asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
- const u32 padbit);
-asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
- const size_t len, const u32 padbit);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
-
-struct poly1305_arch_internal {
- union {
- struct {
- u32 h[5];
- u32 is_base2_26;
- };
- u64 hs[3];
- };
- u64 r[2];
- u64 pad;
- struct { u32 r2, r1, r4, r3; } rn[9];
-};
-
-/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
- * the unfortunate situation of using AVX and then having to go back to scalar
- * -- because the user is silly and has called the update function from two
- * separate contexts -- then we need to convert back to the original base before
- * proceeding. It is possible to reason that the initial reduction below is
- * sufficient given the implementation invariants. However, for an avoidance of
- * doubt and because this is not performance critical, we do the full reduction
- * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py
- */
-static void convert_to_base2_64(void *ctx)
-{
- struct poly1305_arch_internal *state = ctx;
- u32 cy;
-
- if (!state->is_base2_26)
- return;
-
- cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
- cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
- cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
- cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
- state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
- state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
- state->hs[2] = state->h[4] >> 24;
-#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
- cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
- state->hs[2] &= 3;
- state->hs[0] += cy;
- state->hs[1] += (cy = ULT(state->hs[0], cy));
- state->hs[2] += ULT(state->hs[1], cy);
-#undef ULT
- state->is_base2_26 = 0;
-}
-
-static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_BLOCK_SIZE])
-{
- poly1305_init_x86_64(ctx, key);
-}
-
-static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
- const u32 padbit)
-{
- struct poly1305_arch_internal *state = ctx;
-
- /* SIMD disables preemption, so relax after processing each page. */
- BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
- SZ_4K % POLY1305_BLOCK_SIZE);
-
- if (!static_branch_likely(&poly1305_use_avx) ||
- (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
- !crypto_simd_usable()) {
- convert_to_base2_64(ctx);
- poly1305_blocks_x86_64(ctx, inp, len, padbit);
- return;
- }
-
- do {
- const size_t bytes = min_t(size_t, len, SZ_4K);
-
- kernel_fpu_begin();
- if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
- poly1305_blocks_avx512(ctx, inp, bytes, padbit);
- else if (static_branch_likely(&poly1305_use_avx2))
- poly1305_blocks_avx2(ctx, inp, bytes, padbit);
- else
- poly1305_blocks_avx(ctx, inp, bytes, padbit);
- kernel_fpu_end();
-
- len -= bytes;
- inp += bytes;
- } while (len);
-}
-
-static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
- const u32 nonce[4])
-{
- if (!static_branch_likely(&poly1305_use_avx))
- poly1305_emit_x86_64(ctx, mac, nonce);
- else
- poly1305_emit_avx(ctx, mac, nonce);
-}
-
-void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
-{
- poly1305_simd_init(&dctx->h, key);
- dctx->s[0] = get_unaligned_le32(&key[16]);
- dctx->s[1] = get_unaligned_le32(&key[20]);
- dctx->s[2] = get_unaligned_le32(&key[24]);
- dctx->s[3] = get_unaligned_le32(&key[28]);
- dctx->buflen = 0;
- dctx->sset = true;
-}
-EXPORT_SYMBOL(poly1305_init_arch);
-
-static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx,
- const u8 *inp, unsigned int len)
-{
- unsigned int acc = 0;
- if (unlikely(!dctx->sset)) {
- if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) {
- poly1305_simd_init(&dctx->h, inp);
- inp += POLY1305_BLOCK_SIZE;
- len -= POLY1305_BLOCK_SIZE;
- acc += POLY1305_BLOCK_SIZE;
- dctx->rset = 1;
- }
- if (len >= POLY1305_BLOCK_SIZE) {
- dctx->s[0] = get_unaligned_le32(&inp[0]);
- dctx->s[1] = get_unaligned_le32(&inp[4]);
- dctx->s[2] = get_unaligned_le32(&inp[8]);
- dctx->s[3] = get_unaligned_le32(&inp[12]);
- acc += POLY1305_BLOCK_SIZE;
- dctx->sset = true;
- }
- }
- return acc;
-}
-
-void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
- unsigned int srclen)
-{
- unsigned int bytes, used;
-
- if (unlikely(dctx->buflen)) {
- bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
- memcpy(dctx->buf + dctx->buflen, src, bytes);
- src += bytes;
- srclen -= bytes;
- dctx->buflen += bytes;
-
- if (dctx->buflen == POLY1305_BLOCK_SIZE) {
- if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE)))
- poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
- dctx->buflen = 0;
- }
- }
-
- if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
- bytes = round_down(srclen, POLY1305_BLOCK_SIZE);
- srclen -= bytes;
- used = crypto_poly1305_setdctxkey(dctx, src, bytes);
- if (likely(bytes - used))
- poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1);
- src += bytes;
- }
-
- if (unlikely(srclen)) {
- dctx->buflen = srclen;
- memcpy(dctx->buf, src, srclen);
- }
-}
-EXPORT_SYMBOL(poly1305_update_arch);
-
-void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
-{
- if (unlikely(dctx->buflen)) {
- dctx->buf[dctx->buflen++] = 1;
- memset(dctx->buf + dctx->buflen, 0,
- POLY1305_BLOCK_SIZE - dctx->buflen);
- poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
- }
-
- poly1305_simd_emit(&dctx->h, dst, dctx->s);
- memzero_explicit(dctx, sizeof(*dctx));
-}
-EXPORT_SYMBOL(poly1305_final_arch);
-
-static int crypto_poly1305_init(struct shash_desc *desc)
-{
- struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-
- *dctx = (struct poly1305_desc_ctx){};
- return 0;
-}
-
-static int crypto_poly1305_update(struct shash_desc *desc,
- const u8 *src, unsigned int srclen)
-{
- struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-
- poly1305_update_arch(dctx, src, srclen);
- return 0;
-}
-
-static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
-{
- struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-
- if (unlikely(!dctx->sset))
- return -ENOKEY;
-
- poly1305_final_arch(dctx, dst);
- return 0;
-}
-
-static struct shash_alg alg = {
- .digestsize = POLY1305_DIGEST_SIZE,
- .init = crypto_poly1305_init,
- .update = crypto_poly1305_update,
- .final = crypto_poly1305_final,
- .descsize = sizeof(struct poly1305_desc_ctx),
- .base = {
- .cra_name = "poly1305",
- .cra_driver_name = "poly1305-simd",
- .cra_priority = 300,
- .cra_blocksize = POLY1305_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- },
-};
-
-static int __init poly1305_simd_mod_init(void)
-{
- if (boot_cpu_has(X86_FEATURE_AVX) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
- static_branch_enable(&poly1305_use_avx);
- if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
- static_branch_enable(&poly1305_use_avx2);
- if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
- /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
- boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X)
- static_branch_enable(&poly1305_use_avx512);
- return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0;
-}
-
-static void __exit poly1305_simd_mod_exit(void)
-{
- if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
- crypto_unregister_shash(&alg);
-}
-
-module_init(poly1305_simd_mod_init);
-module_exit(poly1305_simd_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
-MODULE_DESCRIPTION("Poly1305 authenticator");
-MODULE_ALIAS_CRYPTO("poly1305");
-MODULE_ALIAS_CRYPTO("poly1305-simd");
diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c
index 8fa58b0f3cb3..6b466867f91a 100644
--- a/arch/x86/crypto/polyval-clmulni_glue.c
+++ b/arch/x86/crypto/polyval-clmulni_glue.c
@@ -16,16 +16,15 @@
* operations.
*/
-#include <crypto/algapi.h>
+#include <asm/cpu_device_id.h>
+#include <asm/fpu/api.h>
#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
#include <crypto/polyval.h>
-#include <linux/crypto.h>
-#include <linux/init.h>
+#include <crypto/utils.h>
+#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <asm/cpu_device_id.h>
-#include <asm/simd.h>
+#include <linux/string.h>
#define POLYVAL_ALIGN 16
#define POLYVAL_ALIGN_ATTR __aligned(POLYVAL_ALIGN)
@@ -42,7 +41,6 @@ struct polyval_tfm_ctx {
struct polyval_desc_ctx {
u8 buffer[POLYVAL_BLOCK_SIZE];
- u32 bytes;
};
asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
@@ -57,25 +55,16 @@ static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm)
static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
const u8 *in, size_t nblocks, u8 *accumulator)
{
- if (likely(crypto_simd_usable())) {
- kernel_fpu_begin();
- clmul_polyval_update(keys, in, nblocks, accumulator);
- kernel_fpu_end();
- } else {
- polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in,
- nblocks, accumulator);
- }
+ kernel_fpu_begin();
+ clmul_polyval_update(keys, in, nblocks, accumulator);
+ kernel_fpu_end();
}
static void internal_polyval_mul(u8 *op1, const u8 *op2)
{
- if (likely(crypto_simd_usable())) {
- kernel_fpu_begin();
- clmul_polyval_mul(op1, op2);
- kernel_fpu_end();
- } else {
- polyval_mul_non4k(op1, op2);
- }
+ kernel_fpu_begin();
+ clmul_polyval_mul(op1, op2);
+ kernel_fpu_end();
}
static int polyval_x86_setkey(struct crypto_shash *tfm,
@@ -112,49 +101,27 @@ static int polyval_x86_update(struct shash_desc *desc,
{
struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
- u8 *pos;
unsigned int nblocks;
- unsigned int n;
-
- if (dctx->bytes) {
- n = min(srclen, dctx->bytes);
- pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes;
-
- dctx->bytes -= n;
- srclen -= n;
-
- while (n--)
- *pos++ ^= *src++;
- if (!dctx->bytes)
- internal_polyval_mul(dctx->buffer,
- tctx->key_powers[NUM_KEY_POWERS-1]);
- }
-
- while (srclen >= POLYVAL_BLOCK_SIZE) {
+ do {
/* Allow rescheduling every 4K bytes. */
nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
internal_polyval_update(tctx, src, nblocks, dctx->buffer);
srclen -= nblocks * POLYVAL_BLOCK_SIZE;
src += nblocks * POLYVAL_BLOCK_SIZE;
- }
+ } while (srclen >= POLYVAL_BLOCK_SIZE);
- if (srclen) {
- dctx->bytes = POLYVAL_BLOCK_SIZE - srclen;
- pos = dctx->buffer;
- while (srclen--)
- *pos++ ^= *src++;
- }
-
- return 0;
+ return srclen;
}
-static int polyval_x86_final(struct shash_desc *desc, u8 *dst)
+static int polyval_x86_finup(struct shash_desc *desc, const u8 *src,
+ unsigned int len, u8 *dst)
{
struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
- if (dctx->bytes) {
+ if (len) {
+ crypto_xor(dctx->buffer, src, len);
internal_polyval_mul(dctx->buffer,
tctx->key_powers[NUM_KEY_POWERS-1]);
}
@@ -168,13 +135,14 @@ static struct shash_alg polyval_alg = {
.digestsize = POLYVAL_DIGEST_SIZE,
.init = polyval_x86_init,
.update = polyval_x86_update,
- .final = polyval_x86_final,
+ .finup = polyval_x86_finup,
.setkey = polyval_x86_setkey,
.descsize = sizeof(struct polyval_desc_ctx),
.base = {
.cra_name = "polyval",
.cra_driver_name = "polyval-clmulni",
.cra_priority = 200,
+ .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
.cra_blocksize = POLYVAL_BLOCK_SIZE,
.cra_ctxsize = POLYVAL_CTX_SIZE,
.cra_module = THIS_MODULE,
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
index 347e97f4b713..f5f2121b7956 100644
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -10,7 +10,6 @@
#include <linux/crypto.h>
#include <linux/err.h>
#include <crypto/algapi.h>
-#include <crypto/internal/simd.h>
#include <crypto/serpent.h>
#include "serpent-avx.h"
@@ -65,10 +64,9 @@ static int cbc_decrypt(struct skcipher_request *req)
static struct skcipher_alg serpent_algs[] = {
{
- .base.cra_name = "__ecb(serpent)",
- .base.cra_driver_name = "__ecb-serpent-avx2",
+ .base.cra_name = "ecb(serpent)",
+ .base.cra_driver_name = "ecb-serpent-avx2",
.base.cra_priority = 600,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.base.cra_blocksize = SERPENT_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct serpent_ctx),
.base.cra_module = THIS_MODULE,
@@ -78,10 +76,9 @@ static struct skcipher_alg serpent_algs[] = {
.encrypt = ecb_encrypt,
.decrypt = ecb_decrypt,
}, {
- .base.cra_name = "__cbc(serpent)",
- .base.cra_driver_name = "__cbc-serpent-avx2",
+ .base.cra_name = "cbc(serpent)",
+ .base.cra_driver_name = "cbc-serpent-avx2",
.base.cra_priority = 600,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.base.cra_blocksize = SERPENT_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct serpent_ctx),
.base.cra_module = THIS_MODULE,
@@ -94,8 +91,6 @@ static struct skcipher_alg serpent_algs[] = {
},
};
-static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)];
-
static int __init serpent_avx2_init(void)
{
const char *feature_name;
@@ -110,15 +105,13 @@ static int __init serpent_avx2_init(void)
return -ENODEV;
}
- return simd_register_skciphers_compat(serpent_algs,
- ARRAY_SIZE(serpent_algs),
- serpent_simd_algs);
+ return crypto_register_skciphers(serpent_algs,
+ ARRAY_SIZE(serpent_algs));
}
static void __exit serpent_avx2_fini(void)
{
- simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs),
- serpent_simd_algs);
+ crypto_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs));
}
module_init(serpent_avx2_init);
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 6c248e1ea4ef..e640abc1cb8a 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -13,7 +13,6 @@
#include <linux/crypto.h>
#include <linux/err.h>
#include <crypto/algapi.h>
-#include <crypto/internal/simd.h>
#include <crypto/serpent.h>
#include "serpent-avx.h"
@@ -71,10 +70,9 @@ static int cbc_decrypt(struct skcipher_request *req)
static struct skcipher_alg serpent_algs[] = {
{
- .base.cra_name = "__ecb(serpent)",
- .base.cra_driver_name = "__ecb-serpent-avx",
+ .base.cra_name = "ecb(serpent)",
+ .base.cra_driver_name = "ecb-serpent-avx",
.base.cra_priority = 500,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.base.cra_blocksize = SERPENT_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct serpent_ctx),
.base.cra_module = THIS_MODULE,
@@ -84,10 +82,9 @@ static struct skcipher_alg serpent_algs[] = {
.encrypt = ecb_encrypt,
.decrypt = ecb_decrypt,
}, {
- .base.cra_name = "__cbc(serpent)",
- .base.cra_driver_name = "__cbc-serpent-avx",
+ .base.cra_name = "cbc(serpent)",
+ .base.cra_driver_name = "cbc-serpent-avx",
.base.cra_priority = 500,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.base.cra_blocksize = SERPENT_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct serpent_ctx),
.base.cra_module = THIS_MODULE,
@@ -100,8 +97,6 @@ static struct skcipher_alg serpent_algs[] = {
},
};
-static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)];
-
static int __init serpent_init(void)
{
const char *feature_name;
@@ -112,15 +107,13 @@ static int __init serpent_init(void)
return -ENODEV;
}
- return simd_register_skciphers_compat(serpent_algs,
- ARRAY_SIZE(serpent_algs),
- serpent_simd_algs);
+ return crypto_register_skciphers(serpent_algs,
+ ARRAY_SIZE(serpent_algs));
}
static void __exit serpent_exit(void)
{
- simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs),
- serpent_simd_algs);
+ crypto_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs));
}
module_init(serpent_init);
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index d78f37e9b2cf..80ee17ec21b4 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -18,7 +18,6 @@
#include <linux/err.h>
#include <crypto/algapi.h>
#include <crypto/b128ops.h>
-#include <crypto/internal/simd.h>
#include <crypto/serpent.h>
#include "serpent-sse2.h"
@@ -74,10 +73,9 @@ static int cbc_decrypt(struct skcipher_request *req)
static struct skcipher_alg serpent_algs[] = {
{
- .base.cra_name = "__ecb(serpent)",
- .base.cra_driver_name = "__ecb-serpent-sse2",
+ .base.cra_name = "ecb(serpent)",
+ .base.cra_driver_name = "ecb-serpent-sse2",
.base.cra_priority = 400,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.base.cra_blocksize = SERPENT_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct serpent_ctx),
.base.cra_module = THIS_MODULE,
@@ -87,10 +85,9 @@ static struct skcipher_alg serpent_algs[] = {
.encrypt = ecb_encrypt,
.decrypt = ecb_decrypt,
}, {
- .base.cra_name = "__cbc(serpent)",
- .base.cra_driver_name = "__cbc-serpent-sse2",
+ .base.cra_name = "cbc(serpent)",
+ .base.cra_driver_name = "cbc-serpent-sse2",
.base.cra_priority = 400,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.base.cra_blocksize = SERPENT_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct serpent_ctx),
.base.cra_module = THIS_MODULE,
@@ -103,8 +100,6 @@ static struct skcipher_alg serpent_algs[] = {
},
};
-static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)];
-
static int __init serpent_sse2_init(void)
{
if (!boot_cpu_has(X86_FEATURE_XMM2)) {
@@ -112,15 +107,13 @@ static int __init serpent_sse2_init(void)
return -ENODEV;
}
- return simd_register_skciphers_compat(serpent_algs,
- ARRAY_SIZE(serpent_algs),
- serpent_simd_algs);
+ return crypto_register_skciphers(serpent_algs,
+ ARRAY_SIZE(serpent_algs));
}
static void __exit serpent_sse2_exit(void)
{
- simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs),
- serpent_simd_algs);
+ crypto_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs));
}
module_init(serpent_sse2_init);
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index ab8bc54f254d..0a912bfc86c5 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -16,21 +16,17 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <asm/cpu_device_id.h>
+#include <asm/simd.h>
#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/types.h>
#include <crypto/sha1.h>
#include <crypto/sha1_base.h>
-#include <asm/cpu_device_id.h>
-#include <asm/simd.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
static const struct x86_cpu_id module_cpu_ids[] = {
-#ifdef CONFIG_AS_SHA1_NI
X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL),
-#endif
X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL),
X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL),
X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL),
@@ -38,14 +34,10 @@ static const struct x86_cpu_id module_cpu_ids[] = {
};
MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids);
-static int sha1_update(struct shash_desc *desc, const u8 *data,
- unsigned int len, sha1_block_fn *sha1_xform)
+static inline int sha1_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len, sha1_block_fn *sha1_xform)
{
- struct sha1_state *sctx = shash_desc_ctx(desc);
-
- if (!crypto_simd_usable() ||
- (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE)
- return crypto_sha1_update(desc, data, len);
+ int remain;
/*
* Make sure struct sha1_state begins directly with the SHA1
@@ -54,22 +46,18 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
kernel_fpu_begin();
- sha1_base_do_update(desc, data, len, sha1_xform);
+ remain = sha1_base_do_update_blocks(desc, data, len, sha1_xform);
kernel_fpu_end();
- return 0;
+ return remain;
}
-static int sha1_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out, sha1_block_fn *sha1_xform)
+static inline int sha1_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out,
+ sha1_block_fn *sha1_xform)
{
- if (!crypto_simd_usable())
- return crypto_sha1_finup(desc, data, len, out);
-
kernel_fpu_begin();
- if (len)
- sha1_base_do_update(desc, data, len, sha1_xform);
- sha1_base_do_finalize(desc, sha1_xform);
+ sha1_base_do_finup(desc, data, len, sha1_xform);
kernel_fpu_end();
return sha1_base_finish(desc, out);
@@ -90,23 +78,17 @@ static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
return sha1_finup(desc, data, len, out, sha1_transform_ssse3);
}
-/* Add padding and return the message digest. */
-static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
-{
- return sha1_ssse3_finup(desc, NULL, 0, out);
-}
-
static struct shash_alg sha1_ssse3_alg = {
.digestsize = SHA1_DIGEST_SIZE,
.init = sha1_base_init,
.update = sha1_ssse3_update,
- .final = sha1_ssse3_final,
.finup = sha1_ssse3_finup,
- .descsize = sizeof(struct sha1_state),
+ .descsize = SHA1_STATE_SIZE,
.base = {
.cra_name = "sha1",
.cra_driver_name = "sha1-ssse3",
.cra_priority = 150,
+ .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@@ -140,22 +122,17 @@ static int sha1_avx_finup(struct shash_desc *desc, const u8 *data,
return sha1_finup(desc, data, len, out, sha1_transform_avx);
}
-static int sha1_avx_final(struct shash_desc *desc, u8 *out)
-{
- return sha1_avx_finup(desc, NULL, 0, out);
-}
-
static struct shash_alg sha1_avx_alg = {
.digestsize = SHA1_DIGEST_SIZE,
.init = sha1_base_init,
.update = sha1_avx_update,
- .final = sha1_avx_final,
.finup = sha1_avx_finup,
- .descsize = sizeof(struct sha1_state),
+ .descsize = SHA1_STATE_SIZE,
.base = {
.cra_name = "sha1",
.cra_driver_name = "sha1-avx",
.cra_priority = 160,
+ .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@@ -200,8 +177,8 @@ static bool avx2_usable(void)
return false;
}
-static void sha1_apply_transform_avx2(struct sha1_state *state,
- const u8 *data, int blocks)
+static inline void sha1_apply_transform_avx2(struct sha1_state *state,
+ const u8 *data, int blocks)
{
/* Select the optimal transform based on data block size */
if (blocks >= SHA1_AVX2_BLOCK_OPTSIZE)
@@ -222,22 +199,17 @@ static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data,
return sha1_finup(desc, data, len, out, sha1_apply_transform_avx2);
}
-static int sha1_avx2_final(struct shash_desc *desc, u8 *out)
-{
- return sha1_avx2_finup(desc, NULL, 0, out);
-}
-
static struct shash_alg sha1_avx2_alg = {
.digestsize = SHA1_DIGEST_SIZE,
.init = sha1_base_init,
.update = sha1_avx2_update,
- .final = sha1_avx2_final,
.finup = sha1_avx2_finup,
- .descsize = sizeof(struct sha1_state),
+ .descsize = SHA1_STATE_SIZE,
.base = {
.cra_name = "sha1",
.cra_driver_name = "sha1-avx2",
.cra_priority = 170,
+ .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@@ -256,7 +228,6 @@ static void unregister_sha1_avx2(void)
crypto_unregister_shash(&sha1_avx2_alg);
}
-#ifdef CONFIG_AS_SHA1_NI
asmlinkage void sha1_ni_transform(struct sha1_state *digest, const u8 *data,
int rounds);
@@ -272,22 +243,17 @@ static int sha1_ni_finup(struct shash_desc *desc, const u8 *data,
return sha1_finup(desc, data, len, out, sha1_ni_transform);
}
-static int sha1_ni_final(struct shash_desc *desc, u8 *out)
-{
- return sha1_ni_finup(desc, NULL, 0, out);
-}
-
static struct shash_alg sha1_ni_alg = {
.digestsize = SHA1_DIGEST_SIZE,
.init = sha1_base_init,
.update = sha1_ni_update,
- .final = sha1_ni_final,
.finup = sha1_ni_finup,
- .descsize = sizeof(struct sha1_state),
+ .descsize = SHA1_STATE_SIZE,
.base = {
.cra_name = "sha1",
.cra_driver_name = "sha1-ni",
.cra_priority = 250,
+ .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@@ -306,11 +272,6 @@ static void unregister_sha1_ni(void)
crypto_unregister_shash(&sha1_ni_alg);
}
-#else
-static inline int register_sha1_ni(void) { return 0; }
-static inline void unregister_sha1_ni(void) { }
-#endif
-
static int __init sha1_ssse3_mod_init(void)
{
if (!x86_match_cpu(module_cpu_ids))
@@ -360,6 +321,4 @@ MODULE_ALIAS_CRYPTO("sha1");
MODULE_ALIAS_CRYPTO("sha1-ssse3");
MODULE_ALIAS_CRYPTO("sha1-avx");
MODULE_ALIAS_CRYPTO("sha1-avx2");
-#ifdef CONFIG_AS_SHA1_NI
MODULE_ALIAS_CRYPTO("sha1-ni");
-#endif
diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
deleted file mode 100644
index 53de72bdd851..000000000000
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ /dev/null
@@ -1,499 +0,0 @@
-########################################################################
-# Implement fast SHA-256 with AVX1 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-256 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-# This code schedules 1 block at a time, with 4 lanes per block
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-## assume buffers not aligned
-#define VMOVDQ vmovdqu
-
-################################ Define Macros
-
-# addm [mem], reg
-# Add reg to mem using reg-mem add and store
-.macro addm p1 p2
- add \p1, \p2
- mov \p2, \p1
-.endm
-
-
-.macro MY_ROR p1 p2
- shld $(32-(\p1)), \p2, \p2
-.endm
-
-################################
-
-# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
-# Load xmm with mem and byte swap each dword
-.macro COPY_XMM_AND_BSWAP p1 p2 p3
- VMOVDQ \p2, \p1
- vpshufb \p3, \p1, \p1
-.endm
-
-################################
-
-X0 = %xmm4
-X1 = %xmm5
-X2 = %xmm6
-X3 = %xmm7
-
-XTMP0 = %xmm0
-XTMP1 = %xmm1
-XTMP2 = %xmm2
-XTMP3 = %xmm3
-XTMP4 = %xmm8
-XFER = %xmm9
-XTMP5 = %xmm11
-
-SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
-SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
-BYTE_FLIP_MASK = %xmm13
-
-NUM_BLKS = %rdx # 3rd arg
-INP = %rsi # 2nd arg
-CTX = %rdi # 1st arg
-
-SRND = %rsi # clobbers INP
-c = %ecx
-d = %r8d
-e = %edx
-TBL = %r12
-a = %eax
-b = %ebx
-
-f = %r9d
-g = %r10d
-h = %r11d
-
-y0 = %r13d
-y1 = %r14d
-y2 = %r15d
-
-
-_INP_END_SIZE = 8
-_INP_SIZE = 8
-_XFER_SIZE = 16
-_XMM_SAVE_SIZE = 0
-
-_INP_END = 0
-_INP = _INP_END + _INP_END_SIZE
-_XFER = _INP + _INP_SIZE
-_XMM_SAVE = _XFER + _XFER_SIZE
-STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
-
-# rotate_Xs
-# Rotate values of symbols X0...X3
-.macro rotate_Xs
-X_ = X0
-X0 = X1
-X1 = X2
-X2 = X3
-X3 = X_
-.endm
-
-# ROTATE_ARGS
-# Rotate values of symbols a...h
-.macro ROTATE_ARGS
-TMP_ = h
-h = g
-g = f
-f = e
-e = d
-d = c
-c = b
-b = a
-a = TMP_
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED
- ## compute s0 four at a time and s1 two at a time
- ## compute W[-16] + W[-7] 4 at a time
-
- mov e, y0 # y0 = e
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor a, y1 # y1 = a ^ (a >> (22-13)
- xor g, y2 # y2 = f^g
- vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- ## compute s0
- vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add y0, y2 # y2 = S1 + CH
- add _XFER(%rsp), y2 # y2 = k + w + S1 + CH
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- vpsrld $7, XTMP1, XTMP2
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- vpslld $(32-7), XTMP1, XTMP3
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
- mov e, y0 # y0 = e
- mov a, y1 # y1 = a
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- vpsrld $18, XTMP1, XTMP2 #
- xor a, y1 # y1 = a ^ (a >> (22-13)
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor g, y2 # y2 = f^g
- vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- vpslld $(32-18), XTMP1, XTMP1
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- vpxor XTMP1, XTMP3, XTMP3 #
- add y0, y2 # y2 = S1 + CH
- add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- ## compute low s1
- vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
- mov e, y0 # y0 = e
- mov a, y1 # y1 = a
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- mov f, y2 # y2 = f
- xor a, y1 # y1 = a ^ (a >> (22-13)
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
- xor g, y2 # y2 = f^g
- vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA}
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA}
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- vpxor XTMP3, XTMP2, XTMP2 #
- add y0, y2 # y2 = S1 + CH
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- ## compute high s1
- vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
- mov e, y0 # y0 = e
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
- xor a, y1 # y1 = a ^ (a >> (22-13)
- xor g, y2 # y2 = f^g
- vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC}
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC}
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- vpxor XTMP3, XTMP2, XTMP2
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add y0, y2 # y2 = S1 + CH
- add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
- rotate_Xs
-.endm
-
-## input is [rsp + _XFER + %1 * 4]
-.macro DO_ROUND round
- mov e, y0 # y0 = e
- MY_ROR (25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- xor e, y0 # y0 = e ^ (e >> (25-11))
- MY_ROR (22-13), y1 # y1 = a >> (22-13)
- mov f, y2 # y2 = f
- xor a, y1 # y1 = a ^ (a >> (22-13)
- MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor g, y2 # y2 = f^g
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- and e, y2 # y2 = (f^g)&e
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- add y0, y2 # y2 = S1 + CH
- MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- offset = \round * 4 + _XFER #
- add offset(%rsp), y2 # y2 = k + w + S1 + CH
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
-.endm
-
-########################################################################
-## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks)
-## arg 1 : pointer to state
-## arg 2 : pointer to input data
-## arg 3 : Num blocks
-########################################################################
-.text
-SYM_TYPED_FUNC_START(sha256_transform_avx)
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rbp
- movq %rsp, %rbp
-
- subq $STACK_SIZE, %rsp # allocate stack space
- and $~15, %rsp # align stack pointer
-
- shl $6, NUM_BLKS # convert to bytes
- jz .Ldone_hash
- add INP, NUM_BLKS # pointer to end of data
- mov NUM_BLKS, _INP_END(%rsp)
-
- ## load initial digest
- mov 4*0(CTX), a
- mov 4*1(CTX), b
- mov 4*2(CTX), c
- mov 4*3(CTX), d
- mov 4*4(CTX), e
- mov 4*5(CTX), f
- mov 4*6(CTX), g
- mov 4*7(CTX), h
-
- vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
- vmovdqa _SHUF_00BA(%rip), SHUF_00BA
- vmovdqa _SHUF_DC00(%rip), SHUF_DC00
-.Lloop0:
- lea K256(%rip), TBL
-
- ## byte swap first 16 dwords
- COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
-
- mov INP, _INP(%rsp)
-
- ## schedule 48 input dwords, by doing 3 rounds of 16 each
- mov $3, SRND
-.align 16
-.Lloop1:
- vpaddd (TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddd 1*16(TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddd 2*16(TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- vpaddd 3*16(TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- add $4*16, TBL
- FOUR_ROUNDS_AND_SCHED
-
- sub $1, SRND
- jne .Lloop1
-
- mov $2, SRND
-.Lloop2:
- vpaddd (TBL), X0, XFER
- vmovdqa XFER, _XFER(%rsp)
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
-
- vpaddd 1*16(TBL), X1, XFER
- vmovdqa XFER, _XFER(%rsp)
- add $2*16, TBL
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
-
- vmovdqa X2, X0
- vmovdqa X3, X1
-
- sub $1, SRND
- jne .Lloop2
-
- addm (4*0)(CTX),a
- addm (4*1)(CTX),b
- addm (4*2)(CTX),c
- addm (4*3)(CTX),d
- addm (4*4)(CTX),e
- addm (4*5)(CTX),f
- addm (4*6)(CTX),g
- addm (4*7)(CTX),h
-
- mov _INP(%rsp), INP
- add $64, INP
- cmp _INP_END(%rsp), INP
- jne .Lloop0
-
-.Ldone_hash:
-
- mov %rbp, %rsp
- popq %rbp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- RET
-SYM_FUNC_END(sha256_transform_avx)
-
-.section .rodata.cst256.K256, "aM", @progbits, 256
-.align 64
-K256:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x0c0d0e0f08090a0b0405060700010203
-
-.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
-.align 16
-# shuffle xBxA -> 00BA
-_SHUF_00BA:
- .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
-
-.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
-.align 16
-# shuffle xDxC -> DC00
-_SHUF_DC00:
- .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
deleted file mode 100644
index 0bbec1c75cd0..000000000000
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ /dev/null
@@ -1,774 +0,0 @@
-########################################################################
-# Implement fast SHA-256 with AVX2 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-256 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-# This code schedules 2 blocks at a time, with 4 lanes per block
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-## assume buffers not aligned
-#define VMOVDQ vmovdqu
-
-################################ Define Macros
-
-# addm [mem], reg
-# Add reg to mem using reg-mem add and store
-.macro addm p1 p2
- add \p1, \p2
- mov \p2, \p1
-.endm
-
-################################
-
-X0 = %ymm4
-X1 = %ymm5
-X2 = %ymm6
-X3 = %ymm7
-
-# XMM versions of above
-XWORD0 = %xmm4
-XWORD1 = %xmm5
-XWORD2 = %xmm6
-XWORD3 = %xmm7
-
-XTMP0 = %ymm0
-XTMP1 = %ymm1
-XTMP2 = %ymm2
-XTMP3 = %ymm3
-XTMP4 = %ymm8
-XFER = %ymm9
-XTMP5 = %ymm11
-
-SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
-SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
-BYTE_FLIP_MASK = %ymm13
-
-X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
-
-NUM_BLKS = %rdx # 3rd arg
-INP = %rsi # 2nd arg
-CTX = %rdi # 1st arg
-c = %ecx
-d = %r8d
-e = %edx # clobbers NUM_BLKS
-y3 = %esi # clobbers INP
-
-SRND = CTX # SRND is same register as CTX
-
-a = %eax
-b = %ebx
-f = %r9d
-g = %r10d
-h = %r11d
-old_h = %r11d
-
-T1 = %r12d
-y0 = %r13d
-y1 = %r14d
-y2 = %r15d
-
-
-_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
-_XMM_SAVE_SIZE = 0
-_INP_END_SIZE = 8
-_INP_SIZE = 8
-_CTX_SIZE = 8
-
-_XFER = 0
-_XMM_SAVE = _XFER + _XFER_SIZE
-_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
-_INP = _INP_END + _INP_END_SIZE
-_CTX = _INP + _INP_SIZE
-STACK_SIZE = _CTX + _CTX_SIZE
-
-# rotate_Xs
-# Rotate values of symbols X0...X3
-.macro rotate_Xs
- X_ = X0
- X0 = X1
- X1 = X2
- X2 = X3
- X3 = X_
-.endm
-
-# ROTATE_ARGS
-# Rotate values of symbols a...h
-.macro ROTATE_ARGS
- old_h = h
- TMP_ = h
- h = g
- g = f
- f = e
- e = d
- d = c
- c = b
- b = a
- a = TMP_
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED disp
-################################### RND N + 0 ############################
-
- mov a, y3 # y3 = a # MAJA
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
-
- addl \disp(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
- vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
- mov f, y2 # y2 = f # CH
- rorx $13, a, T1 # T1 = a >> 13 # S0B
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- xor g, y2 # y2 = f^g # CH
- vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
-
- and e, y2 # y2 = (f^g)&e # CH
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- add h, d # d = k + w + h + d # --
-
- and b, y3 # y3 = (a|c)&b # MAJA
- vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
-
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- vpsrld $7, XTMP1, XTMP2
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
-
- add y0, y2 # y2 = S1 + CH # --
- vpslld $(32-7), XTMP1, XTMP3
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
- vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
-
- vpsrld $18, XTMP1, XTMP2
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
-
- ROTATE_ARGS
-
-################################### RND N + 1 ############################
-
- mov a, y3 # y3 = a # MAJA
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- offset = \disp + 1*4
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
-
- vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
- mov f, y2 # y2 = f # CH
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- xor g, y2 # y2 = f^g # CH
-
-
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- and e, y2 # y2 = (f^g)&e # CH
- add h, d # d = k + w + h + d # --
-
- vpslld $(32-18), XTMP1, XTMP1
- and b, y3 # y3 = (a|c)&b # MAJA
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
-
- vpxor XTMP1, XTMP3, XTMP3
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
- vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
- vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
- vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
-
-
- ROTATE_ARGS
-
-################################### RND N + 2 ############################
-
- mov a, y3 # y3 = a # MAJA
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- offset = \disp + 2*4
- addl offset(%rsp, SRND), h # h = k + w + h # --
-
- vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- or c, y3 # y3 = a|c # MAJA
- mov f, y2 # y2 = f # CH
- xor g, y2 # y2 = f^g # CH
-
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
- and e, y2 # y2 = (f^g)&e # CH
-
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- vpxor XTMP3, XTMP2, XTMP2
- add h, d # d = k + w + h + d # --
- and b, y3 # y3 = (a|c)&b # MAJA
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a ,T1 # T1 = (a >> 2) # S0
- vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
- vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
-
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1,h # h = k + w + h + S0 # --
- add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
- add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
-
- add y3,h # h = t1 + S0 + MAJ # --
-
-
- ROTATE_ARGS
-
-################################### RND N + 3 ############################
-
- mov a, y3 # y3 = a # MAJA
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- offset = \disp + 3*4
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
-
- vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
- mov f, y2 # y2 = f # CH
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- xor g, y2 # y2 = f^g # CH
-
-
- vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add h, d # d = k + w + h + d # --
- and b, y3 # y3 = (a|c)&b # MAJA
-
- vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
-
- vpxor XTMP3, XTMP2, XTMP2
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- add y0, y2 # y2 = S1 + CH # --
-
- vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
-
- vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and c, T1 # T1 = a&c # MAJB
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
-
- add y1, h # h = k + w + h + S0 # --
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- add y3, h # h = t1 + S0 + MAJ # --
-
- ROTATE_ARGS
- rotate_Xs
-.endm
-
-.macro DO_4ROUNDS disp
-################################### RND N + 0 ###########################
-
- mov f, y2 # y2 = f # CH
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- addl \disp(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- ROTATE_ARGS
-
-################################### RND N + 1 ###########################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- offset = 4*1 + \disp
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- ROTATE_ARGS
-
-################################### RND N + 2 ##############################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- offset = 4*2 + \disp
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
- ROTATE_ARGS
-
-################################### RND N + 3 ###########################
-
- add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
- mov f, y2 # y2 = f # CH
- rorx $25, e, y0 # y0 = e >> 25 # S1A
- rorx $11, e, y1 # y1 = e >> 11 # S1B
- xor g, y2 # y2 = f^g # CH
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
- rorx $6, e, y1 # y1 = (e >> 6) # S1
- and e, y2 # y2 = (f^g)&e # CH
- add y3, old_h # h = t1 + S0 + MAJ # --
-
- xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
- rorx $13, a, T1 # T1 = a >> 13 # S0B
- xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
- rorx $22, a, y1 # y1 = a >> 22 # S0A
- mov a, y3 # y3 = a # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
- rorx $2, a, T1 # T1 = (a >> 2) # S0
- offset = 4*3 + \disp
- addl offset(%rsp, SRND), h # h = k + w + h # --
- or c, y3 # y3 = a|c # MAJA
-
- xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
- mov a, T1 # T1 = a # MAJB
- and b, y3 # y3 = (a|c)&b # MAJA
- and c, T1 # T1 = a&c # MAJB
- add y0, y2 # y2 = S1 + CH # --
-
-
- add h, d # d = k + w + h + d # --
- or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
- add y1, h # h = k + w + h + S0 # --
-
- add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
-
-
- add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
-
- add y3, h # h = t1 + S0 + MAJ # --
-
- ROTATE_ARGS
-
-.endm
-
-########################################################################
-## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
-## arg 1 : pointer to state
-## arg 2 : pointer to input data
-## arg 3 : Num blocks
-########################################################################
-.text
-SYM_TYPED_FUNC_START(sha256_transform_rorx)
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
-
- push %rbp
- mov %rsp, %rbp
-
- subq $STACK_SIZE, %rsp
- and $-32, %rsp # align rsp to 32 byte boundary
-
- shl $6, NUM_BLKS # convert to bytes
- jz .Ldone_hash
- lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
- mov NUM_BLKS, _INP_END(%rsp)
-
- cmp NUM_BLKS, INP
- je .Lonly_one_block
-
- ## load initial digest
- mov (CTX), a
- mov 4*1(CTX), b
- mov 4*2(CTX), c
- mov 4*3(CTX), d
- mov 4*4(CTX), e
- mov 4*5(CTX), f
- mov 4*6(CTX), g
- mov 4*7(CTX), h
-
- vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
- vmovdqa _SHUF_00BA(%rip), SHUF_00BA
- vmovdqa _SHUF_DC00(%rip), SHUF_DC00
-
- mov CTX, _CTX(%rsp)
-
-.Lloop0:
- ## Load first 16 dwords from two blocks
- VMOVDQ 0*32(INP),XTMP0
- VMOVDQ 1*32(INP),XTMP1
- VMOVDQ 2*32(INP),XTMP2
- VMOVDQ 3*32(INP),XTMP3
-
- ## byte swap data
- vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
- vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
- vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
- vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
-
- ## transpose data into high/low halves
- vperm2i128 $0x20, XTMP2, XTMP0, X0
- vperm2i128 $0x31, XTMP2, XTMP0, X1
- vperm2i128 $0x20, XTMP3, XTMP1, X2
- vperm2i128 $0x31, XTMP3, XTMP1, X3
-
-.Llast_block_enter:
- add $64, INP
- mov INP, _INP(%rsp)
-
- ## schedule 48 input dwords, by doing 3 rounds of 12 each
- xor SRND, SRND
-
-.align 16
-.Lloop1:
- leaq K256+0*32(%rip), INP ## reuse INP as scratch reg
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED (_XFER + 0*32)
-
- leaq K256+1*32(%rip), INP
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED (_XFER + 1*32)
-
- leaq K256+2*32(%rip), INP
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED (_XFER + 2*32)
-
- leaq K256+3*32(%rip), INP
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
- FOUR_ROUNDS_AND_SCHED (_XFER + 3*32)
-
- add $4*32, SRND
- cmp $3*4*32, SRND
- jb .Lloop1
-
-.Lloop2:
- ## Do last 16 rounds with no scheduling
- leaq K256+0*32(%rip), INP
- vpaddd (INP, SRND), X0, XFER
- vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
- DO_4ROUNDS (_XFER + 0*32)
-
- leaq K256+1*32(%rip), INP
- vpaddd (INP, SRND), X1, XFER
- vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
- DO_4ROUNDS (_XFER + 1*32)
- add $2*32, SRND
-
- vmovdqa X2, X0
- vmovdqa X3, X1
-
- cmp $4*4*32, SRND
- jb .Lloop2
-
- mov _CTX(%rsp), CTX
- mov _INP(%rsp), INP
-
- addm (4*0)(CTX),a
- addm (4*1)(CTX),b
- addm (4*2)(CTX),c
- addm (4*3)(CTX),d
- addm (4*4)(CTX),e
- addm (4*5)(CTX),f
- addm (4*6)(CTX),g
- addm (4*7)(CTX),h
-
- cmp _INP_END(%rsp), INP
- ja .Ldone_hash
-
- #### Do second block using previously scheduled results
- xor SRND, SRND
-.align 16
-.Lloop3:
- DO_4ROUNDS (_XFER + 0*32 + 16)
- DO_4ROUNDS (_XFER + 1*32 + 16)
- add $2*32, SRND
- cmp $4*4*32, SRND
- jb .Lloop3
-
- mov _CTX(%rsp), CTX
- mov _INP(%rsp), INP
- add $64, INP
-
- addm (4*0)(CTX),a
- addm (4*1)(CTX),b
- addm (4*2)(CTX),c
- addm (4*3)(CTX),d
- addm (4*4)(CTX),e
- addm (4*5)(CTX),f
- addm (4*6)(CTX),g
- addm (4*7)(CTX),h
-
- cmp _INP_END(%rsp), INP
- jb .Lloop0
- ja .Ldone_hash
-
-.Ldo_last_block:
- VMOVDQ 0*16(INP),XWORD0
- VMOVDQ 1*16(INP),XWORD1
- VMOVDQ 2*16(INP),XWORD2
- VMOVDQ 3*16(INP),XWORD3
-
- vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
- vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
- vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
- vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
-
- jmp .Llast_block_enter
-
-.Lonly_one_block:
-
- ## load initial digest
- mov (4*0)(CTX),a
- mov (4*1)(CTX),b
- mov (4*2)(CTX),c
- mov (4*3)(CTX),d
- mov (4*4)(CTX),e
- mov (4*5)(CTX),f
- mov (4*6)(CTX),g
- mov (4*7)(CTX),h
-
- vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
- vmovdqa _SHUF_00BA(%rip), SHUF_00BA
- vmovdqa _SHUF_DC00(%rip), SHUF_DC00
-
- mov CTX, _CTX(%rsp)
- jmp .Ldo_last_block
-
-.Ldone_hash:
-
- mov %rbp, %rsp
- pop %rbp
-
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
- vzeroupper
- RET
-SYM_FUNC_END(sha256_transform_rorx)
-
-.section .rodata.cst512.K256, "aM", @progbits, 512
-.align 64
-K256:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
-.align 32
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
-
-# shuffle xBxA -> 00BA
-.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
-.align 32
-_SHUF_00BA:
- .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
-
-# shuffle xDxC -> DC00
-.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
-.align 32
-_SHUF_DC00:
- .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S
deleted file mode 100644
index 93264ee44543..000000000000
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ /dev/null
@@ -1,513 +0,0 @@
-########################################################################
-# Implement fast SHA-256 with SSSE3 instructions. (x86_64)
-#
-# Copyright (C) 2013 Intel Corporation.
-#
-# Authors:
-# James Guilford <james.guilford@intel.com>
-# Kirk Yap <kirk.s.yap@intel.com>
-# Tim Chen <tim.c.chen@linux.intel.com>
-#
-# This software is available to you under a choice of one of two
-# licenses. You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or
-# without modification, are permitted provided that the following
-# conditions are met:
-#
-# - Redistributions of source code must retain the above
-# copyright notice, this list of conditions and the following
-# disclaimer.
-#
-# - Redistributions in binary form must reproduce the above
-# copyright notice, this list of conditions and the following
-# disclaimer in the documentation and/or other materials
-# provided with the distribution.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-########################################################################
-#
-# This code is described in an Intel White-Paper:
-# "Fast SHA-256 Implementations on Intel Architecture Processors"
-#
-# To find it, surf to http://www.intel.com/p/en_US/embedded
-# and search for that title.
-#
-########################################################################
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-## assume buffers not aligned
-#define MOVDQ movdqu
-
-################################ Define Macros
-
-# addm [mem], reg
-# Add reg to mem using reg-mem add and store
-.macro addm p1 p2
- add \p1, \p2
- mov \p2, \p1
-.endm
-
-################################
-
-# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
-# Load xmm with mem and byte swap each dword
-.macro COPY_XMM_AND_BSWAP p1 p2 p3
- MOVDQ \p2, \p1
- pshufb \p3, \p1
-.endm
-
-################################
-
-X0 = %xmm4
-X1 = %xmm5
-X2 = %xmm6
-X3 = %xmm7
-
-XTMP0 = %xmm0
-XTMP1 = %xmm1
-XTMP2 = %xmm2
-XTMP3 = %xmm3
-XTMP4 = %xmm8
-XFER = %xmm9
-
-SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
-SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
-BYTE_FLIP_MASK = %xmm12
-
-NUM_BLKS = %rdx # 3rd arg
-INP = %rsi # 2nd arg
-CTX = %rdi # 1st arg
-
-SRND = %rsi # clobbers INP
-c = %ecx
-d = %r8d
-e = %edx
-TBL = %r12
-a = %eax
-b = %ebx
-
-f = %r9d
-g = %r10d
-h = %r11d
-
-y0 = %r13d
-y1 = %r14d
-y2 = %r15d
-
-
-
-_INP_END_SIZE = 8
-_INP_SIZE = 8
-_XFER_SIZE = 16
-_XMM_SAVE_SIZE = 0
-
-_INP_END = 0
-_INP = _INP_END + _INP_END_SIZE
-_XFER = _INP + _INP_SIZE
-_XMM_SAVE = _XFER + _XFER_SIZE
-STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
-
-# rotate_Xs
-# Rotate values of symbols X0...X3
-.macro rotate_Xs
-X_ = X0
-X0 = X1
-X1 = X2
-X2 = X3
-X3 = X_
-.endm
-
-# ROTATE_ARGS
-# Rotate values of symbols a...h
-.macro ROTATE_ARGS
-TMP_ = h
-h = g
-g = f
-f = e
-e = d
-d = c
-c = b
-b = a
-a = TMP_
-.endm
-
-.macro FOUR_ROUNDS_AND_SCHED
- ## compute s0 four at a time and s1 two at a time
- ## compute W[-16] + W[-7] 4 at a time
- movdqa X3, XTMP0
- mov e, y0 # y0 = e
- ror $(25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- palignr $4, X2, XTMP0 # XTMP0 = W[-7]
- ror $(22-13), y1 # y1 = a >> (22-13)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- movdqa X1, XTMP1
- xor a, y1 # y1 = a ^ (a >> (22-13)
- xor g, y2 # y2 = f^g
- paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16]
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- ## compute s0
- palignr $4, X0, XTMP1 # XTMP1 = W[-15]
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- movdqa XTMP1, XTMP2 # XTMP2 = W[-15]
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add y0, y2 # y2 = S1 + CH
- add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH
- movdqa XTMP1, XTMP3 # XTMP3 = W[-15]
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- pslld $(32-7), XTMP1 #
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- psrld $7, XTMP2 #
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- #
- ROTATE_ARGS #
- movdqa XTMP3, XTMP2 # XTMP2 = W[-15]
- mov e, y0 # y0 = e
- mov a, y1 # y1 = a
- movdqa XTMP3, XTMP4 # XTMP4 = W[-15]
- ror $(25-11), y0 # y0 = e >> (25-11)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- ror $(22-13), y1 # y1 = a >> (22-13)
- pslld $(32-18), XTMP3 #
- xor a, y1 # y1 = a ^ (a >> (22-13)
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor g, y2 # y2 = f^g
- psrld $18, XTMP2 #
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- pxor XTMP3, XTMP1
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- psrld $3, XTMP4 # XTMP4 = W[-15] >> 3
- add y0, y2 # y2 = S1 + CH
- add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- pxor XTMP4, XTMP1 # XTMP1 = s0
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- ## compute low s1
- pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
-
- ROTATE_ARGS
- movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA}
- mov e, y0 # y0 = e
- mov a, y1 # y1 = a
- ror $(25-11), y0 # y0 = e >> (25-11)
- movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA}
- xor e, y0 # y0 = e ^ (e >> (25-11))
- ror $(22-13), y1 # y1 = a >> (22-13)
- mov f, y2 # y2 = f
- xor a, y1 # y1 = a ^ (a >> (22-13)
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
- xor g, y2 # y2 = f^g
- psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- and e, y2 # y2 = (f^g)&e
- psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- pxor XTMP3, XTMP2
- add y0, y2 # y2 = S1 + CH
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA}
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA}
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- ## compute high s1
- pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- #
- ROTATE_ARGS #
- movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC}
- mov e, y0 # y0 = e
- ror $(25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- movdqa XTMP2, X0 # X0 = W[-2] {DDCC}
- ror $(22-13), y1 # y1 = a >> (22-13)
- xor e, y0 # y0 = e ^ (e >> (25-11))
- mov f, y2 # y2 = f
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
- xor a, y1 # y1 = a ^ (a >> (22-13)
- xor g, y2 # y2 = f^g
- psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25
- and e, y2 # y2 = (f^g)&e
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- psrld $10, X0 # X0 = W[-2] >> 10 {DDCC}
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- pxor XTMP3, XTMP2 #
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
- add y0, y2 # y2 = S1 + CH
- add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
- pxor XTMP2, X0 # X0 = s1 {xDxC}
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- pshufb SHUF_DC00, X0 # X0 = s1 {DC00}
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]}
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
-
- ROTATE_ARGS
- rotate_Xs
-.endm
-
-## input is [rsp + _XFER + %1 * 4]
-.macro DO_ROUND round
- mov e, y0 # y0 = e
- ror $(25-11), y0 # y0 = e >> (25-11)
- mov a, y1 # y1 = a
- xor e, y0 # y0 = e ^ (e >> (25-11))
- ror $(22-13), y1 # y1 = a >> (22-13)
- mov f, y2 # y2 = f
- xor a, y1 # y1 = a ^ (a >> (22-13)
- ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
- xor g, y2 # y2 = f^g
- xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
- ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
- and e, y2 # y2 = (f^g)&e
- xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
- ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
- xor g, y2 # y2 = CH = ((f^g)&e)^g
- add y0, y2 # y2 = S1 + CH
- ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
- offset = \round * 4 + _XFER
- add offset(%rsp), y2 # y2 = k + w + S1 + CH
- mov a, y0 # y0 = a
- add y2, h # h = h + S1 + CH + k + w
- mov a, y2 # y2 = a
- or c, y0 # y0 = a|c
- add h, d # d = d + h + S1 + CH + k + w
- and c, y2 # y2 = a&c
- and b, y0 # y0 = (a|c)&b
- add y1, h # h = h + S1 + CH + k + w + S0
- or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
- add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
- ROTATE_ARGS
-.endm
-
-########################################################################
-## void sha256_transform_ssse3(struct sha256_state *state, const u8 *data,
-## int blocks);
-## arg 1 : pointer to state
-## (struct sha256_state is assumed to begin with u32 state[8])
-## arg 2 : pointer to input data
-## arg 3 : Num blocks
-########################################################################
-.text
-SYM_TYPED_FUNC_START(sha256_transform_ssse3)
- pushq %rbx
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- pushq %rbp
- mov %rsp, %rbp
-
- subq $STACK_SIZE, %rsp
- and $~15, %rsp
-
- shl $6, NUM_BLKS # convert to bytes
- jz .Ldone_hash
- add INP, NUM_BLKS
- mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data
-
- ## load initial digest
- mov 4*0(CTX), a
- mov 4*1(CTX), b
- mov 4*2(CTX), c
- mov 4*3(CTX), d
- mov 4*4(CTX), e
- mov 4*5(CTX), f
- mov 4*6(CTX), g
- mov 4*7(CTX), h
-
- movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
- movdqa _SHUF_00BA(%rip), SHUF_00BA
- movdqa _SHUF_DC00(%rip), SHUF_DC00
-
-.Lloop0:
- lea K256(%rip), TBL
-
- ## byte swap first 16 dwords
- COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
- COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
-
- mov INP, _INP(%rsp)
-
- ## schedule 48 input dwords, by doing 3 rounds of 16 each
- mov $3, SRND
-.align 16
-.Lloop1:
- movdqa (TBL), XFER
- paddd X0, XFER
- movdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- movdqa 1*16(TBL), XFER
- paddd X0, XFER
- movdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- movdqa 2*16(TBL), XFER
- paddd X0, XFER
- movdqa XFER, _XFER(%rsp)
- FOUR_ROUNDS_AND_SCHED
-
- movdqa 3*16(TBL), XFER
- paddd X0, XFER
- movdqa XFER, _XFER(%rsp)
- add $4*16, TBL
- FOUR_ROUNDS_AND_SCHED
-
- sub $1, SRND
- jne .Lloop1
-
- mov $2, SRND
-.Lloop2:
- paddd (TBL), X0
- movdqa X0, _XFER(%rsp)
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
- paddd 1*16(TBL), X1
- movdqa X1, _XFER(%rsp)
- add $2*16, TBL
- DO_ROUND 0
- DO_ROUND 1
- DO_ROUND 2
- DO_ROUND 3
-
- movdqa X2, X0
- movdqa X3, X1
-
- sub $1, SRND
- jne .Lloop2
-
- addm (4*0)(CTX),a
- addm (4*1)(CTX),b
- addm (4*2)(CTX),c
- addm (4*3)(CTX),d
- addm (4*4)(CTX),e
- addm (4*5)(CTX),f
- addm (4*6)(CTX),g
- addm (4*7)(CTX),h
-
- mov _INP(%rsp), INP
- add $64, INP
- cmp _INP_END(%rsp), INP
- jne .Lloop0
-
-.Ldone_hash:
-
- mov %rbp, %rsp
- popq %rbp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbx
-
- RET
-SYM_FUNC_END(sha256_transform_ssse3)
-
-.section .rodata.cst256.K256, "aM", @progbits, 256
-.align 64
-K256:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x0c0d0e0f08090a0b0405060700010203
-
-.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
-.align 16
-# shuffle xBxA -> 00BA
-_SHUF_00BA:
- .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
-
-.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
-.align 16
-# shuffle xDxC -> DC00
-_SHUF_DC00:
- .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S
deleted file mode 100644
index d515a55a3bc1..000000000000
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ /dev/null
@@ -1,200 +0,0 @@
-/*
- * Intel SHA Extensions optimized implementation of a SHA-256 update function
- *
- * This file is provided under a dual BSD/GPLv2 license. When using or
- * redistributing this file, you may do so under either license.
- *
- * GPL LICENSE SUMMARY
- *
- * Copyright(c) 2015 Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Contact Information:
- * Sean Gulley <sean.m.gulley@intel.com>
- * Tim Chen <tim.c.chen@linux.intel.com>
- *
- * BSD LICENSE
- *
- * Copyright(c) 2015 Intel Corporation.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * * Neither the name of Intel Corporation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-
-#define DIGEST_PTR %rdi /* 1st arg */
-#define DATA_PTR %rsi /* 2nd arg */
-#define NUM_BLKS %rdx /* 3rd arg */
-
-#define SHA256CONSTANTS %rax
-
-#define MSG %xmm0 /* sha256rnds2 implicit operand */
-#define STATE0 %xmm1
-#define STATE1 %xmm2
-#define MSG0 %xmm3
-#define MSG1 %xmm4
-#define MSG2 %xmm5
-#define MSG3 %xmm6
-#define TMP %xmm7
-
-#define SHUF_MASK %xmm8
-
-#define ABEF_SAVE %xmm9
-#define CDGH_SAVE %xmm10
-
-.macro do_4rounds i, m0, m1, m2, m3
-.if \i < 16
- movdqu \i*4(DATA_PTR), \m0
- pshufb SHUF_MASK, \m0
-.endif
- movdqa (\i-32)*4(SHA256CONSTANTS), MSG
- paddd \m0, MSG
- sha256rnds2 STATE0, STATE1
-.if \i >= 12 && \i < 60
- movdqa \m0, TMP
- palignr $4, \m3, TMP
- paddd TMP, \m1
- sha256msg2 \m0, \m1
-.endif
- punpckhqdq MSG, MSG
- sha256rnds2 STATE1, STATE0
-.if \i >= 4 && \i < 52
- sha256msg1 \m0, \m3
-.endif
-.endm
-
-/*
- * Intel SHA Extensions optimized implementation of a SHA-256 update function
- *
- * The function takes a pointer to the current hash values, a pointer to the
- * input data, and a number of 64 byte blocks to process. Once all blocks have
- * been processed, the digest pointer is updated with the resulting hash value.
- * The function only processes complete blocks, there is no functionality to
- * store partial blocks. All message padding and hash value initialization must
- * be done outside the update function.
- *
- * void sha256_ni_transform(uint32_t *digest, const void *data,
- uint32_t numBlocks);
- * digest : pointer to digest
- * data: pointer to input data
- * numBlocks: Number of blocks to process
- */
-
-.text
-SYM_TYPED_FUNC_START(sha256_ni_transform)
-
- shl $6, NUM_BLKS /* convert to bytes */
- jz .Ldone_hash
- add DATA_PTR, NUM_BLKS /* pointer to end of data */
-
- /*
- * load initial hash values
- * Need to reorder these appropriately
- * DCBA, HGFE -> ABEF, CDGH
- */
- movdqu 0*16(DIGEST_PTR), STATE0 /* DCBA */
- movdqu 1*16(DIGEST_PTR), STATE1 /* HGFE */
-
- movdqa STATE0, TMP
- punpcklqdq STATE1, STATE0 /* FEBA */
- punpckhqdq TMP, STATE1 /* DCHG */
- pshufd $0x1B, STATE0, STATE0 /* ABEF */
- pshufd $0xB1, STATE1, STATE1 /* CDGH */
-
- movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
- lea K256+32*4(%rip), SHA256CONSTANTS
-
-.Lloop0:
- /* Save hash values for addition after rounds */
- movdqa STATE0, ABEF_SAVE
- movdqa STATE1, CDGH_SAVE
-
-.irp i, 0, 16, 32, 48
- do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3
- do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0
- do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1
- do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2
-.endr
-
- /* Add current hash values with previously saved */
- paddd ABEF_SAVE, STATE0
- paddd CDGH_SAVE, STATE1
-
- /* Increment data pointer and loop if more to process */
- add $64, DATA_PTR
- cmp NUM_BLKS, DATA_PTR
- jne .Lloop0
-
- /* Write hash values back in the correct order */
- movdqa STATE0, TMP
- punpcklqdq STATE1, STATE0 /* GHEF */
- punpckhqdq TMP, STATE1 /* ABCD */
- pshufd $0xB1, STATE0, STATE0 /* HGFE */
- pshufd $0x1B, STATE1, STATE1 /* DCBA */
-
- movdqu STATE1, 0*16(DIGEST_PTR)
- movdqu STATE0, 1*16(DIGEST_PTR)
-
-.Ldone_hash:
-
- RET
-SYM_FUNC_END(sha256_ni_transform)
-
-.section .rodata.cst256.K256, "aM", @progbits, 256
-.align 64
-K256:
- .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
-PSHUFFLE_BYTE_FLIP_MASK:
- .octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
deleted file mode 100644
index e04a43d9f7d5..000000000000
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ /dev/null
@@ -1,467 +0,0 @@
-/*
- * Cryptographic API.
- *
- * Glue code for the SHA256 Secure Hash Algorithm assembler implementations
- * using SSSE3, AVX, AVX2, and SHA-NI instructions.
- *
- * This file is based on sha256_generic.c
- *
- * Copyright (C) 2013 Intel Corporation.
- *
- * Author:
- * Tim Chen <tim.c.chen@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/types.h>
-#include <crypto/sha2.h>
-#include <crypto/sha256_base.h>
-#include <linux/string.h>
-#include <asm/cpu_device_id.h>
-#include <asm/simd.h>
-
-asmlinkage void sha256_transform_ssse3(struct sha256_state *state,
- const u8 *data, int blocks);
-
-static const struct x86_cpu_id module_cpu_ids[] = {
-#ifdef CONFIG_AS_SHA256_NI
- X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL),
-#endif
- X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL),
- X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL),
- {}
-};
-MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids);
-
-static int _sha256_update(struct shash_desc *desc, const u8 *data,
- unsigned int len, sha256_block_fn *sha256_xform)
-{
- struct sha256_state *sctx = shash_desc_ctx(desc);
-
- if (!crypto_simd_usable() ||
- (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE)
- return crypto_sha256_update(desc, data, len);
-
- /*
- * Make sure struct sha256_state begins directly with the SHA256
- * 256-bit internal state, as this is what the asm functions expect.
- */
- BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0);
-
- kernel_fpu_begin();
- sha256_base_do_update(desc, data, len, sha256_xform);
- kernel_fpu_end();
-
- return 0;
-}
-
-static int sha256_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out, sha256_block_fn *sha256_xform)
-{
- if (!crypto_simd_usable())
- return crypto_sha256_finup(desc, data, len, out);
-
- kernel_fpu_begin();
- if (len)
- sha256_base_do_update(desc, data, len, sha256_xform);
- sha256_base_do_finalize(desc, sha256_xform);
- kernel_fpu_end();
-
- return sha256_base_finish(desc, out);
-}
-
-static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return _sha256_update(desc, data, len, sha256_transform_ssse3);
-}
-
-static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha256_finup(desc, data, len, out, sha256_transform_ssse3);
-}
-
-/* Add padding and return the message digest. */
-static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
-{
- return sha256_ssse3_finup(desc, NULL, 0, out);
-}
-
-static int sha256_ssse3_digest(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha256_base_init(desc) ?:
- sha256_ssse3_finup(desc, data, len, out);
-}
-
-static struct shash_alg sha256_ssse3_algs[] = { {
- .digestsize = SHA256_DIGEST_SIZE,
- .init = sha256_base_init,
- .update = sha256_ssse3_update,
- .final = sha256_ssse3_final,
- .finup = sha256_ssse3_finup,
- .digest = sha256_ssse3_digest,
- .descsize = sizeof(struct sha256_state),
- .base = {
- .cra_name = "sha256",
- .cra_driver_name = "sha256-ssse3",
- .cra_priority = 150,
- .cra_blocksize = SHA256_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA224_DIGEST_SIZE,
- .init = sha224_base_init,
- .update = sha256_ssse3_update,
- .final = sha256_ssse3_final,
- .finup = sha256_ssse3_finup,
- .descsize = sizeof(struct sha256_state),
- .base = {
- .cra_name = "sha224",
- .cra_driver_name = "sha224-ssse3",
- .cra_priority = 150,
- .cra_blocksize = SHA224_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static int register_sha256_ssse3(void)
-{
- if (boot_cpu_has(X86_FEATURE_SSSE3))
- return crypto_register_shashes(sha256_ssse3_algs,
- ARRAY_SIZE(sha256_ssse3_algs));
- return 0;
-}
-
-static void unregister_sha256_ssse3(void)
-{
- if (boot_cpu_has(X86_FEATURE_SSSE3))
- crypto_unregister_shashes(sha256_ssse3_algs,
- ARRAY_SIZE(sha256_ssse3_algs));
-}
-
-asmlinkage void sha256_transform_avx(struct sha256_state *state,
- const u8 *data, int blocks);
-
-static int sha256_avx_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return _sha256_update(desc, data, len, sha256_transform_avx);
-}
-
-static int sha256_avx_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha256_finup(desc, data, len, out, sha256_transform_avx);
-}
-
-static int sha256_avx_final(struct shash_desc *desc, u8 *out)
-{
- return sha256_avx_finup(desc, NULL, 0, out);
-}
-
-static int sha256_avx_digest(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha256_base_init(desc) ?:
- sha256_avx_finup(desc, data, len, out);
-}
-
-static struct shash_alg sha256_avx_algs[] = { {
- .digestsize = SHA256_DIGEST_SIZE,
- .init = sha256_base_init,
- .update = sha256_avx_update,
- .final = sha256_avx_final,
- .finup = sha256_avx_finup,
- .digest = sha256_avx_digest,
- .descsize = sizeof(struct sha256_state),
- .base = {
- .cra_name = "sha256",
- .cra_driver_name = "sha256-avx",
- .cra_priority = 160,
- .cra_blocksize = SHA256_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA224_DIGEST_SIZE,
- .init = sha224_base_init,
- .update = sha256_avx_update,
- .final = sha256_avx_final,
- .finup = sha256_avx_finup,
- .descsize = sizeof(struct sha256_state),
- .base = {
- .cra_name = "sha224",
- .cra_driver_name = "sha224-avx",
- .cra_priority = 160,
- .cra_blocksize = SHA224_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static bool avx_usable(void)
-{
- if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
- if (boot_cpu_has(X86_FEATURE_AVX))
- pr_info("AVX detected but unusable.\n");
- return false;
- }
-
- return true;
-}
-
-static int register_sha256_avx(void)
-{
- if (avx_usable())
- return crypto_register_shashes(sha256_avx_algs,
- ARRAY_SIZE(sha256_avx_algs));
- return 0;
-}
-
-static void unregister_sha256_avx(void)
-{
- if (avx_usable())
- crypto_unregister_shashes(sha256_avx_algs,
- ARRAY_SIZE(sha256_avx_algs));
-}
-
-asmlinkage void sha256_transform_rorx(struct sha256_state *state,
- const u8 *data, int blocks);
-
-static int sha256_avx2_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return _sha256_update(desc, data, len, sha256_transform_rorx);
-}
-
-static int sha256_avx2_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha256_finup(desc, data, len, out, sha256_transform_rorx);
-}
-
-static int sha256_avx2_final(struct shash_desc *desc, u8 *out)
-{
- return sha256_avx2_finup(desc, NULL, 0, out);
-}
-
-static int sha256_avx2_digest(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha256_base_init(desc) ?:
- sha256_avx2_finup(desc, data, len, out);
-}
-
-static struct shash_alg sha256_avx2_algs[] = { {
- .digestsize = SHA256_DIGEST_SIZE,
- .init = sha256_base_init,
- .update = sha256_avx2_update,
- .final = sha256_avx2_final,
- .finup = sha256_avx2_finup,
- .digest = sha256_avx2_digest,
- .descsize = sizeof(struct sha256_state),
- .base = {
- .cra_name = "sha256",
- .cra_driver_name = "sha256-avx2",
- .cra_priority = 170,
- .cra_blocksize = SHA256_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA224_DIGEST_SIZE,
- .init = sha224_base_init,
- .update = sha256_avx2_update,
- .final = sha256_avx2_final,
- .finup = sha256_avx2_finup,
- .descsize = sizeof(struct sha256_state),
- .base = {
- .cra_name = "sha224",
- .cra_driver_name = "sha224-avx2",
- .cra_priority = 170,
- .cra_blocksize = SHA224_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static bool avx2_usable(void)
-{
- if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) &&
- boot_cpu_has(X86_FEATURE_BMI2))
- return true;
-
- return false;
-}
-
-static int register_sha256_avx2(void)
-{
- if (avx2_usable())
- return crypto_register_shashes(sha256_avx2_algs,
- ARRAY_SIZE(sha256_avx2_algs));
- return 0;
-}
-
-static void unregister_sha256_avx2(void)
-{
- if (avx2_usable())
- crypto_unregister_shashes(sha256_avx2_algs,
- ARRAY_SIZE(sha256_avx2_algs));
-}
-
-#ifdef CONFIG_AS_SHA256_NI
-asmlinkage void sha256_ni_transform(struct sha256_state *digest,
- const u8 *data, int rounds);
-
-static int sha256_ni_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- return _sha256_update(desc, data, len, sha256_ni_transform);
-}
-
-static int sha256_ni_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha256_finup(desc, data, len, out, sha256_ni_transform);
-}
-
-static int sha256_ni_final(struct shash_desc *desc, u8 *out)
-{
- return sha256_ni_finup(desc, NULL, 0, out);
-}
-
-static int sha256_ni_digest(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- return sha256_base_init(desc) ?:
- sha256_ni_finup(desc, data, len, out);
-}
-
-static struct shash_alg sha256_ni_algs[] = { {
- .digestsize = SHA256_DIGEST_SIZE,
- .init = sha256_base_init,
- .update = sha256_ni_update,
- .final = sha256_ni_final,
- .finup = sha256_ni_finup,
- .digest = sha256_ni_digest,
- .descsize = sizeof(struct sha256_state),
- .base = {
- .cra_name = "sha256",
- .cra_driver_name = "sha256-ni",
- .cra_priority = 250,
- .cra_blocksize = SHA256_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA224_DIGEST_SIZE,
- .init = sha224_base_init,
- .update = sha256_ni_update,
- .final = sha256_ni_final,
- .finup = sha256_ni_finup,
- .descsize = sizeof(struct sha256_state),
- .base = {
- .cra_name = "sha224",
- .cra_driver_name = "sha224-ni",
- .cra_priority = 250,
- .cra_blocksize = SHA224_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static int register_sha256_ni(void)
-{
- if (boot_cpu_has(X86_FEATURE_SHA_NI))
- return crypto_register_shashes(sha256_ni_algs,
- ARRAY_SIZE(sha256_ni_algs));
- return 0;
-}
-
-static void unregister_sha256_ni(void)
-{
- if (boot_cpu_has(X86_FEATURE_SHA_NI))
- crypto_unregister_shashes(sha256_ni_algs,
- ARRAY_SIZE(sha256_ni_algs));
-}
-
-#else
-static inline int register_sha256_ni(void) { return 0; }
-static inline void unregister_sha256_ni(void) { }
-#endif
-
-static int __init sha256_ssse3_mod_init(void)
-{
- if (!x86_match_cpu(module_cpu_ids))
- return -ENODEV;
-
- if (register_sha256_ssse3())
- goto fail;
-
- if (register_sha256_avx()) {
- unregister_sha256_ssse3();
- goto fail;
- }
-
- if (register_sha256_avx2()) {
- unregister_sha256_avx();
- unregister_sha256_ssse3();
- goto fail;
- }
-
- if (register_sha256_ni()) {
- unregister_sha256_avx2();
- unregister_sha256_avx();
- unregister_sha256_ssse3();
- goto fail;
- }
-
- return 0;
-fail:
- return -ENODEV;
-}
-
-static void __exit sha256_ssse3_mod_fini(void)
-{
- unregister_sha256_ni();
- unregister_sha256_avx2();
- unregister_sha256_avx();
- unregister_sha256_ssse3();
-}
-
-module_init(sha256_ssse3_mod_init);
-module_exit(sha256_ssse3_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated");
-
-MODULE_ALIAS_CRYPTO("sha256");
-MODULE_ALIAS_CRYPTO("sha256-ssse3");
-MODULE_ALIAS_CRYPTO("sha256-avx");
-MODULE_ALIAS_CRYPTO("sha256-avx2");
-MODULE_ALIAS_CRYPTO("sha224");
-MODULE_ALIAS_CRYPTO("sha224-ssse3");
-MODULE_ALIAS_CRYPTO("sha224-avx");
-MODULE_ALIAS_CRYPTO("sha224-avx2");
-#ifdef CONFIG_AS_SHA256_NI
-MODULE_ALIAS_CRYPTO("sha256-ni");
-MODULE_ALIAS_CRYPTO("sha224-ni");
-#endif
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 6d3b85e53d0e..067684c54395 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -27,17 +27,13 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <asm/cpu_device_id.h>
+#include <asm/simd.h>
#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <linux/init.h>
+#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/types.h>
#include <crypto/sha2.h>
#include <crypto/sha512_base.h>
-#include <asm/cpu_device_id.h>
-#include <asm/simd.h>
asmlinkage void sha512_transform_ssse3(struct sha512_state *state,
const u8 *data, int blocks);
@@ -45,11 +41,7 @@ asmlinkage void sha512_transform_ssse3(struct sha512_state *state,
static int sha512_update(struct shash_desc *desc, const u8 *data,
unsigned int len, sha512_block_fn *sha512_xform)
{
- struct sha512_state *sctx = shash_desc_ctx(desc);
-
- if (!crypto_simd_usable() ||
- (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE)
- return crypto_sha512_update(desc, data, len);
+ int remain;
/*
* Make sure struct sha512_state begins directly with the SHA512
@@ -58,22 +50,17 @@ static int sha512_update(struct shash_desc *desc, const u8 *data,
BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0);
kernel_fpu_begin();
- sha512_base_do_update(desc, data, len, sha512_xform);
+ remain = sha512_base_do_update_blocks(desc, data, len, sha512_xform);
kernel_fpu_end();
- return 0;
+ return remain;
}
static int sha512_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out, sha512_block_fn *sha512_xform)
{
- if (!crypto_simd_usable())
- return crypto_sha512_finup(desc, data, len, out);
-
kernel_fpu_begin();
- if (len)
- sha512_base_do_update(desc, data, len, sha512_xform);
- sha512_base_do_finalize(desc, sha512_xform);
+ sha512_base_do_finup(desc, data, len, sha512_xform);
kernel_fpu_end();
return sha512_base_finish(desc, out);
@@ -91,23 +78,18 @@ static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
return sha512_finup(desc, data, len, out, sha512_transform_ssse3);
}
-/* Add padding and return the message digest. */
-static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
-{
- return sha512_ssse3_finup(desc, NULL, 0, out);
-}
-
static struct shash_alg sha512_ssse3_algs[] = { {
.digestsize = SHA512_DIGEST_SIZE,
.init = sha512_base_init,
.update = sha512_ssse3_update,
- .final = sha512_ssse3_final,
.finup = sha512_ssse3_finup,
- .descsize = sizeof(struct sha512_state),
+ .descsize = SHA512_STATE_SIZE,
.base = {
.cra_name = "sha512",
.cra_driver_name = "sha512-ssse3",
.cra_priority = 150,
+ .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
+ CRYPTO_AHASH_ALG_FINUP_MAX,
.cra_blocksize = SHA512_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@@ -115,13 +97,14 @@ static struct shash_alg sha512_ssse3_algs[] = { {
.digestsize = SHA384_DIGEST_SIZE,
.init = sha384_base_init,
.update = sha512_ssse3_update,
- .final = sha512_ssse3_final,
.finup = sha512_ssse3_finup,
- .descsize = sizeof(struct sha512_state),
+ .descsize = SHA512_STATE_SIZE,
.base = {
.cra_name = "sha384",
.cra_driver_name = "sha384-ssse3",
.cra_priority = 150,
+ .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
+ CRYPTO_AHASH_ALG_FINUP_MAX,
.cra_blocksize = SHA384_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@@ -167,23 +150,18 @@ static int sha512_avx_finup(struct shash_desc *desc, const u8 *data,
return sha512_finup(desc, data, len, out, sha512_transform_avx);
}
-/* Add padding and return the message digest. */
-static int sha512_avx_final(struct shash_desc *desc, u8 *out)
-{
- return sha512_avx_finup(desc, NULL, 0, out);
-}
-
static struct shash_alg sha512_avx_algs[] = { {
.digestsize = SHA512_DIGEST_SIZE,
.init = sha512_base_init,
.update = sha512_avx_update,
- .final = sha512_avx_final,
.finup = sha512_avx_finup,
- .descsize = sizeof(struct sha512_state),
+ .descsize = SHA512_STATE_SIZE,
.base = {
.cra_name = "sha512",
.cra_driver_name = "sha512-avx",
.cra_priority = 160,
+ .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
+ CRYPTO_AHASH_ALG_FINUP_MAX,
.cra_blocksize = SHA512_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@@ -191,13 +169,14 @@ static struct shash_alg sha512_avx_algs[] = { {
.digestsize = SHA384_DIGEST_SIZE,
.init = sha384_base_init,
.update = sha512_avx_update,
- .final = sha512_avx_final,
.finup = sha512_avx_finup,
- .descsize = sizeof(struct sha512_state),
+ .descsize = SHA512_STATE_SIZE,
.base = {
.cra_name = "sha384",
.cra_driver_name = "sha384-avx",
.cra_priority = 160,
+ .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
+ CRYPTO_AHASH_ALG_FINUP_MAX,
.cra_blocksize = SHA384_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@@ -233,23 +212,18 @@ static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data,
return sha512_finup(desc, data, len, out, sha512_transform_rorx);
}
-/* Add padding and return the message digest. */
-static int sha512_avx2_final(struct shash_desc *desc, u8 *out)
-{
- return sha512_avx2_finup(desc, NULL, 0, out);
-}
-
static struct shash_alg sha512_avx2_algs[] = { {
.digestsize = SHA512_DIGEST_SIZE,
.init = sha512_base_init,
.update = sha512_avx2_update,
- .final = sha512_avx2_final,
.finup = sha512_avx2_finup,
- .descsize = sizeof(struct sha512_state),
+ .descsize = SHA512_STATE_SIZE,
.base = {
.cra_name = "sha512",
.cra_driver_name = "sha512-avx2",
.cra_priority = 170,
+ .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
+ CRYPTO_AHASH_ALG_FINUP_MAX,
.cra_blocksize = SHA512_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@@ -257,13 +231,14 @@ static struct shash_alg sha512_avx2_algs[] = { {
.digestsize = SHA384_DIGEST_SIZE,
.init = sha384_base_init,
.update = sha512_avx2_update,
- .final = sha512_avx2_final,
.finup = sha512_avx2_finup,
- .descsize = sizeof(struct sha512_state),
+ .descsize = SHA512_STATE_SIZE,
.base = {
.cra_name = "sha384",
.cra_driver_name = "sha384-avx2",
.cra_priority = 170,
+ .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
+ CRYPTO_AHASH_ALG_FINUP_MAX,
.cra_blocksize = SHA384_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
diff --git a/arch/x86/crypto/sm3_avx_glue.c b/arch/x86/crypto/sm3_avx_glue.c
index 661b6f22ffcd..6e8c42b9dc8e 100644
--- a/arch/x86/crypto/sm3_avx_glue.c
+++ b/arch/x86/crypto/sm3_avx_glue.c
@@ -10,12 +10,11 @@
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/types.h>
#include <crypto/sm3.h>
#include <crypto/sm3_base.h>
-#include <asm/simd.h>
+#include <linux/cpufeature.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
asmlinkage void sm3_transform_avx(struct sm3_state *state,
const u8 *data, int nblocks);
@@ -23,13 +22,7 @@ asmlinkage void sm3_transform_avx(struct sm3_state *state,
static int sm3_avx_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
- struct sm3_state *sctx = shash_desc_ctx(desc);
-
- if (!crypto_simd_usable() ||
- (sctx->count % SM3_BLOCK_SIZE) + len < SM3_BLOCK_SIZE) {
- sm3_update(sctx, data, len);
- return 0;
- }
+ int remain;
/*
* Make sure struct sm3_state begins directly with the SM3
@@ -38,45 +31,17 @@ static int sm3_avx_update(struct shash_desc *desc, const u8 *data,
BUILD_BUG_ON(offsetof(struct sm3_state, state) != 0);
kernel_fpu_begin();
- sm3_base_do_update(desc, data, len, sm3_transform_avx);
+ remain = sm3_base_do_update_blocks(desc, data, len, sm3_transform_avx);
kernel_fpu_end();
-
- return 0;
+ return remain;
}
static int sm3_avx_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
- if (!crypto_simd_usable()) {
- struct sm3_state *sctx = shash_desc_ctx(desc);
-
- if (len)
- sm3_update(sctx, data, len);
-
- sm3_final(sctx, out);
- return 0;
- }
-
kernel_fpu_begin();
- if (len)
- sm3_base_do_update(desc, data, len, sm3_transform_avx);
- sm3_base_do_finalize(desc, sm3_transform_avx);
+ sm3_base_do_finup(desc, data, len, sm3_transform_avx);
kernel_fpu_end();
-
- return sm3_base_finish(desc, out);
-}
-
-static int sm3_avx_final(struct shash_desc *desc, u8 *out)
-{
- if (!crypto_simd_usable()) {
- sm3_final(shash_desc_ctx(desc), out);
- return 0;
- }
-
- kernel_fpu_begin();
- sm3_base_do_finalize(desc, sm3_transform_avx);
- kernel_fpu_end();
-
return sm3_base_finish(desc, out);
}
@@ -84,13 +49,14 @@ static struct shash_alg sm3_avx_alg = {
.digestsize = SM3_DIGEST_SIZE,
.init = sm3_base_init,
.update = sm3_avx_update,
- .final = sm3_avx_final,
.finup = sm3_avx_finup,
- .descsize = sizeof(struct sm3_state),
+ .descsize = SM3_STATE_SIZE,
.base = {
.cra_name = "sm3",
.cra_driver_name = "sm3-avx",
.cra_priority = 300,
+ .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
+ CRYPTO_AHASH_ALG_FINUP_MAX,
.cra_blocksize = SM3_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
diff --git a/arch/x86/crypto/sm4_aesni_avx2_glue.c b/arch/x86/crypto/sm4_aesni_avx2_glue.c
index 1148fd4cd57f..fec0ab7a63dd 100644
--- a/arch/x86/crypto/sm4_aesni_avx2_glue.c
+++ b/arch/x86/crypto/sm4_aesni_avx2_glue.c
@@ -8,11 +8,10 @@
* Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
*/
+#include <asm/fpu/api.h>
#include <linux/module.h>
#include <linux/crypto.h>
#include <linux/kernel.h>
-#include <asm/simd.h>
-#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <crypto/sm4.h>
#include "sm4-avx.h"
@@ -48,10 +47,9 @@ static int ctr_crypt(struct skcipher_request *req)
static struct skcipher_alg sm4_aesni_avx2_skciphers[] = {
{
.base = {
- .cra_name = "__ecb(sm4)",
- .cra_driver_name = "__ecb-sm4-aesni-avx2",
+ .cra_name = "ecb(sm4)",
+ .cra_driver_name = "ecb-sm4-aesni-avx2",
.cra_priority = 500,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = SM4_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct sm4_ctx),
.cra_module = THIS_MODULE,
@@ -64,10 +62,9 @@ static struct skcipher_alg sm4_aesni_avx2_skciphers[] = {
.decrypt = sm4_avx_ecb_decrypt,
}, {
.base = {
- .cra_name = "__cbc(sm4)",
- .cra_driver_name = "__cbc-sm4-aesni-avx2",
+ .cra_name = "cbc(sm4)",
+ .cra_driver_name = "cbc-sm4-aesni-avx2",
.cra_priority = 500,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = SM4_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct sm4_ctx),
.cra_module = THIS_MODULE,
@@ -81,10 +78,9 @@ static struct skcipher_alg sm4_aesni_avx2_skciphers[] = {
.decrypt = cbc_decrypt,
}, {
.base = {
- .cra_name = "__ctr(sm4)",
- .cra_driver_name = "__ctr-sm4-aesni-avx2",
+ .cra_name = "ctr(sm4)",
+ .cra_driver_name = "ctr-sm4-aesni-avx2",
.cra_priority = 500,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct sm4_ctx),
.cra_module = THIS_MODULE,
@@ -100,9 +96,6 @@ static struct skcipher_alg sm4_aesni_avx2_skciphers[] = {
}
};
-static struct simd_skcipher_alg *
-simd_sm4_aesni_avx2_skciphers[ARRAY_SIZE(sm4_aesni_avx2_skciphers)];
-
static int __init sm4_init(void)
{
const char *feature_name;
@@ -121,16 +114,14 @@ static int __init sm4_init(void)
return -ENODEV;
}
- return simd_register_skciphers_compat(sm4_aesni_avx2_skciphers,
- ARRAY_SIZE(sm4_aesni_avx2_skciphers),
- simd_sm4_aesni_avx2_skciphers);
+ return crypto_register_skciphers(sm4_aesni_avx2_skciphers,
+ ARRAY_SIZE(sm4_aesni_avx2_skciphers));
}
static void __exit sm4_exit(void)
{
- simd_unregister_skciphers(sm4_aesni_avx2_skciphers,
- ARRAY_SIZE(sm4_aesni_avx2_skciphers),
- simd_sm4_aesni_avx2_skciphers);
+ crypto_unregister_skciphers(sm4_aesni_avx2_skciphers,
+ ARRAY_SIZE(sm4_aesni_avx2_skciphers));
}
module_init(sm4_init);
diff --git a/arch/x86/crypto/sm4_aesni_avx_glue.c b/arch/x86/crypto/sm4_aesni_avx_glue.c
index 85b4ca78b47b..72867fc49ce8 100644
--- a/arch/x86/crypto/sm4_aesni_avx_glue.c
+++ b/arch/x86/crypto/sm4_aesni_avx_glue.c
@@ -8,11 +8,10 @@
* Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
*/
+#include <asm/fpu/api.h>
#include <linux/module.h>
#include <linux/crypto.h>
#include <linux/kernel.h>
-#include <asm/simd.h>
-#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <crypto/sm4.h>
#include "sm4-avx.h"
@@ -263,10 +262,9 @@ static int ctr_crypt(struct skcipher_request *req)
static struct skcipher_alg sm4_aesni_avx_skciphers[] = {
{
.base = {
- .cra_name = "__ecb(sm4)",
- .cra_driver_name = "__ecb-sm4-aesni-avx",
+ .cra_name = "ecb(sm4)",
+ .cra_driver_name = "ecb-sm4-aesni-avx",
.cra_priority = 400,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = SM4_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct sm4_ctx),
.cra_module = THIS_MODULE,
@@ -279,10 +277,9 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = {
.decrypt = sm4_avx_ecb_decrypt,
}, {
.base = {
- .cra_name = "__cbc(sm4)",
- .cra_driver_name = "__cbc-sm4-aesni-avx",
+ .cra_name = "cbc(sm4)",
+ .cra_driver_name = "cbc-sm4-aesni-avx",
.cra_priority = 400,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = SM4_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct sm4_ctx),
.cra_module = THIS_MODULE,
@@ -296,10 +293,9 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = {
.decrypt = cbc_decrypt,
}, {
.base = {
- .cra_name = "__ctr(sm4)",
- .cra_driver_name = "__ctr-sm4-aesni-avx",
+ .cra_name = "ctr(sm4)",
+ .cra_driver_name = "ctr-sm4-aesni-avx",
.cra_priority = 400,
- .cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct sm4_ctx),
.cra_module = THIS_MODULE,
@@ -315,9 +311,6 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = {
}
};
-static struct simd_skcipher_alg *
-simd_sm4_aesni_avx_skciphers[ARRAY_SIZE(sm4_aesni_avx_skciphers)];
-
static int __init sm4_init(void)
{
const char *feature_name;
@@ -335,16 +328,14 @@ static int __init sm4_init(void)
return -ENODEV;
}
- return simd_register_skciphers_compat(sm4_aesni_avx_skciphers,
- ARRAY_SIZE(sm4_aesni_avx_skciphers),
- simd_sm4_aesni_avx_skciphers);
+ return crypto_register_skciphers(sm4_aesni_avx_skciphers,
+ ARRAY_SIZE(sm4_aesni_avx_skciphers));
}
static void __exit sm4_exit(void)
{
- simd_unregister_skciphers(sm4_aesni_avx_skciphers,
- ARRAY_SIZE(sm4_aesni_avx_skciphers),
- simd_sm4_aesni_avx_skciphers);
+ crypto_unregister_skciphers(sm4_aesni_avx_skciphers,
+ ARRAY_SIZE(sm4_aesni_avx_skciphers));
}
module_init(sm4_init);
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index 3eb3440b477a..9e20db013750 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -13,7 +13,6 @@
#include <linux/crypto.h>
#include <linux/err.h>
#include <crypto/algapi.h>
-#include <crypto/internal/simd.h>
#include <crypto/twofish.h>
#include "twofish.h"
@@ -74,10 +73,9 @@ static int cbc_decrypt(struct skcipher_request *req)
static struct skcipher_alg twofish_algs[] = {
{
- .base.cra_name = "__ecb(twofish)",
- .base.cra_driver_name = "__ecb-twofish-avx",
+ .base.cra_name = "ecb(twofish)",
+ .base.cra_driver_name = "ecb-twofish-avx",
.base.cra_priority = 400,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.base.cra_blocksize = TF_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct twofish_ctx),
.base.cra_module = THIS_MODULE,
@@ -87,10 +85,9 @@ static struct skcipher_alg twofish_algs[] = {
.encrypt = ecb_encrypt,
.decrypt = ecb_decrypt,
}, {
- .base.cra_name = "__cbc(twofish)",
- .base.cra_driver_name = "__cbc-twofish-avx",
+ .base.cra_name = "cbc(twofish)",
+ .base.cra_driver_name = "cbc-twofish-avx",
.base.cra_priority = 400,
- .base.cra_flags = CRYPTO_ALG_INTERNAL,
.base.cra_blocksize = TF_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct twofish_ctx),
.base.cra_module = THIS_MODULE,
@@ -103,8 +100,6 @@ static struct skcipher_alg twofish_algs[] = {
},
};
-static struct simd_skcipher_alg *twofish_simd_algs[ARRAY_SIZE(twofish_algs)];
-
static int __init twofish_init(void)
{
const char *feature_name;
@@ -114,15 +109,13 @@ static int __init twofish_init(void)
return -ENODEV;
}
- return simd_register_skciphers_compat(twofish_algs,
- ARRAY_SIZE(twofish_algs),
- twofish_simd_algs);
+ return crypto_register_skciphers(twofish_algs,
+ ARRAY_SIZE(twofish_algs));
}
static void __exit twofish_exit(void)
{
- simd_unregister_skciphers(twofish_algs, ARRAY_SIZE(twofish_algs),
- twofish_simd_algs);
+ crypto_unregister_skciphers(twofish_algs, ARRAY_SIZE(twofish_algs));
}
module_init(twofish_init);