diff options
Diffstat (limited to 'arch/x86/crypto')
38 files changed, 435 insertions, 11316 deletions
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig index 3d948f10c94c..56cfdc79e2c6 100644 --- a/arch/x86/crypto/Kconfig +++ b/arch/x86/crypto/Kconfig @@ -4,7 +4,7 @@ menu "Accelerated Cryptographic Algorithms for CPU (x86)" config CRYPTO_CURVE25519_X86 tristate - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_KPP select CRYPTO_LIB_CURVE25519_GENERIC select CRYPTO_ARCH_HAVE_LIB_CURVE25519 @@ -17,13 +17,11 @@ config CRYPTO_CURVE25519_X86 config CRYPTO_AES_NI_INTEL tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XCTR, XTS, GCM (AES-NI/VAES)" - depends on X86 select CRYPTO_AEAD select CRYPTO_LIB_AES select CRYPTO_LIB_GF128MUL select CRYPTO_ALGAPI select CRYPTO_SKCIPHER - select CRYPTO_SIMD help Block cipher: AES cipher algorithms AEAD cipher: AES with GCM @@ -38,7 +36,7 @@ config CRYPTO_AES_NI_INTEL config CRYPTO_BLOWFISH_X86_64 tristate "Ciphers: Blowfish, modes: ECB, CBC" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_BLOWFISH_COMMON imply CRYPTO_CTR @@ -50,7 +48,7 @@ config CRYPTO_BLOWFISH_X86_64 config CRYPTO_CAMELLIA_X86_64 tristate "Ciphers: Camellia with modes: ECB, CBC" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER imply CRYPTO_CTR help @@ -61,10 +59,9 @@ config CRYPTO_CAMELLIA_X86_64 config CRYPTO_CAMELLIA_AESNI_AVX_X86_64 tristate "Ciphers: Camellia with modes: ECB, CBC (AES-NI/AVX)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_CAMELLIA_X86_64 - select CRYPTO_SIMD imply CRYPTO_XTS help Length-preserving ciphers: Camellia with ECB and CBC modes @@ -75,7 +72,7 @@ config CRYPTO_CAMELLIA_AESNI_AVX_X86_64 config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64 tristate "Ciphers: Camellia with modes: ECB, CBC (AES-NI/AVX2)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_CAMELLIA_AESNI_AVX_X86_64 help Length-preserving ciphers: Camellia with ECB and CBC modes @@ -86,11 +83,10 @@ config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64 config CRYPTO_CAST5_AVX_X86_64 tristate "Ciphers: CAST5 with modes: ECB, CBC (AVX)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_CAST5 select CRYPTO_CAST_COMMON - select CRYPTO_SIMD imply CRYPTO_CTR help Length-preserving ciphers: CAST5 (CAST-128) cipher algorithm @@ -103,11 +99,10 @@ config CRYPTO_CAST5_AVX_X86_64 config CRYPTO_CAST6_AVX_X86_64 tristate "Ciphers: CAST6 with modes: ECB, CBC (AVX)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_CAST6 select CRYPTO_CAST_COMMON - select CRYPTO_SIMD imply CRYPTO_XTS imply CRYPTO_CTR help @@ -121,7 +116,7 @@ config CRYPTO_CAST6_AVX_X86_64 config CRYPTO_DES3_EDE_X86_64 tristate "Ciphers: Triple DES EDE with modes: ECB, CBC" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_LIB_DES imply CRYPTO_CTR @@ -135,10 +130,9 @@ config CRYPTO_DES3_EDE_X86_64 config CRYPTO_SERPENT_SSE2_X86_64 tristate "Ciphers: Serpent with modes: ECB, CBC (SSE2)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_SERPENT - select CRYPTO_SIMD imply CRYPTO_CTR help Length-preserving ciphers: Serpent cipher algorithm @@ -151,10 +145,9 @@ config CRYPTO_SERPENT_SSE2_X86_64 config CRYPTO_SERPENT_SSE2_586 tristate "Ciphers: Serpent with modes: ECB, CBC (32-bit with SSE2)" - depends on X86 && !64BIT + depends on !64BIT select CRYPTO_SKCIPHER select CRYPTO_SERPENT - select CRYPTO_SIMD imply CRYPTO_CTR help Length-preserving ciphers: Serpent cipher algorithm @@ -167,10 +160,9 @@ config CRYPTO_SERPENT_SSE2_586 config CRYPTO_SERPENT_AVX_X86_64 tristate "Ciphers: Serpent with modes: ECB, CBC (AVX)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_SERPENT - select CRYPTO_SIMD imply CRYPTO_XTS imply CRYPTO_CTR help @@ -184,7 +176,7 @@ config CRYPTO_SERPENT_AVX_X86_64 config CRYPTO_SERPENT_AVX2_X86_64 tristate "Ciphers: Serpent with modes: ECB, CBC (AVX2)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SERPENT_AVX_X86_64 help Length-preserving ciphers: Serpent cipher algorithm @@ -197,9 +189,8 @@ config CRYPTO_SERPENT_AVX2_X86_64 config CRYPTO_SM4_AESNI_AVX_X86_64 tristate "Ciphers: SM4 with modes: ECB, CBC, CTR (AES-NI/AVX)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER - select CRYPTO_SIMD select CRYPTO_ALGAPI select CRYPTO_SM4 help @@ -218,9 +209,8 @@ config CRYPTO_SM4_AESNI_AVX_X86_64 config CRYPTO_SM4_AESNI_AVX2_X86_64 tristate "Ciphers: SM4 with modes: ECB, CBC, CTR (AES-NI/AVX2)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER - select CRYPTO_SIMD select CRYPTO_ALGAPI select CRYPTO_SM4 select CRYPTO_SM4_AESNI_AVX_X86_64 @@ -240,7 +230,7 @@ config CRYPTO_SM4_AESNI_AVX2_X86_64 config CRYPTO_TWOFISH_586 tristate "Ciphers: Twofish (32-bit)" - depends on (X86 || UML_X86) && !64BIT + depends on !64BIT select CRYPTO_ALGAPI select CRYPTO_TWOFISH_COMMON imply CRYPTO_CTR @@ -251,7 +241,7 @@ config CRYPTO_TWOFISH_586 config CRYPTO_TWOFISH_X86_64 tristate "Ciphers: Twofish" - depends on (X86 || UML_X86) && 64BIT + depends on 64BIT select CRYPTO_ALGAPI select CRYPTO_TWOFISH_COMMON imply CRYPTO_CTR @@ -262,7 +252,7 @@ config CRYPTO_TWOFISH_X86_64 config CRYPTO_TWOFISH_X86_64_3WAY tristate "Ciphers: Twofish with modes: ECB, CBC (3-way parallel)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER select CRYPTO_TWOFISH_COMMON select CRYPTO_TWOFISH_X86_64 @@ -277,9 +267,8 @@ config CRYPTO_TWOFISH_X86_64_3WAY config CRYPTO_TWOFISH_AVX_X86_64 tristate "Ciphers: Twofish with modes: ECB, CBC (AVX)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER - select CRYPTO_SIMD select CRYPTO_TWOFISH_COMMON select CRYPTO_TWOFISH_X86_64 select CRYPTO_TWOFISH_X86_64_3WAY @@ -295,9 +284,8 @@ config CRYPTO_TWOFISH_AVX_X86_64 config CRYPTO_ARIA_AESNI_AVX_X86_64 tristate "Ciphers: ARIA with modes: ECB, CTR (AES-NI/AVX/GFNI)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER - select CRYPTO_SIMD select CRYPTO_ALGAPI select CRYPTO_ARIA help @@ -313,9 +301,8 @@ config CRYPTO_ARIA_AESNI_AVX_X86_64 config CRYPTO_ARIA_AESNI_AVX2_X86_64 tristate "Ciphers: ARIA with modes: ECB, CTR (AES-NI/AVX2/GFNI)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SKCIPHER - select CRYPTO_SIMD select CRYPTO_ALGAPI select CRYPTO_ARIA select CRYPTO_ARIA_AESNI_AVX_X86_64 @@ -332,9 +319,8 @@ config CRYPTO_ARIA_AESNI_AVX2_X86_64 config CRYPTO_ARIA_GFNI_AVX512_X86_64 tristate "Ciphers: ARIA with modes: ECB, CTR (AVX512/GFNI)" - depends on X86 && 64BIT && AS_AVX512 && AS_GFNI + depends on 64BIT && AS_GFNI select CRYPTO_SKCIPHER - select CRYPTO_SIMD select CRYPTO_ALGAPI select CRYPTO_ARIA select CRYPTO_ARIA_AESNI_AVX_X86_64 @@ -349,27 +335,10 @@ config CRYPTO_ARIA_GFNI_AVX512_X86_64 Processes 64 blocks in parallel. -config CRYPTO_CHACHA20_X86_64 - tristate - depends on X86 && 64BIT - select CRYPTO_SKCIPHER - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA - default CRYPTO_LIB_CHACHA_INTERNAL - help - Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12 - stream cipher algorithms - - Architecture: x86_64 using: - - SSSE3 (Supplemental SSE3) - - AVX2 (Advanced Vector Extensions 2) - - AVX-512VL (Advanced Vector Extensions-512VL) - config CRYPTO_AEGIS128_AESNI_SSE2 tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE4.1)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_AEAD - select CRYPTO_SIMD help AEGIS-128 AEAD algorithm @@ -379,7 +348,7 @@ config CRYPTO_AEGIS128_AESNI_SSE2 config CRYPTO_NHPOLY1305_SSE2 tristate "Hash functions: NHPoly1305 (SSE2)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_NHPOLY1305 help NHPoly1305 hash function for Adiantum @@ -389,7 +358,7 @@ config CRYPTO_NHPOLY1305_SSE2 config CRYPTO_NHPOLY1305_AVX2 tristate "Hash functions: NHPoly1305 (AVX2)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_NHPOLY1305 help NHPoly1305 hash function for Adiantum @@ -397,21 +366,9 @@ config CRYPTO_NHPOLY1305_AVX2 Architecture: x86_64 using: - AVX2 (Advanced Vector Extensions 2) -config CRYPTO_BLAKE2S_X86 - bool "Hash functions: BLAKE2s (SSSE3/AVX-512)" - depends on X86 && 64BIT - select CRYPTO_LIB_BLAKE2S_GENERIC - select CRYPTO_ARCH_HAVE_LIB_BLAKE2S - help - BLAKE2s cryptographic hash function (RFC 7693) - - Architecture: x86_64 using: - - SSSE3 (Supplemental SSE3) - - AVX-512 (Advanced Vector Extensions-512) - config CRYPTO_POLYVAL_CLMUL_NI tristate "Hash functions: POLYVAL (CLMUL-NI)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_POLYVAL help POLYVAL hash function for HCTR2 @@ -419,23 +376,9 @@ config CRYPTO_POLYVAL_CLMUL_NI Architecture: x86_64 using: - CLMUL-NI (carry-less multiplication new instructions) -config CRYPTO_POLY1305_X86_64 - tristate - depends on X86 && 64BIT - select CRYPTO_HASH - select CRYPTO_LIB_POLY1305_GENERIC - select CRYPTO_ARCH_HAVE_LIB_POLY1305 - default CRYPTO_LIB_POLY1305_INTERNAL - help - Poly1305 authenticator algorithm (RFC7539) - - Architecture: x86_64 using: - - SSE2 (Streaming SIMD Extensions 2) - - AVX2 (Advanced Vector Extensions 2) - config CRYPTO_SHA1_SSSE3 tristate "Hash functions: SHA-1 (SSSE3/AVX/AVX2/SHA-NI)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SHA1 select CRYPTO_HASH help @@ -447,23 +390,9 @@ config CRYPTO_SHA1_SSSE3 - AVX2 (Advanced Vector Extensions 2) - SHA-NI (SHA Extensions New Instructions) -config CRYPTO_SHA256_SSSE3 - tristate "Hash functions: SHA-224 and SHA-256 (SSSE3/AVX/AVX2/SHA-NI)" - depends on X86 && 64BIT - select CRYPTO_SHA256 - select CRYPTO_HASH - help - SHA-224 and SHA-256 secure hash algorithms (FIPS 180) - - Architecture: x86_64 using: - - SSSE3 (Supplemental SSE3) - - AVX (Advanced Vector Extensions) - - AVX2 (Advanced Vector Extensions 2) - - SHA-NI (SHA Extensions New Instructions) - config CRYPTO_SHA512_SSSE3 tristate "Hash functions: SHA-384 and SHA-512 (SSSE3/AVX/AVX2)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_SHA512 select CRYPTO_HASH help @@ -476,9 +405,9 @@ config CRYPTO_SHA512_SSSE3 config CRYPTO_SM3_AVX_X86_64 tristate "Hash functions: SM3 (AVX)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_HASH - select CRYPTO_SM3 + select CRYPTO_LIB_SM3 help SM3 secure hash function as defined by OSCCA GM/T 0004-2012 SM3 @@ -489,7 +418,7 @@ config CRYPTO_SM3_AVX_X86_64 config CRYPTO_GHASH_CLMUL_NI_INTEL tristate "Hash functions: GHASH (CLMUL-NI)" - depends on X86 && 64BIT + depends on 64BIT select CRYPTO_CRYPTD help GCM GHASH hash function (NIST SP800-38D) diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 5d19f41bde58..aa289a9e0153 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -42,10 +42,6 @@ cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o -obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o -chacha-x86_64-y := chacha-avx2-x86_64.o chacha-ssse3-x86_64.o chacha_glue.o -chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o - obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \ @@ -56,29 +52,17 @@ aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o endif obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o -sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o -sha1-ssse3-$(CONFIG_AS_SHA1_NI) += sha1_ni_asm.o - -obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o -sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o -sha256-ssse3-$(CONFIG_AS_SHA256_NI) += sha256_ni_asm.o +sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ni_asm.o sha1_ssse3_glue.o obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o -obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o -libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o - obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o -obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o -poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o -targets += poly1305-x86_64-cryptogams.S - obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o @@ -104,10 +88,5 @@ aria-aesni-avx2-x86_64-y := aria-aesni-avx2-asm_64.o aria_aesni_avx2_glue.o obj-$(CONFIG_CRYPTO_ARIA_GFNI_AVX512_X86_64) += aria-gfni-avx512-x86_64.o aria-gfni-avx512-x86_64-y := aria-gfni-avx512-asm_64.o aria_gfni_avx512_glue.o -quiet_cmd_perlasm = PERLASM $@ - cmd_perlasm = $(PERL) $< > $@ -$(obj)/%.S: $(src)/%.pl FORCE - $(call if_changed,perlasm) - # Disable GCOV in odd or sensitive code GCOV_PROFILE_curve25519-x86_64.o := n diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index 26786e15abac..f1b6d40154e3 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -8,7 +8,6 @@ */ #include <crypto/internal/aead.h> -#include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/module.h> @@ -233,21 +232,18 @@ static struct aead_alg crypto_aegis128_aesni_alg = { .chunksize = AEGIS128_BLOCK_SIZE, .base = { - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct aegis_ctx) + __alignof__(struct aegis_ctx), .cra_priority = 400, - .cra_name = "__aegis128", - .cra_driver_name = "__aegis128-aesni", + .cra_name = "aegis128", + .cra_driver_name = "aegis128-aesni", .cra_module = THIS_MODULE, } }; -static struct simd_aead_alg *simd_alg; - static int __init crypto_aegis128_aesni_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM4_1) || @@ -255,13 +251,12 @@ static int __init crypto_aegis128_aesni_module_init(void) !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; - return simd_register_aeads_compat(&crypto_aegis128_aesni_alg, 1, - &simd_alg); + return crypto_register_aead(&crypto_aegis128_aesni_alg); } static void __exit crypto_aegis128_aesni_module_exit(void) { - simd_unregister_aeads(&crypto_aegis128_aesni_alg, 1, &simd_alg); + crypto_unregister_aead(&crypto_aegis128_aesni_alg); } module_init(crypto_aegis128_aesni_module_init); diff --git a/arch/x86/crypto/aes-ctr-avx-x86_64.S b/arch/x86/crypto/aes-ctr-avx-x86_64.S index 1685d8b24b2c..bbbfd80f5a50 100644 --- a/arch/x86/crypto/aes-ctr-avx-x86_64.S +++ b/arch/x86/crypto/aes-ctr-avx-x86_64.S @@ -48,8 +48,7 @@ // using the following sets of CPU features: // - AES-NI && AVX // - VAES && AVX2 -// - VAES && (AVX10/256 || (AVX512BW && AVX512VL)) && BMI2 -// - VAES && (AVX10/512 || (AVX512BW && AVX512VL)) && BMI2 +// - VAES && AVX512BW && AVX512VL && BMI2 // // See the function definitions at the bottom of the file for more information. @@ -76,7 +75,6 @@ .text // Move a vector between memory and a register. -// The register operand must be in the first 16 vector registers. .macro _vmovdqu src, dst .if VL < 64 vmovdqu \src, \dst @@ -86,7 +84,6 @@ .endm // Move a vector between registers. -// The registers must be in the first 16 vector registers. .macro _vmovdqa src, dst .if VL < 64 vmovdqa \src, \dst @@ -96,7 +93,7 @@ .endm // Broadcast a 128-bit value from memory to all 128-bit lanes of a vector -// register. The register operand must be in the first 16 vector registers. +// register. .macro _vbroadcast128 src, dst .if VL == 16 vmovdqu \src, \dst @@ -108,7 +105,6 @@ .endm // XOR two vectors together. -// Any register operands must be in the first 16 vector registers. .macro _vpxor src1, src2, dst .if VL < 64 vpxor \src1, \src2, \dst @@ -199,8 +195,8 @@ // XOR each with the zero-th round key. Also update LE_CTR if !\final. .macro _prepare_2_ctr_vecs is_xctr, i0, i1, final=0 .if \is_xctr - .if USE_AVX10 - _vmovdqa LE_CTR, AESDATA\i0 + .if USE_AVX512 + vmovdqa64 LE_CTR, AESDATA\i0 vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i0 .else vpxor XCTR_IV, LE_CTR, AESDATA\i0 @@ -208,7 +204,7 @@ .endif vpaddq LE_CTR_INC1, LE_CTR, AESDATA\i1 - .if USE_AVX10 + .if USE_AVX512 vpternlogd $0x96, XCTR_IV, RNDKEY0, AESDATA\i1 .else vpxor XCTR_IV, AESDATA\i1, AESDATA\i1 @@ -481,18 +477,12 @@ .Lxor_tail_partial_vec_0\@: // XOR the remaining 1 <= LEN < VL bytes. It's easy if masked // loads/stores are available; otherwise it's a bit harder... -.if USE_AVX10 - .if VL <= 32 - mov $-1, %eax - bzhi LEN, %eax, %eax - kmovd %eax, %k1 - .else +.if USE_AVX512 mov $-1, %rax bzhi LEN64, %rax, %rax kmovq %rax, %k1 - .endif vmovdqu8 (SRC), AESDATA1{%k1}{z} - _vpxor AESDATA1, AESDATA0, AESDATA0 + vpxord AESDATA1, AESDATA0, AESDATA0 vmovdqu8 AESDATA0, (DST){%k1} .else .if VL == 32 @@ -554,7 +544,7 @@ // eliminates carries. |ctr| is the per-message block counter starting at 1. .set VL, 16 -.set USE_AVX10, 0 +.set USE_AVX512, 0 SYM_TYPED_FUNC_START(aes_ctr64_crypt_aesni_avx) _aes_ctr_crypt 0 SYM_FUNC_END(aes_ctr64_crypt_aesni_avx) @@ -564,7 +554,7 @@ SYM_FUNC_END(aes_xctr_crypt_aesni_avx) #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) .set VL, 32 -.set USE_AVX10, 0 +.set USE_AVX512, 0 SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx2) _aes_ctr_crypt 0 SYM_FUNC_END(aes_ctr64_crypt_vaes_avx2) @@ -572,21 +562,12 @@ SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx2) _aes_ctr_crypt 1 SYM_FUNC_END(aes_xctr_crypt_vaes_avx2) -.set VL, 32 -.set USE_AVX10, 1 -SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx10_256) - _aes_ctr_crypt 0 -SYM_FUNC_END(aes_ctr64_crypt_vaes_avx10_256) -SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx10_256) - _aes_ctr_crypt 1 -SYM_FUNC_END(aes_xctr_crypt_vaes_avx10_256) - .set VL, 64 -.set USE_AVX10, 1 -SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx10_512) +.set USE_AVX512, 1 +SYM_TYPED_FUNC_START(aes_ctr64_crypt_vaes_avx512) _aes_ctr_crypt 0 -SYM_FUNC_END(aes_ctr64_crypt_vaes_avx10_512) -SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx10_512) +SYM_FUNC_END(aes_ctr64_crypt_vaes_avx512) +SYM_TYPED_FUNC_START(aes_xctr_crypt_vaes_avx512) _aes_ctr_crypt 1 -SYM_FUNC_END(aes_xctr_crypt_vaes_avx10_512) +SYM_FUNC_END(aes_xctr_crypt_vaes_avx512) #endif // CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S index 93ba0ddbe009..db79cdf81588 100644 --- a/arch/x86/crypto/aes-xts-avx-x86_64.S +++ b/arch/x86/crypto/aes-xts-avx-x86_64.S @@ -52,32 +52,25 @@ * different code, it uses a macro to generate several implementations that * share similar source code but are targeted at different CPUs, listed below: * - * AES-NI + AVX + * AES-NI && AVX * - 128-bit vectors (1 AES block per vector) * - VEX-coded instructions * - xmm0-xmm15 * - This is for older CPUs that lack VAES but do have AVX. * - * VAES + VPCLMULQDQ + AVX2 + * VAES && VPCLMULQDQ && AVX2 * - 256-bit vectors (2 AES blocks per vector) * - VEX-coded instructions * - ymm0-ymm15 - * - This is for CPUs that have VAES but lack AVX512 or AVX10, - * e.g. Intel's Alder Lake and AMD's Zen 3. + * - This is for CPUs that have VAES but either lack AVX512 (e.g. Intel's + * Alder Lake and AMD's Zen 3) or downclock too eagerly when using zmm + * registers (e.g. Intel's Ice Lake). * - * VAES + VPCLMULQDQ + AVX10/256 + BMI2 - * - 256-bit vectors (2 AES blocks per vector) + * VAES && VPCLMULQDQ && AVX512BW && AVX512VL && BMI2 + * - 512-bit vectors (4 AES blocks per vector) * - EVEX-coded instructions - * - ymm0-ymm31 - * - This is for CPUs that have AVX512 but where using zmm registers causes - * downclocking, and for CPUs that have AVX10/256 but not AVX10/512. - * - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256. - * To avoid confusion with 512-bit, we just write AVX10/256. - * - * VAES + VPCLMULQDQ + AVX10/512 + BMI2 - * - Same as the previous one, but upgrades to 512-bit vectors - * (4 AES blocks per vector) in zmm0-zmm31. - * - This is for CPUs that have good AVX512 or AVX10/512 support. + * - zmm0-zmm31 + * - This is for CPUs that have good AVX512 support. * * This file doesn't have an implementation for AES-NI alone (without AVX), as * the lack of VEX would make all the assembly code different. @@ -107,9 +100,20 @@ // exists when there's a carry out of the low 64 bits of the tweak. .quad 0x87, 1 + // These are the shift amounts that are needed when multiplying by [x^0, + // x^1, x^2, x^3] to compute the first vector of tweaks when VL=64. + // + // The right shifts by 64 are expected to zeroize the destination. + // 'vpsrlvq' is indeed defined to do that; i.e. it doesn't truncate the + // amount to 64 & 63 = 0 like the 'shr' scalar shift instruction would. +.Lrshift_amounts: + .byte 64, 64, 63, 63, 62, 62, 61, 61 +.Llshift_amounts: + .byte 0, 0, 1, 1, 2, 2, 3, 3 + // This table contains constants for vpshufb and vpblendvb, used to // handle variable byte shifts and blending during ciphertext stealing - // on CPUs that don't support AVX10-style masking. + // on CPUs that don't support AVX512-style masking. .Lcts_permute_table: .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 @@ -138,7 +142,7 @@ .irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 _define_Vi \i .endr -.if USE_AVX10 +.if USE_AVX512 .irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 _define_Vi \i .endr @@ -193,7 +197,7 @@ // keys to the *end* of this register range. I.e., AES-128 uses // KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14. // (All also use KEY0 for the XOR-only "round" at the beginning.) -.if USE_AVX10 +.if USE_AVX512 .set KEY1_XMM, %xmm16 .set KEY1, V16 .set KEY2_XMM, %xmm17 @@ -227,7 +231,6 @@ .endm // Move a vector between memory and a register. -// The register operand must be in the first 16 vector registers. .macro _vmovdqu src, dst .if VL < 64 vmovdqu \src, \dst @@ -238,9 +241,9 @@ // Broadcast a 128-bit value into a vector. .macro _vbroadcast128 src, dst -.if VL == 16 && !USE_AVX10 +.if VL == 16 vmovdqu \src, \dst -.elseif VL == 32 && !USE_AVX10 +.elseif VL == 32 vbroadcasti128 \src, \dst .else vbroadcasti32x4 \src, \dst @@ -248,7 +251,6 @@ .endm // XOR two vectors together. -// Any register operands must be in the first 16 vector registers. .macro _vpxor src1, src2, dst .if VL < 64 vpxor \src1, \src2, \dst @@ -259,7 +261,7 @@ // XOR three vectors together. .macro _xor3 src1, src2, src3_and_dst -.if USE_AVX10 +.if USE_AVX512 // vpternlogd with immediate 0x96 is a three-argument XOR. vpternlogd $0x96, \src1, \src2, \src3_and_dst .else @@ -274,7 +276,7 @@ vpshufd $0x13, \src, \tmp vpaddq \src, \src, \dst vpsrad $31, \tmp, \tmp -.if USE_AVX10 +.if USE_AVX512 vpternlogd $0x78, GF_POLY_XMM, \tmp, \dst .else vpand GF_POLY_XMM, \tmp, \tmp @@ -303,52 +305,75 @@ // Given the first XTS tweak at (TWEAK), compute the first set of tweaks and // store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5. .macro _compute_first_set_of_tweaks - vmovdqu (TWEAK), TWEAK0_XMM - _vbroadcast128 .Lgf_poly(%rip), GF_POLY .if VL == 16 - // With VL=16, multiplying by x serially is fastest. + vmovdqu (TWEAK), TWEAK0_XMM + vmovdqu .Lgf_poly(%rip), GF_POLY _next_tweak TWEAK0, %xmm0, TWEAK1 _next_tweak TWEAK1, %xmm0, TWEAK2 _next_tweak TWEAK2, %xmm0, TWEAK3 -.else -.if VL == 32 - // Compute the second block of TWEAK0. +.elseif VL == 32 + vmovdqu (TWEAK), TWEAK0_XMM + vbroadcasti128 .Lgf_poly(%rip), GF_POLY + + // Compute the first vector of tweaks. _next_tweak TWEAK0_XMM, %xmm0, %xmm1 vinserti128 $1, %xmm1, TWEAK0, TWEAK0 -.elseif VL == 64 - // Compute the remaining blocks of TWEAK0. - _next_tweak TWEAK0_XMM, %xmm0, %xmm1 - _next_tweak %xmm1, %xmm0, %xmm2 - _next_tweak %xmm2, %xmm0, %xmm3 - vinserti32x4 $1, %xmm1, TWEAK0, TWEAK0 - vinserti32x4 $2, %xmm2, TWEAK0, TWEAK0 - vinserti32x4 $3, %xmm3, TWEAK0, TWEAK0 -.endif - // Compute TWEAK[1-3] from TWEAK0. - vpsrlq $64 - 1*VL/16, TWEAK0, V0 - vpsrlq $64 - 2*VL/16, TWEAK0, V2 - vpsrlq $64 - 3*VL/16, TWEAK0, V4 + + // Compute the next three vectors of tweaks: + // TWEAK1 = TWEAK0 * [x^2, x^2] + // TWEAK2 = TWEAK0 * [x^4, x^4] + // TWEAK3 = TWEAK0 * [x^6, x^6] + vpsrlq $64 - 2, TWEAK0, V0 + vpsrlq $64 - 4, TWEAK0, V2 + vpsrlq $64 - 6, TWEAK0, V4 vpclmulqdq $0x01, GF_POLY, V0, V1 vpclmulqdq $0x01, GF_POLY, V2, V3 vpclmulqdq $0x01, GF_POLY, V4, V5 vpslldq $8, V0, V0 vpslldq $8, V2, V2 vpslldq $8, V4, V4 - vpsllq $1*VL/16, TWEAK0, TWEAK1 - vpsllq $2*VL/16, TWEAK0, TWEAK2 - vpsllq $3*VL/16, TWEAK0, TWEAK3 -.if USE_AVX10 - vpternlogd $0x96, V0, V1, TWEAK1 - vpternlogd $0x96, V2, V3, TWEAK2 - vpternlogd $0x96, V4, V5, TWEAK3 -.else + vpsllq $2, TWEAK0, TWEAK1 + vpsllq $4, TWEAK0, TWEAK2 + vpsllq $6, TWEAK0, TWEAK3 vpxor V0, TWEAK1, TWEAK1 vpxor V2, TWEAK2, TWEAK2 vpxor V4, TWEAK3, TWEAK3 vpxor V1, TWEAK1, TWEAK1 vpxor V3, TWEAK2, TWEAK2 vpxor V5, TWEAK3, TWEAK3 -.endif +.else + vbroadcasti32x4 (TWEAK), TWEAK0 + vbroadcasti32x4 .Lgf_poly(%rip), GF_POLY + + // Compute the first vector of tweaks: + // TWEAK0 = broadcast128(TWEAK) * [x^0, x^1, x^2, x^3] + vpmovzxbq .Lrshift_amounts(%rip), V4 + vpsrlvq V4, TWEAK0, V0 + vpclmulqdq $0x01, GF_POLY, V0, V1 + vpmovzxbq .Llshift_amounts(%rip), V4 + vpslldq $8, V0, V0 + vpsllvq V4, TWEAK0, TWEAK0 + vpternlogd $0x96, V0, V1, TWEAK0 + + // Compute the next three vectors of tweaks: + // TWEAK1 = TWEAK0 * [x^4, x^4, x^4, x^4] + // TWEAK2 = TWEAK0 * [x^8, x^8, x^8, x^8] + // TWEAK3 = TWEAK0 * [x^12, x^12, x^12, x^12] + // x^8 only needs byte-aligned shifts, so optimize accordingly. + vpsrlq $64 - 4, TWEAK0, V0 + vpsrldq $(64 - 8) / 8, TWEAK0, V2 + vpsrlq $64 - 12, TWEAK0, V4 + vpclmulqdq $0x01, GF_POLY, V0, V1 + vpclmulqdq $0x01, GF_POLY, V2, V3 + vpclmulqdq $0x01, GF_POLY, V4, V5 + vpslldq $8, V0, V0 + vpslldq $8, V4, V4 + vpsllq $4, TWEAK0, TWEAK1 + vpslldq $8 / 8, TWEAK0, TWEAK2 + vpsllq $12, TWEAK0, TWEAK3 + vpternlogd $0x96, V0, V1, TWEAK1 + vpxord V3, TWEAK2, TWEAK2 + vpternlogd $0x96, V4, V5, TWEAK3 .endif .endm @@ -474,26 +499,26 @@ lea OFFS-16(KEY, KEYLEN64, 4), KEY // If all 32 SIMD registers are available, cache all the round keys. -.if USE_AVX10 +.if USE_AVX512 cmp $24, KEYLEN jl .Laes128\@ je .Laes192\@ - _vbroadcast128 -6*16(KEY), KEY1 - _vbroadcast128 -5*16(KEY), KEY2 + vbroadcasti32x4 -6*16(KEY), KEY1 + vbroadcasti32x4 -5*16(KEY), KEY2 .Laes192\@: - _vbroadcast128 -4*16(KEY), KEY3 - _vbroadcast128 -3*16(KEY), KEY4 + vbroadcasti32x4 -4*16(KEY), KEY3 + vbroadcasti32x4 -3*16(KEY), KEY4 .Laes128\@: - _vbroadcast128 -2*16(KEY), KEY5 - _vbroadcast128 -1*16(KEY), KEY6 - _vbroadcast128 0*16(KEY), KEY7 - _vbroadcast128 1*16(KEY), KEY8 - _vbroadcast128 2*16(KEY), KEY9 - _vbroadcast128 3*16(KEY), KEY10 - _vbroadcast128 4*16(KEY), KEY11 - _vbroadcast128 5*16(KEY), KEY12 - _vbroadcast128 6*16(KEY), KEY13 - _vbroadcast128 7*16(KEY), KEY14 + vbroadcasti32x4 -2*16(KEY), KEY5 + vbroadcasti32x4 -1*16(KEY), KEY6 + vbroadcasti32x4 0*16(KEY), KEY7 + vbroadcasti32x4 1*16(KEY), KEY8 + vbroadcasti32x4 2*16(KEY), KEY9 + vbroadcasti32x4 3*16(KEY), KEY10 + vbroadcasti32x4 4*16(KEY), KEY11 + vbroadcasti32x4 5*16(KEY), KEY12 + vbroadcasti32x4 6*16(KEY), KEY13 + vbroadcasti32x4 7*16(KEY), KEY14 .endif .endm @@ -521,7 +546,7 @@ // using the same key for all block(s). The round key is loaded from the // appropriate register or memory location for round \i. May clobber \tmp. .macro _vaes_1x enc, i, xmm_suffix, data, tmp -.if USE_AVX10 +.if USE_AVX512 _vaes \enc, KEY\i\xmm_suffix, \data .else .ifnb \xmm_suffix @@ -538,7 +563,7 @@ // appropriate register or memory location for round \i. In addition, does two // steps of the computation of the next set of tweaks. May clobber V4 and V5. .macro _vaes_4x enc, i -.if USE_AVX10 +.if USE_AVX512 _tweak_step (2*(\i-5)) _vaes \enc, KEY\i, V0 _vaes \enc, KEY\i, V1 @@ -574,7 +599,7 @@ .irp i, 5,6,7,8,9,10,11,12,13 _vaes_1x \enc, \i, \xmm_suffix, \data, tmp=\tmp .endr -.if USE_AVX10 +.if USE_AVX512 vpxord KEY14\xmm_suffix, \tweak, \tmp .else .ifnb \xmm_suffix @@ -617,11 +642,11 @@ // This is the main loop, en/decrypting 4*VL bytes per iteration. // XOR each source block with its tweak and the zero-th round key. -.if USE_AVX10 - _vmovdqu 0*VL(SRC), V0 - _vmovdqu 1*VL(SRC), V1 - _vmovdqu 2*VL(SRC), V2 - _vmovdqu 3*VL(SRC), V3 +.if USE_AVX512 + vmovdqu8 0*VL(SRC), V0 + vmovdqu8 1*VL(SRC), V1 + vmovdqu8 2*VL(SRC), V2 + vmovdqu8 3*VL(SRC), V3 vpternlogd $0x96, TWEAK0, KEY0, V0 vpternlogd $0x96, TWEAK1, KEY0, V1 vpternlogd $0x96, TWEAK2, KEY0, V2 @@ -654,7 +679,7 @@ // Reduce latency by doing the XOR before the vaesenclast, utilizing the // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a) // (and likewise for vaesdeclast). -.if USE_AVX10 +.if USE_AVX512 _tweak_step 18 _tweak_step 19 vpxord TWEAK0, KEY14, V4 @@ -762,7 +787,7 @@ _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0, tmp=%xmm1 .endif -.if USE_AVX10 +.if USE_AVX512 // Create a mask that has the first LEN bits set. mov $-1, %r9d bzhi LEN, %r9d, %r9d @@ -811,7 +836,7 @@ // u8 iv[AES_BLOCK_SIZE]); // // Encrypt |iv| using the AES key |tweak_key| to get the first tweak. Assumes -// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX10. +// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX512. SYM_TYPED_FUNC_START(aes_xts_encrypt_iv) .set TWEAK_KEY, %rdi .set IV, %rsi @@ -853,7 +878,7 @@ SYM_FUNC_END(aes_xts_encrypt_iv) // multiple of 16, then this function updates |tweak| to contain the next tweak. .set VL, 16 -.set USE_AVX10, 0 +.set USE_AVX512, 0 SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx) _aes_xts_crypt 1 SYM_FUNC_END(aes_xts_encrypt_aesni_avx) @@ -863,7 +888,7 @@ SYM_FUNC_END(aes_xts_decrypt_aesni_avx) #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) .set VL, 32 -.set USE_AVX10, 0 +.set USE_AVX512, 0 SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2) _aes_xts_crypt 1 SYM_FUNC_END(aes_xts_encrypt_vaes_avx2) @@ -871,21 +896,12 @@ SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2) _aes_xts_crypt 0 SYM_FUNC_END(aes_xts_decrypt_vaes_avx2) -.set VL, 32 -.set USE_AVX10, 1 -SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256) - _aes_xts_crypt 1 -SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256) -SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256) - _aes_xts_crypt 0 -SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256) - .set VL, 64 -.set USE_AVX10, 1 -SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512) +.set USE_AVX512, 1 +SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx512) _aes_xts_crypt 1 -SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512) -SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512) +SYM_FUNC_END(aes_xts_encrypt_vaes_avx512) +SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx512) _aes_xts_crypt 0 -SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512) +SYM_FUNC_END(aes_xts_decrypt_vaes_avx512) #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index bc655d794a95..061b1ced93c5 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -566,10 +566,9 @@ static struct crypto_alg aesni_cipher_alg = { static struct skcipher_alg aesni_skciphers[] = { { .base = { - .cra_name = "__ecb(aes)", - .cra_driver_name = "__ecb-aes-aesni", + .cra_name = "ecb(aes)", + .cra_driver_name = "ecb-aes-aesni", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, @@ -581,10 +580,9 @@ static struct skcipher_alg aesni_skciphers[] = { .decrypt = ecb_decrypt, }, { .base = { - .cra_name = "__cbc(aes)", - .cra_driver_name = "__cbc-aes-aesni", + .cra_name = "cbc(aes)", + .cra_driver_name = "cbc-aes-aesni", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, @@ -597,10 +595,9 @@ static struct skcipher_alg aesni_skciphers[] = { .decrypt = cbc_decrypt, }, { .base = { - .cra_name = "__cts(cbc(aes))", - .cra_driver_name = "__cts-cbc-aes-aesni", + .cra_name = "cts(cbc(aes))", + .cra_driver_name = "cts-cbc-aes-aesni", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, @@ -615,10 +612,9 @@ static struct skcipher_alg aesni_skciphers[] = { #ifdef CONFIG_X86_64 }, { .base = { - .cra_name = "__ctr(aes)", - .cra_driver_name = "__ctr-aes-aesni", + .cra_name = "ctr(aes)", + .cra_driver_name = "ctr-aes-aesni", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = CRYPTO_AES_CTX_SIZE, .cra_module = THIS_MODULE, @@ -633,10 +629,9 @@ static struct skcipher_alg aesni_skciphers[] = { #endif }, { .base = { - .cra_name = "__xts(aes)", - .cra_driver_name = "__xts-aes-aesni", + .cra_name = "xts(aes)", + .cra_driver_name = "xts-aes-aesni", .cra_priority = 401, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = XTS_AES_CTX_SIZE, .cra_module = THIS_MODULE, @@ -651,9 +646,6 @@ static struct skcipher_alg aesni_skciphers[] = { } }; -static -struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)]; - #ifdef CONFIG_X86_64 asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, u8 iv[AES_BLOCK_SIZE]); @@ -792,10 +784,9 @@ static int xctr_crypt_##suffix(struct skcipher_request *req) \ } \ \ static struct skcipher_alg skcipher_algs_##suffix[] = {{ \ - .base.cra_name = "__xts(aes)", \ - .base.cra_driver_name = "__xts-aes-" driver_name_suffix, \ + .base.cra_name = "xts(aes)", \ + .base.cra_driver_name = "xts-aes-" driver_name_suffix, \ .base.cra_priority = priority, \ - .base.cra_flags = CRYPTO_ALG_INTERNAL, \ .base.cra_blocksize = AES_BLOCK_SIZE, \ .base.cra_ctxsize = XTS_AES_CTX_SIZE, \ .base.cra_module = THIS_MODULE, \ @@ -807,10 +798,9 @@ static struct skcipher_alg skcipher_algs_##suffix[] = {{ \ .encrypt = xts_encrypt_##suffix, \ .decrypt = xts_decrypt_##suffix, \ }, { \ - .base.cra_name = "__ctr(aes)", \ - .base.cra_driver_name = "__ctr-aes-" driver_name_suffix, \ + .base.cra_name = "ctr(aes)", \ + .base.cra_driver_name = "ctr-aes-" driver_name_suffix, \ .base.cra_priority = priority, \ - .base.cra_flags = CRYPTO_ALG_INTERNAL, \ .base.cra_blocksize = 1, \ .base.cra_ctxsize = CRYPTO_AES_CTX_SIZE, \ .base.cra_module = THIS_MODULE, \ @@ -822,10 +812,9 @@ static struct skcipher_alg skcipher_algs_##suffix[] = {{ \ .encrypt = ctr_crypt_##suffix, \ .decrypt = ctr_crypt_##suffix, \ }, { \ - .base.cra_name = "__xctr(aes)", \ - .base.cra_driver_name = "__xctr-aes-" driver_name_suffix, \ + .base.cra_name = "xctr(aes)", \ + .base.cra_driver_name = "xctr-aes-" driver_name_suffix, \ .base.cra_priority = priority, \ - .base.cra_flags = CRYPTO_ALG_INTERNAL, \ .base.cra_blocksize = 1, \ .base.cra_ctxsize = CRYPTO_AES_CTX_SIZE, \ .base.cra_module = THIS_MODULE, \ @@ -836,16 +825,12 @@ static struct skcipher_alg skcipher_algs_##suffix[] = {{ \ .setkey = aesni_skcipher_setkey, \ .encrypt = xctr_crypt_##suffix, \ .decrypt = xctr_crypt_##suffix, \ -}}; \ - \ -static struct simd_skcipher_alg * \ -simd_skcipher_algs_##suffix[ARRAY_SIZE(skcipher_algs_##suffix)] +}} DEFINE_AVX_SKCIPHER_ALGS(aesni_avx, "aesni-avx", 500); #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) DEFINE_AVX_SKCIPHER_ALGS(vaes_avx2, "vaes-avx2", 600); -DEFINE_AVX_SKCIPHER_ALGS(vaes_avx10_256, "vaes-avx10_256", 700); -DEFINE_AVX_SKCIPHER_ALGS(vaes_avx10_512, "vaes-avx10_512", 800); +DEFINE_AVX_SKCIPHER_ALGS(vaes_avx512, "vaes-avx512", 800); #endif /* The common part of the x86_64 AES-GCM key struct */ @@ -1499,10 +1484,9 @@ static struct aead_alg aes_gcm_algs_##suffix[] = { { \ .chunksize = AES_BLOCK_SIZE, \ .maxauthsize = 16, \ .base = { \ - .cra_name = "__gcm(aes)", \ - .cra_driver_name = "__" generic_driver_name, \ + .cra_name = "gcm(aes)", \ + .cra_driver_name = generic_driver_name, \ .cra_priority = (priority), \ - .cra_flags = CRYPTO_ALG_INTERNAL, \ .cra_blocksize = 1, \ .cra_ctxsize = (ctxsize), \ .cra_module = THIS_MODULE, \ @@ -1516,17 +1500,14 @@ static struct aead_alg aes_gcm_algs_##suffix[] = { { \ .chunksize = AES_BLOCK_SIZE, \ .maxauthsize = 16, \ .base = { \ - .cra_name = "__rfc4106(gcm(aes))", \ - .cra_driver_name = "__" rfc_driver_name, \ + .cra_name = "rfc4106(gcm(aes))", \ + .cra_driver_name = rfc_driver_name, \ .cra_priority = (priority), \ - .cra_flags = CRYPTO_ALG_INTERNAL, \ .cra_blocksize = 1, \ .cra_ctxsize = (ctxsize), \ .cra_module = THIS_MODULE, \ }, \ -} }; \ - \ -static struct simd_aead_alg *aes_gcm_simdalgs_##suffix[2] \ +} } /* aes_gcm_algs_aesni */ DEFINE_GCM_ALGS(aesni, /* no flags */ 0, @@ -1556,14 +1537,12 @@ static int __init register_avx_algs(void) if (!boot_cpu_has(X86_FEATURE_AVX)) return 0; - err = simd_register_skciphers_compat(skcipher_algs_aesni_avx, - ARRAY_SIZE(skcipher_algs_aesni_avx), - simd_skcipher_algs_aesni_avx); + err = crypto_register_skciphers(skcipher_algs_aesni_avx, + ARRAY_SIZE(skcipher_algs_aesni_avx)); if (err) return err; - err = simd_register_aeads_compat(aes_gcm_algs_aesni_avx, - ARRAY_SIZE(aes_gcm_algs_aesni_avx), - aes_gcm_simdalgs_aesni_avx); + err = crypto_register_aeads(aes_gcm_algs_aesni_avx, + ARRAY_SIZE(aes_gcm_algs_aesni_avx)); if (err) return err; /* @@ -1579,9 +1558,8 @@ static int __init register_avx_algs(void) !boot_cpu_has(X86_FEATURE_PCLMULQDQ) || !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) return 0; - err = simd_register_skciphers_compat(skcipher_algs_vaes_avx2, - ARRAY_SIZE(skcipher_algs_vaes_avx2), - simd_skcipher_algs_vaes_avx2); + err = crypto_register_skciphers(skcipher_algs_vaes_avx2, + ARRAY_SIZE(skcipher_algs_vaes_avx2)); if (err) return err; @@ -1592,76 +1570,52 @@ static int __init register_avx_algs(void) XFEATURE_MASK_AVX512, NULL)) return 0; - err = simd_register_skciphers_compat(skcipher_algs_vaes_avx10_256, - ARRAY_SIZE(skcipher_algs_vaes_avx10_256), - simd_skcipher_algs_vaes_avx10_256); - if (err) - return err; - err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_256, - ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256), - aes_gcm_simdalgs_vaes_avx10_256); + err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_256, + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256)); if (err) return err; if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) { int i; - for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx10_512); i++) - skcipher_algs_vaes_avx10_512[i].base.cra_priority = 1; + for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx512); i++) + skcipher_algs_vaes_avx512[i].base.cra_priority = 1; for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++) aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1; } - err = simd_register_skciphers_compat(skcipher_algs_vaes_avx10_512, - ARRAY_SIZE(skcipher_algs_vaes_avx10_512), - simd_skcipher_algs_vaes_avx10_512); + err = crypto_register_skciphers(skcipher_algs_vaes_avx512, + ARRAY_SIZE(skcipher_algs_vaes_avx512)); if (err) return err; - err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_512, - ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512), - aes_gcm_simdalgs_vaes_avx10_512); + err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_512, + ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512)); if (err) return err; #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ return 0; } +#define unregister_skciphers(A) \ + if (refcount_read(&(A)[0].base.cra_refcnt) != 0) \ + crypto_unregister_skciphers((A), ARRAY_SIZE(A)) +#define unregister_aeads(A) \ + if (refcount_read(&(A)[0].base.cra_refcnt) != 0) \ + crypto_unregister_aeads((A), ARRAY_SIZE(A)) + static void unregister_avx_algs(void) { - if (simd_skcipher_algs_aesni_avx[0]) - simd_unregister_skciphers(skcipher_algs_aesni_avx, - ARRAY_SIZE(skcipher_algs_aesni_avx), - simd_skcipher_algs_aesni_avx); - if (aes_gcm_simdalgs_aesni_avx[0]) - simd_unregister_aeads(aes_gcm_algs_aesni_avx, - ARRAY_SIZE(aes_gcm_algs_aesni_avx), - aes_gcm_simdalgs_aesni_avx); + unregister_skciphers(skcipher_algs_aesni_avx); + unregister_aeads(aes_gcm_algs_aesni_avx); #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) - if (simd_skcipher_algs_vaes_avx2[0]) - simd_unregister_skciphers(skcipher_algs_vaes_avx2, - ARRAY_SIZE(skcipher_algs_vaes_avx2), - simd_skcipher_algs_vaes_avx2); - if (simd_skcipher_algs_vaes_avx10_256[0]) - simd_unregister_skciphers(skcipher_algs_vaes_avx10_256, - ARRAY_SIZE(skcipher_algs_vaes_avx10_256), - simd_skcipher_algs_vaes_avx10_256); - if (aes_gcm_simdalgs_vaes_avx10_256[0]) - simd_unregister_aeads(aes_gcm_algs_vaes_avx10_256, - ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256), - aes_gcm_simdalgs_vaes_avx10_256); - if (simd_skcipher_algs_vaes_avx10_512[0]) - simd_unregister_skciphers(skcipher_algs_vaes_avx10_512, - ARRAY_SIZE(skcipher_algs_vaes_avx10_512), - simd_skcipher_algs_vaes_avx10_512); - if (aes_gcm_simdalgs_vaes_avx10_512[0]) - simd_unregister_aeads(aes_gcm_algs_vaes_avx10_512, - ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512), - aes_gcm_simdalgs_vaes_avx10_512); + unregister_skciphers(skcipher_algs_vaes_avx2); + unregister_skciphers(skcipher_algs_vaes_avx512); + unregister_aeads(aes_gcm_algs_vaes_avx10_256); + unregister_aeads(aes_gcm_algs_vaes_avx10_512); #endif } #else /* CONFIG_X86_64 */ static struct aead_alg aes_gcm_algs_aesni[0]; -static struct simd_aead_alg *aes_gcm_simdalgs_aesni[0]; static int __init register_avx_algs(void) { @@ -1690,15 +1644,13 @@ static int __init aesni_init(void) if (err) return err; - err = simd_register_skciphers_compat(aesni_skciphers, - ARRAY_SIZE(aesni_skciphers), - aesni_simd_skciphers); + err = crypto_register_skciphers(aesni_skciphers, + ARRAY_SIZE(aesni_skciphers)); if (err) goto unregister_cipher; - err = simd_register_aeads_compat(aes_gcm_algs_aesni, - ARRAY_SIZE(aes_gcm_algs_aesni), - aes_gcm_simdalgs_aesni); + err = crypto_register_aeads(aes_gcm_algs_aesni, + ARRAY_SIZE(aes_gcm_algs_aesni)); if (err) goto unregister_skciphers; @@ -1710,12 +1662,11 @@ static int __init aesni_init(void) unregister_avx: unregister_avx_algs(); - simd_unregister_aeads(aes_gcm_algs_aesni, - ARRAY_SIZE(aes_gcm_algs_aesni), - aes_gcm_simdalgs_aesni); + crypto_unregister_aeads(aes_gcm_algs_aesni, + ARRAY_SIZE(aes_gcm_algs_aesni)); unregister_skciphers: - simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), - aesni_simd_skciphers); + crypto_unregister_skciphers(aesni_skciphers, + ARRAY_SIZE(aesni_skciphers)); unregister_cipher: crypto_unregister_alg(&aesni_cipher_alg); return err; @@ -1723,11 +1674,10 @@ unregister_cipher: static void __exit aesni_exit(void) { - simd_unregister_aeads(aes_gcm_algs_aesni, - ARRAY_SIZE(aes_gcm_algs_aesni), - aes_gcm_simdalgs_aesni); - simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), - aesni_simd_skciphers); + crypto_unregister_aeads(aes_gcm_algs_aesni, + ARRAY_SIZE(aes_gcm_algs_aesni)); + crypto_unregister_skciphers(aesni_skciphers, + ARRAY_SIZE(aesni_skciphers)); crypto_unregister_alg(&aesni_cipher_alg); unregister_avx_algs(); } diff --git a/arch/x86/crypto/aria_aesni_avx2_glue.c b/arch/x86/crypto/aria_aesni_avx2_glue.c index 87a11804fc77..b4bddcd58457 100644 --- a/arch/x86/crypto/aria_aesni_avx2_glue.c +++ b/arch/x86/crypto/aria_aesni_avx2_glue.c @@ -6,7 +6,6 @@ */ #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <crypto/aria.h> #include <linux/crypto.h> #include <linux/err.h> @@ -165,10 +164,9 @@ static int aria_avx2_init_tfm(struct crypto_skcipher *tfm) static struct skcipher_alg aria_algs[] = { { - .base.cra_name = "__ecb(aria)", - .base.cra_driver_name = "__ecb-aria-avx2", + .base.cra_name = "ecb(aria)", + .base.cra_driver_name = "ecb-aria-avx2", .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = ARIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct aria_ctx), .base.cra_module = THIS_MODULE, @@ -178,11 +176,10 @@ static struct skcipher_alg aria_algs[] = { .encrypt = aria_avx2_ecb_encrypt, .decrypt = aria_avx2_ecb_decrypt, }, { - .base.cra_name = "__ctr(aria)", - .base.cra_driver_name = "__ctr-aria-avx2", + .base.cra_name = "ctr(aria)", + .base.cra_driver_name = "ctr-aria-avx2", .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL | - CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE, + .base.cra_flags = CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct aria_ctx), .base.cra_module = THIS_MODULE, @@ -197,8 +194,6 @@ static struct skcipher_alg aria_algs[] = { } }; -static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)]; - static int __init aria_avx2_init(void) { const char *feature_name; @@ -233,15 +228,12 @@ static int __init aria_avx2_init(void) aria_ops.aria_ctr_crypt_32way = aria_aesni_avx2_ctr_crypt_32way; } - return simd_register_skciphers_compat(aria_algs, - ARRAY_SIZE(aria_algs), - aria_simd_algs); + return crypto_register_skciphers(aria_algs, ARRAY_SIZE(aria_algs)); } static void __exit aria_avx2_exit(void) { - simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs), - aria_simd_algs); + crypto_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs)); } module_init(aria_avx2_init); diff --git a/arch/x86/crypto/aria_aesni_avx_glue.c b/arch/x86/crypto/aria_aesni_avx_glue.c index 4e1516b76669..ab9b38d05332 100644 --- a/arch/x86/crypto/aria_aesni_avx_glue.c +++ b/arch/x86/crypto/aria_aesni_avx_glue.c @@ -6,7 +6,6 @@ */ #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <crypto/aria.h> #include <linux/crypto.h> #include <linux/err.h> @@ -152,10 +151,9 @@ static int aria_avx_init_tfm(struct crypto_skcipher *tfm) static struct skcipher_alg aria_algs[] = { { - .base.cra_name = "__ecb(aria)", - .base.cra_driver_name = "__ecb-aria-avx", + .base.cra_name = "ecb(aria)", + .base.cra_driver_name = "ecb-aria-avx", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = ARIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct aria_ctx), .base.cra_module = THIS_MODULE, @@ -165,10 +163,9 @@ static struct skcipher_alg aria_algs[] = { .encrypt = aria_avx_ecb_encrypt, .decrypt = aria_avx_ecb_decrypt, }, { - .base.cra_name = "__ctr(aria)", - .base.cra_driver_name = "__ctr-aria-avx", + .base.cra_name = "ctr(aria)", + .base.cra_driver_name = "ctr-aria-avx", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct aria_ctx), .base.cra_module = THIS_MODULE, @@ -184,8 +181,6 @@ static struct skcipher_alg aria_algs[] = { } }; -static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)]; - static int __init aria_avx_init(void) { const char *feature_name; @@ -213,15 +208,12 @@ static int __init aria_avx_init(void) aria_ops.aria_ctr_crypt_16way = aria_aesni_avx_ctr_crypt_16way; } - return simd_register_skciphers_compat(aria_algs, - ARRAY_SIZE(aria_algs), - aria_simd_algs); + return crypto_register_skciphers(aria_algs, ARRAY_SIZE(aria_algs)); } static void __exit aria_avx_exit(void) { - simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs), - aria_simd_algs); + crypto_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs)); } module_init(aria_avx_init); diff --git a/arch/x86/crypto/aria_gfni_avx512_glue.c b/arch/x86/crypto/aria_gfni_avx512_glue.c index f4a2208d2638..363cbf4399cc 100644 --- a/arch/x86/crypto/aria_gfni_avx512_glue.c +++ b/arch/x86/crypto/aria_gfni_avx512_glue.c @@ -6,7 +6,6 @@ */ #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <crypto/aria.h> #include <linux/crypto.h> #include <linux/err.h> @@ -165,10 +164,9 @@ static int aria_avx512_init_tfm(struct crypto_skcipher *tfm) static struct skcipher_alg aria_algs[] = { { - .base.cra_name = "__ecb(aria)", - .base.cra_driver_name = "__ecb-aria-avx512", + .base.cra_name = "ecb(aria)", + .base.cra_driver_name = "ecb-aria-avx512", .base.cra_priority = 600, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = ARIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct aria_ctx), .base.cra_module = THIS_MODULE, @@ -178,11 +176,10 @@ static struct skcipher_alg aria_algs[] = { .encrypt = aria_avx512_ecb_encrypt, .decrypt = aria_avx512_ecb_decrypt, }, { - .base.cra_name = "__ctr(aria)", - .base.cra_driver_name = "__ctr-aria-avx512", + .base.cra_name = "ctr(aria)", + .base.cra_driver_name = "ctr-aria-avx512", .base.cra_priority = 600, - .base.cra_flags = CRYPTO_ALG_INTERNAL | - CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE, + .base.cra_flags = CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct aria_ctx), .base.cra_module = THIS_MODULE, @@ -197,8 +194,6 @@ static struct skcipher_alg aria_algs[] = { } }; -static struct simd_skcipher_alg *aria_simd_algs[ARRAY_SIZE(aria_algs)]; - static int __init aria_avx512_init(void) { const char *feature_name; @@ -229,15 +224,12 @@ static int __init aria_avx512_init(void) aria_ops.aria_decrypt_64way = aria_gfni_avx512_decrypt_64way; aria_ops.aria_ctr_crypt_64way = aria_gfni_avx512_ctr_crypt_64way; - return simd_register_skciphers_compat(aria_algs, - ARRAY_SIZE(aria_algs), - aria_simd_algs); + return crypto_register_skciphers(aria_algs, ARRAY_SIZE(aria_algs)); } static void __exit aria_avx512_exit(void) { - simd_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs), - aria_simd_algs); + crypto_unregister_skciphers(aria_algs, ARRAY_SIZE(aria_algs)); } module_init(aria_avx512_init); diff --git a/arch/x86/crypto/blake2s-core.S b/arch/x86/crypto/blake2s-core.S deleted file mode 100644 index b50b35ff1fdb..000000000000 --- a/arch/x86/crypto/blake2s-core.S +++ /dev/null @@ -1,256 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR MIT */ -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. - */ - -#include <linux/linkage.h> - -.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 -.align 32 -IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 - .octa 0x5BE0CD191F83D9AB9B05688C510E527F -.section .rodata.cst16.ROT16, "aM", @progbits, 16 -.align 16 -ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 -.section .rodata.cst16.ROR328, "aM", @progbits, 16 -.align 16 -ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 -.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160 -.align 64 -SIGMA: -.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 -.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 -.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 -.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 -.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 -.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 -.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 -.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 -.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 -.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 -#ifdef CONFIG_AS_AVX512 -.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 -.align 64 -SIGMA2: -.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 -.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 -.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 -.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 -.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 -.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 -.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 -.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 -.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 -.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 -#endif /* CONFIG_AS_AVX512 */ - -.text -SYM_FUNC_START(blake2s_compress_ssse3) - testq %rdx,%rdx - je .Lendofloop - movdqu (%rdi),%xmm0 - movdqu 0x10(%rdi),%xmm1 - movdqa ROT16(%rip),%xmm12 - movdqa ROR328(%rip),%xmm13 - movdqu 0x20(%rdi),%xmm14 - movq %rcx,%xmm15 - leaq SIGMA+0xa0(%rip),%r8 - jmp .Lbeginofloop - .align 32 -.Lbeginofloop: - movdqa %xmm0,%xmm10 - movdqa %xmm1,%xmm11 - paddq %xmm15,%xmm14 - movdqa IV(%rip),%xmm2 - movdqa %xmm14,%xmm3 - pxor IV+0x10(%rip),%xmm3 - leaq SIGMA(%rip),%rcx -.Lroundloop: - movzbl (%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - movzbl 0x1(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - movzbl 0x2(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - movzbl 0x3(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - punpckldq %xmm5,%xmm4 - punpckldq %xmm7,%xmm6 - punpcklqdq %xmm6,%xmm4 - paddd %xmm4,%xmm0 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm12,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm8 - psrld $0xc,%xmm1 - pslld $0x14,%xmm8 - por %xmm8,%xmm1 - movzbl 0x4(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - movzbl 0x5(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - movzbl 0x6(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - movzbl 0x7(%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - punpckldq %xmm6,%xmm5 - punpckldq %xmm4,%xmm7 - punpcklqdq %xmm7,%xmm5 - paddd %xmm5,%xmm0 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm13,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm8 - psrld $0x7,%xmm1 - pslld $0x19,%xmm8 - por %xmm8,%xmm1 - pshufd $0x93,%xmm0,%xmm0 - pshufd $0x4e,%xmm3,%xmm3 - pshufd $0x39,%xmm2,%xmm2 - movzbl 0x8(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - movzbl 0x9(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - movzbl 0xa(%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - movzbl 0xb(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - punpckldq %xmm7,%xmm6 - punpckldq %xmm5,%xmm4 - punpcklqdq %xmm4,%xmm6 - paddd %xmm6,%xmm0 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm12,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm8 - psrld $0xc,%xmm1 - pslld $0x14,%xmm8 - por %xmm8,%xmm1 - movzbl 0xc(%rcx),%eax - movd (%rsi,%rax,4),%xmm7 - movzbl 0xd(%rcx),%eax - movd (%rsi,%rax,4),%xmm4 - movzbl 0xe(%rcx),%eax - movd (%rsi,%rax,4),%xmm5 - movzbl 0xf(%rcx),%eax - movd (%rsi,%rax,4),%xmm6 - punpckldq %xmm4,%xmm7 - punpckldq %xmm6,%xmm5 - punpcklqdq %xmm5,%xmm7 - paddd %xmm7,%xmm0 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm13,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm8 - psrld $0x7,%xmm1 - pslld $0x19,%xmm8 - por %xmm8,%xmm1 - pshufd $0x39,%xmm0,%xmm0 - pshufd $0x4e,%xmm3,%xmm3 - pshufd $0x93,%xmm2,%xmm2 - addq $0x10,%rcx - cmpq %r8,%rcx - jnz .Lroundloop - pxor %xmm2,%xmm0 - pxor %xmm3,%xmm1 - pxor %xmm10,%xmm0 - pxor %xmm11,%xmm1 - addq $0x40,%rsi - decq %rdx - jnz .Lbeginofloop - movdqu %xmm0,(%rdi) - movdqu %xmm1,0x10(%rdi) - movdqu %xmm14,0x20(%rdi) -.Lendofloop: - RET -SYM_FUNC_END(blake2s_compress_ssse3) - -#ifdef CONFIG_AS_AVX512 -SYM_FUNC_START(blake2s_compress_avx512) - vmovdqu (%rdi),%xmm0 - vmovdqu 0x10(%rdi),%xmm1 - vmovdqu 0x20(%rdi),%xmm4 - vmovq %rcx,%xmm5 - vmovdqa IV(%rip),%xmm14 - vmovdqa IV+16(%rip),%xmm15 - jmp .Lblake2s_compress_avx512_mainloop -.align 32 -.Lblake2s_compress_avx512_mainloop: - vmovdqa %xmm0,%xmm10 - vmovdqa %xmm1,%xmm11 - vpaddq %xmm5,%xmm4,%xmm4 - vmovdqa %xmm14,%xmm2 - vpxor %xmm15,%xmm4,%xmm3 - vmovdqu (%rsi),%ymm6 - vmovdqu 0x20(%rsi),%ymm7 - addq $0x40,%rsi - leaq SIGMA2(%rip),%rax - movb $0xa,%cl -.Lblake2s_compress_avx512_roundloop: - addq $0x40,%rax - vmovdqa -0x40(%rax),%ymm8 - vmovdqa -0x20(%rax),%ymm9 - vpermi2d %ymm7,%ymm6,%ymm8 - vpermi2d %ymm7,%ymm6,%ymm9 - vmovdqa %ymm8,%ymm6 - vmovdqa %ymm9,%ymm7 - vpaddd %xmm8,%xmm0,%xmm0 - vpaddd %xmm1,%xmm0,%xmm0 - vpxor %xmm0,%xmm3,%xmm3 - vprord $0x10,%xmm3,%xmm3 - vpaddd %xmm3,%xmm2,%xmm2 - vpxor %xmm2,%xmm1,%xmm1 - vprord $0xc,%xmm1,%xmm1 - vextracti128 $0x1,%ymm8,%xmm8 - vpaddd %xmm8,%xmm0,%xmm0 - vpaddd %xmm1,%xmm0,%xmm0 - vpxor %xmm0,%xmm3,%xmm3 - vprord $0x8,%xmm3,%xmm3 - vpaddd %xmm3,%xmm2,%xmm2 - vpxor %xmm2,%xmm1,%xmm1 - vprord $0x7,%xmm1,%xmm1 - vpshufd $0x93,%xmm0,%xmm0 - vpshufd $0x4e,%xmm3,%xmm3 - vpshufd $0x39,%xmm2,%xmm2 - vpaddd %xmm9,%xmm0,%xmm0 - vpaddd %xmm1,%xmm0,%xmm0 - vpxor %xmm0,%xmm3,%xmm3 - vprord $0x10,%xmm3,%xmm3 - vpaddd %xmm3,%xmm2,%xmm2 - vpxor %xmm2,%xmm1,%xmm1 - vprord $0xc,%xmm1,%xmm1 - vextracti128 $0x1,%ymm9,%xmm9 - vpaddd %xmm9,%xmm0,%xmm0 - vpaddd %xmm1,%xmm0,%xmm0 - vpxor %xmm0,%xmm3,%xmm3 - vprord $0x8,%xmm3,%xmm3 - vpaddd %xmm3,%xmm2,%xmm2 - vpxor %xmm2,%xmm1,%xmm1 - vprord $0x7,%xmm1,%xmm1 - vpshufd $0x39,%xmm0,%xmm0 - vpshufd $0x4e,%xmm3,%xmm3 - vpshufd $0x93,%xmm2,%xmm2 - decb %cl - jne .Lblake2s_compress_avx512_roundloop - vpxor %xmm10,%xmm0,%xmm0 - vpxor %xmm11,%xmm1,%xmm1 - vpxor %xmm2,%xmm0,%xmm0 - vpxor %xmm3,%xmm1,%xmm1 - decq %rdx - jne .Lblake2s_compress_avx512_mainloop - vmovdqu %xmm0,(%rdi) - vmovdqu %xmm1,0x10(%rdi) - vmovdqu %xmm4,0x20(%rdi) - vzeroupper - RET -SYM_FUNC_END(blake2s_compress_avx512) -#endif /* CONFIG_AS_AVX512 */ diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c deleted file mode 100644 index 0313f9673f56..000000000000 --- a/arch/x86/crypto/blake2s-glue.c +++ /dev/null @@ -1,74 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -#include <crypto/internal/blake2s.h> - -#include <linux/types.h> -#include <linux/jump_label.h> -#include <linux/kernel.h> -#include <linux/sizes.h> - -#include <asm/cpufeature.h> -#include <asm/fpu/api.h> -#include <asm/processor.h> -#include <asm/simd.h> - -asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state, - const u8 *block, const size_t nblocks, - const u32 inc); -asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, - const u8 *block, const size_t nblocks, - const u32 inc); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512); - -void blake2s_compress(struct blake2s_state *state, const u8 *block, - size_t nblocks, const u32 inc) -{ - /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8); - - if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) { - blake2s_compress_generic(state, block, nblocks, inc); - return; - } - - do { - const size_t blocks = min_t(size_t, nblocks, - SZ_4K / BLAKE2S_BLOCK_SIZE); - - kernel_fpu_begin(); - if (IS_ENABLED(CONFIG_AS_AVX512) && - static_branch_likely(&blake2s_use_avx512)) - blake2s_compress_avx512(state, block, blocks, inc); - else - blake2s_compress_ssse3(state, block, blocks, inc); - kernel_fpu_end(); - - nblocks -= blocks; - block += blocks * BLAKE2S_BLOCK_SIZE; - } while (nblocks); -} -EXPORT_SYMBOL(blake2s_compress); - -static int __init blake2s_mod_init(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - static_branch_enable(&blake2s_use_ssse3); - - if (IS_ENABLED(CONFIG_AS_AVX512) && - boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - boot_cpu_has(X86_FEATURE_AVX512F) && - boot_cpu_has(X86_FEATURE_AVX512VL) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | - XFEATURE_MASK_AVX512, NULL)) - static_branch_enable(&blake2s_use_avx512); - - return 0; -} - -subsys_initcall(blake2s_mod_init); diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index e7e4d64e9577..2d2f4e16537c 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -6,7 +6,6 @@ */ #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/module.h> @@ -69,10 +68,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg camellia_algs[] = { { - .base.cra_name = "__ecb(camellia)", - .base.cra_driver_name = "__ecb-camellia-aesni-avx2", + .base.cra_name = "ecb(camellia)", + .base.cra_driver_name = "ecb-camellia-aesni-avx2", .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct camellia_ctx), .base.cra_module = THIS_MODULE, @@ -82,10 +80,9 @@ static struct skcipher_alg camellia_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(camellia)", - .base.cra_driver_name = "__cbc-camellia-aesni-avx2", + .base.cra_name = "cbc(camellia)", + .base.cra_driver_name = "cbc-camellia-aesni-avx2", .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct camellia_ctx), .base.cra_module = THIS_MODULE, @@ -98,8 +95,6 @@ static struct skcipher_alg camellia_algs[] = { }, }; -static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)]; - static int __init camellia_aesni_init(void) { const char *feature_name; @@ -118,15 +113,13 @@ static int __init camellia_aesni_init(void) return -ENODEV; } - return simd_register_skciphers_compat(camellia_algs, - ARRAY_SIZE(camellia_algs), - camellia_simd_algs); + return crypto_register_skciphers(camellia_algs, + ARRAY_SIZE(camellia_algs)); } static void __exit camellia_aesni_fini(void) { - simd_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs), - camellia_simd_algs); + crypto_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs)); } module_init(camellia_aesni_init); diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index c7ccf63e741e..a7d162388142 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -6,7 +6,6 @@ */ #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/module.h> @@ -69,10 +68,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg camellia_algs[] = { { - .base.cra_name = "__ecb(camellia)", - .base.cra_driver_name = "__ecb-camellia-aesni", + .base.cra_name = "ecb(camellia)", + .base.cra_driver_name = "ecb-camellia-aesni", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct camellia_ctx), .base.cra_module = THIS_MODULE, @@ -82,10 +80,9 @@ static struct skcipher_alg camellia_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(camellia)", - .base.cra_driver_name = "__cbc-camellia-aesni", + .base.cra_name = "cbc(camellia)", + .base.cra_driver_name = "cbc-camellia-aesni", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct camellia_ctx), .base.cra_module = THIS_MODULE, @@ -98,8 +95,6 @@ static struct skcipher_alg camellia_algs[] = { } }; -static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)]; - static int __init camellia_aesni_init(void) { const char *feature_name; @@ -117,15 +112,13 @@ static int __init camellia_aesni_init(void) return -ENODEV; } - return simd_register_skciphers_compat(camellia_algs, - ARRAY_SIZE(camellia_algs), - camellia_simd_algs); + return crypto_register_skciphers(camellia_algs, + ARRAY_SIZE(camellia_algs)); } static void __exit camellia_aesni_fini(void) { - simd_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs), - camellia_simd_algs); + crypto_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs)); } module_init(camellia_aesni_init); diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index 3976a87f92ad..3aca04d43b34 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -8,7 +8,6 @@ #include <crypto/algapi.h> #include <crypto/cast5.h> -#include <crypto/internal/simd.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/module.h> @@ -64,10 +63,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg cast5_algs[] = { { - .base.cra_name = "__ecb(cast5)", - .base.cra_driver_name = "__ecb-cast5-avx", + .base.cra_name = "ecb(cast5)", + .base.cra_driver_name = "ecb-cast5-avx", .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAST5_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct cast5_ctx), .base.cra_module = THIS_MODULE, @@ -77,10 +75,9 @@ static struct skcipher_alg cast5_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(cast5)", - .base.cra_driver_name = "__cbc-cast5-avx", + .base.cra_name = "cbc(cast5)", + .base.cra_driver_name = "cbc-cast5-avx", .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAST5_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct cast5_ctx), .base.cra_module = THIS_MODULE, @@ -93,8 +90,6 @@ static struct skcipher_alg cast5_algs[] = { } }; -static struct simd_skcipher_alg *cast5_simd_algs[ARRAY_SIZE(cast5_algs)]; - static int __init cast5_init(void) { const char *feature_name; @@ -105,15 +100,13 @@ static int __init cast5_init(void) return -ENODEV; } - return simd_register_skciphers_compat(cast5_algs, - ARRAY_SIZE(cast5_algs), - cast5_simd_algs); + return crypto_register_skciphers(cast5_algs, + ARRAY_SIZE(cast5_algs)); } static void __exit cast5_exit(void) { - simd_unregister_skciphers(cast5_algs, ARRAY_SIZE(cast5_algs), - cast5_simd_algs); + crypto_unregister_skciphers(cast5_algs, ARRAY_SIZE(cast5_algs)); } module_init(cast5_init); diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 7e2aea372349..c4dd28c30303 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -14,7 +14,6 @@ #include <linux/err.h> #include <crypto/algapi.h> #include <crypto/cast6.h> -#include <crypto/internal/simd.h> #include "ecb_cbc_helpers.h" @@ -64,10 +63,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg cast6_algs[] = { { - .base.cra_name = "__ecb(cast6)", - .base.cra_driver_name = "__ecb-cast6-avx", + .base.cra_name = "ecb(cast6)", + .base.cra_driver_name = "ecb-cast6-avx", .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAST6_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct cast6_ctx), .base.cra_module = THIS_MODULE, @@ -77,10 +75,9 @@ static struct skcipher_alg cast6_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(cast6)", - .base.cra_driver_name = "__cbc-cast6-avx", + .base.cra_name = "cbc(cast6)", + .base.cra_driver_name = "cbc-cast6-avx", .base.cra_priority = 200, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAST6_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct cast6_ctx), .base.cra_module = THIS_MODULE, @@ -93,8 +90,6 @@ static struct skcipher_alg cast6_algs[] = { }, }; -static struct simd_skcipher_alg *cast6_simd_algs[ARRAY_SIZE(cast6_algs)]; - static int __init cast6_init(void) { const char *feature_name; @@ -105,15 +100,12 @@ static int __init cast6_init(void) return -ENODEV; } - return simd_register_skciphers_compat(cast6_algs, - ARRAY_SIZE(cast6_algs), - cast6_simd_algs); + return crypto_register_skciphers(cast6_algs, ARRAY_SIZE(cast6_algs)); } static void __exit cast6_exit(void) { - simd_unregister_skciphers(cast6_algs, ARRAY_SIZE(cast6_algs), - cast6_simd_algs); + crypto_unregister_skciphers(cast6_algs, ARRAY_SIZE(cast6_algs)); } module_init(cast6_init); diff --git a/arch/x86/crypto/chacha-avx2-x86_64.S b/arch/x86/crypto/chacha-avx2-x86_64.S deleted file mode 100644 index f3d8fc018249..000000000000 --- a/arch/x86/crypto/chacha-avx2-x86_64.S +++ /dev/null @@ -1,1021 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * ChaCha 256-bit cipher algorithm, x64 AVX2 functions - * - * Copyright (C) 2015 Martin Willi - */ - -#include <linux/linkage.h> - -.section .rodata.cst32.ROT8, "aM", @progbits, 32 -.align 32 -ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 - .octa 0x0e0d0c0f0a09080b0605040702010003 - -.section .rodata.cst32.ROT16, "aM", @progbits, 32 -.align 32 -ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 - .octa 0x0d0c0f0e09080b0a0504070601000302 - -.section .rodata.cst32.CTRINC, "aM", @progbits, 32 -.align 32 -CTRINC: .octa 0x00000003000000020000000100000000 - .octa 0x00000007000000060000000500000004 - -.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 -.align 32 -CTR2BL: .octa 0x00000000000000000000000000000000 - .octa 0x00000000000000000000000000000001 - -.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 -.align 32 -CTR4BL: .octa 0x00000000000000000000000000000002 - .octa 0x00000000000000000000000000000003 - -.text - -SYM_FUNC_START(chacha_2block_xor_avx2) - # %rdi: Input state matrix, s - # %rsi: up to 2 data blocks output, o - # %rdx: up to 2 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts two ChaCha blocks by loading the state - # matrix twice across four AVX registers. It performs matrix operations - # on four words in each matrix in parallel, but requires shuffling to - # rearrange the words after each round. - - vzeroupper - - # x0..3[0-2] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - - vmovdqa %ymm0,%ymm8 - vmovdqa %ymm1,%ymm9 - vmovdqa %ymm2,%ymm10 - vmovdqa %ymm3,%ymm11 - - vmovdqa ROT8(%rip),%ymm4 - vmovdqa ROT16(%rip),%ymm5 - - mov %rcx,%rax - -.Ldoubleround: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm5,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm6 - vpslld $12,%ymm6,%ymm6 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm6,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm4,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm7 - vpslld $7,%ymm7,%ymm7 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm5,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm6 - vpslld $12,%ymm6,%ymm6 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm6,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm4,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm7 - vpslld $7,%ymm7,%ymm7 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - - sub $2,%r8d - jnz .Ldoubleround - - # o0 = i0 ^ (x0 + s0) - vpaddd %ymm8,%ymm0,%ymm7 - cmp $0x10,%rax - jl .Lxorpart2 - vpxor 0x00(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x00(%rsi) - vextracti128 $1,%ymm7,%xmm0 - # o1 = i1 ^ (x1 + s1) - vpaddd %ymm9,%ymm1,%ymm7 - cmp $0x20,%rax - jl .Lxorpart2 - vpxor 0x10(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x10(%rsi) - vextracti128 $1,%ymm7,%xmm1 - # o2 = i2 ^ (x2 + s2) - vpaddd %ymm10,%ymm2,%ymm7 - cmp $0x30,%rax - jl .Lxorpart2 - vpxor 0x20(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x20(%rsi) - vextracti128 $1,%ymm7,%xmm2 - # o3 = i3 ^ (x3 + s3) - vpaddd %ymm11,%ymm3,%ymm7 - cmp $0x40,%rax - jl .Lxorpart2 - vpxor 0x30(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x30(%rsi) - vextracti128 $1,%ymm7,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm7 - cmp $0x50,%rax - jl .Lxorpart2 - vpxor 0x40(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x40(%rsi) - - vmovdqa %xmm1,%xmm7 - cmp $0x60,%rax - jl .Lxorpart2 - vpxor 0x50(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x50(%rsi) - - vmovdqa %xmm2,%xmm7 - cmp $0x70,%rax - jl .Lxorpart2 - vpxor 0x60(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x60(%rsi) - - vmovdqa %xmm3,%xmm7 - cmp $0x80,%rax - jl .Lxorpart2 - vpxor 0x70(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x70(%rsi) - -.Ldone2: - vzeroupper - RET - -.Lxorpart2: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone2 - and $~0x0f,%rax - - mov %rsi,%r11 - - lea 8(%rsp),%r10 - sub $0x10,%rsp - and $~31,%rsp - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - vpxor 0x00(%rsp),%xmm7,%xmm7 - vmovdqa %xmm7,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - lea -8(%r10),%rsp - jmp .Ldone2 - -SYM_FUNC_END(chacha_2block_xor_avx2) - -SYM_FUNC_START(chacha_4block_xor_avx2) - # %rdi: Input state matrix, s - # %rsi: up to 4 data blocks output, o - # %rdx: up to 4 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts four ChaCha blocks by loading the state - # matrix four times across eight AVX registers. It performs matrix - # operations on four words in two matrices in parallel, sequentially - # to the operations on the four words of the other two matrices. The - # required word shuffling has a rather high latency, we can do the - # arithmetic on two matrix-pairs without much slowdown. - - vzeroupper - - # x0..3[0-4] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vmovdqa %ymm0,%ymm4 - vmovdqa %ymm1,%ymm5 - vmovdqa %ymm2,%ymm6 - vmovdqa %ymm3,%ymm7 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - vpaddd CTR4BL(%rip),%ymm7,%ymm7 - - vmovdqa %ymm0,%ymm11 - vmovdqa %ymm1,%ymm12 - vmovdqa %ymm2,%ymm13 - vmovdqa %ymm3,%ymm14 - vmovdqa %ymm7,%ymm15 - - vmovdqa ROT8(%rip),%ymm8 - vmovdqa ROT16(%rip),%ymm9 - - mov %rcx,%rax - -.Ldoubleround4: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm9,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm9,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - vpshufd $0x39,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - vpshufd $0x93,%ymm7,%ymm7 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm9,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm9,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $12,%ymm10,%ymm10 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vpshufb %ymm8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxor %ymm4,%ymm7,%ymm7 - vpshufb %ymm8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vmovdqa %ymm1,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm10,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxor %ymm6,%ymm5,%ymm5 - vmovdqa %ymm5,%ymm10 - vpslld $7,%ymm10,%ymm10 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm10,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - vpshufd $0x93,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - vpshufd $0x39,%ymm7,%ymm7 - - sub $2,%r8d - jnz .Ldoubleround4 - - # o0 = i0 ^ (x0 + s0), first block - vpaddd %ymm11,%ymm0,%ymm10 - cmp $0x10,%rax - jl .Lxorpart4 - vpxor 0x00(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x00(%rsi) - vextracti128 $1,%ymm10,%xmm0 - # o1 = i1 ^ (x1 + s1), first block - vpaddd %ymm12,%ymm1,%ymm10 - cmp $0x20,%rax - jl .Lxorpart4 - vpxor 0x10(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x10(%rsi) - vextracti128 $1,%ymm10,%xmm1 - # o2 = i2 ^ (x2 + s2), first block - vpaddd %ymm13,%ymm2,%ymm10 - cmp $0x30,%rax - jl .Lxorpart4 - vpxor 0x20(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x20(%rsi) - vextracti128 $1,%ymm10,%xmm2 - # o3 = i3 ^ (x3 + s3), first block - vpaddd %ymm14,%ymm3,%ymm10 - cmp $0x40,%rax - jl .Lxorpart4 - vpxor 0x30(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x30(%rsi) - vextracti128 $1,%ymm10,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm10 - cmp $0x50,%rax - jl .Lxorpart4 - vpxor 0x40(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x40(%rsi) - - vmovdqa %xmm1,%xmm10 - cmp $0x60,%rax - jl .Lxorpart4 - vpxor 0x50(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x50(%rsi) - - vmovdqa %xmm2,%xmm10 - cmp $0x70,%rax - jl .Lxorpart4 - vpxor 0x60(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x60(%rsi) - - vmovdqa %xmm3,%xmm10 - cmp $0x80,%rax - jl .Lxorpart4 - vpxor 0x70(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x70(%rsi) - - # o0 = i0 ^ (x0 + s0), third block - vpaddd %ymm11,%ymm4,%ymm10 - cmp $0x90,%rax - jl .Lxorpart4 - vpxor 0x80(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x80(%rsi) - vextracti128 $1,%ymm10,%xmm4 - # o1 = i1 ^ (x1 + s1), third block - vpaddd %ymm12,%ymm5,%ymm10 - cmp $0xa0,%rax - jl .Lxorpart4 - vpxor 0x90(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x90(%rsi) - vextracti128 $1,%ymm10,%xmm5 - # o2 = i2 ^ (x2 + s2), third block - vpaddd %ymm13,%ymm6,%ymm10 - cmp $0xb0,%rax - jl .Lxorpart4 - vpxor 0xa0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xa0(%rsi) - vextracti128 $1,%ymm10,%xmm6 - # o3 = i3 ^ (x3 + s3), third block - vpaddd %ymm15,%ymm7,%ymm10 - cmp $0xc0,%rax - jl .Lxorpart4 - vpxor 0xb0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xb0(%rsi) - vextracti128 $1,%ymm10,%xmm7 - - # xor and write fourth block - vmovdqa %xmm4,%xmm10 - cmp $0xd0,%rax - jl .Lxorpart4 - vpxor 0xc0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xc0(%rsi) - - vmovdqa %xmm5,%xmm10 - cmp $0xe0,%rax - jl .Lxorpart4 - vpxor 0xd0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xd0(%rsi) - - vmovdqa %xmm6,%xmm10 - cmp $0xf0,%rax - jl .Lxorpart4 - vpxor 0xe0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xe0(%rsi) - - vmovdqa %xmm7,%xmm10 - cmp $0x100,%rax - jl .Lxorpart4 - vpxor 0xf0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xf0(%rsi) - -.Ldone4: - vzeroupper - RET - -.Lxorpart4: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone4 - and $~0x0f,%rax - - mov %rsi,%r11 - - lea 8(%rsp),%r10 - sub $0x10,%rsp - and $~31,%rsp - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - vpxor 0x00(%rsp),%xmm10,%xmm10 - vmovdqa %xmm10,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - lea -8(%r10),%rsp - jmp .Ldone4 - -SYM_FUNC_END(chacha_4block_xor_avx2) - -SYM_FUNC_START(chacha_8block_xor_avx2) - # %rdi: Input state matrix, s - # %rsi: up to 8 data blocks output, o - # %rdx: up to 8 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts eight consecutive ChaCha blocks by loading - # the state matrix in AVX registers eight times. As we need some - # scratch registers, we save the first four registers on the stack. The - # algorithm performs each operation on the corresponding word of each - # state matrix, hence requires no word shuffling. For final XORing step - # we transpose the matrix by interleaving 32-, 64- and then 128-bit - # words, which allows us to do XOR in AVX registers. 8/16-bit word - # rotation is done with the slightly better performing byte shuffling, - # 7/12-bit word rotation uses traditional shift+OR. - - vzeroupper - # 4 * 32 byte stack, 32-byte aligned - lea 8(%rsp),%r10 - and $~31, %rsp - sub $0x80, %rsp - mov %rcx,%rax - - # x0..15[0-7] = s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpbroadcastd 0x04(%rdi),%ymm1 - vpbroadcastd 0x08(%rdi),%ymm2 - vpbroadcastd 0x0c(%rdi),%ymm3 - vpbroadcastd 0x10(%rdi),%ymm4 - vpbroadcastd 0x14(%rdi),%ymm5 - vpbroadcastd 0x18(%rdi),%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm7 - vpbroadcastd 0x20(%rdi),%ymm8 - vpbroadcastd 0x24(%rdi),%ymm9 - vpbroadcastd 0x28(%rdi),%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm11 - vpbroadcastd 0x30(%rdi),%ymm12 - vpbroadcastd 0x34(%rdi),%ymm13 - vpbroadcastd 0x38(%rdi),%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm15 - # x0..3 on stack - vmovdqa %ymm0,0x00(%rsp) - vmovdqa %ymm1,0x20(%rsp) - vmovdqa %ymm2,0x40(%rsp) - vmovdqa %ymm3,0x60(%rsp) - - vmovdqa CTRINC(%rip),%ymm1 - vmovdqa ROT8(%rip),%ymm2 - vmovdqa ROT16(%rip),%ymm3 - - # x12 += counter values 0-3 - vpaddd %ymm1,%ymm12,%ymm12 - -.Ldoubleround8: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - vpaddd 0x00(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm3,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - vpaddd 0x20(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm3,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - vpaddd 0x40(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm3,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - vpaddd 0x60(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm3,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $12,%ymm4,%ymm0 - vpsrld $20,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $12,%ymm5,%ymm0 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $12,%ymm6,%ymm0 - vpsrld $20,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - vpaddd %ymm15,%ymm11,%ymm11 - vpxor %ymm11,%ymm7,%ymm7 - vpslld $12,%ymm7,%ymm0 - vpsrld $20,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - vpaddd 0x00(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm2,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - vpaddd 0x20(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm2,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - vpaddd 0x40(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm2,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - vpaddd 0x60(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm2,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm0 - vpsrld $25,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm0 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm0 - vpsrld $25,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - vpaddd %ymm15,%ymm11,%ymm11 - vpxor %ymm11,%ymm7,%ymm7 - vpslld $7,%ymm7,%ymm0 - vpsrld $25,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - vpaddd 0x00(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm3,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 - vpaddd 0x20(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm3,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - vpaddd 0x40(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm3,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - vpaddd 0x60(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm3,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - vpaddd %ymm15,%ymm10,%ymm10 - vpxor %ymm10,%ymm5,%ymm5 - vpslld $12,%ymm5,%ymm0 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - vpaddd %ymm12,%ymm11,%ymm11 - vpxor %ymm11,%ymm6,%ymm6 - vpslld $12,%ymm6,%ymm0 - vpsrld $20,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - vpaddd %ymm13,%ymm8,%ymm8 - vpxor %ymm8,%ymm7,%ymm7 - vpslld $12,%ymm7,%ymm0 - vpsrld $20,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - vpaddd %ymm14,%ymm9,%ymm9 - vpxor %ymm9,%ymm4,%ymm4 - vpslld $12,%ymm4,%ymm0 - vpsrld $20,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - vpaddd 0x00(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm2,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - vpaddd 0x20(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm2,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - vpaddd 0x40(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm2,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - vpaddd 0x60(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm2,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - vpaddd %ymm15,%ymm10,%ymm10 - vpxor %ymm10,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm0 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - vpaddd %ymm12,%ymm11,%ymm11 - vpxor %ymm11,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm0 - vpsrld $25,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - vpaddd %ymm13,%ymm8,%ymm8 - vpxor %ymm8,%ymm7,%ymm7 - vpslld $7,%ymm7,%ymm0 - vpsrld $25,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - vpaddd %ymm14,%ymm9,%ymm9 - vpxor %ymm9,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm0 - vpsrld $25,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - - sub $2,%r8d - jnz .Ldoubleround8 - - # x0..15[0-3] += s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpaddd 0x00(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpbroadcastd 0x04(%rdi),%ymm0 - vpaddd 0x20(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpbroadcastd 0x08(%rdi),%ymm0 - vpaddd 0x40(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpbroadcastd 0x0c(%rdi),%ymm0 - vpaddd 0x60(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpbroadcastd 0x10(%rdi),%ymm0 - vpaddd %ymm0,%ymm4,%ymm4 - vpbroadcastd 0x14(%rdi),%ymm0 - vpaddd %ymm0,%ymm5,%ymm5 - vpbroadcastd 0x18(%rdi),%ymm0 - vpaddd %ymm0,%ymm6,%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm0 - vpaddd %ymm0,%ymm7,%ymm7 - vpbroadcastd 0x20(%rdi),%ymm0 - vpaddd %ymm0,%ymm8,%ymm8 - vpbroadcastd 0x24(%rdi),%ymm0 - vpaddd %ymm0,%ymm9,%ymm9 - vpbroadcastd 0x28(%rdi),%ymm0 - vpaddd %ymm0,%ymm10,%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm0 - vpaddd %ymm0,%ymm11,%ymm11 - vpbroadcastd 0x30(%rdi),%ymm0 - vpaddd %ymm0,%ymm12,%ymm12 - vpbroadcastd 0x34(%rdi),%ymm0 - vpaddd %ymm0,%ymm13,%ymm13 - vpbroadcastd 0x38(%rdi),%ymm0 - vpaddd %ymm0,%ymm14,%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm0 - vpaddd %ymm0,%ymm15,%ymm15 - - # x12 += counter values 0-3 - vpaddd %ymm1,%ymm12,%ymm12 - - # interleave 32-bit words in state n, n+1 - vmovdqa 0x00(%rsp),%ymm0 - vmovdqa 0x20(%rsp),%ymm1 - vpunpckldq %ymm1,%ymm0,%ymm2 - vpunpckhdq %ymm1,%ymm0,%ymm1 - vmovdqa %ymm2,0x00(%rsp) - vmovdqa %ymm1,0x20(%rsp) - vmovdqa 0x40(%rsp),%ymm0 - vmovdqa 0x60(%rsp),%ymm1 - vpunpckldq %ymm1,%ymm0,%ymm2 - vpunpckhdq %ymm1,%ymm0,%ymm1 - vmovdqa %ymm2,0x40(%rsp) - vmovdqa %ymm1,0x60(%rsp) - vmovdqa %ymm4,%ymm0 - vpunpckldq %ymm5,%ymm0,%ymm4 - vpunpckhdq %ymm5,%ymm0,%ymm5 - vmovdqa %ymm6,%ymm0 - vpunpckldq %ymm7,%ymm0,%ymm6 - vpunpckhdq %ymm7,%ymm0,%ymm7 - vmovdqa %ymm8,%ymm0 - vpunpckldq %ymm9,%ymm0,%ymm8 - vpunpckhdq %ymm9,%ymm0,%ymm9 - vmovdqa %ymm10,%ymm0 - vpunpckldq %ymm11,%ymm0,%ymm10 - vpunpckhdq %ymm11,%ymm0,%ymm11 - vmovdqa %ymm12,%ymm0 - vpunpckldq %ymm13,%ymm0,%ymm12 - vpunpckhdq %ymm13,%ymm0,%ymm13 - vmovdqa %ymm14,%ymm0 - vpunpckldq %ymm15,%ymm0,%ymm14 - vpunpckhdq %ymm15,%ymm0,%ymm15 - - # interleave 64-bit words in state n, n+2 - vmovdqa 0x00(%rsp),%ymm0 - vmovdqa 0x40(%rsp),%ymm2 - vpunpcklqdq %ymm2,%ymm0,%ymm1 - vpunpckhqdq %ymm2,%ymm0,%ymm2 - vmovdqa %ymm1,0x00(%rsp) - vmovdqa %ymm2,0x40(%rsp) - vmovdqa 0x20(%rsp),%ymm0 - vmovdqa 0x60(%rsp),%ymm2 - vpunpcklqdq %ymm2,%ymm0,%ymm1 - vpunpckhqdq %ymm2,%ymm0,%ymm2 - vmovdqa %ymm1,0x20(%rsp) - vmovdqa %ymm2,0x60(%rsp) - vmovdqa %ymm4,%ymm0 - vpunpcklqdq %ymm6,%ymm0,%ymm4 - vpunpckhqdq %ymm6,%ymm0,%ymm6 - vmovdqa %ymm5,%ymm0 - vpunpcklqdq %ymm7,%ymm0,%ymm5 - vpunpckhqdq %ymm7,%ymm0,%ymm7 - vmovdqa %ymm8,%ymm0 - vpunpcklqdq %ymm10,%ymm0,%ymm8 - vpunpckhqdq %ymm10,%ymm0,%ymm10 - vmovdqa %ymm9,%ymm0 - vpunpcklqdq %ymm11,%ymm0,%ymm9 - vpunpckhqdq %ymm11,%ymm0,%ymm11 - vmovdqa %ymm12,%ymm0 - vpunpcklqdq %ymm14,%ymm0,%ymm12 - vpunpckhqdq %ymm14,%ymm0,%ymm14 - vmovdqa %ymm13,%ymm0 - vpunpcklqdq %ymm15,%ymm0,%ymm13 - vpunpckhqdq %ymm15,%ymm0,%ymm15 - - # interleave 128-bit words in state n, n+4 - # xor/write first four blocks - vmovdqa 0x00(%rsp),%ymm1 - vperm2i128 $0x20,%ymm4,%ymm1,%ymm0 - cmp $0x0020,%rax - jl .Lxorpart8 - vpxor 0x0000(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0000(%rsi) - vperm2i128 $0x31,%ymm4,%ymm1,%ymm4 - - vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 - cmp $0x0040,%rax - jl .Lxorpart8 - vpxor 0x0020(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0020(%rsi) - vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 - - vmovdqa 0x40(%rsp),%ymm1 - vperm2i128 $0x20,%ymm6,%ymm1,%ymm0 - cmp $0x0060,%rax - jl .Lxorpart8 - vpxor 0x0040(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0040(%rsi) - vperm2i128 $0x31,%ymm6,%ymm1,%ymm6 - - vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 - cmp $0x0080,%rax - jl .Lxorpart8 - vpxor 0x0060(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0060(%rsi) - vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 - - vmovdqa 0x20(%rsp),%ymm1 - vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 - cmp $0x00a0,%rax - jl .Lxorpart8 - vpxor 0x0080(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0080(%rsi) - vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 - - vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 - cmp $0x00c0,%rax - jl .Lxorpart8 - vpxor 0x00a0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x00a0(%rsi) - vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 - - vmovdqa 0x60(%rsp),%ymm1 - vperm2i128 $0x20,%ymm7,%ymm1,%ymm0 - cmp $0x00e0,%rax - jl .Lxorpart8 - vpxor 0x00c0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x00c0(%rsi) - vperm2i128 $0x31,%ymm7,%ymm1,%ymm7 - - vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 - cmp $0x0100,%rax - jl .Lxorpart8 - vpxor 0x00e0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x00e0(%rsi) - vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 - - # xor remaining blocks, write to output - vmovdqa %ymm4,%ymm0 - cmp $0x0120,%rax - jl .Lxorpart8 - vpxor 0x0100(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0100(%rsi) - - vmovdqa %ymm12,%ymm0 - cmp $0x0140,%rax - jl .Lxorpart8 - vpxor 0x0120(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0120(%rsi) - - vmovdqa %ymm6,%ymm0 - cmp $0x0160,%rax - jl .Lxorpart8 - vpxor 0x0140(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0140(%rsi) - - vmovdqa %ymm14,%ymm0 - cmp $0x0180,%rax - jl .Lxorpart8 - vpxor 0x0160(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0160(%rsi) - - vmovdqa %ymm5,%ymm0 - cmp $0x01a0,%rax - jl .Lxorpart8 - vpxor 0x0180(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0180(%rsi) - - vmovdqa %ymm13,%ymm0 - cmp $0x01c0,%rax - jl .Lxorpart8 - vpxor 0x01a0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x01a0(%rsi) - - vmovdqa %ymm7,%ymm0 - cmp $0x01e0,%rax - jl .Lxorpart8 - vpxor 0x01c0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x01c0(%rsi) - - vmovdqa %ymm15,%ymm0 - cmp $0x0200,%rax - jl .Lxorpart8 - vpxor 0x01e0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x01e0(%rsi) - -.Ldone8: - vzeroupper - lea -8(%r10),%rsp - RET - -.Lxorpart8: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x1f,%r9 - jz .Ldone8 - and $~0x1f,%rax - - mov %rsi,%r11 - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - vpxor 0x00(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - jmp .Ldone8 - -SYM_FUNC_END(chacha_8block_xor_avx2) diff --git a/arch/x86/crypto/chacha-avx512vl-x86_64.S b/arch/x86/crypto/chacha-avx512vl-x86_64.S deleted file mode 100644 index 259383e1ad44..000000000000 --- a/arch/x86/crypto/chacha-avx512vl-x86_64.S +++ /dev/null @@ -1,836 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions - * - * Copyright (C) 2018 Martin Willi - */ - -#include <linux/linkage.h> - -.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 -.align 32 -CTR2BL: .octa 0x00000000000000000000000000000000 - .octa 0x00000000000000000000000000000001 - -.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 -.align 32 -CTR4BL: .octa 0x00000000000000000000000000000002 - .octa 0x00000000000000000000000000000003 - -.section .rodata.cst32.CTR8BL, "aM", @progbits, 32 -.align 32 -CTR8BL: .octa 0x00000003000000020000000100000000 - .octa 0x00000007000000060000000500000004 - -.text - -SYM_FUNC_START(chacha_2block_xor_avx512vl) - # %rdi: Input state matrix, s - # %rsi: up to 2 data blocks output, o - # %rdx: up to 2 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts two ChaCha blocks by loading the state - # matrix twice across four AVX registers. It performs matrix operations - # on four words in each matrix in parallel, but requires shuffling to - # rearrange the words after each round. - - vzeroupper - - # x0..3[0-2] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - - vmovdqa %ymm0,%ymm8 - vmovdqa %ymm1,%ymm9 - vmovdqa %ymm2,%ymm10 - vmovdqa %ymm3,%ymm11 - -.Ldoubleround: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - - sub $2,%r8d - jnz .Ldoubleround - - # o0 = i0 ^ (x0 + s0) - vpaddd %ymm8,%ymm0,%ymm7 - cmp $0x10,%rcx - jl .Lxorpart2 - vpxord 0x00(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x00(%rsi) - vextracti128 $1,%ymm7,%xmm0 - # o1 = i1 ^ (x1 + s1) - vpaddd %ymm9,%ymm1,%ymm7 - cmp $0x20,%rcx - jl .Lxorpart2 - vpxord 0x10(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x10(%rsi) - vextracti128 $1,%ymm7,%xmm1 - # o2 = i2 ^ (x2 + s2) - vpaddd %ymm10,%ymm2,%ymm7 - cmp $0x30,%rcx - jl .Lxorpart2 - vpxord 0x20(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x20(%rsi) - vextracti128 $1,%ymm7,%xmm2 - # o3 = i3 ^ (x3 + s3) - vpaddd %ymm11,%ymm3,%ymm7 - cmp $0x40,%rcx - jl .Lxorpart2 - vpxord 0x30(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x30(%rsi) - vextracti128 $1,%ymm7,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm7 - cmp $0x50,%rcx - jl .Lxorpart2 - vpxord 0x40(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x40(%rsi) - - vmovdqa %xmm1,%xmm7 - cmp $0x60,%rcx - jl .Lxorpart2 - vpxord 0x50(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x50(%rsi) - - vmovdqa %xmm2,%xmm7 - cmp $0x70,%rcx - jl .Lxorpart2 - vpxord 0x60(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x60(%rsi) - - vmovdqa %xmm3,%xmm7 - cmp $0x80,%rcx - jl .Lxorpart2 - vpxord 0x70(%rdx),%xmm7,%xmm6 - vmovdqu %xmm6,0x70(%rsi) - -.Ldone2: - vzeroupper - RET - -.Lxorpart2: - # xor remaining bytes from partial register into output - mov %rcx,%rax - and $0xf,%rcx - jz .Ldone2 - mov %rax,%r9 - and $~0xf,%r9 - - mov $1,%rax - shld %cl,%rax,%rax - sub $1,%rax - kmovq %rax,%k1 - - vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} - vpxord %xmm7,%xmm1,%xmm1 - vmovdqu8 %xmm1,(%rsi,%r9){%k1} - - jmp .Ldone2 - -SYM_FUNC_END(chacha_2block_xor_avx512vl) - -SYM_FUNC_START(chacha_4block_xor_avx512vl) - # %rdi: Input state matrix, s - # %rsi: up to 4 data blocks output, o - # %rdx: up to 4 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts four ChaCha blocks by loading the state - # matrix four times across eight AVX registers. It performs matrix - # operations on four words in two matrices in parallel, sequentially - # to the operations on the four words of the other two matrices. The - # required word shuffling has a rather high latency, we can do the - # arithmetic on two matrix-pairs without much slowdown. - - vzeroupper - - # x0..3[0-4] = s0..3 - vbroadcasti128 0x00(%rdi),%ymm0 - vbroadcasti128 0x10(%rdi),%ymm1 - vbroadcasti128 0x20(%rdi),%ymm2 - vbroadcasti128 0x30(%rdi),%ymm3 - - vmovdqa %ymm0,%ymm4 - vmovdqa %ymm1,%ymm5 - vmovdqa %ymm2,%ymm6 - vmovdqa %ymm3,%ymm7 - - vpaddd CTR2BL(%rip),%ymm3,%ymm3 - vpaddd CTR4BL(%rip),%ymm7,%ymm7 - - vmovdqa %ymm0,%ymm11 - vmovdqa %ymm1,%ymm12 - vmovdqa %ymm2,%ymm13 - vmovdqa %ymm3,%ymm14 - vmovdqa %ymm7,%ymm15 - -.Ldoubleround4: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $16,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm1,%ymm1 - vpshufd $0x39,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm3,%ymm3 - vpshufd $0x93,%ymm7,%ymm7 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $16,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vpaddd %ymm1,%ymm0,%ymm0 - vpxord %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - - vpaddd %ymm5,%ymm4,%ymm4 - vpxord %ymm4,%ymm7,%ymm7 - vprold $8,%ymm7,%ymm7 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vpaddd %ymm3,%ymm2,%ymm2 - vpxord %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - - vpaddd %ymm7,%ymm6,%ymm6 - vpxord %ymm6,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vpshufd $0x93,%ymm1,%ymm1 - vpshufd $0x93,%ymm5,%ymm5 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vpshufd $0x4e,%ymm2,%ymm2 - vpshufd $0x4e,%ymm6,%ymm6 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vpshufd $0x39,%ymm3,%ymm3 - vpshufd $0x39,%ymm7,%ymm7 - - sub $2,%r8d - jnz .Ldoubleround4 - - # o0 = i0 ^ (x0 + s0), first block - vpaddd %ymm11,%ymm0,%ymm10 - cmp $0x10,%rcx - jl .Lxorpart4 - vpxord 0x00(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x00(%rsi) - vextracti128 $1,%ymm10,%xmm0 - # o1 = i1 ^ (x1 + s1), first block - vpaddd %ymm12,%ymm1,%ymm10 - cmp $0x20,%rcx - jl .Lxorpart4 - vpxord 0x10(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x10(%rsi) - vextracti128 $1,%ymm10,%xmm1 - # o2 = i2 ^ (x2 + s2), first block - vpaddd %ymm13,%ymm2,%ymm10 - cmp $0x30,%rcx - jl .Lxorpart4 - vpxord 0x20(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x20(%rsi) - vextracti128 $1,%ymm10,%xmm2 - # o3 = i3 ^ (x3 + s3), first block - vpaddd %ymm14,%ymm3,%ymm10 - cmp $0x40,%rcx - jl .Lxorpart4 - vpxord 0x30(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x30(%rsi) - vextracti128 $1,%ymm10,%xmm3 - - # xor and write second block - vmovdqa %xmm0,%xmm10 - cmp $0x50,%rcx - jl .Lxorpart4 - vpxord 0x40(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x40(%rsi) - - vmovdqa %xmm1,%xmm10 - cmp $0x60,%rcx - jl .Lxorpart4 - vpxord 0x50(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x50(%rsi) - - vmovdqa %xmm2,%xmm10 - cmp $0x70,%rcx - jl .Lxorpart4 - vpxord 0x60(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x60(%rsi) - - vmovdqa %xmm3,%xmm10 - cmp $0x80,%rcx - jl .Lxorpart4 - vpxord 0x70(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x70(%rsi) - - # o0 = i0 ^ (x0 + s0), third block - vpaddd %ymm11,%ymm4,%ymm10 - cmp $0x90,%rcx - jl .Lxorpart4 - vpxord 0x80(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x80(%rsi) - vextracti128 $1,%ymm10,%xmm4 - # o1 = i1 ^ (x1 + s1), third block - vpaddd %ymm12,%ymm5,%ymm10 - cmp $0xa0,%rcx - jl .Lxorpart4 - vpxord 0x90(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0x90(%rsi) - vextracti128 $1,%ymm10,%xmm5 - # o2 = i2 ^ (x2 + s2), third block - vpaddd %ymm13,%ymm6,%ymm10 - cmp $0xb0,%rcx - jl .Lxorpart4 - vpxord 0xa0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xa0(%rsi) - vextracti128 $1,%ymm10,%xmm6 - # o3 = i3 ^ (x3 + s3), third block - vpaddd %ymm15,%ymm7,%ymm10 - cmp $0xc0,%rcx - jl .Lxorpart4 - vpxord 0xb0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xb0(%rsi) - vextracti128 $1,%ymm10,%xmm7 - - # xor and write fourth block - vmovdqa %xmm4,%xmm10 - cmp $0xd0,%rcx - jl .Lxorpart4 - vpxord 0xc0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xc0(%rsi) - - vmovdqa %xmm5,%xmm10 - cmp $0xe0,%rcx - jl .Lxorpart4 - vpxord 0xd0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xd0(%rsi) - - vmovdqa %xmm6,%xmm10 - cmp $0xf0,%rcx - jl .Lxorpart4 - vpxord 0xe0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xe0(%rsi) - - vmovdqa %xmm7,%xmm10 - cmp $0x100,%rcx - jl .Lxorpart4 - vpxord 0xf0(%rdx),%xmm10,%xmm9 - vmovdqu %xmm9,0xf0(%rsi) - -.Ldone4: - vzeroupper - RET - -.Lxorpart4: - # xor remaining bytes from partial register into output - mov %rcx,%rax - and $0xf,%rcx - jz .Ldone4 - mov %rax,%r9 - and $~0xf,%r9 - - mov $1,%rax - shld %cl,%rax,%rax - sub $1,%rax - kmovq %rax,%k1 - - vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} - vpxord %xmm10,%xmm1,%xmm1 - vmovdqu8 %xmm1,(%rsi,%r9){%k1} - - jmp .Ldone4 - -SYM_FUNC_END(chacha_4block_xor_avx512vl) - -SYM_FUNC_START(chacha_8block_xor_avx512vl) - # %rdi: Input state matrix, s - # %rsi: up to 8 data blocks output, o - # %rdx: up to 8 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts eight consecutive ChaCha blocks by loading - # the state matrix in AVX registers eight times. Compared to AVX2, this - # mostly benefits from the new rotate instructions in VL and the - # additional registers. - - vzeroupper - - # x0..15[0-7] = s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpbroadcastd 0x04(%rdi),%ymm1 - vpbroadcastd 0x08(%rdi),%ymm2 - vpbroadcastd 0x0c(%rdi),%ymm3 - vpbroadcastd 0x10(%rdi),%ymm4 - vpbroadcastd 0x14(%rdi),%ymm5 - vpbroadcastd 0x18(%rdi),%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm7 - vpbroadcastd 0x20(%rdi),%ymm8 - vpbroadcastd 0x24(%rdi),%ymm9 - vpbroadcastd 0x28(%rdi),%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm11 - vpbroadcastd 0x30(%rdi),%ymm12 - vpbroadcastd 0x34(%rdi),%ymm13 - vpbroadcastd 0x38(%rdi),%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm15 - - # x12 += counter values 0-3 - vpaddd CTR8BL(%rip),%ymm12,%ymm12 - - vmovdqa64 %ymm0,%ymm16 - vmovdqa64 %ymm1,%ymm17 - vmovdqa64 %ymm2,%ymm18 - vmovdqa64 %ymm3,%ymm19 - vmovdqa64 %ymm4,%ymm20 - vmovdqa64 %ymm5,%ymm21 - vmovdqa64 %ymm6,%ymm22 - vmovdqa64 %ymm7,%ymm23 - vmovdqa64 %ymm8,%ymm24 - vmovdqa64 %ymm9,%ymm25 - vmovdqa64 %ymm10,%ymm26 - vmovdqa64 %ymm11,%ymm27 - vmovdqa64 %ymm12,%ymm28 - vmovdqa64 %ymm13,%ymm29 - vmovdqa64 %ymm14,%ymm30 - vmovdqa64 %ymm15,%ymm31 - -.Ldoubleround8: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - vpaddd %ymm0,%ymm4,%ymm0 - vpxord %ymm0,%ymm12,%ymm12 - vprold $16,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - vpaddd %ymm1,%ymm5,%ymm1 - vpxord %ymm1,%ymm13,%ymm13 - vprold $16,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - vpaddd %ymm2,%ymm6,%ymm2 - vpxord %ymm2,%ymm14,%ymm14 - vprold $16,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - vpaddd %ymm3,%ymm7,%ymm3 - vpxord %ymm3,%ymm15,%ymm15 - vprold $16,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - vpaddd %ymm12,%ymm8,%ymm8 - vpxord %ymm8,%ymm4,%ymm4 - vprold $12,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - vpaddd %ymm13,%ymm9,%ymm9 - vpxord %ymm9,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - vpaddd %ymm14,%ymm10,%ymm10 - vpxord %ymm10,%ymm6,%ymm6 - vprold $12,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - vpaddd %ymm15,%ymm11,%ymm11 - vpxord %ymm11,%ymm7,%ymm7 - vprold $12,%ymm7,%ymm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - vpaddd %ymm0,%ymm4,%ymm0 - vpxord %ymm0,%ymm12,%ymm12 - vprold $8,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - vpaddd %ymm1,%ymm5,%ymm1 - vpxord %ymm1,%ymm13,%ymm13 - vprold $8,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - vpaddd %ymm2,%ymm6,%ymm2 - vpxord %ymm2,%ymm14,%ymm14 - vprold $8,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - vpaddd %ymm3,%ymm7,%ymm3 - vpxord %ymm3,%ymm15,%ymm15 - vprold $8,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - vpaddd %ymm12,%ymm8,%ymm8 - vpxord %ymm8,%ymm4,%ymm4 - vprold $7,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - vpaddd %ymm13,%ymm9,%ymm9 - vpxord %ymm9,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - vpaddd %ymm14,%ymm10,%ymm10 - vpxord %ymm10,%ymm6,%ymm6 - vprold $7,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - vpaddd %ymm15,%ymm11,%ymm11 - vpxord %ymm11,%ymm7,%ymm7 - vprold $7,%ymm7,%ymm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - vpaddd %ymm0,%ymm5,%ymm0 - vpxord %ymm0,%ymm15,%ymm15 - vprold $16,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16) - vpaddd %ymm1,%ymm6,%ymm1 - vpxord %ymm1,%ymm12,%ymm12 - vprold $16,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - vpaddd %ymm2,%ymm7,%ymm2 - vpxord %ymm2,%ymm13,%ymm13 - vprold $16,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - vpaddd %ymm3,%ymm4,%ymm3 - vpxord %ymm3,%ymm14,%ymm14 - vprold $16,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - vpaddd %ymm15,%ymm10,%ymm10 - vpxord %ymm10,%ymm5,%ymm5 - vprold $12,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - vpaddd %ymm12,%ymm11,%ymm11 - vpxord %ymm11,%ymm6,%ymm6 - vprold $12,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - vpaddd %ymm13,%ymm8,%ymm8 - vpxord %ymm8,%ymm7,%ymm7 - vprold $12,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - vpaddd %ymm14,%ymm9,%ymm9 - vpxord %ymm9,%ymm4,%ymm4 - vprold $12,%ymm4,%ymm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - vpaddd %ymm0,%ymm5,%ymm0 - vpxord %ymm0,%ymm15,%ymm15 - vprold $8,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - vpaddd %ymm1,%ymm6,%ymm1 - vpxord %ymm1,%ymm12,%ymm12 - vprold $8,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - vpaddd %ymm2,%ymm7,%ymm2 - vpxord %ymm2,%ymm13,%ymm13 - vprold $8,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - vpaddd %ymm3,%ymm4,%ymm3 - vpxord %ymm3,%ymm14,%ymm14 - vprold $8,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - vpaddd %ymm15,%ymm10,%ymm10 - vpxord %ymm10,%ymm5,%ymm5 - vprold $7,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - vpaddd %ymm12,%ymm11,%ymm11 - vpxord %ymm11,%ymm6,%ymm6 - vprold $7,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - vpaddd %ymm13,%ymm8,%ymm8 - vpxord %ymm8,%ymm7,%ymm7 - vprold $7,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - vpaddd %ymm14,%ymm9,%ymm9 - vpxord %ymm9,%ymm4,%ymm4 - vprold $7,%ymm4,%ymm4 - - sub $2,%r8d - jnz .Ldoubleround8 - - # x0..15[0-3] += s[0..15] - vpaddd %ymm16,%ymm0,%ymm0 - vpaddd %ymm17,%ymm1,%ymm1 - vpaddd %ymm18,%ymm2,%ymm2 - vpaddd %ymm19,%ymm3,%ymm3 - vpaddd %ymm20,%ymm4,%ymm4 - vpaddd %ymm21,%ymm5,%ymm5 - vpaddd %ymm22,%ymm6,%ymm6 - vpaddd %ymm23,%ymm7,%ymm7 - vpaddd %ymm24,%ymm8,%ymm8 - vpaddd %ymm25,%ymm9,%ymm9 - vpaddd %ymm26,%ymm10,%ymm10 - vpaddd %ymm27,%ymm11,%ymm11 - vpaddd %ymm28,%ymm12,%ymm12 - vpaddd %ymm29,%ymm13,%ymm13 - vpaddd %ymm30,%ymm14,%ymm14 - vpaddd %ymm31,%ymm15,%ymm15 - - # interleave 32-bit words in state n, n+1 - vpunpckldq %ymm1,%ymm0,%ymm16 - vpunpckhdq %ymm1,%ymm0,%ymm17 - vpunpckldq %ymm3,%ymm2,%ymm18 - vpunpckhdq %ymm3,%ymm2,%ymm19 - vpunpckldq %ymm5,%ymm4,%ymm20 - vpunpckhdq %ymm5,%ymm4,%ymm21 - vpunpckldq %ymm7,%ymm6,%ymm22 - vpunpckhdq %ymm7,%ymm6,%ymm23 - vpunpckldq %ymm9,%ymm8,%ymm24 - vpunpckhdq %ymm9,%ymm8,%ymm25 - vpunpckldq %ymm11,%ymm10,%ymm26 - vpunpckhdq %ymm11,%ymm10,%ymm27 - vpunpckldq %ymm13,%ymm12,%ymm28 - vpunpckhdq %ymm13,%ymm12,%ymm29 - vpunpckldq %ymm15,%ymm14,%ymm30 - vpunpckhdq %ymm15,%ymm14,%ymm31 - - # interleave 64-bit words in state n, n+2 - vpunpcklqdq %ymm18,%ymm16,%ymm0 - vpunpcklqdq %ymm19,%ymm17,%ymm1 - vpunpckhqdq %ymm18,%ymm16,%ymm2 - vpunpckhqdq %ymm19,%ymm17,%ymm3 - vpunpcklqdq %ymm22,%ymm20,%ymm4 - vpunpcklqdq %ymm23,%ymm21,%ymm5 - vpunpckhqdq %ymm22,%ymm20,%ymm6 - vpunpckhqdq %ymm23,%ymm21,%ymm7 - vpunpcklqdq %ymm26,%ymm24,%ymm8 - vpunpcklqdq %ymm27,%ymm25,%ymm9 - vpunpckhqdq %ymm26,%ymm24,%ymm10 - vpunpckhqdq %ymm27,%ymm25,%ymm11 - vpunpcklqdq %ymm30,%ymm28,%ymm12 - vpunpcklqdq %ymm31,%ymm29,%ymm13 - vpunpckhqdq %ymm30,%ymm28,%ymm14 - vpunpckhqdq %ymm31,%ymm29,%ymm15 - - # interleave 128-bit words in state n, n+4 - # xor/write first four blocks - vmovdqa64 %ymm0,%ymm16 - vperm2i128 $0x20,%ymm4,%ymm0,%ymm0 - cmp $0x0020,%rcx - jl .Lxorpart8 - vpxord 0x0000(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0000(%rsi) - vmovdqa64 %ymm16,%ymm0 - vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 - - vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 - cmp $0x0040,%rcx - jl .Lxorpart8 - vpxord 0x0020(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0020(%rsi) - vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 - - vperm2i128 $0x20,%ymm6,%ymm2,%ymm0 - cmp $0x0060,%rcx - jl .Lxorpart8 - vpxord 0x0040(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0040(%rsi) - vperm2i128 $0x31,%ymm6,%ymm2,%ymm6 - - vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 - cmp $0x0080,%rcx - jl .Lxorpart8 - vpxord 0x0060(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0060(%rsi) - vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 - - vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 - cmp $0x00a0,%rcx - jl .Lxorpart8 - vpxord 0x0080(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0080(%rsi) - vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 - - vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 - cmp $0x00c0,%rcx - jl .Lxorpart8 - vpxord 0x00a0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x00a0(%rsi) - vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 - - vperm2i128 $0x20,%ymm7,%ymm3,%ymm0 - cmp $0x00e0,%rcx - jl .Lxorpart8 - vpxord 0x00c0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x00c0(%rsi) - vperm2i128 $0x31,%ymm7,%ymm3,%ymm7 - - vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 - cmp $0x0100,%rcx - jl .Lxorpart8 - vpxord 0x00e0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x00e0(%rsi) - vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 - - # xor remaining blocks, write to output - vmovdqa64 %ymm4,%ymm0 - cmp $0x0120,%rcx - jl .Lxorpart8 - vpxord 0x0100(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0100(%rsi) - - vmovdqa64 %ymm12,%ymm0 - cmp $0x0140,%rcx - jl .Lxorpart8 - vpxord 0x0120(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0120(%rsi) - - vmovdqa64 %ymm6,%ymm0 - cmp $0x0160,%rcx - jl .Lxorpart8 - vpxord 0x0140(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0140(%rsi) - - vmovdqa64 %ymm14,%ymm0 - cmp $0x0180,%rcx - jl .Lxorpart8 - vpxord 0x0160(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0160(%rsi) - - vmovdqa64 %ymm5,%ymm0 - cmp $0x01a0,%rcx - jl .Lxorpart8 - vpxord 0x0180(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x0180(%rsi) - - vmovdqa64 %ymm13,%ymm0 - cmp $0x01c0,%rcx - jl .Lxorpart8 - vpxord 0x01a0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x01a0(%rsi) - - vmovdqa64 %ymm7,%ymm0 - cmp $0x01e0,%rcx - jl .Lxorpart8 - vpxord 0x01c0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x01c0(%rsi) - - vmovdqa64 %ymm15,%ymm0 - cmp $0x0200,%rcx - jl .Lxorpart8 - vpxord 0x01e0(%rdx),%ymm0,%ymm0 - vmovdqu64 %ymm0,0x01e0(%rsi) - -.Ldone8: - vzeroupper - RET - -.Lxorpart8: - # xor remaining bytes from partial register into output - mov %rcx,%rax - and $0x1f,%rcx - jz .Ldone8 - mov %rax,%r9 - and $~0x1f,%r9 - - mov $1,%rax - shld %cl,%rax,%rax - sub $1,%rax - kmovq %rax,%k1 - - vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z} - vpxord %ymm0,%ymm1,%ymm1 - vmovdqu8 %ymm1,(%rsi,%r9){%k1} - - jmp .Ldone8 - -SYM_FUNC_END(chacha_8block_xor_avx512vl) diff --git a/arch/x86/crypto/chacha-ssse3-x86_64.S b/arch/x86/crypto/chacha-ssse3-x86_64.S deleted file mode 100644 index 7111949cd5b9..000000000000 --- a/arch/x86/crypto/chacha-ssse3-x86_64.S +++ /dev/null @@ -1,791 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions - * - * Copyright (C) 2015 Martin Willi - */ - -#include <linux/linkage.h> -#include <asm/frame.h> - -.section .rodata.cst16.ROT8, "aM", @progbits, 16 -.align 16 -ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 -.section .rodata.cst16.ROT16, "aM", @progbits, 16 -.align 16 -ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 -.section .rodata.cst16.CTRINC, "aM", @progbits, 16 -.align 16 -CTRINC: .octa 0x00000003000000020000000100000000 - -.text - -/* - * chacha_permute - permute one block - * - * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This - * function performs matrix operations on four words in parallel, but requires - * shuffling to rearrange the words after each round. 8/16-bit word rotation is - * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word - * rotation uses traditional shift+OR. - * - * The round count is given in %r8d. - * - * Clobbers: %r8d, %xmm4-%xmm7 - */ -SYM_FUNC_START_LOCAL(chacha_permute) - - movdqa ROT8(%rip),%xmm4 - movdqa ROT16(%rip),%xmm5 - -.Ldoubleround: - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm5,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm6 - pslld $12,%xmm6 - psrld $20,%xmm1 - por %xmm6,%xmm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm4,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm7 - pslld $7,%xmm7 - psrld $25,%xmm1 - por %xmm7,%xmm1 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - pshufd $0x39,%xmm1,%xmm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - pshufd $0x4e,%xmm2,%xmm2 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - pshufd $0x93,%xmm3,%xmm3 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm5,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm6 - pslld $12,%xmm6 - psrld $20,%xmm1 - por %xmm6,%xmm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm4,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm7 - pslld $7,%xmm7 - psrld $25,%xmm1 - por %xmm7,%xmm1 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - pshufd $0x93,%xmm1,%xmm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - pshufd $0x4e,%xmm2,%xmm2 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - pshufd $0x39,%xmm3,%xmm3 - - sub $2,%r8d - jnz .Ldoubleround - - RET -SYM_FUNC_END(chacha_permute) - -SYM_FUNC_START(chacha_block_xor_ssse3) - # %rdi: Input state matrix, s - # %rsi: up to 1 data block output, o - # %rdx: up to 1 data block input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - FRAME_BEGIN - - # x0..3 = s0..3 - movdqu 0x00(%rdi),%xmm0 - movdqu 0x10(%rdi),%xmm1 - movdqu 0x20(%rdi),%xmm2 - movdqu 0x30(%rdi),%xmm3 - movdqa %xmm0,%xmm8 - movdqa %xmm1,%xmm9 - movdqa %xmm2,%xmm10 - movdqa %xmm3,%xmm11 - - mov %rcx,%rax - call chacha_permute - - # o0 = i0 ^ (x0 + s0) - paddd %xmm8,%xmm0 - cmp $0x10,%rax - jl .Lxorpart - movdqu 0x00(%rdx),%xmm4 - pxor %xmm4,%xmm0 - movdqu %xmm0,0x00(%rsi) - # o1 = i1 ^ (x1 + s1) - paddd %xmm9,%xmm1 - movdqa %xmm1,%xmm0 - cmp $0x20,%rax - jl .Lxorpart - movdqu 0x10(%rdx),%xmm0 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x10(%rsi) - # o2 = i2 ^ (x2 + s2) - paddd %xmm10,%xmm2 - movdqa %xmm2,%xmm0 - cmp $0x30,%rax - jl .Lxorpart - movdqu 0x20(%rdx),%xmm0 - pxor %xmm2,%xmm0 - movdqu %xmm0,0x20(%rsi) - # o3 = i3 ^ (x3 + s3) - paddd %xmm11,%xmm3 - movdqa %xmm3,%xmm0 - cmp $0x40,%rax - jl .Lxorpart - movdqu 0x30(%rdx),%xmm0 - pxor %xmm3,%xmm0 - movdqu %xmm0,0x30(%rsi) - -.Ldone: - FRAME_END - RET - -.Lxorpart: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone - and $~0x0f,%rax - - mov %rsi,%r11 - - lea 8(%rsp),%r10 - sub $0x10,%rsp - and $~31,%rsp - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - pxor 0x00(%rsp),%xmm0 - movdqa %xmm0,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - lea -8(%r10),%rsp - jmp .Ldone - -SYM_FUNC_END(chacha_block_xor_ssse3) - -SYM_FUNC_START(hchacha_block_ssse3) - # %rdi: Input state matrix, s - # %rsi: output (8 32-bit words) - # %edx: nrounds - FRAME_BEGIN - - movdqu 0x00(%rdi),%xmm0 - movdqu 0x10(%rdi),%xmm1 - movdqu 0x20(%rdi),%xmm2 - movdqu 0x30(%rdi),%xmm3 - - mov %edx,%r8d - call chacha_permute - - movdqu %xmm0,0x00(%rsi) - movdqu %xmm3,0x10(%rsi) - - FRAME_END - RET -SYM_FUNC_END(hchacha_block_ssse3) - -SYM_FUNC_START(chacha_4block_xor_ssse3) - # %rdi: Input state matrix, s - # %rsi: up to 4 data blocks output, o - # %rdx: up to 4 data blocks input, i - # %rcx: input/output length in bytes - # %r8d: nrounds - - # This function encrypts four consecutive ChaCha blocks by loading the - # the state matrix in SSE registers four times. As we need some scratch - # registers, we save the first four registers on the stack. The - # algorithm performs each operation on the corresponding word of each - # state matrix, hence requires no word shuffling. For final XORing step - # we transpose the matrix by interleaving 32- and then 64-bit words, - # which allows us to do XOR in SSE registers. 8/16-bit word rotation is - # done with the slightly better performing SSSE3 byte shuffling, - # 7/12-bit word rotation uses traditional shift+OR. - - lea 8(%rsp),%r10 - sub $0x80,%rsp - and $~63,%rsp - mov %rcx,%rax - - # x0..15[0-3] = s0..3[0..3] - movq 0x00(%rdi),%xmm1 - pshufd $0x00,%xmm1,%xmm0 - pshufd $0x55,%xmm1,%xmm1 - movq 0x08(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - movq 0x10(%rdi),%xmm5 - pshufd $0x00,%xmm5,%xmm4 - pshufd $0x55,%xmm5,%xmm5 - movq 0x18(%rdi),%xmm7 - pshufd $0x00,%xmm7,%xmm6 - pshufd $0x55,%xmm7,%xmm7 - movq 0x20(%rdi),%xmm9 - pshufd $0x00,%xmm9,%xmm8 - pshufd $0x55,%xmm9,%xmm9 - movq 0x28(%rdi),%xmm11 - pshufd $0x00,%xmm11,%xmm10 - pshufd $0x55,%xmm11,%xmm11 - movq 0x30(%rdi),%xmm13 - pshufd $0x00,%xmm13,%xmm12 - pshufd $0x55,%xmm13,%xmm13 - movq 0x38(%rdi),%xmm15 - pshufd $0x00,%xmm15,%xmm14 - pshufd $0x55,%xmm15,%xmm15 - # x0..3 on stack - movdqa %xmm0,0x00(%rsp) - movdqa %xmm1,0x10(%rsp) - movdqa %xmm2,0x20(%rsp) - movdqa %xmm3,0x30(%rsp) - - movdqa CTRINC(%rip),%xmm1 - movdqa ROT8(%rip),%xmm2 - movdqa ROT16(%rip),%xmm3 - - # x12 += counter values 0-3 - paddd %xmm1,%xmm12 - -.Ldoubleround4: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm3,%xmm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm3,%xmm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm3,%xmm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm3,%xmm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm4 - por %xmm0,%xmm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm5 - por %xmm0,%xmm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm6 - por %xmm0,%xmm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm7 - por %xmm0,%xmm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm2,%xmm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm2,%xmm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm2,%xmm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm2,%xmm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm4 - por %xmm0,%xmm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm5 - por %xmm0,%xmm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm6 - por %xmm0,%xmm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm7 - por %xmm0,%xmm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm3,%xmm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm3,%xmm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm3,%xmm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm3,%xmm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - paddd %xmm15,%xmm10 - pxor %xmm10,%xmm5 - movdqa %xmm5,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm5 - por %xmm0,%xmm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - paddd %xmm12,%xmm11 - pxor %xmm11,%xmm6 - movdqa %xmm6,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm6 - por %xmm0,%xmm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - paddd %xmm13,%xmm8 - pxor %xmm8,%xmm7 - movdqa %xmm7,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm7 - por %xmm0,%xmm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - paddd %xmm14,%xmm9 - pxor %xmm9,%xmm4 - movdqa %xmm4,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm4 - por %xmm0,%xmm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm2,%xmm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm2,%xmm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm2,%xmm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm2,%xmm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - paddd %xmm15,%xmm10 - pxor %xmm10,%xmm5 - movdqa %xmm5,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm5 - por %xmm0,%xmm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - paddd %xmm12,%xmm11 - pxor %xmm11,%xmm6 - movdqa %xmm6,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm6 - por %xmm0,%xmm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - paddd %xmm13,%xmm8 - pxor %xmm8,%xmm7 - movdqa %xmm7,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm7 - por %xmm0,%xmm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - paddd %xmm14,%xmm9 - pxor %xmm9,%xmm4 - movdqa %xmm4,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm4 - por %xmm0,%xmm4 - - sub $2,%r8d - jnz .Ldoubleround4 - - # x0[0-3] += s0[0] - # x1[0-3] += s0[1] - movq 0x00(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd 0x00(%rsp),%xmm2 - movdqa %xmm2,0x00(%rsp) - paddd 0x10(%rsp),%xmm3 - movdqa %xmm3,0x10(%rsp) - # x2[0-3] += s0[2] - # x3[0-3] += s0[3] - movq 0x08(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd 0x20(%rsp),%xmm2 - movdqa %xmm2,0x20(%rsp) - paddd 0x30(%rsp),%xmm3 - movdqa %xmm3,0x30(%rsp) - - # x4[0-3] += s1[0] - # x5[0-3] += s1[1] - movq 0x10(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm4 - paddd %xmm3,%xmm5 - # x6[0-3] += s1[2] - # x7[0-3] += s1[3] - movq 0x18(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm6 - paddd %xmm3,%xmm7 - - # x8[0-3] += s2[0] - # x9[0-3] += s2[1] - movq 0x20(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm8 - paddd %xmm3,%xmm9 - # x10[0-3] += s2[2] - # x11[0-3] += s2[3] - movq 0x28(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm10 - paddd %xmm3,%xmm11 - - # x12[0-3] += s3[0] - # x13[0-3] += s3[1] - movq 0x30(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm12 - paddd %xmm3,%xmm13 - # x14[0-3] += s3[2] - # x15[0-3] += s3[3] - movq 0x38(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm14 - paddd %xmm3,%xmm15 - - # x12 += counter values 0-3 - paddd %xmm1,%xmm12 - - # interleave 32-bit words in state n, n+1 - movdqa 0x00(%rsp),%xmm0 - movdqa 0x10(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpckldq %xmm1,%xmm2 - punpckhdq %xmm1,%xmm0 - movdqa %xmm2,0x00(%rsp) - movdqa %xmm0,0x10(%rsp) - movdqa 0x20(%rsp),%xmm0 - movdqa 0x30(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpckldq %xmm1,%xmm2 - punpckhdq %xmm1,%xmm0 - movdqa %xmm2,0x20(%rsp) - movdqa %xmm0,0x30(%rsp) - movdqa %xmm4,%xmm0 - punpckldq %xmm5,%xmm4 - punpckhdq %xmm5,%xmm0 - movdqa %xmm0,%xmm5 - movdqa %xmm6,%xmm0 - punpckldq %xmm7,%xmm6 - punpckhdq %xmm7,%xmm0 - movdqa %xmm0,%xmm7 - movdqa %xmm8,%xmm0 - punpckldq %xmm9,%xmm8 - punpckhdq %xmm9,%xmm0 - movdqa %xmm0,%xmm9 - movdqa %xmm10,%xmm0 - punpckldq %xmm11,%xmm10 - punpckhdq %xmm11,%xmm0 - movdqa %xmm0,%xmm11 - movdqa %xmm12,%xmm0 - punpckldq %xmm13,%xmm12 - punpckhdq %xmm13,%xmm0 - movdqa %xmm0,%xmm13 - movdqa %xmm14,%xmm0 - punpckldq %xmm15,%xmm14 - punpckhdq %xmm15,%xmm0 - movdqa %xmm0,%xmm15 - - # interleave 64-bit words in state n, n+2 - movdqa 0x00(%rsp),%xmm0 - movdqa 0x20(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpcklqdq %xmm1,%xmm2 - punpckhqdq %xmm1,%xmm0 - movdqa %xmm2,0x00(%rsp) - movdqa %xmm0,0x20(%rsp) - movdqa 0x10(%rsp),%xmm0 - movdqa 0x30(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpcklqdq %xmm1,%xmm2 - punpckhqdq %xmm1,%xmm0 - movdqa %xmm2,0x10(%rsp) - movdqa %xmm0,0x30(%rsp) - movdqa %xmm4,%xmm0 - punpcklqdq %xmm6,%xmm4 - punpckhqdq %xmm6,%xmm0 - movdqa %xmm0,%xmm6 - movdqa %xmm5,%xmm0 - punpcklqdq %xmm7,%xmm5 - punpckhqdq %xmm7,%xmm0 - movdqa %xmm0,%xmm7 - movdqa %xmm8,%xmm0 - punpcklqdq %xmm10,%xmm8 - punpckhqdq %xmm10,%xmm0 - movdqa %xmm0,%xmm10 - movdqa %xmm9,%xmm0 - punpcklqdq %xmm11,%xmm9 - punpckhqdq %xmm11,%xmm0 - movdqa %xmm0,%xmm11 - movdqa %xmm12,%xmm0 - punpcklqdq %xmm14,%xmm12 - punpckhqdq %xmm14,%xmm0 - movdqa %xmm0,%xmm14 - movdqa %xmm13,%xmm0 - punpcklqdq %xmm15,%xmm13 - punpckhqdq %xmm15,%xmm0 - movdqa %xmm0,%xmm15 - - # xor with corresponding input, write to output - movdqa 0x00(%rsp),%xmm0 - cmp $0x10,%rax - jl .Lxorpart4 - movdqu 0x00(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x00(%rsi) - - movdqu %xmm4,%xmm0 - cmp $0x20,%rax - jl .Lxorpart4 - movdqu 0x10(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x10(%rsi) - - movdqu %xmm8,%xmm0 - cmp $0x30,%rax - jl .Lxorpart4 - movdqu 0x20(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x20(%rsi) - - movdqu %xmm12,%xmm0 - cmp $0x40,%rax - jl .Lxorpart4 - movdqu 0x30(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x30(%rsi) - - movdqa 0x20(%rsp),%xmm0 - cmp $0x50,%rax - jl .Lxorpart4 - movdqu 0x40(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x40(%rsi) - - movdqu %xmm6,%xmm0 - cmp $0x60,%rax - jl .Lxorpart4 - movdqu 0x50(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x50(%rsi) - - movdqu %xmm10,%xmm0 - cmp $0x70,%rax - jl .Lxorpart4 - movdqu 0x60(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x60(%rsi) - - movdqu %xmm14,%xmm0 - cmp $0x80,%rax - jl .Lxorpart4 - movdqu 0x70(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x70(%rsi) - - movdqa 0x10(%rsp),%xmm0 - cmp $0x90,%rax - jl .Lxorpart4 - movdqu 0x80(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x80(%rsi) - - movdqu %xmm5,%xmm0 - cmp $0xa0,%rax - jl .Lxorpart4 - movdqu 0x90(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x90(%rsi) - - movdqu %xmm9,%xmm0 - cmp $0xb0,%rax - jl .Lxorpart4 - movdqu 0xa0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xa0(%rsi) - - movdqu %xmm13,%xmm0 - cmp $0xc0,%rax - jl .Lxorpart4 - movdqu 0xb0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xb0(%rsi) - - movdqa 0x30(%rsp),%xmm0 - cmp $0xd0,%rax - jl .Lxorpart4 - movdqu 0xc0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xc0(%rsi) - - movdqu %xmm7,%xmm0 - cmp $0xe0,%rax - jl .Lxorpart4 - movdqu 0xd0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xd0(%rsi) - - movdqu %xmm11,%xmm0 - cmp $0xf0,%rax - jl .Lxorpart4 - movdqu 0xe0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xe0(%rsi) - - movdqu %xmm15,%xmm0 - cmp $0x100,%rax - jl .Lxorpart4 - movdqu 0xf0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xf0(%rsi) - -.Ldone4: - lea -8(%r10),%rsp - RET - -.Lxorpart4: - # xor remaining bytes from partial register into output - mov %rax,%r9 - and $0x0f,%r9 - jz .Ldone4 - and $~0x0f,%rax - - mov %rsi,%r11 - - lea (%rdx,%rax),%rsi - mov %rsp,%rdi - mov %r9,%rcx - rep movsb - - pxor 0x00(%rsp),%xmm0 - movdqa %xmm0,0x00(%rsp) - - mov %rsp,%rsi - lea (%r11,%rax),%rdi - mov %r9,%rcx - rep movsb - - jmp .Ldone4 - -SYM_FUNC_END(chacha_4block_xor_ssse3) diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c deleted file mode 100644 index 8bb74a272879..000000000000 --- a/arch/x86/crypto/chacha_glue.c +++ /dev/null @@ -1,311 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * x64 SIMD accelerated ChaCha and XChaCha stream ciphers, - * including ChaCha20 (RFC7539) - * - * Copyright (C) 2015 Martin Willi - */ - -#include <crypto/algapi.h> -#include <crypto/internal/chacha.h> -#include <crypto/internal/simd.h> -#include <crypto/internal/skcipher.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/sizes.h> -#include <asm/simd.h> - -asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds); - -asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); - -asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); -asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, - unsigned int len, int nrounds); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); - -static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) -{ - len = min(len, maxblocks * CHACHA_BLOCK_SIZE); - return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; -} - -static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, - unsigned int bytes, int nrounds) -{ - if (IS_ENABLED(CONFIG_AS_AVX512) && - static_branch_likely(&chacha_use_avx512vl)) { - while (bytes >= CHACHA_BLOCK_SIZE * 8) { - chacha_8block_xor_avx512vl(state, dst, src, bytes, - nrounds); - bytes -= CHACHA_BLOCK_SIZE * 8; - src += CHACHA_BLOCK_SIZE * 8; - dst += CHACHA_BLOCK_SIZE * 8; - state[12] += 8; - } - if (bytes > CHACHA_BLOCK_SIZE * 4) { - chacha_8block_xor_avx512vl(state, dst, src, bytes, - nrounds); - state[12] += chacha_advance(bytes, 8); - return; - } - if (bytes > CHACHA_BLOCK_SIZE * 2) { - chacha_4block_xor_avx512vl(state, dst, src, bytes, - nrounds); - state[12] += chacha_advance(bytes, 4); - return; - } - if (bytes) { - chacha_2block_xor_avx512vl(state, dst, src, bytes, - nrounds); - state[12] += chacha_advance(bytes, 2); - return; - } - } - - if (static_branch_likely(&chacha_use_avx2)) { - while (bytes >= CHACHA_BLOCK_SIZE * 8) { - chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); - bytes -= CHACHA_BLOCK_SIZE * 8; - src += CHACHA_BLOCK_SIZE * 8; - dst += CHACHA_BLOCK_SIZE * 8; - state[12] += 8; - } - if (bytes > CHACHA_BLOCK_SIZE * 4) { - chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); - state[12] += chacha_advance(bytes, 8); - return; - } - if (bytes > CHACHA_BLOCK_SIZE * 2) { - chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); - state[12] += chacha_advance(bytes, 4); - return; - } - if (bytes > CHACHA_BLOCK_SIZE) { - chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); - state[12] += chacha_advance(bytes, 2); - return; - } - } - - while (bytes >= CHACHA_BLOCK_SIZE * 4) { - chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); - bytes -= CHACHA_BLOCK_SIZE * 4; - src += CHACHA_BLOCK_SIZE * 4; - dst += CHACHA_BLOCK_SIZE * 4; - state[12] += 4; - } - if (bytes > CHACHA_BLOCK_SIZE) { - chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); - state[12] += chacha_advance(bytes, 4); - return; - } - if (bytes) { - chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); - state[12]++; - } -} - -void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) -{ - if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) { - hchacha_block_generic(state, stream, nrounds); - } else { - kernel_fpu_begin(); - hchacha_block_ssse3(state, stream, nrounds); - kernel_fpu_end(); - } -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, - int nrounds) -{ - if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() || - bytes <= CHACHA_BLOCK_SIZE) - return chacha_crypt_generic(state, dst, src, bytes, nrounds); - - do { - unsigned int todo = min_t(unsigned int, bytes, SZ_4K); - - kernel_fpu_begin(); - chacha_dosimd(state, dst, src, todo, nrounds); - kernel_fpu_end(); - - bytes -= todo; - src += todo; - dst += todo; - } while (bytes); -} -EXPORT_SYMBOL(chacha_crypt_arch); - -static int chacha_simd_stream_xor(struct skcipher_request *req, - const struct chacha_ctx *ctx, const u8 *iv) -{ - u32 state[CHACHA_STATE_WORDS] __aligned(8); - struct skcipher_walk walk; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - chacha_init(state, ctx->key, iv); - - while (walk.nbytes > 0) { - unsigned int nbytes = walk.nbytes; - - if (nbytes < walk.total) - nbytes = round_down(nbytes, walk.stride); - - if (!static_branch_likely(&chacha_use_simd) || - !crypto_simd_usable()) { - chacha_crypt_generic(state, walk.dst.virt.addr, - walk.src.virt.addr, nbytes, - ctx->nrounds); - } else { - kernel_fpu_begin(); - chacha_dosimd(state, walk.dst.virt.addr, - walk.src.virt.addr, nbytes, - ctx->nrounds); - kernel_fpu_end(); - } - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); - } - - return err; -} - -static int chacha_simd(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - - return chacha_simd_stream_xor(req, ctx, req->iv); -} - -static int xchacha_simd(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - u32 state[CHACHA_STATE_WORDS] __aligned(8); - struct chacha_ctx subctx; - u8 real_iv[16]; - - chacha_init(state, ctx->key, req->iv); - - if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) { - kernel_fpu_begin(); - hchacha_block_ssse3(state, subctx.key, ctx->nrounds); - kernel_fpu_end(); - } else { - hchacha_block_generic(state, subctx.key, ctx->nrounds); - } - subctx.nrounds = ctx->nrounds; - - memcpy(&real_iv[0], req->iv + 24, 8); - memcpy(&real_iv[8], req->iv + 16, 8); - return chacha_simd_stream_xor(req, &subctx, real_iv); -} - -static struct skcipher_alg algs[] = { - { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-simd", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = CHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha20_setkey, - .encrypt = chacha_simd, - .decrypt = chacha_simd, - }, { - .base.cra_name = "xchacha20", - .base.cra_driver_name = "xchacha20-simd", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha20_setkey, - .encrypt = xchacha_simd, - .decrypt = xchacha_simd, - }, { - .base.cra_name = "xchacha12", - .base.cra_driver_name = "xchacha12-simd", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .setkey = chacha12_setkey, - .encrypt = xchacha_simd, - .decrypt = xchacha_simd, - }, -}; - -static int __init chacha_simd_mod_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_SSSE3)) - return 0; - - static_branch_enable(&chacha_use_simd); - - if (boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - static_branch_enable(&chacha_use_avx2); - - if (IS_ENABLED(CONFIG_AS_AVX512) && - boot_cpu_has(X86_FEATURE_AVX512VL) && - boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ - static_branch_enable(&chacha_use_avx512vl); - } - return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ? - crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0; -} - -static void __exit chacha_simd_mod_fini(void) -{ - if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) && boot_cpu_has(X86_FEATURE_SSSE3)) - crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); -} - -module_init(chacha_simd_mod_init); -module_exit(chacha_simd_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); -MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)"); -MODULE_ALIAS_CRYPTO("chacha20"); -MODULE_ALIAS_CRYPTO("chacha20-simd"); -MODULE_ALIAS_CRYPTO("xchacha20"); -MODULE_ALIAS_CRYPTO("xchacha20-simd"); -MODULE_ALIAS_CRYPTO("xchacha12"); -MODULE_ALIAS_CRYPTO("xchacha12-simd"); diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index 99cb983ded9e..c4fbaa82ed7a 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -103,8 +103,8 @@ SYM_FUNC_START(clmul_ghash_mul) SYM_FUNC_END(clmul_ghash_mul) /* - * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, - * const le128 *shash); + * int clmul_ghash_update(char *dst, const char *src, unsigned int srclen, + * const le128 *shash); */ SYM_FUNC_START(clmul_ghash_update) FRAME_BEGIN @@ -127,6 +127,7 @@ SYM_FUNC_START(clmul_ghash_update) pshufb BSWAP, DATA movups DATA, (%rdi) .Lupdate_just_ret: + mov %rdx, %rax FRAME_END RET SYM_FUNC_END(clmul_ghash_update) diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index c759ec808bf1..aea5d4d06be7 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -7,41 +7,27 @@ * Author: Huang Ying <ying.huang@intel.com> */ -#include <linux/err.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/crypto.h> -#include <crypto/algapi.h> -#include <crypto/cryptd.h> -#include <crypto/gf128mul.h> -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> #include <asm/cpu_device_id.h> #include <asm/simd.h> +#include <crypto/b128ops.h> +#include <crypto/ghash.h> +#include <crypto/internal/hash.h> +#include <crypto/utils.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> #include <linux/unaligned.h> -#define GHASH_BLOCK_SIZE 16 -#define GHASH_DIGEST_SIZE 16 +asmlinkage void clmul_ghash_mul(char *dst, const le128 *shash); -void clmul_ghash_mul(char *dst, const le128 *shash); +asmlinkage int clmul_ghash_update(char *dst, const char *src, + unsigned int srclen, const le128 *shash); -void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, - const le128 *shash); - -struct ghash_async_ctx { - struct cryptd_ahash *cryptd_tfm; -}; - -struct ghash_ctx { +struct x86_ghash_ctx { le128 shash; }; -struct ghash_desc_ctx { - u8 buffer[GHASH_BLOCK_SIZE]; - u32 bytes; -}; - static int ghash_init(struct shash_desc *desc) { struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); @@ -54,7 +40,7 @@ static int ghash_init(struct shash_desc *desc) static int ghash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { - struct ghash_ctx *ctx = crypto_shash_ctx(tfm); + struct x86_ghash_ctx *ctx = crypto_shash_ctx(tfm); u64 a, b; if (keylen != GHASH_BLOCK_SIZE) @@ -95,64 +81,38 @@ static int ghash_setkey(struct crypto_shash *tfm, static int ghash_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) { + struct x86_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); - struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); u8 *dst = dctx->buffer; + int remain; kernel_fpu_begin(); - if (dctx->bytes) { - int n = min(srclen, dctx->bytes); - u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes); - - dctx->bytes -= n; - srclen -= n; - - while (n--) - *pos++ ^= *src++; - - if (!dctx->bytes) - clmul_ghash_mul(dst, &ctx->shash); - } - - clmul_ghash_update(dst, src, srclen, &ctx->shash); + remain = clmul_ghash_update(dst, src, srclen, &ctx->shash); kernel_fpu_end(); - - if (srclen & 0xf) { - src += srclen - (srclen & 0xf); - srclen &= 0xf; - dctx->bytes = GHASH_BLOCK_SIZE - srclen; - while (srclen--) - *dst++ ^= *src++; - } - - return 0; + return remain; } -static void ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx) +static void ghash_flush(struct x86_ghash_ctx *ctx, struct ghash_desc_ctx *dctx, + const u8 *src, unsigned int len) { u8 *dst = dctx->buffer; - if (dctx->bytes) { - u8 *tmp = dst + (GHASH_BLOCK_SIZE - dctx->bytes); - - while (dctx->bytes--) - *tmp++ ^= 0; - - kernel_fpu_begin(); + kernel_fpu_begin(); + if (len) { + crypto_xor(dst, src, len); clmul_ghash_mul(dst, &ctx->shash); - kernel_fpu_end(); } - - dctx->bytes = 0; + kernel_fpu_end(); } -static int ghash_final(struct shash_desc *desc, u8 *dst) +static int ghash_finup(struct shash_desc *desc, const u8 *src, + unsigned int len, u8 *dst) { + struct x86_ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); - struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); u8 *buf = dctx->buffer; - ghash_flush(ctx, dctx); + ghash_flush(ctx, dctx, src, len); memcpy(dst, buf, GHASH_BLOCK_SIZE); return 0; @@ -162,186 +122,20 @@ static struct shash_alg ghash_alg = { .digestsize = GHASH_DIGEST_SIZE, .init = ghash_init, .update = ghash_update, - .final = ghash_final, + .finup = ghash_finup, .setkey = ghash_setkey, .descsize = sizeof(struct ghash_desc_ctx), .base = { - .cra_name = "__ghash", - .cra_driver_name = "__ghash-pclmulqdqni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_name = "ghash", + .cra_driver_name = "ghash-pclmulqdqni", + .cra_priority = 400, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = GHASH_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct ghash_ctx), + .cra_ctxsize = sizeof(struct x86_ghash_ctx), .cra_module = THIS_MODULE, }, }; -static int ghash_async_init(struct ahash_request *req) -{ - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct ahash_request *cryptd_req = ahash_request_ctx(req); - struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - struct shash_desc *desc = cryptd_shash_desc(cryptd_req); - struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); - - desc->tfm = child; - return crypto_shash_init(desc); -} - -static void ghash_init_cryptd_req(struct ahash_request *req) -{ - struct ahash_request *cryptd_req = ahash_request_ctx(req); - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - - ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); - ahash_request_set_callback(cryptd_req, req->base.flags, - req->base.complete, req->base.data); - ahash_request_set_crypt(cryptd_req, req->src, req->result, - req->nbytes); -} - -static int ghash_async_update(struct ahash_request *req) -{ - struct ahash_request *cryptd_req = ahash_request_ctx(req); - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - - if (!crypto_simd_usable() || - (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { - ghash_init_cryptd_req(req); - return crypto_ahash_update(cryptd_req); - } else { - struct shash_desc *desc = cryptd_shash_desc(cryptd_req); - return shash_ahash_update(req, desc); - } -} - -static int ghash_async_final(struct ahash_request *req) -{ - struct ahash_request *cryptd_req = ahash_request_ctx(req); - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - - if (!crypto_simd_usable() || - (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { - ghash_init_cryptd_req(req); - return crypto_ahash_final(cryptd_req); - } else { - struct shash_desc *desc = cryptd_shash_desc(cryptd_req); - return crypto_shash_final(desc, req->result); - } -} - -static int ghash_async_import(struct ahash_request *req, const void *in) -{ - struct ahash_request *cryptd_req = ahash_request_ctx(req); - struct shash_desc *desc = cryptd_shash_desc(cryptd_req); - struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); - - ghash_async_init(req); - memcpy(dctx, in, sizeof(*dctx)); - return 0; - -} - -static int ghash_async_export(struct ahash_request *req, void *out) -{ - struct ahash_request *cryptd_req = ahash_request_ctx(req); - struct shash_desc *desc = cryptd_shash_desc(cryptd_req); - struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); - - memcpy(out, dctx, sizeof(*dctx)); - return 0; - -} - -static int ghash_async_digest(struct ahash_request *req) -{ - struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct ahash_request *cryptd_req = ahash_request_ctx(req); - struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - - if (!crypto_simd_usable() || - (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { - ghash_init_cryptd_req(req); - return crypto_ahash_digest(cryptd_req); - } else { - struct shash_desc *desc = cryptd_shash_desc(cryptd_req); - struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); - - desc->tfm = child; - return shash_ahash_digest(req, desc); - } -} - -static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key, - unsigned int keylen) -{ - struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); - struct crypto_ahash *child = &ctx->cryptd_tfm->base; - - crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK); - crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm) - & CRYPTO_TFM_REQ_MASK); - return crypto_ahash_setkey(child, key, keylen); -} - -static int ghash_async_init_tfm(struct crypto_tfm *tfm) -{ - struct cryptd_ahash *cryptd_tfm; - struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); - - cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", - CRYPTO_ALG_INTERNAL, - CRYPTO_ALG_INTERNAL); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ctx->cryptd_tfm = cryptd_tfm; - crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), - sizeof(struct ahash_request) + - crypto_ahash_reqsize(&cryptd_tfm->base)); - - return 0; -} - -static void ghash_async_exit_tfm(struct crypto_tfm *tfm) -{ - struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); - - cryptd_free_ahash(ctx->cryptd_tfm); -} - -static struct ahash_alg ghash_async_alg = { - .init = ghash_async_init, - .update = ghash_async_update, - .final = ghash_async_final, - .setkey = ghash_async_setkey, - .digest = ghash_async_digest, - .export = ghash_async_export, - .import = ghash_async_import, - .halg = { - .digestsize = GHASH_DIGEST_SIZE, - .statesize = sizeof(struct ghash_desc_ctx), - .base = { - .cra_name = "ghash", - .cra_driver_name = "ghash-clmulni", - .cra_priority = 400, - .cra_ctxsize = sizeof(struct ghash_async_ctx), - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = GHASH_BLOCK_SIZE, - .cra_module = THIS_MODULE, - .cra_init = ghash_async_init_tfm, - .cra_exit = ghash_async_exit_tfm, - }, - }, -}; - static const struct x86_cpu_id pcmul_cpu_id[] = { X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), /* Pickle-Mickle-Duck */ {} @@ -350,29 +144,14 @@ MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id); static int __init ghash_pclmulqdqni_mod_init(void) { - int err; - if (!x86_match_cpu(pcmul_cpu_id)) return -ENODEV; - err = crypto_register_shash(&ghash_alg); - if (err) - goto err_out; - err = crypto_register_ahash(&ghash_async_alg); - if (err) - goto err_shash; - - return 0; - -err_shash: - crypto_unregister_shash(&ghash_alg); -err_out: - return err; + return crypto_register_shash(&ghash_alg); } static void __exit ghash_pclmulqdqni_mod_exit(void) { - crypto_unregister_ahash(&ghash_async_alg); crypto_unregister_shash(&ghash_alg); } diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl deleted file mode 100644 index b9abcd79c1f4..000000000000 --- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl +++ /dev/null @@ -1,4248 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause -# -# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. -# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. -# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. -# -# This code is taken from the OpenSSL project but the author, Andy Polyakov, -# has relicensed it under the licenses specified in the SPDX header above. -# The original headers, including the original license headers, are -# included below for completeness. -# -# ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# This module implements Poly1305 hash for x86_64. -# -# March 2015 -# -# Initial release. -# -# December 2016 -# -# Add AVX512F+VL+BW code path. -# -# November 2017 -# -# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be -# executed even on Knights Landing. Trigger for modification was -# observation that AVX512 code paths can negatively affect overall -# Skylake-X system performance. Since we are likely to suppress -# AVX512F capability flag [at least on Skylake-X], conversion serves -# as kind of "investment protection". Note that next *lake processor, -# Cannonlake, has AVX512IFMA code path to execute... -# -# Numbers are cycles per processed byte with poly1305_blocks alone, -# measured with rdtsc at fixed clock frequency. -# -# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 -# P4 4.46/+120% - -# Core 2 2.41/+90% - -# Westmere 1.88/+120% - -# Sandy Bridge 1.39/+140% 1.10 -# Haswell 1.14/+175% 1.11 0.65 -# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] -# Silvermont 2.83/+95% - -# Knights L 3.60/? 1.65 1.10 0.41(***) -# Goldmont 1.70/+180% - -# VIA Nano 1.82/+150% - -# Sledgehammer 1.38/+160% - -# Bulldozer 2.30/+130% 0.97 -# Ryzen 1.15/+200% 1.08 1.18 -# -# (*) improvement coefficients relative to clang are more modest and -# are ~50% on most processors, in both cases we are comparing to -# __int128 code; -# (**) SSE2 implementation was attempted, but among non-AVX processors -# it was faster than integer-only code only on older Intel P4 and -# Core processors, 50-30%, less newer processor is, but slower on -# contemporary ones, for example almost 2x slower on Atom, and as -# former are naturally disappearing, SSE2 is deemed unnecessary; -# (***) strangely enough performance seems to vary from core to core, -# listed result is best case; - -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } - -$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); -$kernel=0; $kernel=1 if (!$flavour && !$output); - -if (!$kernel) { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or - die "can't locate x86_64-xlate.pl"; - - open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; - *STDOUT=*OUT; - - if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` - =~ /GNU assembler version ([2-9]\.[0-9]+)/) { - $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); - } - - if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { - $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); - $avx += 1 if ($1==2.11 && $2>=8); - } - - if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && - `ml64 2>&1` =~ /Version ([0-9]+)\./) { - $avx = ($1>=10) + ($1>=11); - } - - if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { - $avx = ($2>=3.0) + ($2>3.0); - } -} else { - $avx = 4; # The kernel uses ifdefs for this. -} - -sub declare_function() { - my ($name, $align, $nargs) = @_; - if($kernel) { - $code .= "SYM_FUNC_START($name)\n"; - $code .= ".L$name:\n"; - } else { - $code .= ".globl $name\n"; - $code .= ".type $name,\@function,$nargs\n"; - $code .= ".align $align\n"; - $code .= "$name:\n"; - } -} - -sub end_function() { - my ($name) = @_; - if($kernel) { - $code .= "SYM_FUNC_END($name)\n"; - } else { - $code .= ".size $name,.-$name\n"; - } -} - -$code.=<<___ if $kernel; -#include <linux/linkage.h> -___ - -if ($avx) { -$code.=<<___ if $kernel; -.section .rodata -___ -$code.=<<___; -.align 64 -.Lconst: -.Lmask24: -.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 -.L129: -.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 -.Lmask26: -.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 -.Lpermd_avx2: -.long 2,2,2,3,2,0,2,1 -.Lpermd_avx512: -.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 - -.L2_44_inp_permd: -.long 0,1,1,2,2,3,7,7 -.L2_44_inp_shift: -.quad 0,12,24,64 -.L2_44_mask: -.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff -.L2_44_shift_rgt: -.quad 44,44,42,64 -.L2_44_shift_lft: -.quad 8,8,10,64 - -.align 64 -.Lx_mask44: -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff -.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff -.Lx_mask42: -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff -.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff -___ -} -$code.=<<___ if (!$kernel); -.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" -.align 16 -___ - -my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); -my ($mac,$nonce)=($inp,$len); # *_emit arguments -my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13"); -my ($h0,$h1,$h2)=("%r14","%rbx","%r10"); - -sub poly1305_iteration { -# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 -# output: $h0-$h2 *= $r0-$r1 -$code.=<<___; - mulq $h0 # h0*r1 - mov %rax,$d2 - mov $r0,%rax - mov %rdx,$d3 - - mulq $h0 # h0*r0 - mov %rax,$h0 # future $h0 - mov $r0,%rax - mov %rdx,$d1 - - mulq $h1 # h1*r0 - add %rax,$d2 - mov $s1,%rax - adc %rdx,$d3 - - mulq $h1 # h1*s1 - mov $h2,$h1 # borrow $h1 - add %rax,$h0 - adc %rdx,$d1 - - imulq $s1,$h1 # h2*s1 - add $h1,$d2 - mov $d1,$h1 - adc \$0,$d3 - - imulq $r0,$h2 # h2*r0 - add $d2,$h1 - mov \$-4,%rax # mask value - adc $h2,$d3 - - and $d3,%rax # last reduction step - mov $d3,$h2 - shr \$2,$d3 - and \$3,$h2 - add $d3,%rax - add %rax,$h0 - adc \$0,$h1 - adc \$0,$h2 -___ -} - -######################################################################## -# Layout of opaque area is following. -# -# unsigned __int64 h[3]; # current hash value base 2^64 -# unsigned __int64 r[2]; # key value base 2^64 - -$code.=<<___; -.text -___ -$code.=<<___ if (!$kernel); -.extern OPENSSL_ia32cap_P - -.globl poly1305_init_x86_64 -.hidden poly1305_init_x86_64 -.globl poly1305_blocks_x86_64 -.hidden poly1305_blocks_x86_64 -.globl poly1305_emit_x86_64 -.hidden poly1305_emit_x86_64 -___ -&declare_function("poly1305_init_x86_64", 32, 3); -$code.=<<___; - xor %eax,%eax - mov %rax,0($ctx) # initialize hash value - mov %rax,8($ctx) - mov %rax,16($ctx) - - test $inp,$inp - je .Lno_key -___ -$code.=<<___ if (!$kernel); - lea poly1305_blocks_x86_64(%rip),%r10 - lea poly1305_emit_x86_64(%rip),%r11 -___ -$code.=<<___ if (!$kernel && $avx); - mov OPENSSL_ia32cap_P+4(%rip),%r9 - lea poly1305_blocks_avx(%rip),%rax - lea poly1305_emit_avx(%rip),%rcx - bt \$`60-32`,%r9 # AVX? - cmovc %rax,%r10 - cmovc %rcx,%r11 -___ -$code.=<<___ if (!$kernel && $avx>1); - lea poly1305_blocks_avx2(%rip),%rax - bt \$`5+32`,%r9 # AVX2? - cmovc %rax,%r10 -___ -$code.=<<___ if (!$kernel && $avx>3); - mov \$`(1<<31|1<<21|1<<16)`,%rax - shr \$32,%r9 - and %rax,%r9 - cmp %rax,%r9 - je .Linit_base2_44 -___ -$code.=<<___; - mov \$0x0ffffffc0fffffff,%rax - mov \$0x0ffffffc0ffffffc,%rcx - and 0($inp),%rax - and 8($inp),%rcx - mov %rax,24($ctx) - mov %rcx,32($ctx) -___ -$code.=<<___ if (!$kernel && $flavour !~ /elf32/); - mov %r10,0(%rdx) - mov %r11,8(%rdx) -___ -$code.=<<___ if (!$kernel && $flavour =~ /elf32/); - mov %r10d,0(%rdx) - mov %r11d,4(%rdx) -___ -$code.=<<___; - mov \$1,%eax -.Lno_key: - RET -___ -&end_function("poly1305_init_x86_64"); - -&declare_function("poly1305_blocks_x86_64", 32, 4); -$code.=<<___; -.cfi_startproc -.Lblocks: - shr \$4,$len - jz .Lno_data # too short - - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - push $ctx -.cfi_push $ctx -.Lblocks_body: - - mov $len,%r15 # reassign $len - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - mov 0($ctx),$h0 # load hash value - mov 8($ctx),$h1 - mov 16($ctx),$h2 - - mov $s1,$r1 - shr \$2,$s1 - mov $r1,%rax - add $r1,$s1 # s1 = r1 + (r1 >> 2) - jmp .Loop - -.align 32 -.Loop: - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 -___ - - &poly1305_iteration(); - -$code.=<<___; - mov $r1,%rax - dec %r15 # len-=16 - jnz .Loop - - mov 0(%rsp),$ctx -.cfi_restore $ctx - - mov $h0,0($ctx) # store hash value - mov $h1,8($ctx) - mov $h2,16($ctx) - - mov 8(%rsp),%r15 -.cfi_restore %r15 - mov 16(%rsp),%r14 -.cfi_restore %r14 - mov 24(%rsp),%r13 -.cfi_restore %r13 - mov 32(%rsp),%r12 -.cfi_restore %r12 - mov 40(%rsp),%rbx -.cfi_restore %rbx - lea 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lno_data: -.Lblocks_epilogue: - RET -.cfi_endproc -___ -&end_function("poly1305_blocks_x86_64"); - -&declare_function("poly1305_emit_x86_64", 32, 3); -$code.=<<___; -.Lemit: - mov 0($ctx),%r8 # load hash value - mov 8($ctx),%r9 - mov 16($ctx),%r10 - - mov %r8,%rax - add \$5,%r8 # compare to modulus - mov %r9,%rcx - adc \$0,%r9 - adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overflow? - cmovnz %r8,%rax - cmovnz %r9,%rcx - - add 0($nonce),%rax # accumulate nonce - adc 8($nonce),%rcx - mov %rax,0($mac) # write result - mov %rcx,8($mac) - - RET -___ -&end_function("poly1305_emit_x86_64"); -if ($avx) { - -######################################################################## -# Layout of opaque area is following. -# -# unsigned __int32 h[5]; # current hash value base 2^26 -# unsigned __int32 is_base2_26; -# unsigned __int64 r[2]; # key value base 2^64 -# unsigned __int64 pad; -# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; -# -# where r^n are base 2^26 digits of degrees of multiplier key. There are -# 5 digits, but last four are interleaved with multiples of 5, totalling -# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. - -my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = - map("%xmm$_",(0..15)); - -$code.=<<___; -.type __poly1305_block,\@abi-omnipotent -.align 32 -__poly1305_block: - push $ctx -___ - &poly1305_iteration(); -$code.=<<___; - pop $ctx - RET -.size __poly1305_block,.-__poly1305_block - -.type __poly1305_init_avx,\@abi-omnipotent -.align 32 -__poly1305_init_avx: - push %rbp - mov %rsp,%rbp - mov $r0,$h0 - mov $r1,$h1 - xor $h2,$h2 - - lea 48+64($ctx),$ctx # size optimization - - mov $r1,%rax - call __poly1305_block # r^2 - - mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 - mov \$0x3ffffff,%edx - mov $h0,$d1 - and $h0#d,%eax - mov $r0,$d2 - and $r0#d,%edx - mov %eax,`16*0+0-64`($ctx) - shr \$26,$d1 - mov %edx,`16*0+4-64`($ctx) - shr \$26,$d2 - - mov \$0x3ffffff,%eax - mov \$0x3ffffff,%edx - and $d1#d,%eax - and $d2#d,%edx - mov %eax,`16*1+0-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov %edx,`16*1+4-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - mov %eax,`16*2+0-64`($ctx) - shr \$26,$d1 - mov %edx,`16*2+4-64`($ctx) - shr \$26,$d2 - - mov $h1,%rax - mov $r1,%rdx - shl \$12,%rax - shl \$12,%rdx - or $d1,%rax - or $d2,%rdx - and \$0x3ffffff,%eax - and \$0x3ffffff,%edx - mov %eax,`16*3+0-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov %edx,`16*3+4-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - mov %eax,`16*4+0-64`($ctx) - mov $h1,$d1 - mov %edx,`16*4+4-64`($ctx) - mov $r1,$d2 - - mov \$0x3ffffff,%eax - mov \$0x3ffffff,%edx - shr \$14,$d1 - shr \$14,$d2 - and $d1#d,%eax - and $d2#d,%edx - mov %eax,`16*5+0-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov %edx,`16*5+4-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - mov %eax,`16*6+0-64`($ctx) - shr \$26,$d1 - mov %edx,`16*6+4-64`($ctx) - shr \$26,$d2 - - mov $h2,%rax - shl \$24,%rax - or %rax,$d1 - mov $d1#d,`16*7+0-64`($ctx) - lea ($d1,$d1,4),$d1 # *5 - mov $d2#d,`16*7+4-64`($ctx) - lea ($d2,$d2,4),$d2 # *5 - mov $d1#d,`16*8+0-64`($ctx) - mov $d2#d,`16*8+4-64`($ctx) - - mov $r1,%rax - call __poly1305_block # r^3 - - mov \$0x3ffffff,%eax # save r^3 base 2^26 - mov $h0,$d1 - and $h0#d,%eax - shr \$26,$d1 - mov %eax,`16*0+12-64`($ctx) - - mov \$0x3ffffff,%edx - and $d1#d,%edx - mov %edx,`16*1+12-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*2+12-64`($ctx) - - mov $h1,%rax - shl \$12,%rax - or $d1,%rax - and \$0x3ffffff,%eax - mov %eax,`16*3+12-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov $h1,$d1 - mov %eax,`16*4+12-64`($ctx) - - mov \$0x3ffffff,%edx - shr \$14,$d1 - and $d1#d,%edx - mov %edx,`16*5+12-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*6+12-64`($ctx) - - mov $h2,%rax - shl \$24,%rax - or %rax,$d1 - mov $d1#d,`16*7+12-64`($ctx) - lea ($d1,$d1,4),$d1 # *5 - mov $d1#d,`16*8+12-64`($ctx) - - mov $r1,%rax - call __poly1305_block # r^4 - - mov \$0x3ffffff,%eax # save r^4 base 2^26 - mov $h0,$d1 - and $h0#d,%eax - shr \$26,$d1 - mov %eax,`16*0+8-64`($ctx) - - mov \$0x3ffffff,%edx - and $d1#d,%edx - mov %edx,`16*1+8-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*2+8-64`($ctx) - - mov $h1,%rax - shl \$12,%rax - or $d1,%rax - and \$0x3ffffff,%eax - mov %eax,`16*3+8-64`($ctx) - lea (%rax,%rax,4),%eax # *5 - mov $h1,$d1 - mov %eax,`16*4+8-64`($ctx) - - mov \$0x3ffffff,%edx - shr \$14,$d1 - and $d1#d,%edx - mov %edx,`16*5+8-64`($ctx) - lea (%rdx,%rdx,4),%edx # *5 - shr \$26,$d1 - mov %edx,`16*6+8-64`($ctx) - - mov $h2,%rax - shl \$24,%rax - or %rax,$d1 - mov $d1#d,`16*7+8-64`($ctx) - lea ($d1,$d1,4),$d1 # *5 - mov $d1#d,`16*8+8-64`($ctx) - - lea -48-64($ctx),$ctx # size [de-]optimization - pop %rbp - RET -.size __poly1305_init_avx,.-__poly1305_init_avx -___ - -&declare_function("poly1305_blocks_avx", 32, 4); -$code.=<<___; -.cfi_startproc - mov 20($ctx),%r8d # is_base2_26 - cmp \$128,$len - jae .Lblocks_avx - test %r8d,%r8d - jz .Lblocks - -.Lblocks_avx: - and \$-16,$len - jz .Lno_data_avx - - vzeroupper - - test %r8d,%r8d - jz .Lbase2_64_avx - - test \$31,$len - jz .Leven_avx - - push %rbp -.cfi_push %rbp - mov %rsp,%rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 -.Lblocks_avx_body: - - mov $len,%r15 # reassign $len - - mov 0($ctx),$d1 # load hash value - mov 8($ctx),$d2 - mov 16($ctx),$h2#d - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - ################################# base 2^26 -> base 2^64 - mov $d1#d,$h0#d - and \$`-1*(1<<31)`,$d1 - mov $d2,$r1 # borrow $r1 - mov $d2#d,$h1#d - and \$`-1*(1<<31)`,$d2 - - shr \$6,$d1 - shl \$52,$r1 - add $d1,$h0 - shr \$12,$h1 - shr \$18,$d2 - add $r1,$h0 - adc $d2,$h1 - - mov $h2,$d1 - shl \$40,$d1 - shr \$24,$h2 - add $d1,$h1 - adc \$0,$h2 # can be partially reduced... - - mov \$-4,$d2 # ... so reduce - mov $h2,$d1 - and $h2,$d2 - shr \$2,$d1 - and \$3,$h2 - add $d2,$d1 # =*5 - add $d1,$h0 - adc \$0,$h1 - adc \$0,$h2 - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - - call __poly1305_block - - test $padbit,$padbit # if $padbit is zero, - jz .Lstore_base2_64_avx # store hash in base 2^64 format - - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$r0 - mov $h1,$r1 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$r0 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $r0,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$r1 - and \$0x3ffffff,$h1 # h[3] - or $r1,$h2 # h[4] - - sub \$16,%r15 - jz .Lstore_base2_26_avx - - vmovd %rax#d,$H0 - vmovd %rdx#d,$H1 - vmovd $h0#d,$H2 - vmovd $h1#d,$H3 - vmovd $h2#d,$H4 - jmp .Lproceed_avx - -.align 32 -.Lstore_base2_64_avx: - mov $h0,0($ctx) - mov $h1,8($ctx) - mov $h2,16($ctx) # note that is_base2_26 is zeroed - jmp .Ldone_avx - -.align 16 -.Lstore_base2_26_avx: - mov %rax#d,0($ctx) # store hash value base 2^26 - mov %rdx#d,4($ctx) - mov $h0#d,8($ctx) - mov $h1#d,12($ctx) - mov $h2#d,16($ctx) -.align 16 -.Ldone_avx: - pop %r15 -.cfi_restore %r15 - pop %r14 -.cfi_restore %r14 - pop %r13 -.cfi_restore %r13 - pop %r12 -.cfi_restore %r12 - pop %rbx -.cfi_restore %rbx - pop %rbp -.cfi_restore %rbp -.Lno_data_avx: -.Lblocks_avx_epilogue: - RET -.cfi_endproc - -.align 32 -.Lbase2_64_avx: -.cfi_startproc - push %rbp -.cfi_push %rbp - mov %rsp,%rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 -.Lbase2_64_avx_body: - - mov $len,%r15 # reassign $len - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - mov 0($ctx),$h0 # load hash value - mov 8($ctx),$h1 - mov 16($ctx),$h2#d - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - - test \$31,$len - jz .Linit_avx - - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - sub \$16,%r15 - - call __poly1305_block - -.Linit_avx: - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$d1 - mov $h1,$d2 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$d1 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $d1,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$d2 - and \$0x3ffffff,$h1 # h[3] - or $d2,$h2 # h[4] - - vmovd %rax#d,$H0 - vmovd %rdx#d,$H1 - vmovd $h0#d,$H2 - vmovd $h1#d,$H3 - vmovd $h2#d,$H4 - movl \$1,20($ctx) # set is_base2_26 - - call __poly1305_init_avx - -.Lproceed_avx: - mov %r15,$len - pop %r15 -.cfi_restore %r15 - pop %r14 -.cfi_restore %r14 - pop %r13 -.cfi_restore %r13 - pop %r12 -.cfi_restore %r12 - pop %rbx -.cfi_restore %rbx - pop %rbp -.cfi_restore %rbp -.Lbase2_64_avx_epilogue: - jmp .Ldo_avx -.cfi_endproc - -.align 32 -.Leven_avx: -.cfi_startproc - vmovd 4*0($ctx),$H0 # load hash value - vmovd 4*1($ctx),$H1 - vmovd 4*2($ctx),$H2 - vmovd 4*3($ctx),$H3 - vmovd 4*4($ctx),$H4 - -.Ldo_avx: -___ -$code.=<<___ if (!$win64); - lea 8(%rsp),%r10 -.cfi_def_cfa_register %r10 - and \$-32,%rsp - sub \$-8,%rsp - lea -0x58(%rsp),%r11 - sub \$0x178,%rsp -___ -$code.=<<___ if ($win64); - lea -0xf8(%rsp),%r11 - sub \$0x218,%rsp - vmovdqa %xmm6,0x50(%r11) - vmovdqa %xmm7,0x60(%r11) - vmovdqa %xmm8,0x70(%r11) - vmovdqa %xmm9,0x80(%r11) - vmovdqa %xmm10,0x90(%r11) - vmovdqa %xmm11,0xa0(%r11) - vmovdqa %xmm12,0xb0(%r11) - vmovdqa %xmm13,0xc0(%r11) - vmovdqa %xmm14,0xd0(%r11) - vmovdqa %xmm15,0xe0(%r11) -.Ldo_avx_body: -___ -$code.=<<___; - sub \$64,$len - lea -32($inp),%rax - cmovc %rax,$inp - - vmovdqu `16*3`($ctx),$D4 # preload r0^2 - lea `16*3+64`($ctx),$ctx # size optimization - lea .Lconst(%rip),%rcx - - ################################################################ - # load input - vmovdqu 16*2($inp),$T0 - vmovdqu 16*3($inp),$T1 - vmovdqa 64(%rcx),$MASK # .Lmask26 - - vpsrldq \$6,$T0,$T2 # splat input - vpsrldq \$6,$T1,$T3 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpunpcklqdq $T3,$T2,$T3 # 2:3 - - vpsrlq \$40,$T4,$T4 # 4 - vpsrlq \$26,$T0,$T1 - vpand $MASK,$T0,$T0 # 0 - vpsrlq \$4,$T3,$T2 - vpand $MASK,$T1,$T1 # 1 - vpsrlq \$30,$T3,$T3 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - jbe .Lskip_loop_avx - - # expand and copy pre-calculated table to stack - vmovdqu `16*1-64`($ctx),$D1 - vmovdqu `16*2-64`($ctx),$D2 - vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 - vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 - vmovdqa $D3,-0x90(%r11) - vmovdqa $D0,0x00(%rsp) - vpshufd \$0xEE,$D1,$D4 - vmovdqu `16*3-64`($ctx),$D0 - vpshufd \$0x44,$D1,$D1 - vmovdqa $D4,-0x80(%r11) - vmovdqa $D1,0x10(%rsp) - vpshufd \$0xEE,$D2,$D3 - vmovdqu `16*4-64`($ctx),$D1 - vpshufd \$0x44,$D2,$D2 - vmovdqa $D3,-0x70(%r11) - vmovdqa $D2,0x20(%rsp) - vpshufd \$0xEE,$D0,$D4 - vmovdqu `16*5-64`($ctx),$D2 - vpshufd \$0x44,$D0,$D0 - vmovdqa $D4,-0x60(%r11) - vmovdqa $D0,0x30(%rsp) - vpshufd \$0xEE,$D1,$D3 - vmovdqu `16*6-64`($ctx),$D0 - vpshufd \$0x44,$D1,$D1 - vmovdqa $D3,-0x50(%r11) - vmovdqa $D1,0x40(%rsp) - vpshufd \$0xEE,$D2,$D4 - vmovdqu `16*7-64`($ctx),$D1 - vpshufd \$0x44,$D2,$D2 - vmovdqa $D4,-0x40(%r11) - vmovdqa $D2,0x50(%rsp) - vpshufd \$0xEE,$D0,$D3 - vmovdqu `16*8-64`($ctx),$D2 - vpshufd \$0x44,$D0,$D0 - vmovdqa $D3,-0x30(%r11) - vmovdqa $D0,0x60(%rsp) - vpshufd \$0xEE,$D1,$D4 - vpshufd \$0x44,$D1,$D1 - vmovdqa $D4,-0x20(%r11) - vmovdqa $D1,0x70(%rsp) - vpshufd \$0xEE,$D2,$D3 - vmovdqa 0x00(%rsp),$D4 # preload r0^2 - vpshufd \$0x44,$D2,$D2 - vmovdqa $D3,-0x10(%r11) - vmovdqa $D2,0x80(%rsp) - - jmp .Loop_avx - -.align 32 -.Loop_avx: - ################################################################ - # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 - # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r - # \___________________/ - # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 - # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r - # \___________________/ \____________________/ - # - # Note that we start with inp[2:3]*r^2. This is because it - # doesn't depend on reduction in previous iteration. - ################################################################ - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - # - # though note that $Tx and $Hx are "reversed" in this section, - # and $D4 is preloaded with r0^2... - - vpmuludq $T0,$D4,$D0 # d0 = h0*r0 - vpmuludq $T1,$D4,$D1 # d1 = h1*r0 - vmovdqa $H2,0x20(%r11) # offload hash - vpmuludq $T2,$D4,$D2 # d3 = h2*r0 - vmovdqa 0x10(%rsp),$H2 # r1^2 - vpmuludq $T3,$D4,$D3 # d3 = h3*r0 - vpmuludq $T4,$D4,$D4 # d4 = h4*r0 - - vmovdqa $H0,0x00(%r11) # - vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 - vmovdqa $H1,0x10(%r11) # - vpmuludq $T3,$H2,$H1 # h3*r1 - vpaddq $H0,$D0,$D0 # d0 += h4*s1 - vpaddq $H1,$D4,$D4 # d4 += h3*r1 - vmovdqa $H3,0x30(%r11) # - vpmuludq $T2,$H2,$H0 # h2*r1 - vpmuludq $T1,$H2,$H1 # h1*r1 - vpaddq $H0,$D3,$D3 # d3 += h2*r1 - vmovdqa 0x30(%rsp),$H3 # r2^2 - vpaddq $H1,$D2,$D2 # d2 += h1*r1 - vmovdqa $H4,0x40(%r11) # - vpmuludq $T0,$H2,$H2 # h0*r1 - vpmuludq $T2,$H3,$H0 # h2*r2 - vpaddq $H2,$D1,$D1 # d1 += h0*r1 - - vmovdqa 0x40(%rsp),$H4 # s2^2 - vpaddq $H0,$D4,$D4 # d4 += h2*r2 - vpmuludq $T1,$H3,$H1 # h1*r2 - vpmuludq $T0,$H3,$H3 # h0*r2 - vpaddq $H1,$D3,$D3 # d3 += h1*r2 - vmovdqa 0x50(%rsp),$H2 # r3^2 - vpaddq $H3,$D2,$D2 # d2 += h0*r2 - vpmuludq $T4,$H4,$H0 # h4*s2 - vpmuludq $T3,$H4,$H4 # h3*s2 - vpaddq $H0,$D1,$D1 # d1 += h4*s2 - vmovdqa 0x60(%rsp),$H3 # s3^2 - vpaddq $H4,$D0,$D0 # d0 += h3*s2 - - vmovdqa 0x80(%rsp),$H4 # s4^2 - vpmuludq $T1,$H2,$H1 # h1*r3 - vpmuludq $T0,$H2,$H2 # h0*r3 - vpaddq $H1,$D4,$D4 # d4 += h1*r3 - vpaddq $H2,$D3,$D3 # d3 += h0*r3 - vpmuludq $T4,$H3,$H0 # h4*s3 - vpmuludq $T3,$H3,$H1 # h3*s3 - vpaddq $H0,$D2,$D2 # d2 += h4*s3 - vmovdqu 16*0($inp),$H0 # load input - vpaddq $H1,$D1,$D1 # d1 += h3*s3 - vpmuludq $T2,$H3,$H3 # h2*s3 - vpmuludq $T2,$H4,$T2 # h2*s4 - vpaddq $H3,$D0,$D0 # d0 += h2*s3 - - vmovdqu 16*1($inp),$H1 # - vpaddq $T2,$D1,$D1 # d1 += h2*s4 - vpmuludq $T3,$H4,$T3 # h3*s4 - vpmuludq $T4,$H4,$T4 # h4*s4 - vpsrldq \$6,$H0,$H2 # splat input - vpaddq $T3,$D2,$D2 # d2 += h3*s4 - vpaddq $T4,$D3,$D3 # d3 += h4*s4 - vpsrldq \$6,$H1,$H3 # - vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 - vpmuludq $T1,$H4,$T0 # h1*s4 - vpunpckhqdq $H1,$H0,$H4 # 4 - vpaddq $T4,$D4,$D4 # d4 += h0*r4 - vmovdqa -0x90(%r11),$T4 # r0^4 - vpaddq $T0,$D0,$D0 # d0 += h1*s4 - - vpunpcklqdq $H1,$H0,$H0 # 0:1 - vpunpcklqdq $H3,$H2,$H3 # 2:3 - - #vpsrlq \$40,$H4,$H4 # 4 - vpsrldq \$`40/8`,$H4,$H4 # 4 - vpsrlq \$26,$H0,$H1 - vpand $MASK,$H0,$H0 # 0 - vpsrlq \$4,$H3,$H2 - vpand $MASK,$H1,$H1 # 1 - vpand 0(%rcx),$H4,$H4 # .Lmask24 - vpsrlq \$30,$H3,$H3 - vpand $MASK,$H2,$H2 # 2 - vpand $MASK,$H3,$H3 # 3 - vpor 32(%rcx),$H4,$H4 # padbit, yes, always - - vpaddq 0x00(%r11),$H0,$H0 # add hash value - vpaddq 0x10(%r11),$H1,$H1 - vpaddq 0x20(%r11),$H2,$H2 - vpaddq 0x30(%r11),$H3,$H3 - vpaddq 0x40(%r11),$H4,$H4 - - lea 16*2($inp),%rax - lea 16*4($inp),$inp - sub \$64,$len - cmovc %rax,$inp - - ################################################################ - # Now we accumulate (inp[0:1]+hash)*r^4 - ################################################################ - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - - vpmuludq $H0,$T4,$T0 # h0*r0 - vpmuludq $H1,$T4,$T1 # h1*r0 - vpaddq $T0,$D0,$D0 - vpaddq $T1,$D1,$D1 - vmovdqa -0x80(%r11),$T2 # r1^4 - vpmuludq $H2,$T4,$T0 # h2*r0 - vpmuludq $H3,$T4,$T1 # h3*r0 - vpaddq $T0,$D2,$D2 - vpaddq $T1,$D3,$D3 - vpmuludq $H4,$T4,$T4 # h4*r0 - vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 - vpaddq $T4,$D4,$D4 - - vpaddq $T0,$D0,$D0 # d0 += h4*s1 - vpmuludq $H2,$T2,$T1 # h2*r1 - vpmuludq $H3,$T2,$T0 # h3*r1 - vpaddq $T1,$D3,$D3 # d3 += h2*r1 - vmovdqa -0x60(%r11),$T3 # r2^4 - vpaddq $T0,$D4,$D4 # d4 += h3*r1 - vpmuludq $H1,$T2,$T1 # h1*r1 - vpmuludq $H0,$T2,$T2 # h0*r1 - vpaddq $T1,$D2,$D2 # d2 += h1*r1 - vpaddq $T2,$D1,$D1 # d1 += h0*r1 - - vmovdqa -0x50(%r11),$T4 # s2^4 - vpmuludq $H2,$T3,$T0 # h2*r2 - vpmuludq $H1,$T3,$T1 # h1*r2 - vpaddq $T0,$D4,$D4 # d4 += h2*r2 - vpaddq $T1,$D3,$D3 # d3 += h1*r2 - vmovdqa -0x40(%r11),$T2 # r3^4 - vpmuludq $H0,$T3,$T3 # h0*r2 - vpmuludq $H4,$T4,$T0 # h4*s2 - vpaddq $T3,$D2,$D2 # d2 += h0*r2 - vpaddq $T0,$D1,$D1 # d1 += h4*s2 - vmovdqa -0x30(%r11),$T3 # s3^4 - vpmuludq $H3,$T4,$T4 # h3*s2 - vpmuludq $H1,$T2,$T1 # h1*r3 - vpaddq $T4,$D0,$D0 # d0 += h3*s2 - - vmovdqa -0x10(%r11),$T4 # s4^4 - vpaddq $T1,$D4,$D4 # d4 += h1*r3 - vpmuludq $H0,$T2,$T2 # h0*r3 - vpmuludq $H4,$T3,$T0 # h4*s3 - vpaddq $T2,$D3,$D3 # d3 += h0*r3 - vpaddq $T0,$D2,$D2 # d2 += h4*s3 - vmovdqu 16*2($inp),$T0 # load input - vpmuludq $H3,$T3,$T2 # h3*s3 - vpmuludq $H2,$T3,$T3 # h2*s3 - vpaddq $T2,$D1,$D1 # d1 += h3*s3 - vmovdqu 16*3($inp),$T1 # - vpaddq $T3,$D0,$D0 # d0 += h2*s3 - - vpmuludq $H2,$T4,$H2 # h2*s4 - vpmuludq $H3,$T4,$H3 # h3*s4 - vpsrldq \$6,$T0,$T2 # splat input - vpaddq $H2,$D1,$D1 # d1 += h2*s4 - vpmuludq $H4,$T4,$H4 # h4*s4 - vpsrldq \$6,$T1,$T3 # - vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 - vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 - vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 - vpmuludq $H1,$T4,$H0 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 - vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 - - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpunpcklqdq $T3,$T2,$T3 # 2:3 - - #vpsrlq \$40,$T4,$T4 # 4 - vpsrldq \$`40/8`,$T4,$T4 # 4 - vpsrlq \$26,$T0,$T1 - vmovdqa 0x00(%rsp),$D4 # preload r0^2 - vpand $MASK,$T0,$T0 # 0 - vpsrlq \$4,$T3,$T2 - vpand $MASK,$T1,$T1 # 1 - vpand 0(%rcx),$T4,$T4 # .Lmask24 - vpsrlq \$30,$T3,$T3 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - ################################################################ - # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein - # and P. Schwabe - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$D1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D0 - vpand $MASK,$H4,$H4 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D0,$H0,$H0 - vpsllq \$2,$D0,$D0 - vpaddq $D0,$H0,$H0 # h4 -> h0 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - ja .Loop_avx - -.Lskip_loop_avx: - ################################################################ - # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 - - vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 - add \$32,$len - jnz .Long_tail_avx - - vpaddq $H2,$T2,$T2 - vpaddq $H0,$T0,$T0 - vpaddq $H1,$T1,$T1 - vpaddq $H3,$T3,$T3 - vpaddq $H4,$T4,$T4 - -.Long_tail_avx: - vmovdqa $H2,0x20(%r11) - vmovdqa $H0,0x00(%r11) - vmovdqa $H1,0x10(%r11) - vmovdqa $H3,0x30(%r11) - vmovdqa $H4,0x40(%r11) - - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - - vpmuludq $T2,$D4,$D2 # d2 = h2*r0 - vpmuludq $T0,$D4,$D0 # d0 = h0*r0 - vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n - vpmuludq $T1,$D4,$D1 # d1 = h1*r0 - vpmuludq $T3,$D4,$D3 # d3 = h3*r0 - vpmuludq $T4,$D4,$D4 # d4 = h4*r0 - - vpmuludq $T3,$H2,$H0 # h3*r1 - vpaddq $H0,$D4,$D4 # d4 += h3*r1 - vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n - vpmuludq $T2,$H2,$H1 # h2*r1 - vpaddq $H1,$D3,$D3 # d3 += h2*r1 - vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n - vpmuludq $T1,$H2,$H0 # h1*r1 - vpaddq $H0,$D2,$D2 # d2 += h1*r1 - vpmuludq $T0,$H2,$H2 # h0*r1 - vpaddq $H2,$D1,$D1 # d1 += h0*r1 - vpmuludq $T4,$H3,$H3 # h4*s1 - vpaddq $H3,$D0,$D0 # d0 += h4*s1 - - vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n - vpmuludq $T2,$H4,$H1 # h2*r2 - vpaddq $H1,$D4,$D4 # d4 += h2*r2 - vpmuludq $T1,$H4,$H0 # h1*r2 - vpaddq $H0,$D3,$D3 # d3 += h1*r2 - vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n - vpmuludq $T0,$H4,$H4 # h0*r2 - vpaddq $H4,$D2,$D2 # d2 += h0*r2 - vpmuludq $T4,$H2,$H1 # h4*s2 - vpaddq $H1,$D1,$D1 # d1 += h4*s2 - vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n - vpmuludq $T3,$H2,$H2 # h3*s2 - vpaddq $H2,$D0,$D0 # d0 += h3*s2 - - vpmuludq $T1,$H3,$H0 # h1*r3 - vpaddq $H0,$D4,$D4 # d4 += h1*r3 - vpmuludq $T0,$H3,$H3 # h0*r3 - vpaddq $H3,$D3,$D3 # d3 += h0*r3 - vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n - vpmuludq $T4,$H4,$H1 # h4*s3 - vpaddq $H1,$D2,$D2 # d2 += h4*s3 - vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n - vpmuludq $T3,$H4,$H0 # h3*s3 - vpaddq $H0,$D1,$D1 # d1 += h3*s3 - vpmuludq $T2,$H4,$H4 # h2*s3 - vpaddq $H4,$D0,$D0 # d0 += h2*s3 - - vpmuludq $T0,$H2,$H2 # h0*r4 - vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 - vpmuludq $T4,$H3,$H1 # h4*s4 - vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 - vpmuludq $T3,$H3,$H0 # h3*s4 - vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 - vpmuludq $T2,$H3,$H1 # h2*s4 - vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 - vpmuludq $T1,$H3,$H3 # h1*s4 - vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 - - jz .Lshort_tail_avx - - vmovdqu 16*0($inp),$H0 # load input - vmovdqu 16*1($inp),$H1 - - vpsrldq \$6,$H0,$H2 # splat input - vpsrldq \$6,$H1,$H3 - vpunpckhqdq $H1,$H0,$H4 # 4 - vpunpcklqdq $H1,$H0,$H0 # 0:1 - vpunpcklqdq $H3,$H2,$H3 # 2:3 - - vpsrlq \$40,$H4,$H4 # 4 - vpsrlq \$26,$H0,$H1 - vpand $MASK,$H0,$H0 # 0 - vpsrlq \$4,$H3,$H2 - vpand $MASK,$H1,$H1 # 1 - vpsrlq \$30,$H3,$H3 - vpand $MASK,$H2,$H2 # 2 - vpand $MASK,$H3,$H3 # 3 - vpor 32(%rcx),$H4,$H4 # padbit, yes, always - - vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 - vpaddq 0x00(%r11),$H0,$H0 - vpaddq 0x10(%r11),$H1,$H1 - vpaddq 0x20(%r11),$H2,$H2 - vpaddq 0x30(%r11),$H3,$H3 - vpaddq 0x40(%r11),$H4,$H4 - - ################################################################ - # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate - - vpmuludq $H0,$T4,$T0 # h0*r0 - vpaddq $T0,$D0,$D0 # d0 += h0*r0 - vpmuludq $H1,$T4,$T1 # h1*r0 - vpaddq $T1,$D1,$D1 # d1 += h1*r0 - vpmuludq $H2,$T4,$T0 # h2*r0 - vpaddq $T0,$D2,$D2 # d2 += h2*r0 - vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n - vpmuludq $H3,$T4,$T1 # h3*r0 - vpaddq $T1,$D3,$D3 # d3 += h3*r0 - vpmuludq $H4,$T4,$T4 # h4*r0 - vpaddq $T4,$D4,$D4 # d4 += h4*r0 - - vpmuludq $H3,$T2,$T0 # h3*r1 - vpaddq $T0,$D4,$D4 # d4 += h3*r1 - vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 - vpmuludq $H2,$T2,$T1 # h2*r1 - vpaddq $T1,$D3,$D3 # d3 += h2*r1 - vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 - vpmuludq $H1,$T2,$T0 # h1*r1 - vpaddq $T0,$D2,$D2 # d2 += h1*r1 - vpmuludq $H0,$T2,$T2 # h0*r1 - vpaddq $T2,$D1,$D1 # d1 += h0*r1 - vpmuludq $H4,$T3,$T3 # h4*s1 - vpaddq $T3,$D0,$D0 # d0 += h4*s1 - - vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 - vpmuludq $H2,$T4,$T1 # h2*r2 - vpaddq $T1,$D4,$D4 # d4 += h2*r2 - vpmuludq $H1,$T4,$T0 # h1*r2 - vpaddq $T0,$D3,$D3 # d3 += h1*r2 - vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 - vpmuludq $H0,$T4,$T4 # h0*r2 - vpaddq $T4,$D2,$D2 # d2 += h0*r2 - vpmuludq $H4,$T2,$T1 # h4*s2 - vpaddq $T1,$D1,$D1 # d1 += h4*s2 - vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 - vpmuludq $H3,$T2,$T2 # h3*s2 - vpaddq $T2,$D0,$D0 # d0 += h3*s2 - - vpmuludq $H1,$T3,$T0 # h1*r3 - vpaddq $T0,$D4,$D4 # d4 += h1*r3 - vpmuludq $H0,$T3,$T3 # h0*r3 - vpaddq $T3,$D3,$D3 # d3 += h0*r3 - vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 - vpmuludq $H4,$T4,$T1 # h4*s3 - vpaddq $T1,$D2,$D2 # d2 += h4*s3 - vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 - vpmuludq $H3,$T4,$T0 # h3*s3 - vpaddq $T0,$D1,$D1 # d1 += h3*s3 - vpmuludq $H2,$T4,$T4 # h2*s3 - vpaddq $T4,$D0,$D0 # d0 += h2*s3 - - vpmuludq $H0,$T2,$T2 # h0*r4 - vpaddq $T2,$D4,$D4 # d4 += h0*r4 - vpmuludq $H4,$T3,$T1 # h4*s4 - vpaddq $T1,$D3,$D3 # d3 += h4*s4 - vpmuludq $H3,$T3,$T0 # h3*s4 - vpaddq $T0,$D2,$D2 # d2 += h3*s4 - vpmuludq $H2,$T3,$T1 # h2*s4 - vpaddq $T1,$D1,$D1 # d1 += h2*s4 - vpmuludq $H1,$T3,$T3 # h1*s4 - vpaddq $T3,$D0,$D0 # d0 += h1*s4 - -.Lshort_tail_avx: - ################################################################ - # horizontal addition - - vpsrldq \$8,$D4,$T4 - vpsrldq \$8,$D3,$T3 - vpsrldq \$8,$D1,$T1 - vpsrldq \$8,$D0,$T0 - vpsrldq \$8,$D2,$T2 - vpaddq $T3,$D3,$D3 - vpaddq $T4,$D4,$D4 - vpaddq $T0,$D0,$D0 - vpaddq $T1,$D1,$D1 - vpaddq $T2,$D2,$D2 - - ################################################################ - # lazy reduction - - vpsrlq \$26,$D3,$H3 - vpand $MASK,$D3,$D3 - vpaddq $H3,$D4,$D4 # h3 -> h4 - - vpsrlq \$26,$D0,$H0 - vpand $MASK,$D0,$D0 - vpaddq $H0,$D1,$D1 # h0 -> h1 - - vpsrlq \$26,$D4,$H4 - vpand $MASK,$D4,$D4 - - vpsrlq \$26,$D1,$H1 - vpand $MASK,$D1,$D1 - vpaddq $H1,$D2,$D2 # h1 -> h2 - - vpaddq $H4,$D0,$D0 - vpsllq \$2,$H4,$H4 - vpaddq $H4,$D0,$D0 # h4 -> h0 - - vpsrlq \$26,$D2,$H2 - vpand $MASK,$D2,$D2 - vpaddq $H2,$D3,$D3 # h2 -> h3 - - vpsrlq \$26,$D0,$H0 - vpand $MASK,$D0,$D0 - vpaddq $H0,$D1,$D1 # h0 -> h1 - - vpsrlq \$26,$D3,$H3 - vpand $MASK,$D3,$D3 - vpaddq $H3,$D4,$D4 # h3 -> h4 - - vmovd $D0,`4*0-48-64`($ctx) # save partially reduced - vmovd $D1,`4*1-48-64`($ctx) - vmovd $D2,`4*2-48-64`($ctx) - vmovd $D3,`4*3-48-64`($ctx) - vmovd $D4,`4*4-48-64`($ctx) -___ -$code.=<<___ if ($win64); - vmovdqa 0x50(%r11),%xmm6 - vmovdqa 0x60(%r11),%xmm7 - vmovdqa 0x70(%r11),%xmm8 - vmovdqa 0x80(%r11),%xmm9 - vmovdqa 0x90(%r11),%xmm10 - vmovdqa 0xa0(%r11),%xmm11 - vmovdqa 0xb0(%r11),%xmm12 - vmovdqa 0xc0(%r11),%xmm13 - vmovdqa 0xd0(%r11),%xmm14 - vmovdqa 0xe0(%r11),%xmm15 - lea 0xf8(%r11),%rsp -.Ldo_avx_epilogue: -___ -$code.=<<___ if (!$win64); - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -___ -$code.=<<___; - vzeroupper - RET -.cfi_endproc -___ -&end_function("poly1305_blocks_avx"); - -&declare_function("poly1305_emit_avx", 32, 3); -$code.=<<___; - cmpl \$0,20($ctx) # is_base2_26? - je .Lemit - - mov 0($ctx),%eax # load hash value base 2^26 - mov 4($ctx),%ecx - mov 8($ctx),%r8d - mov 12($ctx),%r11d - mov 16($ctx),%r10d - - shl \$26,%rcx # base 2^26 -> base 2^64 - mov %r8,%r9 - shl \$52,%r8 - add %rcx,%rax - shr \$12,%r9 - add %rax,%r8 # h0 - adc \$0,%r9 - - shl \$14,%r11 - mov %r10,%rax - shr \$24,%r10 - add %r11,%r9 - shl \$40,%rax - add %rax,%r9 # h1 - adc \$0,%r10 # h2 - - mov %r10,%rax # could be partially reduced, so reduce - mov %r10,%rcx - and \$3,%r10 - shr \$2,%rax - and \$-4,%rcx - add %rcx,%rax - add %rax,%r8 - adc \$0,%r9 - adc \$0,%r10 - - mov %r8,%rax - add \$5,%r8 # compare to modulus - mov %r9,%rcx - adc \$0,%r9 - adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overflow? - cmovnz %r8,%rax - cmovnz %r9,%rcx - - add 0($nonce),%rax # accumulate nonce - adc 8($nonce),%rcx - mov %rax,0($mac) # write result - mov %rcx,8($mac) - - RET -___ -&end_function("poly1305_emit_avx"); - -if ($avx>1) { - -my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = - map("%ymm$_",(0..15)); -my $S4=$MASK; - -sub poly1305_blocks_avxN { - my ($avx512) = @_; - my $suffix = $avx512 ? "_avx512" : ""; -$code.=<<___; -.cfi_startproc - mov 20($ctx),%r8d # is_base2_26 - cmp \$128,$len - jae .Lblocks_avx2$suffix - test %r8d,%r8d - jz .Lblocks - -.Lblocks_avx2$suffix: - and \$-16,$len - jz .Lno_data_avx2$suffix - - vzeroupper - - test %r8d,%r8d - jz .Lbase2_64_avx2$suffix - - test \$63,$len - jz .Leven_avx2$suffix - - push %rbp -.cfi_push %rbp - mov %rsp,%rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 -.Lblocks_avx2_body$suffix: - - mov $len,%r15 # reassign $len - - mov 0($ctx),$d1 # load hash value - mov 8($ctx),$d2 - mov 16($ctx),$h2#d - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - ################################# base 2^26 -> base 2^64 - mov $d1#d,$h0#d - and \$`-1*(1<<31)`,$d1 - mov $d2,$r1 # borrow $r1 - mov $d2#d,$h1#d - and \$`-1*(1<<31)`,$d2 - - shr \$6,$d1 - shl \$52,$r1 - add $d1,$h0 - shr \$12,$h1 - shr \$18,$d2 - add $r1,$h0 - adc $d2,$h1 - - mov $h2,$d1 - shl \$40,$d1 - shr \$24,$h2 - add $d1,$h1 - adc \$0,$h2 # can be partially reduced... - - mov \$-4,$d2 # ... so reduce - mov $h2,$d1 - and $h2,$d2 - shr \$2,$d1 - and \$3,$h2 - add $d2,$d1 # =*5 - add $d1,$h0 - adc \$0,$h1 - adc \$0,$h2 - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - -.Lbase2_26_pre_avx2$suffix: - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - sub \$16,%r15 - - call __poly1305_block - mov $r1,%rax - - test \$63,%r15 - jnz .Lbase2_26_pre_avx2$suffix - - test $padbit,$padbit # if $padbit is zero, - jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format - - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$r0 - mov $h1,$r1 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$r0 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $r0,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$r1 - and \$0x3ffffff,$h1 # h[3] - or $r1,$h2 # h[4] - - test %r15,%r15 - jz .Lstore_base2_26_avx2$suffix - - vmovd %rax#d,%x#$H0 - vmovd %rdx#d,%x#$H1 - vmovd $h0#d,%x#$H2 - vmovd $h1#d,%x#$H3 - vmovd $h2#d,%x#$H4 - jmp .Lproceed_avx2$suffix - -.align 32 -.Lstore_base2_64_avx2$suffix: - mov $h0,0($ctx) - mov $h1,8($ctx) - mov $h2,16($ctx) # note that is_base2_26 is zeroed - jmp .Ldone_avx2$suffix - -.align 16 -.Lstore_base2_26_avx2$suffix: - mov %rax#d,0($ctx) # store hash value base 2^26 - mov %rdx#d,4($ctx) - mov $h0#d,8($ctx) - mov $h1#d,12($ctx) - mov $h2#d,16($ctx) -.align 16 -.Ldone_avx2$suffix: - pop %r15 -.cfi_restore %r15 - pop %r14 -.cfi_restore %r14 - pop %r13 -.cfi_restore %r13 - pop %r12 -.cfi_restore %r12 - pop %rbx -.cfi_restore %rbx - pop %rbp -.cfi_restore %rbp -.Lno_data_avx2$suffix: -.Lblocks_avx2_epilogue$suffix: - RET -.cfi_endproc - -.align 32 -.Lbase2_64_avx2$suffix: -.cfi_startproc - push %rbp -.cfi_push %rbp - mov %rsp,%rbp - push %rbx -.cfi_push %rbx - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 -.Lbase2_64_avx2_body$suffix: - - mov $len,%r15 # reassign $len - - mov 24($ctx),$r0 # load r - mov 32($ctx),$s1 - - mov 0($ctx),$h0 # load hash value - mov 8($ctx),$h1 - mov 16($ctx),$h2#d - - mov $s1,$r1 - mov $s1,%rax - shr \$2,$s1 - add $r1,$s1 # s1 = r1 + (r1 >> 2) - - test \$63,$len - jz .Linit_avx2$suffix - -.Lbase2_64_pre_avx2$suffix: - add 0($inp),$h0 # accumulate input - adc 8($inp),$h1 - lea 16($inp),$inp - adc $padbit,$h2 - sub \$16,%r15 - - call __poly1305_block - mov $r1,%rax - - test \$63,%r15 - jnz .Lbase2_64_pre_avx2$suffix - -.Linit_avx2$suffix: - ################################# base 2^64 -> base 2^26 - mov $h0,%rax - mov $h0,%rdx - shr \$52,$h0 - mov $h1,$d1 - mov $h1,$d2 - shr \$26,%rdx - and \$0x3ffffff,%rax # h[0] - shl \$12,$d1 - and \$0x3ffffff,%rdx # h[1] - shr \$14,$h1 - or $d1,$h0 - shl \$24,$h2 - and \$0x3ffffff,$h0 # h[2] - shr \$40,$d2 - and \$0x3ffffff,$h1 # h[3] - or $d2,$h2 # h[4] - - vmovd %rax#d,%x#$H0 - vmovd %rdx#d,%x#$H1 - vmovd $h0#d,%x#$H2 - vmovd $h1#d,%x#$H3 - vmovd $h2#d,%x#$H4 - movl \$1,20($ctx) # set is_base2_26 - - call __poly1305_init_avx - -.Lproceed_avx2$suffix: - mov %r15,$len # restore $len -___ -$code.=<<___ if (!$kernel); - mov OPENSSL_ia32cap_P+8(%rip),%r9d - mov \$`(1<<31|1<<30|1<<16)`,%r11d -___ -$code.=<<___; - pop %r15 -.cfi_restore %r15 - pop %r14 -.cfi_restore %r14 - pop %r13 -.cfi_restore %r13 - pop %r12 -.cfi_restore %r12 - pop %rbx -.cfi_restore %rbx - pop %rbp -.cfi_restore %rbp -.Lbase2_64_avx2_epilogue$suffix: - jmp .Ldo_avx2$suffix -.cfi_endproc - -.align 32 -.Leven_avx2$suffix: -.cfi_startproc -___ -$code.=<<___ if (!$kernel); - mov OPENSSL_ia32cap_P+8(%rip),%r9d -___ -$code.=<<___; - vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 - vmovd 4*1($ctx),%x#$H1 - vmovd 4*2($ctx),%x#$H2 - vmovd 4*3($ctx),%x#$H3 - vmovd 4*4($ctx),%x#$H4 - -.Ldo_avx2$suffix: -___ -$code.=<<___ if (!$kernel && $avx>2); - cmp \$512,$len - jb .Lskip_avx512 - and %r11d,%r9d - test \$`1<<16`,%r9d # check for AVX512F - jnz .Lblocks_avx512 -.Lskip_avx512$suffix: -___ -$code.=<<___ if ($avx > 2 && $avx512 && $kernel); - cmp \$512,$len - jae .Lblocks_avx512 -___ -$code.=<<___ if (!$win64); - lea 8(%rsp),%r10 -.cfi_def_cfa_register %r10 - sub \$0x128,%rsp -___ -$code.=<<___ if ($win64); - lea 8(%rsp),%r10 - sub \$0x1c8,%rsp - vmovdqa %xmm6,-0xb0(%r10) - vmovdqa %xmm7,-0xa0(%r10) - vmovdqa %xmm8,-0x90(%r10) - vmovdqa %xmm9,-0x80(%r10) - vmovdqa %xmm10,-0x70(%r10) - vmovdqa %xmm11,-0x60(%r10) - vmovdqa %xmm12,-0x50(%r10) - vmovdqa %xmm13,-0x40(%r10) - vmovdqa %xmm14,-0x30(%r10) - vmovdqa %xmm15,-0x20(%r10) -.Ldo_avx2_body$suffix: -___ -$code.=<<___; - lea .Lconst(%rip),%rcx - lea 48+64($ctx),$ctx # size optimization - vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 - - # expand and copy pre-calculated table to stack - vmovdqu `16*0-64`($ctx),%x#$T2 - and \$-512,%rsp - vmovdqu `16*1-64`($ctx),%x#$T3 - vmovdqu `16*2-64`($ctx),%x#$T4 - vmovdqu `16*3-64`($ctx),%x#$D0 - vmovdqu `16*4-64`($ctx),%x#$D1 - vmovdqu `16*5-64`($ctx),%x#$D2 - lea 0x90(%rsp),%rax # size optimization - vmovdqu `16*6-64`($ctx),%x#$D3 - vpermd $T2,$T0,$T2 # 00003412 -> 14243444 - vmovdqu `16*7-64`($ctx),%x#$D4 - vpermd $T3,$T0,$T3 - vmovdqu `16*8-64`($ctx),%x#$MASK - vpermd $T4,$T0,$T4 - vmovdqa $T2,0x00(%rsp) - vpermd $D0,$T0,$D0 - vmovdqa $T3,0x20-0x90(%rax) - vpermd $D1,$T0,$D1 - vmovdqa $T4,0x40-0x90(%rax) - vpermd $D2,$T0,$D2 - vmovdqa $D0,0x60-0x90(%rax) - vpermd $D3,$T0,$D3 - vmovdqa $D1,0x80-0x90(%rax) - vpermd $D4,$T0,$D4 - vmovdqa $D2,0xa0-0x90(%rax) - vpermd $MASK,$T0,$MASK - vmovdqa $D3,0xc0-0x90(%rax) - vmovdqa $D4,0xe0-0x90(%rax) - vmovdqa $MASK,0x100-0x90(%rax) - vmovdqa 64(%rcx),$MASK # .Lmask26 - - ################################################################ - # load input - vmovdqu 16*0($inp),%x#$T0 - vmovdqu 16*1($inp),%x#$T1 - vinserti128 \$1,16*2($inp),$T0,$T0 - vinserti128 \$1,16*3($inp),$T1,$T1 - lea 16*4($inp),$inp - - vpsrldq \$6,$T0,$T2 # splat input - vpsrldq \$6,$T1,$T3 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpunpcklqdq $T3,$T2,$T2 # 2:3 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - - vpsrlq \$30,$T2,$T3 - vpsrlq \$4,$T2,$T2 - vpsrlq \$26,$T0,$T1 - vpsrlq \$40,$T4,$T4 # 4 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T0,$T0 # 0 - vpand $MASK,$T1,$T1 # 1 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - vpaddq $H2,$T2,$H2 # accumulate input - sub \$64,$len - jz .Ltail_avx2$suffix - jmp .Loop_avx2$suffix - -.align 32 -.Loop_avx2$suffix: - ################################################################ - # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 - # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 - # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 - # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 - # \________/\__________/ - ################################################################ - #vpaddq $H2,$T2,$H2 # accumulate input - vpaddq $H0,$T0,$H0 - vmovdqa `32*0`(%rsp),$T0 # r0^4 - vpaddq $H1,$T1,$H1 - vmovdqa `32*1`(%rsp),$T1 # r1^4 - vpaddq $H3,$T3,$H3 - vmovdqa `32*3`(%rsp),$T2 # r2^4 - vpaddq $H4,$T4,$H4 - vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 - vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 - - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - # - # however, as h2 is "chronologically" first one available pull - # corresponding operations up, so it's - # - # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 - # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 - # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 - - vpmuludq $H2,$T0,$D2 # d2 = h2*r0 - vpmuludq $H2,$T1,$D3 # d3 = h2*r1 - vpmuludq $H2,$T2,$D4 # d4 = h2*r2 - vpmuludq $H2,$T3,$D0 # d0 = h2*s3 - vpmuludq $H2,$S4,$D1 # d1 = h2*s4 - - vpmuludq $H0,$T1,$T4 # h0*r1 - vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp - vpaddq $T4,$D1,$D1 # d1 += h0*r1 - vpaddq $H2,$D2,$D2 # d2 += h1*r1 - vpmuludq $H3,$T1,$T4 # h3*r1 - vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 - vpaddq $T4,$D4,$D4 # d4 += h3*r1 - vpaddq $H2,$D0,$D0 # d0 += h4*s1 - vmovdqa `32*4-0x90`(%rax),$T1 # s2 - - vpmuludq $H0,$T0,$T4 # h0*r0 - vpmuludq $H1,$T0,$H2 # h1*r0 - vpaddq $T4,$D0,$D0 # d0 += h0*r0 - vpaddq $H2,$D1,$D1 # d1 += h1*r0 - vpmuludq $H3,$T0,$T4 # h3*r0 - vpmuludq $H4,$T0,$H2 # h4*r0 - vmovdqu 16*0($inp),%x#$T0 # load input - vpaddq $T4,$D3,$D3 # d3 += h3*r0 - vpaddq $H2,$D4,$D4 # d4 += h4*r0 - vinserti128 \$1,16*2($inp),$T0,$T0 - - vpmuludq $H3,$T1,$T4 # h3*s2 - vpmuludq $H4,$T1,$H2 # h4*s2 - vmovdqu 16*1($inp),%x#$T1 - vpaddq $T4,$D0,$D0 # d0 += h3*s2 - vpaddq $H2,$D1,$D1 # d1 += h4*s2 - vmovdqa `32*5-0x90`(%rax),$H2 # r3 - vpmuludq $H1,$T2,$T4 # h1*r2 - vpmuludq $H0,$T2,$T2 # h0*r2 - vpaddq $T4,$D3,$D3 # d3 += h1*r2 - vpaddq $T2,$D2,$D2 # d2 += h0*r2 - vinserti128 \$1,16*3($inp),$T1,$T1 - lea 16*4($inp),$inp - - vpmuludq $H1,$H2,$T4 # h1*r3 - vpmuludq $H0,$H2,$H2 # h0*r3 - vpsrldq \$6,$T0,$T2 # splat input - vpaddq $T4,$D4,$D4 # d4 += h1*r3 - vpaddq $H2,$D3,$D3 # d3 += h0*r3 - vpmuludq $H3,$T3,$T4 # h3*s3 - vpmuludq $H4,$T3,$H2 # h4*s3 - vpsrldq \$6,$T1,$T3 - vpaddq $T4,$D1,$D1 # d1 += h3*s3 - vpaddq $H2,$D2,$D2 # d2 += h4*s3 - vpunpckhqdq $T1,$T0,$T4 # 4 - - vpmuludq $H3,$S4,$H3 # h3*s4 - vpmuludq $H4,$S4,$H4 # h4*s4 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 - vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 - vpunpcklqdq $T3,$T2,$T3 # 2:3 - vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 - vpmuludq $H1,$S4,$H0 # h1*s4 - vmovdqa 64(%rcx),$MASK # .Lmask26 - vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 - vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 - - ################################################################ - # lazy reduction (interleaved with tail of input splat) - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$D1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D4 - vpand $MASK,$H4,$H4 - - vpsrlq \$4,$T3,$T2 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D4,$H0,$H0 - vpsllq \$2,$D4,$D4 - vpaddq $D4,$H0,$H0 # h4 -> h0 - - vpand $MASK,$T2,$T2 # 2 - vpsrlq \$26,$T0,$T1 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpaddq $T2,$H2,$H2 # modulo-scheduled - vpsrlq \$30,$T3,$T3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$40,$T4,$T4 # 4 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpand $MASK,$T0,$T0 # 0 - vpand $MASK,$T1,$T1 # 1 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - - sub \$64,$len - jnz .Loop_avx2$suffix - - .byte 0x66,0x90 -.Ltail_avx2$suffix: - ################################################################ - # while above multiplications were by r^4 in all lanes, in last - # iteration we multiply least significant lane by r^4 and most - # significant one by r, so copy of above except that references - # to the precomputed table are displaced by 4... - - #vpaddq $H2,$T2,$H2 # accumulate input - vpaddq $H0,$T0,$H0 - vmovdqu `32*0+4`(%rsp),$T0 # r0^4 - vpaddq $H1,$T1,$H1 - vmovdqu `32*1+4`(%rsp),$T1 # r1^4 - vpaddq $H3,$T3,$H3 - vmovdqu `32*3+4`(%rsp),$T2 # r2^4 - vpaddq $H4,$T4,$H4 - vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 - vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 - - vpmuludq $H2,$T0,$D2 # d2 = h2*r0 - vpmuludq $H2,$T1,$D3 # d3 = h2*r1 - vpmuludq $H2,$T2,$D4 # d4 = h2*r2 - vpmuludq $H2,$T3,$D0 # d0 = h2*s3 - vpmuludq $H2,$S4,$D1 # d1 = h2*s4 - - vpmuludq $H0,$T1,$T4 # h0*r1 - vpmuludq $H1,$T1,$H2 # h1*r1 - vpaddq $T4,$D1,$D1 # d1 += h0*r1 - vpaddq $H2,$D2,$D2 # d2 += h1*r1 - vpmuludq $H3,$T1,$T4 # h3*r1 - vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 - vpaddq $T4,$D4,$D4 # d4 += h3*r1 - vpaddq $H2,$D0,$D0 # d0 += h4*s1 - - vpmuludq $H0,$T0,$T4 # h0*r0 - vpmuludq $H1,$T0,$H2 # h1*r0 - vpaddq $T4,$D0,$D0 # d0 += h0*r0 - vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 - vpaddq $H2,$D1,$D1 # d1 += h1*r0 - vpmuludq $H3,$T0,$T4 # h3*r0 - vpmuludq $H4,$T0,$H2 # h4*r0 - vpaddq $T4,$D3,$D3 # d3 += h3*r0 - vpaddq $H2,$D4,$D4 # d4 += h4*r0 - - vpmuludq $H3,$T1,$T4 # h3*s2 - vpmuludq $H4,$T1,$H2 # h4*s2 - vpaddq $T4,$D0,$D0 # d0 += h3*s2 - vpaddq $H2,$D1,$D1 # d1 += h4*s2 - vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 - vpmuludq $H1,$T2,$T4 # h1*r2 - vpmuludq $H0,$T2,$T2 # h0*r2 - vpaddq $T4,$D3,$D3 # d3 += h1*r2 - vpaddq $T2,$D2,$D2 # d2 += h0*r2 - - vpmuludq $H1,$H2,$T4 # h1*r3 - vpmuludq $H0,$H2,$H2 # h0*r3 - vpaddq $T4,$D4,$D4 # d4 += h1*r3 - vpaddq $H2,$D3,$D3 # d3 += h0*r3 - vpmuludq $H3,$T3,$T4 # h3*s3 - vpmuludq $H4,$T3,$H2 # h4*s3 - vpaddq $T4,$D1,$D1 # d1 += h3*s3 - vpaddq $H2,$D2,$D2 # d2 += h4*s3 - - vpmuludq $H3,$S4,$H3 # h3*s4 - vpmuludq $H4,$S4,$H4 # h4*s4 - vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 - vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 - vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 - vpmuludq $H1,$S4,$H0 # h1*s4 - vmovdqa 64(%rcx),$MASK # .Lmask26 - vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 - vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 - - ################################################################ - # horizontal addition - - vpsrldq \$8,$D1,$T1 - vpsrldq \$8,$H2,$T2 - vpsrldq \$8,$H3,$T3 - vpsrldq \$8,$H4,$T4 - vpsrldq \$8,$H0,$T0 - vpaddq $T1,$D1,$D1 - vpaddq $T2,$H2,$H2 - vpaddq $T3,$H3,$H3 - vpaddq $T4,$H4,$H4 - vpaddq $T0,$H0,$H0 - - vpermq \$0x2,$H3,$T3 - vpermq \$0x2,$H4,$T4 - vpermq \$0x2,$H0,$T0 - vpermq \$0x2,$D1,$T1 - vpermq \$0x2,$H2,$T2 - vpaddq $T3,$H3,$H3 - vpaddq $T4,$H4,$H4 - vpaddq $T0,$H0,$H0 - vpaddq $T1,$D1,$D1 - vpaddq $T2,$H2,$H2 - - ################################################################ - # lazy reduction - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$D1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D4 - vpand $MASK,$H4,$H4 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D4,$H0,$H0 - vpsllq \$2,$D4,$D4 - vpaddq $D4,$H0,$H0 # h4 -> h0 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced - vmovd %x#$H1,`4*1-48-64`($ctx) - vmovd %x#$H2,`4*2-48-64`($ctx) - vmovd %x#$H3,`4*3-48-64`($ctx) - vmovd %x#$H4,`4*4-48-64`($ctx) -___ -$code.=<<___ if ($win64); - vmovdqa -0xb0(%r10),%xmm6 - vmovdqa -0xa0(%r10),%xmm7 - vmovdqa -0x90(%r10),%xmm8 - vmovdqa -0x80(%r10),%xmm9 - vmovdqa -0x70(%r10),%xmm10 - vmovdqa -0x60(%r10),%xmm11 - vmovdqa -0x50(%r10),%xmm12 - vmovdqa -0x40(%r10),%xmm13 - vmovdqa -0x30(%r10),%xmm14 - vmovdqa -0x20(%r10),%xmm15 - lea -8(%r10),%rsp -.Ldo_avx2_epilogue$suffix: -___ -$code.=<<___ if (!$win64); - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -___ -$code.=<<___; - vzeroupper - RET -.cfi_endproc -___ -if($avx > 2 && $avx512) { -my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); -my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); -my $PADBIT="%zmm30"; - -map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain -map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); -map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); -map(s/%y/%z/,($MASK)); - -$code.=<<___; -.cfi_startproc -.Lblocks_avx512: - mov \$15,%eax - kmovw %eax,%k2 -___ -$code.=<<___ if (!$win64); - lea 8(%rsp),%r10 -.cfi_def_cfa_register %r10 - sub \$0x128,%rsp -___ -$code.=<<___ if ($win64); - lea 8(%rsp),%r10 - sub \$0x1c8,%rsp - vmovdqa %xmm6,-0xb0(%r10) - vmovdqa %xmm7,-0xa0(%r10) - vmovdqa %xmm8,-0x90(%r10) - vmovdqa %xmm9,-0x80(%r10) - vmovdqa %xmm10,-0x70(%r10) - vmovdqa %xmm11,-0x60(%r10) - vmovdqa %xmm12,-0x50(%r10) - vmovdqa %xmm13,-0x40(%r10) - vmovdqa %xmm14,-0x30(%r10) - vmovdqa %xmm15,-0x20(%r10) -.Ldo_avx512_body: -___ -$code.=<<___; - lea .Lconst(%rip),%rcx - lea 48+64($ctx),$ctx # size optimization - vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 - - # expand pre-calculated table - vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} - and \$-512,%rsp - vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} - mov \$0x20,%rax - vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} - vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} - vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} - vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} - vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} - vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} - vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} - vpermd $D0,$T2,$R0 # 00003412 -> 14243444 - vpbroadcastq 64(%rcx),$MASK # .Lmask26 - vpermd $D1,$T2,$R1 - vpermd $T0,$T2,$S1 - vpermd $D2,$T2,$R2 - vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 - vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 - vpermd $T1,$T2,$S2 - vmovdqu64 $R1,0x00(%rsp,%rax){%k2} - vpsrlq \$32,$R1,$T1 - vpermd $D3,$T2,$R3 - vmovdqa64 $S1,0x40(%rsp){%k2} - vpermd $T3,$T2,$S3 - vpermd $D4,$T2,$R4 - vmovdqu64 $R2,0x40(%rsp,%rax){%k2} - vpermd $T4,$T2,$S4 - vmovdqa64 $S2,0x80(%rsp){%k2} - vmovdqu64 $R3,0x80(%rsp,%rax){%k2} - vmovdqa64 $S3,0xc0(%rsp){%k2} - vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} - vmovdqa64 $S4,0x100(%rsp){%k2} - - ################################################################ - # calculate 5th through 8th powers of the key - # - # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 - # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 - # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 - # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 - # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 - - vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 - vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 - vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 - vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 - vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 - vpsrlq \$32,$R2,$T2 - - vpmuludq $T1,$S4,$M0 - vpmuludq $T1,$R0,$M1 - vpmuludq $T1,$R1,$M2 - vpmuludq $T1,$R2,$M3 - vpmuludq $T1,$R3,$M4 - vpsrlq \$32,$R3,$T3 - vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 - vpaddq $M1,$D1,$D1 # d1 += r1'*r0 - vpaddq $M2,$D2,$D2 # d2 += r1'*r1 - vpaddq $M3,$D3,$D3 # d3 += r1'*r2 - vpaddq $M4,$D4,$D4 # d4 += r1'*r3 - - vpmuludq $T2,$S3,$M0 - vpmuludq $T2,$S4,$M1 - vpmuludq $T2,$R1,$M3 - vpmuludq $T2,$R2,$M4 - vpmuludq $T2,$R0,$M2 - vpsrlq \$32,$R4,$T4 - vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 - vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 - vpaddq $M3,$D3,$D3 # d3 += r2'*r1 - vpaddq $M4,$D4,$D4 # d4 += r2'*r2 - vpaddq $M2,$D2,$D2 # d2 += r2'*r0 - - vpmuludq $T3,$S2,$M0 - vpmuludq $T3,$R0,$M3 - vpmuludq $T3,$R1,$M4 - vpmuludq $T3,$S3,$M1 - vpmuludq $T3,$S4,$M2 - vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 - vpaddq $M3,$D3,$D3 # d3 += r3'*r0 - vpaddq $M4,$D4,$D4 # d4 += r3'*r1 - vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 - vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 - - vpmuludq $T4,$S4,$M3 - vpmuludq $T4,$R0,$M4 - vpmuludq $T4,$S1,$M0 - vpmuludq $T4,$S2,$M1 - vpmuludq $T4,$S3,$M2 - vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 - vpaddq $M4,$D4,$D4 # d4 += r2'*r0 - vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 - vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 - vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 - - ################################################################ - # load input - vmovdqu64 16*0($inp),%z#$T3 - vmovdqu64 16*4($inp),%z#$T4 - lea 16*8($inp),$inp - - ################################################################ - # lazy reduction - - vpsrlq \$26,$D3,$M3 - vpandq $MASK,$D3,$D3 - vpaddq $M3,$D4,$D4 # d3 -> d4 - - vpsrlq \$26,$D0,$M0 - vpandq $MASK,$D0,$D0 - vpaddq $M0,$D1,$D1 # d0 -> d1 - - vpsrlq \$26,$D4,$M4 - vpandq $MASK,$D4,$D4 - - vpsrlq \$26,$D1,$M1 - vpandq $MASK,$D1,$D1 - vpaddq $M1,$D2,$D2 # d1 -> d2 - - vpaddq $M4,$D0,$D0 - vpsllq \$2,$M4,$M4 - vpaddq $M4,$D0,$D0 # d4 -> d0 - - vpsrlq \$26,$D2,$M2 - vpandq $MASK,$D2,$D2 - vpaddq $M2,$D3,$D3 # d2 -> d3 - - vpsrlq \$26,$D0,$M0 - vpandq $MASK,$D0,$D0 - vpaddq $M0,$D1,$D1 # d0 -> d1 - - vpsrlq \$26,$D3,$M3 - vpandq $MASK,$D3,$D3 - vpaddq $M3,$D4,$D4 # d3 -> d4 - - ################################################################ - # at this point we have 14243444 in $R0-$S4 and 05060708 in - # $D0-$D4, ... - - vpunpcklqdq $T4,$T3,$T0 # transpose input - vpunpckhqdq $T4,$T3,$T4 - - # ... since input 64-bit lanes are ordered as 73625140, we could - # "vperm" it to 76543210 (here and in each loop iteration), *or* - # we could just flow along, hence the goal for $R0-$S4 is - # 1858286838784888 ... - - vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: - mov \$0x7777,%eax - kmovw %eax,%k1 - - vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- - vpermd $R1,$M0,$R1 - vpermd $R2,$M0,$R2 - vpermd $R3,$M0,$R3 - vpermd $R4,$M0,$R4 - - vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 - vpermd $D1,$M0,${R1}{%k1} - vpermd $D2,$M0,${R2}{%k1} - vpermd $D3,$M0,${R3}{%k1} - vpermd $D4,$M0,${R4}{%k1} - - vpslld \$2,$R1,$S1 # *5 - vpslld \$2,$R2,$S2 - vpslld \$2,$R3,$S3 - vpslld \$2,$R4,$S4 - vpaddd $R1,$S1,$S1 - vpaddd $R2,$S2,$S2 - vpaddd $R3,$S3,$S3 - vpaddd $R4,$S4,$S4 - - vpbroadcastq 32(%rcx),$PADBIT # .L129 - - vpsrlq \$52,$T0,$T2 # splat input - vpsllq \$12,$T4,$T3 - vporq $T3,$T2,$T2 - vpsrlq \$26,$T0,$T1 - vpsrlq \$14,$T4,$T3 - vpsrlq \$40,$T4,$T4 # 4 - vpandq $MASK,$T2,$T2 # 2 - vpandq $MASK,$T0,$T0 # 0 - #vpandq $MASK,$T1,$T1 # 1 - #vpandq $MASK,$T3,$T3 # 3 - #vporq $PADBIT,$T4,$T4 # padbit, yes, always - - vpaddq $H2,$T2,$H2 # accumulate input - sub \$192,$len - jbe .Ltail_avx512 - jmp .Loop_avx512 - -.align 32 -.Loop_avx512: - ################################################################ - # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 - # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 - # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 - # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 - # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 - # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 - # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 - # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 - # \________/\___________/ - ################################################################ - #vpaddq $H2,$T2,$H2 # accumulate input - - # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 - # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 - # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 - # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 - # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 - # - # however, as h2 is "chronologically" first one available pull - # corresponding operations up, so it's - # - # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 - # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 - # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 - # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 - # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 - - vpmuludq $H2,$R1,$D3 # d3 = h2*r1 - vpaddq $H0,$T0,$H0 - vpmuludq $H2,$R2,$D4 # d4 = h2*r2 - vpandq $MASK,$T1,$T1 # 1 - vpmuludq $H2,$S3,$D0 # d0 = h2*s3 - vpandq $MASK,$T3,$T3 # 3 - vpmuludq $H2,$S4,$D1 # d1 = h2*s4 - vporq $PADBIT,$T4,$T4 # padbit, yes, always - vpmuludq $H2,$R0,$D2 # d2 = h2*r0 - vpaddq $H1,$T1,$H1 # accumulate input - vpaddq $H3,$T3,$H3 - vpaddq $H4,$T4,$H4 - - vmovdqu64 16*0($inp),$T3 # load input - vmovdqu64 16*4($inp),$T4 - lea 16*8($inp),$inp - vpmuludq $H0,$R3,$M3 - vpmuludq $H0,$R4,$M4 - vpmuludq $H0,$R0,$M0 - vpmuludq $H0,$R1,$M1 - vpaddq $M3,$D3,$D3 # d3 += h0*r3 - vpaddq $M4,$D4,$D4 # d4 += h0*r4 - vpaddq $M0,$D0,$D0 # d0 += h0*r0 - vpaddq $M1,$D1,$D1 # d1 += h0*r1 - - vpmuludq $H1,$R2,$M3 - vpmuludq $H1,$R3,$M4 - vpmuludq $H1,$S4,$M0 - vpmuludq $H0,$R2,$M2 - vpaddq $M3,$D3,$D3 # d3 += h1*r2 - vpaddq $M4,$D4,$D4 # d4 += h1*r3 - vpaddq $M0,$D0,$D0 # d0 += h1*s4 - vpaddq $M2,$D2,$D2 # d2 += h0*r2 - - vpunpcklqdq $T4,$T3,$T0 # transpose input - vpunpckhqdq $T4,$T3,$T4 - - vpmuludq $H3,$R0,$M3 - vpmuludq $H3,$R1,$M4 - vpmuludq $H1,$R0,$M1 - vpmuludq $H1,$R1,$M2 - vpaddq $M3,$D3,$D3 # d3 += h3*r0 - vpaddq $M4,$D4,$D4 # d4 += h3*r1 - vpaddq $M1,$D1,$D1 # d1 += h1*r0 - vpaddq $M2,$D2,$D2 # d2 += h1*r1 - - vpmuludq $H4,$S4,$M3 - vpmuludq $H4,$R0,$M4 - vpmuludq $H3,$S2,$M0 - vpmuludq $H3,$S3,$M1 - vpaddq $M3,$D3,$D3 # d3 += h4*s4 - vpmuludq $H3,$S4,$M2 - vpaddq $M4,$D4,$D4 # d4 += h4*r0 - vpaddq $M0,$D0,$D0 # d0 += h3*s2 - vpaddq $M1,$D1,$D1 # d1 += h3*s3 - vpaddq $M2,$D2,$D2 # d2 += h3*s4 - - vpmuludq $H4,$S1,$M0 - vpmuludq $H4,$S2,$M1 - vpmuludq $H4,$S3,$M2 - vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 - vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 - vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 - - ################################################################ - # lazy reduction (interleaved with input splat) - - vpsrlq \$52,$T0,$T2 # splat input - vpsllq \$12,$T4,$T3 - - vpsrlq \$26,$D3,$H3 - vpandq $MASK,$D3,$D3 - vpaddq $H3,$D4,$H4 # h3 -> h4 - - vporq $T3,$T2,$T2 - - vpsrlq \$26,$H0,$D0 - vpandq $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpandq $MASK,$T2,$T2 # 2 - - vpsrlq \$26,$H4,$D4 - vpandq $MASK,$H4,$H4 - - vpsrlq \$26,$H1,$D1 - vpandq $MASK,$H1,$H1 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D4,$H0,$H0 - vpsllq \$2,$D4,$D4 - vpaddq $D4,$H0,$H0 # h4 -> h0 - - vpaddq $T2,$H2,$H2 # modulo-scheduled - vpsrlq \$26,$T0,$T1 - - vpsrlq \$26,$H2,$D2 - vpandq $MASK,$H2,$H2 - vpaddq $D2,$D3,$H3 # h2 -> h3 - - vpsrlq \$14,$T4,$T3 - - vpsrlq \$26,$H0,$D0 - vpandq $MASK,$H0,$H0 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$40,$T4,$T4 # 4 - - vpsrlq \$26,$H3,$D3 - vpandq $MASK,$H3,$H3 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpandq $MASK,$T0,$T0 # 0 - #vpandq $MASK,$T1,$T1 # 1 - #vpandq $MASK,$T3,$T3 # 3 - #vporq $PADBIT,$T4,$T4 # padbit, yes, always - - sub \$128,$len - ja .Loop_avx512 - -.Ltail_avx512: - ################################################################ - # while above multiplications were by r^8 in all lanes, in last - # iteration we multiply least significant lane by r^8 and most - # significant one by r, that's why table gets shifted... - - vpsrlq \$32,$R0,$R0 # 0105020603070408 - vpsrlq \$32,$R1,$R1 - vpsrlq \$32,$R2,$R2 - vpsrlq \$32,$S3,$S3 - vpsrlq \$32,$S4,$S4 - vpsrlq \$32,$R3,$R3 - vpsrlq \$32,$R4,$R4 - vpsrlq \$32,$S1,$S1 - vpsrlq \$32,$S2,$S2 - - ################################################################ - # load either next or last 64 byte of input - lea ($inp,$len),$inp - - #vpaddq $H2,$T2,$H2 # accumulate input - vpaddq $H0,$T0,$H0 - - vpmuludq $H2,$R1,$D3 # d3 = h2*r1 - vpmuludq $H2,$R2,$D4 # d4 = h2*r2 - vpmuludq $H2,$S3,$D0 # d0 = h2*s3 - vpandq $MASK,$T1,$T1 # 1 - vpmuludq $H2,$S4,$D1 # d1 = h2*s4 - vpandq $MASK,$T3,$T3 # 3 - vpmuludq $H2,$R0,$D2 # d2 = h2*r0 - vporq $PADBIT,$T4,$T4 # padbit, yes, always - vpaddq $H1,$T1,$H1 # accumulate input - vpaddq $H3,$T3,$H3 - vpaddq $H4,$T4,$H4 - - vmovdqu 16*0($inp),%x#$T0 - vpmuludq $H0,$R3,$M3 - vpmuludq $H0,$R4,$M4 - vpmuludq $H0,$R0,$M0 - vpmuludq $H0,$R1,$M1 - vpaddq $M3,$D3,$D3 # d3 += h0*r3 - vpaddq $M4,$D4,$D4 # d4 += h0*r4 - vpaddq $M0,$D0,$D0 # d0 += h0*r0 - vpaddq $M1,$D1,$D1 # d1 += h0*r1 - - vmovdqu 16*1($inp),%x#$T1 - vpmuludq $H1,$R2,$M3 - vpmuludq $H1,$R3,$M4 - vpmuludq $H1,$S4,$M0 - vpmuludq $H0,$R2,$M2 - vpaddq $M3,$D3,$D3 # d3 += h1*r2 - vpaddq $M4,$D4,$D4 # d4 += h1*r3 - vpaddq $M0,$D0,$D0 # d0 += h1*s4 - vpaddq $M2,$D2,$D2 # d2 += h0*r2 - - vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 - vpmuludq $H3,$R0,$M3 - vpmuludq $H3,$R1,$M4 - vpmuludq $H1,$R0,$M1 - vpmuludq $H1,$R1,$M2 - vpaddq $M3,$D3,$D3 # d3 += h3*r0 - vpaddq $M4,$D4,$D4 # d4 += h3*r1 - vpaddq $M1,$D1,$D1 # d1 += h1*r0 - vpaddq $M2,$D2,$D2 # d2 += h1*r1 - - vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 - vpmuludq $H4,$S4,$M3 - vpmuludq $H4,$R0,$M4 - vpmuludq $H3,$S2,$M0 - vpmuludq $H3,$S3,$M1 - vpmuludq $H3,$S4,$M2 - vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 - vpaddq $M4,$D4,$D4 # d4 += h4*r0 - vpaddq $M0,$D0,$D0 # d0 += h3*s2 - vpaddq $M1,$D1,$D1 # d1 += h3*s3 - vpaddq $M2,$D2,$D2 # d2 += h3*s4 - - vpmuludq $H4,$S1,$M0 - vpmuludq $H4,$S2,$M1 - vpmuludq $H4,$S3,$M2 - vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 - vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 - vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 - - ################################################################ - # horizontal addition - - mov \$1,%eax - vpermq \$0xb1,$H3,$D3 - vpermq \$0xb1,$D4,$H4 - vpermq \$0xb1,$H0,$D0 - vpermq \$0xb1,$H1,$D1 - vpermq \$0xb1,$H2,$D2 - vpaddq $D3,$H3,$H3 - vpaddq $D4,$H4,$H4 - vpaddq $D0,$H0,$H0 - vpaddq $D1,$H1,$H1 - vpaddq $D2,$H2,$H2 - - kmovw %eax,%k3 - vpermq \$0x2,$H3,$D3 - vpermq \$0x2,$H4,$D4 - vpermq \$0x2,$H0,$D0 - vpermq \$0x2,$H1,$D1 - vpermq \$0x2,$H2,$D2 - vpaddq $D3,$H3,$H3 - vpaddq $D4,$H4,$H4 - vpaddq $D0,$H0,$H0 - vpaddq $D1,$H1,$H1 - vpaddq $D2,$H2,$H2 - - vextracti64x4 \$0x1,$H3,%y#$D3 - vextracti64x4 \$0x1,$H4,%y#$D4 - vextracti64x4 \$0x1,$H0,%y#$D0 - vextracti64x4 \$0x1,$H1,%y#$D1 - vextracti64x4 \$0x1,$H2,%y#$D2 - vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case - vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 - vpaddq $D0,$H0,${H0}{%k3}{z} - vpaddq $D1,$H1,${H1}{%k3}{z} - vpaddq $D2,$H2,${H2}{%k3}{z} -___ -map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); -map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); -$code.=<<___; - ################################################################ - # lazy reduction (interleaved with input splat) - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpsrldq \$6,$T0,$T2 # splat input - vpsrldq \$6,$T1,$T3 - vpunpckhqdq $T1,$T0,$T4 # 4 - vpaddq $D3,$H4,$H4 # h3 -> h4 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpunpcklqdq $T3,$T2,$T2 # 2:3 - vpunpcklqdq $T1,$T0,$T0 # 0:1 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$26,$H4,$D4 - vpand $MASK,$H4,$H4 - - vpsrlq \$26,$H1,$D1 - vpand $MASK,$H1,$H1 - vpsrlq \$30,$T2,$T3 - vpsrlq \$4,$T2,$T2 - vpaddq $D1,$H2,$H2 # h1 -> h2 - - vpaddq $D4,$H0,$H0 - vpsllq \$2,$D4,$D4 - vpsrlq \$26,$T0,$T1 - vpsrlq \$40,$T4,$T4 # 4 - vpaddq $D4,$H0,$H0 # h4 -> h0 - - vpsrlq \$26,$H2,$D2 - vpand $MASK,$H2,$H2 - vpand $MASK,$T2,$T2 # 2 - vpand $MASK,$T0,$T0 # 0 - vpaddq $D2,$H3,$H3 # h2 -> h3 - - vpsrlq \$26,$H0,$D0 - vpand $MASK,$H0,$H0 - vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 - vpand $MASK,$T1,$T1 # 1 - vpaddq $D0,$H1,$H1 # h0 -> h1 - - vpsrlq \$26,$H3,$D3 - vpand $MASK,$H3,$H3 - vpand $MASK,$T3,$T3 # 3 - vpor 32(%rcx),$T4,$T4 # padbit, yes, always - vpaddq $D3,$H4,$H4 # h3 -> h4 - - lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 - add \$64,$len - jnz .Ltail_avx2$suffix - - vpsubq $T2,$H2,$H2 # undo input accumulation - vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced - vmovd %x#$H1,`4*1-48-64`($ctx) - vmovd %x#$H2,`4*2-48-64`($ctx) - vmovd %x#$H3,`4*3-48-64`($ctx) - vmovd %x#$H4,`4*4-48-64`($ctx) - vzeroall -___ -$code.=<<___ if ($win64); - movdqa -0xb0(%r10),%xmm6 - movdqa -0xa0(%r10),%xmm7 - movdqa -0x90(%r10),%xmm8 - movdqa -0x80(%r10),%xmm9 - movdqa -0x70(%r10),%xmm10 - movdqa -0x60(%r10),%xmm11 - movdqa -0x50(%r10),%xmm12 - movdqa -0x40(%r10),%xmm13 - movdqa -0x30(%r10),%xmm14 - movdqa -0x20(%r10),%xmm15 - lea -8(%r10),%rsp -.Ldo_avx512_epilogue: -___ -$code.=<<___ if (!$win64); - lea -8(%r10),%rsp -.cfi_def_cfa_register %rsp -___ -$code.=<<___; - RET -.cfi_endproc -___ - -} - -} - -&declare_function("poly1305_blocks_avx2", 32, 4); -poly1305_blocks_avxN(0); -&end_function("poly1305_blocks_avx2"); - -####################################################################### -if ($avx>2) { -# On entry we have input length divisible by 64. But since inner loop -# processes 128 bytes per iteration, cases when length is not divisible -# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this -# reason stack layout is kept identical to poly1305_blocks_avx2. If not -# for this tail, we wouldn't have to even allocate stack frame... - -if($kernel) { - $code .= "#ifdef CONFIG_AS_AVX512\n"; -} - -&declare_function("poly1305_blocks_avx512", 32, 4); -poly1305_blocks_avxN(1); -&end_function("poly1305_blocks_avx512"); - -if ($kernel) { - $code .= "#endif\n"; -} - -if (!$kernel && $avx>3) { -######################################################################## -# VPMADD52 version using 2^44 radix. -# -# One can argue that base 2^52 would be more natural. Well, even though -# some operations would be more natural, one has to recognize couple of -# things. Base 2^52 doesn't provide advantage over base 2^44 if you look -# at amount of multiply-n-accumulate operations. Secondly, it makes it -# impossible to pre-compute multiples of 5 [referred to as s[]/sN in -# reference implementations], which means that more such operations -# would have to be performed in inner loop, which in turn makes critical -# path longer. In other words, even though base 2^44 reduction might -# look less elegant, overall critical path is actually shorter... - -######################################################################## -# Layout of opaque area is following. -# -# unsigned __int64 h[3]; # current hash value base 2^44 -# unsigned __int64 s[2]; # key value*20 base 2^44 -# unsigned __int64 r[3]; # key value base 2^44 -# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; -# # r^n positions reflect -# # placement in register, not -# # memory, R[3] is R[1]*20 - -$code.=<<___; -.type poly1305_init_base2_44,\@function,3 -.align 32 -poly1305_init_base2_44: - xor %eax,%eax - mov %rax,0($ctx) # initialize hash value - mov %rax,8($ctx) - mov %rax,16($ctx) - -.Linit_base2_44: - lea poly1305_blocks_vpmadd52(%rip),%r10 - lea poly1305_emit_base2_44(%rip),%r11 - - mov \$0x0ffffffc0fffffff,%rax - mov \$0x0ffffffc0ffffffc,%rcx - and 0($inp),%rax - mov \$0x00000fffffffffff,%r8 - and 8($inp),%rcx - mov \$0x00000fffffffffff,%r9 - and %rax,%r8 - shrd \$44,%rcx,%rax - mov %r8,40($ctx) # r0 - and %r9,%rax - shr \$24,%rcx - mov %rax,48($ctx) # r1 - lea (%rax,%rax,4),%rax # *5 - mov %rcx,56($ctx) # r2 - shl \$2,%rax # magic <<2 - lea (%rcx,%rcx,4),%rcx # *5 - shl \$2,%rcx # magic <<2 - mov %rax,24($ctx) # s1 - mov %rcx,32($ctx) # s2 - movq \$-1,64($ctx) # write impossible value -___ -$code.=<<___ if ($flavour !~ /elf32/); - mov %r10,0(%rdx) - mov %r11,8(%rdx) -___ -$code.=<<___ if ($flavour =~ /elf32/); - mov %r10d,0(%rdx) - mov %r11d,4(%rdx) -___ -$code.=<<___; - mov \$1,%eax - RET -.size poly1305_init_base2_44,.-poly1305_init_base2_44 -___ -{ -my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); -my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); -my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); - -$code.=<<___; -.type poly1305_blocks_vpmadd52,\@function,4 -.align 32 -poly1305_blocks_vpmadd52: - shr \$4,$len - jz .Lno_data_vpmadd52 # too short - - shl \$40,$padbit - mov 64($ctx),%r8 # peek on power of the key - - # if powers of the key are not calculated yet, process up to 3 - # blocks with this single-block subroutine, otherwise ensure that - # length is divisible by 2 blocks and pass the rest down to next - # subroutine... - - mov \$3,%rax - mov \$1,%r10 - cmp \$4,$len # is input long - cmovae %r10,%rax - test %r8,%r8 # is power value impossible? - cmovns %r10,%rax - - and $len,%rax # is input of favourable length? - jz .Lblocks_vpmadd52_4x - - sub %rax,$len - mov \$7,%r10d - mov \$1,%r11d - kmovw %r10d,%k7 - lea .L2_44_inp_permd(%rip),%r10 - kmovw %r11d,%k1 - - vmovq $padbit,%x#$PAD - vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd - vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift - vpermq \$0xcf,$PAD,$PAD - vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask - - vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value - vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys - vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} - vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} - - vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt - vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft - - jmp .Loop_vpmadd52 - -.align 32 -.Loop_vpmadd52: - vmovdqu32 0($inp),%x#$T0 # load input as ----3210 - lea 16($inp),$inp - - vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 - vpsrlvq $inp_shift,$T0,$T0 - vpandq $reduc_mask,$T0,$T0 - vporq $PAD,$T0,$T0 - - vpaddq $T0,$Dlo,$Dlo # accumulate input - - vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value - vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} - vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} - - vpxord $Dlo,$Dlo,$Dlo - vpxord $Dhi,$Dhi,$Dhi - - vpmadd52luq $r2r1r0,$H0,$Dlo - vpmadd52huq $r2r1r0,$H0,$Dhi - - vpmadd52luq $r1r0s2,$H1,$Dlo - vpmadd52huq $r1r0s2,$H1,$Dhi - - vpmadd52luq $r0s2s1,$H2,$Dlo - vpmadd52huq $r0s2s1,$H2,$Dhi - - vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword - vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword - vpandq $reduc_mask,$Dlo,$Dlo - - vpaddq $T0,$Dhi,$Dhi - - vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword - - vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) - - vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word - vpandq $reduc_mask,$Dlo,$Dlo - - vpermq \$0b10010011,$T0,$T0 - - vpaddq $T0,$Dlo,$Dlo - - vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} - - vpaddq $T0,$Dlo,$Dlo - vpsllq \$2,$T0,$T0 - - vpaddq $T0,$Dlo,$Dlo - - dec %rax # len-=16 - jnz .Loop_vpmadd52 - - vmovdqu64 $Dlo,0($ctx){%k7} # store hash value - - test $len,$len - jnz .Lblocks_vpmadd52_4x - -.Lno_data_vpmadd52: - RET -.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 -___ -} -{ -######################################################################## -# As implied by its name 4x subroutine processes 4 blocks in parallel -# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power -# and is handled in 256-bit %ymm registers. - -my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); -my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); -my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); - -$code.=<<___; -.type poly1305_blocks_vpmadd52_4x,\@function,4 -.align 32 -poly1305_blocks_vpmadd52_4x: - shr \$4,$len - jz .Lno_data_vpmadd52_4x # too short - - shl \$40,$padbit - mov 64($ctx),%r8 # peek on power of the key - -.Lblocks_vpmadd52_4x: - vpbroadcastq $padbit,$PAD - - vmovdqa64 .Lx_mask44(%rip),$mask44 - mov \$5,%eax - vmovdqa64 .Lx_mask42(%rip),$mask42 - kmovw %eax,%k1 # used in 2x path - - test %r8,%r8 # is power value impossible? - js .Linit_vpmadd52 # if it is, then init R[4] - - vmovq 0($ctx),%x#$H0 # load current hash value - vmovq 8($ctx),%x#$H1 - vmovq 16($ctx),%x#$H2 - - test \$3,$len # is length 4*n+2? - jnz .Lblocks_vpmadd52_2x_do - -.Lblocks_vpmadd52_4x_do: - vpbroadcastq 64($ctx),$R0 # load 4th power of the key - vpbroadcastq 96($ctx),$R1 - vpbroadcastq 128($ctx),$R2 - vpbroadcastq 160($ctx),$S1 - -.Lblocks_vpmadd52_4x_key_loaded: - vpsllq \$2,$R2,$S2 # S2 = R2*5*4 - vpaddq $R2,$S2,$S2 - vpsllq \$2,$S2,$S2 - - test \$7,$len # is len 8*n? - jz .Lblocks_vpmadd52_8x - - vmovdqu64 16*0($inp),$T2 # load data - vmovdqu64 16*2($inp),$T3 - lea 16*4($inp),$inp - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - - # at this point 64-bit lanes are ordered as 3-1-2-0 - - vpsrlq \$24,$T3,$T2 # splat the data - vporq $PAD,$T2,$T2 - vpaddq $T2,$H2,$H2 # accumulate input - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - sub \$4,$len - jz .Ltail_vpmadd52_4x - jmp .Loop_vpmadd52_4x - ud2 - -.align 32 -.Linit_vpmadd52: - vmovq 24($ctx),%x#$S1 # load key - vmovq 56($ctx),%x#$H2 - vmovq 32($ctx),%x#$S2 - vmovq 40($ctx),%x#$R0 - vmovq 48($ctx),%x#$R1 - - vmovdqa $R0,$H0 - vmovdqa $R1,$H1 - vmovdqa $H2,$R2 - - mov \$2,%eax - -.Lmul_init_vpmadd52: - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$R0,$D2hi - - vpmadd52luq $H0,$R0,$D0lo - vpmadd52huq $H0,$R0,$D0hi - vpmadd52luq $H0,$R1,$D1lo - vpmadd52huq $H0,$R1,$D1hi - vpmadd52luq $H0,$R2,$D2lo - vpmadd52huq $H0,$R2,$D2hi - - vpmadd52luq $H1,$S2,$D0lo - vpmadd52huq $H1,$S2,$D0hi - vpmadd52luq $H1,$R0,$D1lo - vpmadd52huq $H1,$R0,$D1hi - vpmadd52luq $H1,$R1,$D2lo - vpmadd52huq $H1,$R1,$D2hi - - ################################################################ - # partial reduction - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - - dec %eax - jz .Ldone_init_vpmadd52 - - vpunpcklqdq $R1,$H1,$R1 # 1,2 - vpbroadcastq %x#$H1,%x#$H1 # 2,2 - vpunpcklqdq $R2,$H2,$R2 - vpbroadcastq %x#$H2,%x#$H2 - vpunpcklqdq $R0,$H0,$R0 - vpbroadcastq %x#$H0,%x#$H0 - - vpsllq \$2,$R1,$S1 # S1 = R1*5*4 - vpsllq \$2,$R2,$S2 # S2 = R2*5*4 - vpaddq $R1,$S1,$S1 - vpaddq $R2,$S2,$S2 - vpsllq \$2,$S1,$S1 - vpsllq \$2,$S2,$S2 - - jmp .Lmul_init_vpmadd52 - ud2 - -.align 32 -.Ldone_init_vpmadd52: - vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 - vinserti128 \$1,%x#$R2,$H2,$R2 - vinserti128 \$1,%x#$R0,$H0,$R0 - - vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 - vpermq \$0b11011000,$R2,$R2 - vpermq \$0b11011000,$R0,$R0 - - vpsllq \$2,$R1,$S1 # S1 = R1*5*4 - vpaddq $R1,$S1,$S1 - vpsllq \$2,$S1,$S1 - - vmovq 0($ctx),%x#$H0 # load current hash value - vmovq 8($ctx),%x#$H1 - vmovq 16($ctx),%x#$H2 - - test \$3,$len # is length 4*n+2? - jnz .Ldone_init_vpmadd52_2x - - vmovdqu64 $R0,64($ctx) # save key powers - vpbroadcastq %x#$R0,$R0 # broadcast 4th power - vmovdqu64 $R1,96($ctx) - vpbroadcastq %x#$R1,$R1 - vmovdqu64 $R2,128($ctx) - vpbroadcastq %x#$R2,$R2 - vmovdqu64 $S1,160($ctx) - vpbroadcastq %x#$S1,$S1 - - jmp .Lblocks_vpmadd52_4x_key_loaded - ud2 - -.align 32 -.Ldone_init_vpmadd52_2x: - vmovdqu64 $R0,64($ctx) # save key powers - vpsrldq \$8,$R0,$R0 # 0-1-0-2 - vmovdqu64 $R1,96($ctx) - vpsrldq \$8,$R1,$R1 - vmovdqu64 $R2,128($ctx) - vpsrldq \$8,$R2,$R2 - vmovdqu64 $S1,160($ctx) - vpsrldq \$8,$S1,$S1 - jmp .Lblocks_vpmadd52_2x_key_loaded - ud2 - -.align 32 -.Lblocks_vpmadd52_2x_do: - vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers - vmovdqu64 160+8($ctx),${S1}{%k1}{z} - vmovdqu64 64+8($ctx),${R0}{%k1}{z} - vmovdqu64 96+8($ctx),${R1}{%k1}{z} - -.Lblocks_vpmadd52_2x_key_loaded: - vmovdqu64 16*0($inp),$T2 # load data - vpxorq $T3,$T3,$T3 - lea 16*2($inp),$inp - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - - # at this point 64-bit lanes are ordered as x-1-x-0 - - vpsrlq \$24,$T3,$T2 # splat the data - vporq $PAD,$T2,$T2 - vpaddq $T2,$H2,$H2 # accumulate input - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - jmp .Ltail_vpmadd52_2x - ud2 - -.align 32 -.Loop_vpmadd52_4x: - #vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$R0,$D2hi - - vmovdqu64 16*0($inp),$T2 # load data - vmovdqu64 16*2($inp),$T3 - lea 16*4($inp),$inp - vpmadd52luq $H0,$R0,$D0lo - vpmadd52huq $H0,$R0,$D0hi - vpmadd52luq $H0,$R1,$D1lo - vpmadd52huq $H0,$R1,$D1hi - vpmadd52luq $H0,$R2,$D2lo - vpmadd52huq $H0,$R2,$D2hi - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - vpmadd52luq $H1,$S2,$D0lo - vpmadd52huq $H1,$S2,$D0hi - vpmadd52luq $H1,$R0,$D1lo - vpmadd52huq $H1,$R0,$D1hi - vpmadd52luq $H1,$R1,$D2lo - vpmadd52huq $H1,$R1,$D2hi - - ################################################################ - # partial reduction (interleaved with data splat) - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpsrlq \$24,$T3,$T2 - vporq $PAD,$T2,$T2 - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - - sub \$4,$len # len-=64 - jnz .Loop_vpmadd52_4x - -.Ltail_vpmadd52_4x: - vmovdqu64 128($ctx),$R2 # load all key powers - vmovdqu64 160($ctx),$S1 - vmovdqu64 64($ctx),$R0 - vmovdqu64 96($ctx),$R1 - -.Ltail_vpmadd52_2x: - vpsllq \$2,$R2,$S2 # S2 = R2*5*4 - vpaddq $R2,$S2,$S2 - vpsllq \$2,$S2,$S2 - - #vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$R0,$D2hi - - vpmadd52luq $H0,$R0,$D0lo - vpmadd52huq $H0,$R0,$D0hi - vpmadd52luq $H0,$R1,$D1lo - vpmadd52huq $H0,$R1,$D1hi - vpmadd52luq $H0,$R2,$D2lo - vpmadd52huq $H0,$R2,$D2hi - - vpmadd52luq $H1,$S2,$D0lo - vpmadd52huq $H1,$S2,$D0hi - vpmadd52luq $H1,$R0,$D1lo - vpmadd52huq $H1,$R0,$D1hi - vpmadd52luq $H1,$R1,$D2lo - vpmadd52huq $H1,$R1,$D2hi - - ################################################################ - # horizontal addition - - mov \$1,%eax - kmovw %eax,%k1 - vpsrldq \$8,$D0lo,$T0 - vpsrldq \$8,$D0hi,$H0 - vpsrldq \$8,$D1lo,$T1 - vpsrldq \$8,$D1hi,$H1 - vpaddq $T0,$D0lo,$D0lo - vpaddq $H0,$D0hi,$D0hi - vpsrldq \$8,$D2lo,$T2 - vpsrldq \$8,$D2hi,$H2 - vpaddq $T1,$D1lo,$D1lo - vpaddq $H1,$D1hi,$D1hi - vpermq \$0x2,$D0lo,$T0 - vpermq \$0x2,$D0hi,$H0 - vpaddq $T2,$D2lo,$D2lo - vpaddq $H2,$D2hi,$D2hi - - vpermq \$0x2,$D1lo,$T1 - vpermq \$0x2,$D1hi,$H1 - vpaddq $T0,$D0lo,${D0lo}{%k1}{z} - vpaddq $H0,$D0hi,${D0hi}{%k1}{z} - vpermq \$0x2,$D2lo,$T2 - vpermq \$0x2,$D2hi,$H2 - vpaddq $T1,$D1lo,${D1lo}{%k1}{z} - vpaddq $H1,$D1hi,${D1hi}{%k1}{z} - vpaddq $T2,$D2lo,${D2lo}{%k1}{z} - vpaddq $H2,$D2hi,${D2hi}{%k1}{z} - - ################################################################ - # partial reduction - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - # at this point $len is - # either 4*n+2 or 0... - sub \$2,$len # len-=32 - ja .Lblocks_vpmadd52_4x_do - - vmovq %x#$H0,0($ctx) - vmovq %x#$H1,8($ctx) - vmovq %x#$H2,16($ctx) - vzeroall - -.Lno_data_vpmadd52_4x: - RET -.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x -___ -} -{ -######################################################################## -# As implied by its name 8x subroutine processes 8 blocks in parallel... -# This is intermediate version, as it's used only in cases when input -# length is either 8*n, 8*n+1 or 8*n+2... - -my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); -my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); -my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); -my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); - -$code.=<<___; -.type poly1305_blocks_vpmadd52_8x,\@function,4 -.align 32 -poly1305_blocks_vpmadd52_8x: - shr \$4,$len - jz .Lno_data_vpmadd52_8x # too short - - shl \$40,$padbit - mov 64($ctx),%r8 # peek on power of the key - - vmovdqa64 .Lx_mask44(%rip),$mask44 - vmovdqa64 .Lx_mask42(%rip),$mask42 - - test %r8,%r8 # is power value impossible? - js .Linit_vpmadd52 # if it is, then init R[4] - - vmovq 0($ctx),%x#$H0 # load current hash value - vmovq 8($ctx),%x#$H1 - vmovq 16($ctx),%x#$H2 - -.Lblocks_vpmadd52_8x: - ################################################################ - # fist we calculate more key powers - - vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers - vmovdqu64 160($ctx),$S1 - vmovdqu64 64($ctx),$R0 - vmovdqu64 96($ctx),$R1 - - vpsllq \$2,$R2,$S2 # S2 = R2*5*4 - vpaddq $R2,$S2,$S2 - vpsllq \$2,$S2,$S2 - - vpbroadcastq %x#$R2,$RR2 # broadcast 4th power - vpbroadcastq %x#$R0,$RR0 - vpbroadcastq %x#$R1,$RR1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $RR2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $RR2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $RR2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $RR2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $RR2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $RR2,$R0,$D2hi - - vpmadd52luq $RR0,$R0,$D0lo - vpmadd52huq $RR0,$R0,$D0hi - vpmadd52luq $RR0,$R1,$D1lo - vpmadd52huq $RR0,$R1,$D1hi - vpmadd52luq $RR0,$R2,$D2lo - vpmadd52huq $RR0,$R2,$D2hi - - vpmadd52luq $RR1,$S2,$D0lo - vpmadd52huq $RR1,$S2,$D0hi - vpmadd52luq $RR1,$R0,$D1lo - vpmadd52huq $RR1,$R0,$D1hi - vpmadd52luq $RR1,$R1,$D2lo - vpmadd52huq $RR1,$R1,$D2hi - - ################################################################ - # partial reduction - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$RR0 - vpaddq $tmp,$D0hi,$D0hi - - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$RR1 - vpaddq $tmp,$D1hi,$D1hi - - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$RR2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $D2hi,$RR0,$RR0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$RR0,$RR0 - - vpsrlq \$44,$RR0,$tmp # additional step - vpandq $mask44,$RR0,$RR0 - - vpaddq $tmp,$RR1,$RR1 - - ################################################################ - # At this point Rx holds 1324 powers, RRx - 5768, and the goal - # is 15263748, which reflects how data is loaded... - - vpunpcklqdq $R2,$RR2,$T2 # 3748 - vpunpckhqdq $R2,$RR2,$R2 # 1526 - vpunpcklqdq $R0,$RR0,$T0 - vpunpckhqdq $R0,$RR0,$R0 - vpunpcklqdq $R1,$RR1,$T1 - vpunpckhqdq $R1,$RR1,$R1 -___ -######## switch to %zmm -map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); -map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); -map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); -map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); - -$code.=<<___; - vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 - vshufi64x2 \$0x44,$R0,$T0,$RR0 - vshufi64x2 \$0x44,$R1,$T1,$RR1 - - vmovdqu64 16*0($inp),$T2 # load data - vmovdqu64 16*4($inp),$T3 - lea 16*8($inp),$inp - - vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 - vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 - vpaddq $RR2,$SS2,$SS2 - vpaddq $RR1,$SS1,$SS1 - vpsllq \$2,$SS2,$SS2 - vpsllq \$2,$SS1,$SS1 - - vpbroadcastq $padbit,$PAD - vpbroadcastq %x#$mask44,$mask44 - vpbroadcastq %x#$mask42,$mask42 - - vpbroadcastq %x#$SS1,$S1 # broadcast 8th power - vpbroadcastq %x#$SS2,$S2 - vpbroadcastq %x#$RR0,$R0 - vpbroadcastq %x#$RR1,$R1 - vpbroadcastq %x#$RR2,$R2 - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - - # at this point 64-bit lanes are ordered as 73625140 - - vpsrlq \$24,$T3,$T2 # splat the data - vporq $PAD,$T2,$T2 - vpaddq $T2,$H2,$H2 # accumulate input - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - sub \$8,$len - jz .Ltail_vpmadd52_8x - jmp .Loop_vpmadd52_8x - -.align 32 -.Loop_vpmadd52_8x: - #vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$S1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$S1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$S2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$S2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$R0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$R0,$D2hi - - vmovdqu64 16*0($inp),$T2 # load data - vmovdqu64 16*4($inp),$T3 - lea 16*8($inp),$inp - vpmadd52luq $H0,$R0,$D0lo - vpmadd52huq $H0,$R0,$D0hi - vpmadd52luq $H0,$R1,$D1lo - vpmadd52huq $H0,$R1,$D1hi - vpmadd52luq $H0,$R2,$D2lo - vpmadd52huq $H0,$R2,$D2hi - - vpunpcklqdq $T3,$T2,$T1 # transpose data - vpunpckhqdq $T3,$T2,$T3 - vpmadd52luq $H1,$S2,$D0lo - vpmadd52huq $H1,$S2,$D0hi - vpmadd52luq $H1,$R0,$D1lo - vpmadd52huq $H1,$R0,$D1hi - vpmadd52luq $H1,$R1,$D2lo - vpmadd52huq $H1,$R1,$D2hi - - ################################################################ - # partial reduction (interleaved with data splat) - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpsrlq \$24,$T3,$T2 - vporq $PAD,$T2,$T2 - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpandq $mask44,$T1,$T0 - vpsrlq \$44,$T1,$T1 - vpsllq \$20,$T3,$T3 - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vporq $T3,$T1,$T1 - vpandq $mask44,$T1,$T1 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - - sub \$8,$len # len-=128 - jnz .Loop_vpmadd52_8x - -.Ltail_vpmadd52_8x: - #vpaddq $T2,$H2,$H2 # accumulate input - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - - vpxorq $D0lo,$D0lo,$D0lo - vpmadd52luq $H2,$SS1,$D0lo - vpxorq $D0hi,$D0hi,$D0hi - vpmadd52huq $H2,$SS1,$D0hi - vpxorq $D1lo,$D1lo,$D1lo - vpmadd52luq $H2,$SS2,$D1lo - vpxorq $D1hi,$D1hi,$D1hi - vpmadd52huq $H2,$SS2,$D1hi - vpxorq $D2lo,$D2lo,$D2lo - vpmadd52luq $H2,$RR0,$D2lo - vpxorq $D2hi,$D2hi,$D2hi - vpmadd52huq $H2,$RR0,$D2hi - - vpmadd52luq $H0,$RR0,$D0lo - vpmadd52huq $H0,$RR0,$D0hi - vpmadd52luq $H0,$RR1,$D1lo - vpmadd52huq $H0,$RR1,$D1hi - vpmadd52luq $H0,$RR2,$D2lo - vpmadd52huq $H0,$RR2,$D2hi - - vpmadd52luq $H1,$SS2,$D0lo - vpmadd52huq $H1,$SS2,$D0hi - vpmadd52luq $H1,$RR0,$D1lo - vpmadd52huq $H1,$RR0,$D1hi - vpmadd52luq $H1,$RR1,$D2lo - vpmadd52huq $H1,$RR1,$D2hi - - ################################################################ - # horizontal addition - - mov \$1,%eax - kmovw %eax,%k1 - vpsrldq \$8,$D0lo,$T0 - vpsrldq \$8,$D0hi,$H0 - vpsrldq \$8,$D1lo,$T1 - vpsrldq \$8,$D1hi,$H1 - vpaddq $T0,$D0lo,$D0lo - vpaddq $H0,$D0hi,$D0hi - vpsrldq \$8,$D2lo,$T2 - vpsrldq \$8,$D2hi,$H2 - vpaddq $T1,$D1lo,$D1lo - vpaddq $H1,$D1hi,$D1hi - vpermq \$0x2,$D0lo,$T0 - vpermq \$0x2,$D0hi,$H0 - vpaddq $T2,$D2lo,$D2lo - vpaddq $H2,$D2hi,$D2hi - - vpermq \$0x2,$D1lo,$T1 - vpermq \$0x2,$D1hi,$H1 - vpaddq $T0,$D0lo,$D0lo - vpaddq $H0,$D0hi,$D0hi - vpermq \$0x2,$D2lo,$T2 - vpermq \$0x2,$D2hi,$H2 - vpaddq $T1,$D1lo,$D1lo - vpaddq $H1,$D1hi,$D1hi - vextracti64x4 \$1,$D0lo,%y#$T0 - vextracti64x4 \$1,$D0hi,%y#$H0 - vpaddq $T2,$D2lo,$D2lo - vpaddq $H2,$D2hi,$D2hi - - vextracti64x4 \$1,$D1lo,%y#$T1 - vextracti64x4 \$1,$D1hi,%y#$H1 - vextracti64x4 \$1,$D2lo,%y#$T2 - vextracti64x4 \$1,$D2hi,%y#$H2 -___ -######## switch back to %ymm -map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); -map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); -map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); - -$code.=<<___; - vpaddq $T0,$D0lo,${D0lo}{%k1}{z} - vpaddq $H0,$D0hi,${D0hi}{%k1}{z} - vpaddq $T1,$D1lo,${D1lo}{%k1}{z} - vpaddq $H1,$D1hi,${D1hi}{%k1}{z} - vpaddq $T2,$D2lo,${D2lo}{%k1}{z} - vpaddq $H2,$D2hi,${D2hi}{%k1}{z} - - ################################################################ - # partial reduction - vpsrlq \$44,$D0lo,$tmp - vpsllq \$8,$D0hi,$D0hi - vpandq $mask44,$D0lo,$H0 - vpaddq $tmp,$D0hi,$D0hi - - vpaddq $D0hi,$D1lo,$D1lo - - vpsrlq \$44,$D1lo,$tmp - vpsllq \$8,$D1hi,$D1hi - vpandq $mask44,$D1lo,$H1 - vpaddq $tmp,$D1hi,$D1hi - - vpaddq $D1hi,$D2lo,$D2lo - - vpsrlq \$42,$D2lo,$tmp - vpsllq \$10,$D2hi,$D2hi - vpandq $mask42,$D2lo,$H2 - vpaddq $tmp,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - vpsllq \$2,$D2hi,$D2hi - - vpaddq $D2hi,$H0,$H0 - - vpsrlq \$44,$H0,$tmp # additional step - vpandq $mask44,$H0,$H0 - - vpaddq $tmp,$H1,$H1 - - ################################################################ - - vmovq %x#$H0,0($ctx) - vmovq %x#$H1,8($ctx) - vmovq %x#$H2,16($ctx) - vzeroall - -.Lno_data_vpmadd52_8x: - RET -.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x -___ -} -$code.=<<___; -.type poly1305_emit_base2_44,\@function,3 -.align 32 -poly1305_emit_base2_44: - mov 0($ctx),%r8 # load hash value - mov 8($ctx),%r9 - mov 16($ctx),%r10 - - mov %r9,%rax - shr \$20,%r9 - shl \$44,%rax - mov %r10,%rcx - shr \$40,%r10 - shl \$24,%rcx - - add %rax,%r8 - adc %rcx,%r9 - adc \$0,%r10 - - mov %r8,%rax - add \$5,%r8 # compare to modulus - mov %r9,%rcx - adc \$0,%r9 - adc \$0,%r10 - shr \$2,%r10 # did 130-bit value overflow? - cmovnz %r8,%rax - cmovnz %r9,%rcx - - add 0($nonce),%rax # accumulate nonce - adc 8($nonce),%rcx - mov %rax,0($mac) # write result - mov %rcx,8($mac) - - RET -.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 -___ -} } } -} - -if (!$kernel) -{ # chacha20-poly1305 helpers -my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order - ("%rdi","%rsi","%rdx","%rcx"); # Unix order -$code.=<<___; -.globl xor128_encrypt_n_pad -.type xor128_encrypt_n_pad,\@abi-omnipotent -.align 16 -xor128_encrypt_n_pad: - sub $otp,$inp - sub $otp,$out - mov $len,%r10 # put len aside - shr \$4,$len # len / 16 - jz .Ltail_enc - nop -.Loop_enc_xmm: - movdqu ($inp,$otp),%xmm0 - pxor ($otp),%xmm0 - movdqu %xmm0,($out,$otp) - movdqa %xmm0,($otp) - lea 16($otp),$otp - dec $len - jnz .Loop_enc_xmm - - and \$15,%r10 # len % 16 - jz .Ldone_enc - -.Ltail_enc: - mov \$16,$len - sub %r10,$len - xor %eax,%eax -.Loop_enc_byte: - mov ($inp,$otp),%al - xor ($otp),%al - mov %al,($out,$otp) - mov %al,($otp) - lea 1($otp),$otp - dec %r10 - jnz .Loop_enc_byte - - xor %eax,%eax -.Loop_enc_pad: - mov %al,($otp) - lea 1($otp),$otp - dec $len - jnz .Loop_enc_pad - -.Ldone_enc: - mov $otp,%rax - RET -.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad - -.globl xor128_decrypt_n_pad -.type xor128_decrypt_n_pad,\@abi-omnipotent -.align 16 -xor128_decrypt_n_pad: - sub $otp,$inp - sub $otp,$out - mov $len,%r10 # put len aside - shr \$4,$len # len / 16 - jz .Ltail_dec - nop -.Loop_dec_xmm: - movdqu ($inp,$otp),%xmm0 - movdqa ($otp),%xmm1 - pxor %xmm0,%xmm1 - movdqu %xmm1,($out,$otp) - movdqa %xmm0,($otp) - lea 16($otp),$otp - dec $len - jnz .Loop_dec_xmm - - pxor %xmm1,%xmm1 - and \$15,%r10 # len % 16 - jz .Ldone_dec - -.Ltail_dec: - mov \$16,$len - sub %r10,$len - xor %eax,%eax - xor %r11d,%r11d -.Loop_dec_byte: - mov ($inp,$otp),%r11b - mov ($otp),%al - xor %r11b,%al - mov %al,($out,$otp) - mov %r11b,($otp) - lea 1($otp),$otp - dec %r10 - jnz .Loop_dec_byte - - xor %eax,%eax -.Loop_dec_pad: - mov %al,($otp) - lea 1($otp),$otp - dec $len - jnz .Loop_dec_pad - -.Ldone_dec: - mov $otp,%rax - RET -.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad -___ -} - -# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, -# CONTEXT *context,DISPATCHER_CONTEXT *disp) -if ($win64) { -$rec="%rcx"; -$frame="%rdx"; -$context="%r8"; -$disp="%r9"; - -$code.=<<___; -.extern __imp_RtlVirtualUnwind -.type se_handler,\@abi-omnipotent -.align 16 -se_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),%r11 # disp->HandlerData - - mov 0(%r11),%r10d # HandlerData[0] - lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->Rip<.Lprologue - jb .Lcommon_seh_tail - - mov 152($context),%rax # pull context->Rsp - - mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=.Lepilogue - jae .Lcommon_seh_tail - - lea 48(%rax),%rax - - mov -8(%rax),%rbx - mov -16(%rax),%rbp - mov -24(%rax),%r12 - mov -32(%rax),%r13 - mov -40(%rax),%r14 - mov -48(%rax),%r15 - mov %rbx,144($context) # restore context->Rbx - mov %rbp,160($context) # restore context->Rbp - mov %r12,216($context) # restore context->R12 - mov %r13,224($context) # restore context->R13 - mov %r14,232($context) # restore context->R14 - mov %r15,240($context) # restore context->R14 - - jmp .Lcommon_seh_tail -.size se_handler,.-se_handler - -.type avx_handler,\@abi-omnipotent -.align 16 -avx_handler: - push %rsi - push %rdi - push %rbx - push %rbp - push %r12 - push %r13 - push %r14 - push %r15 - pushfq - sub \$64,%rsp - - mov 120($context),%rax # pull context->Rax - mov 248($context),%rbx # pull context->Rip - - mov 8($disp),%rsi # disp->ImageBase - mov 56($disp),%r11 # disp->HandlerData - - mov 0(%r11),%r10d # HandlerData[0] - lea (%rsi,%r10),%r10 # prologue label - cmp %r10,%rbx # context->Rip<prologue label - jb .Lcommon_seh_tail - - mov 152($context),%rax # pull context->Rsp - - mov 4(%r11),%r10d # HandlerData[1] - lea (%rsi,%r10),%r10 # epilogue label - cmp %r10,%rbx # context->Rip>=epilogue label - jae .Lcommon_seh_tail - - mov 208($context),%rax # pull context->R11 - - lea 0x50(%rax),%rsi - lea 0xf8(%rax),%rax - lea 512($context),%rdi # &context.Xmm6 - mov \$20,%ecx - .long 0xa548f3fc # cld; rep movsq - -.Lcommon_seh_tail: - mov 8(%rax),%rdi - mov 16(%rax),%rsi - mov %rax,152($context) # restore context->Rsp - mov %rsi,168($context) # restore context->Rsi - mov %rdi,176($context) # restore context->Rdi - - mov 40($disp),%rdi # disp->ContextRecord - mov $context,%rsi # context - mov \$154,%ecx # sizeof(CONTEXT) - .long 0xa548f3fc # cld; rep movsq - - mov $disp,%rsi - xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER - mov 8(%rsi),%rdx # arg2, disp->ImageBase - mov 0(%rsi),%r8 # arg3, disp->ControlPc - mov 16(%rsi),%r9 # arg4, disp->FunctionEntry - mov 40(%rsi),%r10 # disp->ContextRecord - lea 56(%rsi),%r11 # &disp->HandlerData - lea 24(%rsi),%r12 # &disp->EstablisherFrame - mov %r10,32(%rsp) # arg5 - mov %r11,40(%rsp) # arg6 - mov %r12,48(%rsp) # arg7 - mov %rcx,56(%rsp) # arg8, (NULL) - call *__imp_RtlVirtualUnwind(%rip) - - mov \$1,%eax # ExceptionContinueSearch - add \$64,%rsp - popfq - pop %r15 - pop %r14 - pop %r13 - pop %r12 - pop %rbp - pop %rbx - pop %rdi - pop %rsi - RET -.size avx_handler,.-avx_handler - -.section .pdata -.align 4 - .rva .LSEH_begin_poly1305_init_x86_64 - .rva .LSEH_end_poly1305_init_x86_64 - .rva .LSEH_info_poly1305_init_x86_64 - - .rva .LSEH_begin_poly1305_blocks_x86_64 - .rva .LSEH_end_poly1305_blocks_x86_64 - .rva .LSEH_info_poly1305_blocks_x86_64 - - .rva .LSEH_begin_poly1305_emit_x86_64 - .rva .LSEH_end_poly1305_emit_x86_64 - .rva .LSEH_info_poly1305_emit_x86_64 -___ -$code.=<<___ if ($avx); - .rva .LSEH_begin_poly1305_blocks_avx - .rva .Lbase2_64_avx - .rva .LSEH_info_poly1305_blocks_avx_1 - - .rva .Lbase2_64_avx - .rva .Leven_avx - .rva .LSEH_info_poly1305_blocks_avx_2 - - .rva .Leven_avx - .rva .LSEH_end_poly1305_blocks_avx - .rva .LSEH_info_poly1305_blocks_avx_3 - - .rva .LSEH_begin_poly1305_emit_avx - .rva .LSEH_end_poly1305_emit_avx - .rva .LSEH_info_poly1305_emit_avx -___ -$code.=<<___ if ($avx>1); - .rva .LSEH_begin_poly1305_blocks_avx2 - .rva .Lbase2_64_avx2 - .rva .LSEH_info_poly1305_blocks_avx2_1 - - .rva .Lbase2_64_avx2 - .rva .Leven_avx2 - .rva .LSEH_info_poly1305_blocks_avx2_2 - - .rva .Leven_avx2 - .rva .LSEH_end_poly1305_blocks_avx2 - .rva .LSEH_info_poly1305_blocks_avx2_3 -___ -$code.=<<___ if ($avx>2); - .rva .LSEH_begin_poly1305_blocks_avx512 - .rva .LSEH_end_poly1305_blocks_avx512 - .rva .LSEH_info_poly1305_blocks_avx512 -___ -$code.=<<___; -.section .xdata -.align 8 -.LSEH_info_poly1305_init_x86_64: - .byte 9,0,0,0 - .rva se_handler - .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64 - -.LSEH_info_poly1305_blocks_x86_64: - .byte 9,0,0,0 - .rva se_handler - .rva .Lblocks_body,.Lblocks_epilogue - -.LSEH_info_poly1305_emit_x86_64: - .byte 9,0,0,0 - .rva se_handler - .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64 -___ -$code.=<<___ if ($avx); -.LSEH_info_poly1305_blocks_avx_1: - .byte 9,0,0,0 - .rva se_handler - .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx_2: - .byte 9,0,0,0 - .rva se_handler - .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx_3: - .byte 9,0,0,0 - .rva avx_handler - .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] - -.LSEH_info_poly1305_emit_avx: - .byte 9,0,0,0 - .rva se_handler - .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx -___ -$code.=<<___ if ($avx>1); -.LSEH_info_poly1305_blocks_avx2_1: - .byte 9,0,0,0 - .rva se_handler - .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx2_2: - .byte 9,0,0,0 - .rva se_handler - .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] - -.LSEH_info_poly1305_blocks_avx2_3: - .byte 9,0,0,0 - .rva avx_handler - .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] -___ -$code.=<<___ if ($avx>2); -.LSEH_info_poly1305_blocks_avx512: - .byte 9,0,0,0 - .rva avx_handler - .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] -___ -} - -open SELF,$0; -while(<SELF>) { - next if (/^#!/); - last if (!s/^#/\/\// and !/^$/); - print; -} -close SELF; - -foreach (split('\n',$code)) { - s/\`([^\`]*)\`/eval($1)/ge; - s/%r([a-z]+)#d/%e$1/g; - s/%r([0-9]+)#d/%r$1d/g; - s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; - - if ($kernel) { - s/(^\.type.*),[0-9]+$/\1/; - s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/; - next if /^\.cfi.*/; - } - - print $_,"\n"; -} -close STDOUT; diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c deleted file mode 100644 index 08ff4b489f7e..000000000000 --- a/arch/x86/crypto/poly1305_glue.c +++ /dev/null @@ -1,290 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 OR MIT -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - */ - -#include <crypto/algapi.h> -#include <crypto/internal/hash.h> -#include <crypto/internal/poly1305.h> -#include <crypto/internal/simd.h> -#include <linux/crypto.h> -#include <linux/jump_label.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/sizes.h> -#include <asm/cpu_device_id.h> -#include <asm/simd.h> - -asmlinkage void poly1305_init_x86_64(void *ctx, - const u8 key[POLY1305_BLOCK_SIZE]); -asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp, - const size_t len, const u32 padbit); -asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len, - const u32 padbit); -asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len, - const u32 padbit); -asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp, - const size_t len, const u32 padbit); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); -static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512); - -struct poly1305_arch_internal { - union { - struct { - u32 h[5]; - u32 is_base2_26; - }; - u64 hs[3]; - }; - u64 r[2]; - u64 pad; - struct { u32 r2, r1, r4, r3; } rn[9]; -}; - -/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit - * the unfortunate situation of using AVX and then having to go back to scalar - * -- because the user is silly and has called the update function from two - * separate contexts -- then we need to convert back to the original base before - * proceeding. It is possible to reason that the initial reduction below is - * sufficient given the implementation invariants. However, for an avoidance of - * doubt and because this is not performance critical, we do the full reduction - * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py - */ -static void convert_to_base2_64(void *ctx) -{ - struct poly1305_arch_internal *state = ctx; - u32 cy; - - if (!state->is_base2_26) - return; - - cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy; - cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy; - cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy; - cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy; - state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0]; - state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12); - state->hs[2] = state->h[4] >> 24; -#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1)) - cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL); - state->hs[2] &= 3; - state->hs[0] += cy; - state->hs[1] += (cy = ULT(state->hs[0], cy)); - state->hs[2] += ULT(state->hs[1], cy); -#undef ULT - state->is_base2_26 = 0; -} - -static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_BLOCK_SIZE]) -{ - poly1305_init_x86_64(ctx, key); -} - -static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len, - const u32 padbit) -{ - struct poly1305_arch_internal *state = ctx; - - /* SIMD disables preemption, so relax after processing each page. */ - BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE || - SZ_4K % POLY1305_BLOCK_SIZE); - - if (!static_branch_likely(&poly1305_use_avx) || - (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) || - !crypto_simd_usable()) { - convert_to_base2_64(ctx); - poly1305_blocks_x86_64(ctx, inp, len, padbit); - return; - } - - do { - const size_t bytes = min_t(size_t, len, SZ_4K); - - kernel_fpu_begin(); - if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512)) - poly1305_blocks_avx512(ctx, inp, bytes, padbit); - else if (static_branch_likely(&poly1305_use_avx2)) - poly1305_blocks_avx2(ctx, inp, bytes, padbit); - else - poly1305_blocks_avx(ctx, inp, bytes, padbit); - kernel_fpu_end(); - - len -= bytes; - inp += bytes; - } while (len); -} - -static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE], - const u32 nonce[4]) -{ - if (!static_branch_likely(&poly1305_use_avx)) - poly1305_emit_x86_64(ctx, mac, nonce); - else - poly1305_emit_avx(ctx, mac, nonce); -} - -void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE]) -{ - poly1305_simd_init(&dctx->h, key); - dctx->s[0] = get_unaligned_le32(&key[16]); - dctx->s[1] = get_unaligned_le32(&key[20]); - dctx->s[2] = get_unaligned_le32(&key[24]); - dctx->s[3] = get_unaligned_le32(&key[28]); - dctx->buflen = 0; - dctx->sset = true; -} -EXPORT_SYMBOL(poly1305_init_arch); - -static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx, - const u8 *inp, unsigned int len) -{ - unsigned int acc = 0; - if (unlikely(!dctx->sset)) { - if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) { - poly1305_simd_init(&dctx->h, inp); - inp += POLY1305_BLOCK_SIZE; - len -= POLY1305_BLOCK_SIZE; - acc += POLY1305_BLOCK_SIZE; - dctx->rset = 1; - } - if (len >= POLY1305_BLOCK_SIZE) { - dctx->s[0] = get_unaligned_le32(&inp[0]); - dctx->s[1] = get_unaligned_le32(&inp[4]); - dctx->s[2] = get_unaligned_le32(&inp[8]); - dctx->s[3] = get_unaligned_le32(&inp[12]); - acc += POLY1305_BLOCK_SIZE; - dctx->sset = true; - } - } - return acc; -} - -void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, - unsigned int srclen) -{ - unsigned int bytes, used; - - if (unlikely(dctx->buflen)) { - bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); - memcpy(dctx->buf + dctx->buflen, src, bytes); - src += bytes; - srclen -= bytes; - dctx->buflen += bytes; - - if (dctx->buflen == POLY1305_BLOCK_SIZE) { - if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE))) - poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1); - dctx->buflen = 0; - } - } - - if (likely(srclen >= POLY1305_BLOCK_SIZE)) { - bytes = round_down(srclen, POLY1305_BLOCK_SIZE); - srclen -= bytes; - used = crypto_poly1305_setdctxkey(dctx, src, bytes); - if (likely(bytes - used)) - poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1); - src += bytes; - } - - if (unlikely(srclen)) { - dctx->buflen = srclen; - memcpy(dctx->buf, src, srclen); - } -} -EXPORT_SYMBOL(poly1305_update_arch); - -void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) -{ - if (unlikely(dctx->buflen)) { - dctx->buf[dctx->buflen++] = 1; - memset(dctx->buf + dctx->buflen, 0, - POLY1305_BLOCK_SIZE - dctx->buflen); - poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0); - } - - poly1305_simd_emit(&dctx->h, dst, dctx->s); - memzero_explicit(dctx, sizeof(*dctx)); -} -EXPORT_SYMBOL(poly1305_final_arch); - -static int crypto_poly1305_init(struct shash_desc *desc) -{ - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - - *dctx = (struct poly1305_desc_ctx){}; - return 0; -} - -static int crypto_poly1305_update(struct shash_desc *desc, - const u8 *src, unsigned int srclen) -{ - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - - poly1305_update_arch(dctx, src, srclen); - return 0; -} - -static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) -{ - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - - if (unlikely(!dctx->sset)) - return -ENOKEY; - - poly1305_final_arch(dctx, dst); - return 0; -} - -static struct shash_alg alg = { - .digestsize = POLY1305_DIGEST_SIZE, - .init = crypto_poly1305_init, - .update = crypto_poly1305_update, - .final = crypto_poly1305_final, - .descsize = sizeof(struct poly1305_desc_ctx), - .base = { - .cra_name = "poly1305", - .cra_driver_name = "poly1305-simd", - .cra_priority = 300, - .cra_blocksize = POLY1305_BLOCK_SIZE, - .cra_module = THIS_MODULE, - }, -}; - -static int __init poly1305_simd_mod_init(void) -{ - if (boot_cpu_has(X86_FEATURE_AVX) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) - static_branch_enable(&poly1305_use_avx); - if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) - static_branch_enable(&poly1305_use_avx2); - if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) && - /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */ - boot_cpu_data.x86_vfm != INTEL_SKYLAKE_X) - static_branch_enable(&poly1305_use_avx512); - return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0; -} - -static void __exit poly1305_simd_mod_exit(void) -{ - if (IS_REACHABLE(CONFIG_CRYPTO_HASH)) - crypto_unregister_shash(&alg); -} - -module_init(poly1305_simd_mod_init); -module_exit(poly1305_simd_mod_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>"); -MODULE_DESCRIPTION("Poly1305 authenticator"); -MODULE_ALIAS_CRYPTO("poly1305"); -MODULE_ALIAS_CRYPTO("poly1305-simd"); diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c index 8fa58b0f3cb3..6b466867f91a 100644 --- a/arch/x86/crypto/polyval-clmulni_glue.c +++ b/arch/x86/crypto/polyval-clmulni_glue.c @@ -16,16 +16,15 @@ * operations. */ -#include <crypto/algapi.h> +#include <asm/cpu_device_id.h> +#include <asm/fpu/api.h> #include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> #include <crypto/polyval.h> -#include <linux/crypto.h> -#include <linux/init.h> +#include <crypto/utils.h> +#include <linux/errno.h> #include <linux/kernel.h> #include <linux/module.h> -#include <asm/cpu_device_id.h> -#include <asm/simd.h> +#include <linux/string.h> #define POLYVAL_ALIGN 16 #define POLYVAL_ALIGN_ATTR __aligned(POLYVAL_ALIGN) @@ -42,7 +41,6 @@ struct polyval_tfm_ctx { struct polyval_desc_ctx { u8 buffer[POLYVAL_BLOCK_SIZE]; - u32 bytes; }; asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys, @@ -57,25 +55,16 @@ static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm) static void internal_polyval_update(const struct polyval_tfm_ctx *keys, const u8 *in, size_t nblocks, u8 *accumulator) { - if (likely(crypto_simd_usable())) { - kernel_fpu_begin(); - clmul_polyval_update(keys, in, nblocks, accumulator); - kernel_fpu_end(); - } else { - polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in, - nblocks, accumulator); - } + kernel_fpu_begin(); + clmul_polyval_update(keys, in, nblocks, accumulator); + kernel_fpu_end(); } static void internal_polyval_mul(u8 *op1, const u8 *op2) { - if (likely(crypto_simd_usable())) { - kernel_fpu_begin(); - clmul_polyval_mul(op1, op2); - kernel_fpu_end(); - } else { - polyval_mul_non4k(op1, op2); - } + kernel_fpu_begin(); + clmul_polyval_mul(op1, op2); + kernel_fpu_end(); } static int polyval_x86_setkey(struct crypto_shash *tfm, @@ -112,49 +101,27 @@ static int polyval_x86_update(struct shash_desc *desc, { struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm); - u8 *pos; unsigned int nblocks; - unsigned int n; - - if (dctx->bytes) { - n = min(srclen, dctx->bytes); - pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes; - - dctx->bytes -= n; - srclen -= n; - - while (n--) - *pos++ ^= *src++; - if (!dctx->bytes) - internal_polyval_mul(dctx->buffer, - tctx->key_powers[NUM_KEY_POWERS-1]); - } - - while (srclen >= POLYVAL_BLOCK_SIZE) { + do { /* Allow rescheduling every 4K bytes. */ nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE; internal_polyval_update(tctx, src, nblocks, dctx->buffer); srclen -= nblocks * POLYVAL_BLOCK_SIZE; src += nblocks * POLYVAL_BLOCK_SIZE; - } + } while (srclen >= POLYVAL_BLOCK_SIZE); - if (srclen) { - dctx->bytes = POLYVAL_BLOCK_SIZE - srclen; - pos = dctx->buffer; - while (srclen--) - *pos++ ^= *src++; - } - - return 0; + return srclen; } -static int polyval_x86_final(struct shash_desc *desc, u8 *dst) +static int polyval_x86_finup(struct shash_desc *desc, const u8 *src, + unsigned int len, u8 *dst) { struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm); - if (dctx->bytes) { + if (len) { + crypto_xor(dctx->buffer, src, len); internal_polyval_mul(dctx->buffer, tctx->key_powers[NUM_KEY_POWERS-1]); } @@ -168,13 +135,14 @@ static struct shash_alg polyval_alg = { .digestsize = POLYVAL_DIGEST_SIZE, .init = polyval_x86_init, .update = polyval_x86_update, - .final = polyval_x86_final, + .finup = polyval_x86_finup, .setkey = polyval_x86_setkey, .descsize = sizeof(struct polyval_desc_ctx), .base = { .cra_name = "polyval", .cra_driver_name = "polyval-clmulni", .cra_priority = 200, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = POLYVAL_BLOCK_SIZE, .cra_ctxsize = POLYVAL_CTX_SIZE, .cra_module = THIS_MODULE, diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 347e97f4b713..f5f2121b7956 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -10,7 +10,6 @@ #include <linux/crypto.h> #include <linux/err.h> #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <crypto/serpent.h> #include "serpent-avx.h" @@ -65,10 +64,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg serpent_algs[] = { { - .base.cra_name = "__ecb(serpent)", - .base.cra_driver_name = "__ecb-serpent-avx2", + .base.cra_name = "ecb(serpent)", + .base.cra_driver_name = "ecb-serpent-avx2", .base.cra_priority = 600, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, @@ -78,10 +76,9 @@ static struct skcipher_alg serpent_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(serpent)", - .base.cra_driver_name = "__cbc-serpent-avx2", + .base.cra_name = "cbc(serpent)", + .base.cra_driver_name = "cbc-serpent-avx2", .base.cra_priority = 600, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, @@ -94,8 +91,6 @@ static struct skcipher_alg serpent_algs[] = { }, }; -static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)]; - static int __init serpent_avx2_init(void) { const char *feature_name; @@ -110,15 +105,13 @@ static int __init serpent_avx2_init(void) return -ENODEV; } - return simd_register_skciphers_compat(serpent_algs, - ARRAY_SIZE(serpent_algs), - serpent_simd_algs); + return crypto_register_skciphers(serpent_algs, + ARRAY_SIZE(serpent_algs)); } static void __exit serpent_avx2_fini(void) { - simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs), - serpent_simd_algs); + crypto_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs)); } module_init(serpent_avx2_init); diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 6c248e1ea4ef..e640abc1cb8a 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -13,7 +13,6 @@ #include <linux/crypto.h> #include <linux/err.h> #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <crypto/serpent.h> #include "serpent-avx.h" @@ -71,10 +70,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg serpent_algs[] = { { - .base.cra_name = "__ecb(serpent)", - .base.cra_driver_name = "__ecb-serpent-avx", + .base.cra_name = "ecb(serpent)", + .base.cra_driver_name = "ecb-serpent-avx", .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, @@ -84,10 +82,9 @@ static struct skcipher_alg serpent_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(serpent)", - .base.cra_driver_name = "__cbc-serpent-avx", + .base.cra_name = "cbc(serpent)", + .base.cra_driver_name = "cbc-serpent-avx", .base.cra_priority = 500, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, @@ -100,8 +97,6 @@ static struct skcipher_alg serpent_algs[] = { }, }; -static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)]; - static int __init serpent_init(void) { const char *feature_name; @@ -112,15 +107,13 @@ static int __init serpent_init(void) return -ENODEV; } - return simd_register_skciphers_compat(serpent_algs, - ARRAY_SIZE(serpent_algs), - serpent_simd_algs); + return crypto_register_skciphers(serpent_algs, + ARRAY_SIZE(serpent_algs)); } static void __exit serpent_exit(void) { - simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs), - serpent_simd_algs); + crypto_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs)); } module_init(serpent_init); diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index d78f37e9b2cf..80ee17ec21b4 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -18,7 +18,6 @@ #include <linux/err.h> #include <crypto/algapi.h> #include <crypto/b128ops.h> -#include <crypto/internal/simd.h> #include <crypto/serpent.h> #include "serpent-sse2.h" @@ -74,10 +73,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg serpent_algs[] = { { - .base.cra_name = "__ecb(serpent)", - .base.cra_driver_name = "__ecb-serpent-sse2", + .base.cra_name = "ecb(serpent)", + .base.cra_driver_name = "ecb-serpent-sse2", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, @@ -87,10 +85,9 @@ static struct skcipher_alg serpent_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(serpent)", - .base.cra_driver_name = "__cbc-serpent-sse2", + .base.cra_name = "cbc(serpent)", + .base.cra_driver_name = "cbc-serpent-sse2", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = SERPENT_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct serpent_ctx), .base.cra_module = THIS_MODULE, @@ -103,8 +100,6 @@ static struct skcipher_alg serpent_algs[] = { }, }; -static struct simd_skcipher_alg *serpent_simd_algs[ARRAY_SIZE(serpent_algs)]; - static int __init serpent_sse2_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2)) { @@ -112,15 +107,13 @@ static int __init serpent_sse2_init(void) return -ENODEV; } - return simd_register_skciphers_compat(serpent_algs, - ARRAY_SIZE(serpent_algs), - serpent_simd_algs); + return crypto_register_skciphers(serpent_algs, + ARRAY_SIZE(serpent_algs)); } static void __exit serpent_sse2_exit(void) { - simd_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs), - serpent_simd_algs); + crypto_unregister_skciphers(serpent_algs, ARRAY_SIZE(serpent_algs)); } module_init(serpent_sse2_init); diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index ab8bc54f254d..0a912bfc86c5 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -16,21 +16,17 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <asm/cpu_device_id.h> +#include <asm/simd.h> #include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/mm.h> -#include <linux/types.h> #include <crypto/sha1.h> #include <crypto/sha1_base.h> -#include <asm/cpu_device_id.h> -#include <asm/simd.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/module.h> static const struct x86_cpu_id module_cpu_ids[] = { -#ifdef CONFIG_AS_SHA1_NI X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL), -#endif X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL), X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL), X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL), @@ -38,14 +34,10 @@ static const struct x86_cpu_id module_cpu_ids[] = { }; MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids); -static int sha1_update(struct shash_desc *desc, const u8 *data, - unsigned int len, sha1_block_fn *sha1_xform) +static inline int sha1_update(struct shash_desc *desc, const u8 *data, + unsigned int len, sha1_block_fn *sha1_xform) { - struct sha1_state *sctx = shash_desc_ctx(desc); - - if (!crypto_simd_usable() || - (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE) - return crypto_sha1_update(desc, data, len); + int remain; /* * Make sure struct sha1_state begins directly with the SHA1 @@ -54,22 +46,18 @@ static int sha1_update(struct shash_desc *desc, const u8 *data, BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0); kernel_fpu_begin(); - sha1_base_do_update(desc, data, len, sha1_xform); + remain = sha1_base_do_update_blocks(desc, data, len, sha1_xform); kernel_fpu_end(); - return 0; + return remain; } -static int sha1_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out, sha1_block_fn *sha1_xform) +static inline int sha1_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out, + sha1_block_fn *sha1_xform) { - if (!crypto_simd_usable()) - return crypto_sha1_finup(desc, data, len, out); - kernel_fpu_begin(); - if (len) - sha1_base_do_update(desc, data, len, sha1_xform); - sha1_base_do_finalize(desc, sha1_xform); + sha1_base_do_finup(desc, data, len, sha1_xform); kernel_fpu_end(); return sha1_base_finish(desc, out); @@ -90,23 +78,17 @@ static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, return sha1_finup(desc, data, len, out, sha1_transform_ssse3); } -/* Add padding and return the message digest. */ -static int sha1_ssse3_final(struct shash_desc *desc, u8 *out) -{ - return sha1_ssse3_finup(desc, NULL, 0, out); -} - static struct shash_alg sha1_ssse3_alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_base_init, .update = sha1_ssse3_update, - .final = sha1_ssse3_final, .finup = sha1_ssse3_finup, - .descsize = sizeof(struct sha1_state), + .descsize = SHA1_STATE_SIZE, .base = { .cra_name = "sha1", .cra_driver_name = "sha1-ssse3", .cra_priority = 150, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -140,22 +122,17 @@ static int sha1_avx_finup(struct shash_desc *desc, const u8 *data, return sha1_finup(desc, data, len, out, sha1_transform_avx); } -static int sha1_avx_final(struct shash_desc *desc, u8 *out) -{ - return sha1_avx_finup(desc, NULL, 0, out); -} - static struct shash_alg sha1_avx_alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_base_init, .update = sha1_avx_update, - .final = sha1_avx_final, .finup = sha1_avx_finup, - .descsize = sizeof(struct sha1_state), + .descsize = SHA1_STATE_SIZE, .base = { .cra_name = "sha1", .cra_driver_name = "sha1-avx", .cra_priority = 160, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -200,8 +177,8 @@ static bool avx2_usable(void) return false; } -static void sha1_apply_transform_avx2(struct sha1_state *state, - const u8 *data, int blocks) +static inline void sha1_apply_transform_avx2(struct sha1_state *state, + const u8 *data, int blocks) { /* Select the optimal transform based on data block size */ if (blocks >= SHA1_AVX2_BLOCK_OPTSIZE) @@ -222,22 +199,17 @@ static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data, return sha1_finup(desc, data, len, out, sha1_apply_transform_avx2); } -static int sha1_avx2_final(struct shash_desc *desc, u8 *out) -{ - return sha1_avx2_finup(desc, NULL, 0, out); -} - static struct shash_alg sha1_avx2_alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_base_init, .update = sha1_avx2_update, - .final = sha1_avx2_final, .finup = sha1_avx2_finup, - .descsize = sizeof(struct sha1_state), + .descsize = SHA1_STATE_SIZE, .base = { .cra_name = "sha1", .cra_driver_name = "sha1-avx2", .cra_priority = 170, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -256,7 +228,6 @@ static void unregister_sha1_avx2(void) crypto_unregister_shash(&sha1_avx2_alg); } -#ifdef CONFIG_AS_SHA1_NI asmlinkage void sha1_ni_transform(struct sha1_state *digest, const u8 *data, int rounds); @@ -272,22 +243,17 @@ static int sha1_ni_finup(struct shash_desc *desc, const u8 *data, return sha1_finup(desc, data, len, out, sha1_ni_transform); } -static int sha1_ni_final(struct shash_desc *desc, u8 *out) -{ - return sha1_ni_finup(desc, NULL, 0, out); -} - static struct shash_alg sha1_ni_alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_base_init, .update = sha1_ni_update, - .final = sha1_ni_final, .finup = sha1_ni_finup, - .descsize = sizeof(struct sha1_state), + .descsize = SHA1_STATE_SIZE, .base = { .cra_name = "sha1", .cra_driver_name = "sha1-ni", .cra_priority = 250, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -306,11 +272,6 @@ static void unregister_sha1_ni(void) crypto_unregister_shash(&sha1_ni_alg); } -#else -static inline int register_sha1_ni(void) { return 0; } -static inline void unregister_sha1_ni(void) { } -#endif - static int __init sha1_ssse3_mod_init(void) { if (!x86_match_cpu(module_cpu_ids)) @@ -360,6 +321,4 @@ MODULE_ALIAS_CRYPTO("sha1"); MODULE_ALIAS_CRYPTO("sha1-ssse3"); MODULE_ALIAS_CRYPTO("sha1-avx"); MODULE_ALIAS_CRYPTO("sha1-avx2"); -#ifdef CONFIG_AS_SHA1_NI MODULE_ALIAS_CRYPTO("sha1-ni"); -#endif diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S deleted file mode 100644 index 53de72bdd851..000000000000 --- a/arch/x86/crypto/sha256-avx-asm.S +++ /dev/null @@ -1,499 +0,0 @@ -######################################################################## -# Implement fast SHA-256 with AVX1 instructions. (x86_64) -# -# Copyright (C) 2013 Intel Corporation. -# -# Authors: -# James Guilford <james.guilford@intel.com> -# Kirk Yap <kirk.s.yap@intel.com> -# Tim Chen <tim.c.chen@linux.intel.com> -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or -# without modification, are permitted provided that the following -# conditions are met: -# -# - Redistributions of source code must retain the above -# copyright notice, this list of conditions and the following -# disclaimer. -# -# - Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -######################################################################## -# -# This code is described in an Intel White-Paper: -# "Fast SHA-256 Implementations on Intel Architecture Processors" -# -# To find it, surf to http://www.intel.com/p/en_US/embedded -# and search for that title. -# -######################################################################## -# This code schedules 1 block at a time, with 4 lanes per block -######################################################################## - -#include <linux/linkage.h> -#include <linux/cfi_types.h> - -## assume buffers not aligned -#define VMOVDQ vmovdqu - -################################ Define Macros - -# addm [mem], reg -# Add reg to mem using reg-mem add and store -.macro addm p1 p2 - add \p1, \p2 - mov \p2, \p1 -.endm - - -.macro MY_ROR p1 p2 - shld $(32-(\p1)), \p2, \p2 -.endm - -################################ - -# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask -# Load xmm with mem and byte swap each dword -.macro COPY_XMM_AND_BSWAP p1 p2 p3 - VMOVDQ \p2, \p1 - vpshufb \p3, \p1, \p1 -.endm - -################################ - -X0 = %xmm4 -X1 = %xmm5 -X2 = %xmm6 -X3 = %xmm7 - -XTMP0 = %xmm0 -XTMP1 = %xmm1 -XTMP2 = %xmm2 -XTMP3 = %xmm3 -XTMP4 = %xmm8 -XFER = %xmm9 -XTMP5 = %xmm11 - -SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA -SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00 -BYTE_FLIP_MASK = %xmm13 - -NUM_BLKS = %rdx # 3rd arg -INP = %rsi # 2nd arg -CTX = %rdi # 1st arg - -SRND = %rsi # clobbers INP -c = %ecx -d = %r8d -e = %edx -TBL = %r12 -a = %eax -b = %ebx - -f = %r9d -g = %r10d -h = %r11d - -y0 = %r13d -y1 = %r14d -y2 = %r15d - - -_INP_END_SIZE = 8 -_INP_SIZE = 8 -_XFER_SIZE = 16 -_XMM_SAVE_SIZE = 0 - -_INP_END = 0 -_INP = _INP_END + _INP_END_SIZE -_XFER = _INP + _INP_SIZE -_XMM_SAVE = _XFER + _XFER_SIZE -STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE - -# rotate_Xs -# Rotate values of symbols X0...X3 -.macro rotate_Xs -X_ = X0 -X0 = X1 -X1 = X2 -X2 = X3 -X3 = X_ -.endm - -# ROTATE_ARGS -# Rotate values of symbols a...h -.macro ROTATE_ARGS -TMP_ = h -h = g -g = f -f = e -e = d -d = c -c = b -b = a -a = TMP_ -.endm - -.macro FOUR_ROUNDS_AND_SCHED - ## compute s0 four at a time and s1 two at a time - ## compute W[-16] + W[-7] 4 at a time - - mov e, y0 # y0 = e - MY_ROR (25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] - MY_ROR (22-13), y1 # y1 = a >> (22-13) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor a, y1 # y1 = a ^ (a >> (22-13) - xor g, y2 # y2 = f^g - vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16] - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - ## compute s0 - vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y0, y2 # y2 = S1 + CH - add _XFER(%rsp), y2 # y2 = k + w + S1 + CH - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - vpsrld $7, XTMP1, XTMP2 - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - vpslld $(32-7), XTMP1, XTMP3 - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - vpor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS - mov e, y0 # y0 = e - mov a, y1 # y1 = a - MY_ROR (25-11), y0 # y0 = e >> (25-11) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - MY_ROR (22-13), y1 # y1 = a >> (22-13) - vpsrld $18, XTMP1, XTMP2 # - xor a, y1 # y1 = a ^ (a >> (22-13) - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor g, y2 # y2 = f^g - vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - vpslld $(32-18), XTMP1, XTMP1 - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor g, y2 # y2 = CH = ((f^g)&e)^g - vpxor XTMP1, XTMP3, XTMP3 # - add y0, y2 # y2 = S1 + CH - add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - vpxor XTMP2, XTMP3, XTMP3 # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - ## compute low s1 - vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS - mov e, y0 # y0 = e - mov a, y1 # y1 = a - MY_ROR (25-11), y0 # y0 = e >> (25-11) - xor e, y0 # y0 = e ^ (e >> (25-11)) - MY_ROR (22-13), y1 # y1 = a >> (22-13) - mov f, y2 # y2 = f - xor a, y1 # y1 = a ^ (a >> (22-13) - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} - xor g, y2 # y2 = f^g - vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xBxA} - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xBxA} - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor g, y2 # y2 = CH = ((f^g)&e)^g - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - vpxor XTMP3, XTMP2, XTMP2 # - add y0, y2 # y2 = S1 + CH - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - ## compute high s1 - vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS - mov e, y0 # y0 = e - MY_ROR (25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - MY_ROR (22-13), y1 # y1 = a >> (22-13) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} - xor a, y1 # y1 = a ^ (a >> (22-13) - xor g, y2 # y2 = f^g - vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] MY_ROR 19 {xDxC} - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] MY_ROR 17 {xDxC} - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - vpxor XTMP3, XTMP2, XTMP2 - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y0, y2 # y2 = S1 + CH - add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS - rotate_Xs -.endm - -## input is [rsp + _XFER + %1 * 4] -.macro DO_ROUND round - mov e, y0 # y0 = e - MY_ROR (25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - xor e, y0 # y0 = e ^ (e >> (25-11)) - MY_ROR (22-13), y1 # y1 = a >> (22-13) - mov f, y2 # y2 = f - xor a, y1 # y1 = a ^ (a >> (22-13) - MY_ROR (11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor g, y2 # y2 = f^g - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - MY_ROR (13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - and e, y2 # y2 = (f^g)&e - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - MY_ROR 6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - add y0, y2 # y2 = S1 + CH - MY_ROR 2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - offset = \round * 4 + _XFER # - add offset(%rsp), y2 # y2 = k + w + S1 + CH - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS -.endm - -######################################################################## -## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks) -## arg 1 : pointer to state -## arg 2 : pointer to input data -## arg 3 : Num blocks -######################################################################## -.text -SYM_TYPED_FUNC_START(sha256_transform_avx) - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbp - movq %rsp, %rbp - - subq $STACK_SIZE, %rsp # allocate stack space - and $~15, %rsp # align stack pointer - - shl $6, NUM_BLKS # convert to bytes - jz .Ldone_hash - add INP, NUM_BLKS # pointer to end of data - mov NUM_BLKS, _INP_END(%rsp) - - ## load initial digest - mov 4*0(CTX), a - mov 4*1(CTX), b - mov 4*2(CTX), c - mov 4*3(CTX), d - mov 4*4(CTX), e - mov 4*5(CTX), f - mov 4*6(CTX), g - mov 4*7(CTX), h - - vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK - vmovdqa _SHUF_00BA(%rip), SHUF_00BA - vmovdqa _SHUF_DC00(%rip), SHUF_DC00 -.Lloop0: - lea K256(%rip), TBL - - ## byte swap first 16 dwords - COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK - - mov INP, _INP(%rsp) - - ## schedule 48 input dwords, by doing 3 rounds of 16 each - mov $3, SRND -.align 16 -.Lloop1: - vpaddd (TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - vpaddd 1*16(TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - vpaddd 2*16(TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - vpaddd 3*16(TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - add $4*16, TBL - FOUR_ROUNDS_AND_SCHED - - sub $1, SRND - jne .Lloop1 - - mov $2, SRND -.Lloop2: - vpaddd (TBL), X0, XFER - vmovdqa XFER, _XFER(%rsp) - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - vpaddd 1*16(TBL), X1, XFER - vmovdqa XFER, _XFER(%rsp) - add $2*16, TBL - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - vmovdqa X2, X0 - vmovdqa X3, X1 - - sub $1, SRND - jne .Lloop2 - - addm (4*0)(CTX),a - addm (4*1)(CTX),b - addm (4*2)(CTX),c - addm (4*3)(CTX),d - addm (4*4)(CTX),e - addm (4*5)(CTX),f - addm (4*6)(CTX),g - addm (4*7)(CTX),h - - mov _INP(%rsp), INP - add $64, INP - cmp _INP_END(%rsp), INP - jne .Lloop0 - -.Ldone_hash: - - mov %rbp, %rsp - popq %rbp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - RET -SYM_FUNC_END(sha256_transform_avx) - -.section .rodata.cst256.K256, "aM", @progbits, 256 -.align 64 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.align 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203 - -.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 -.align 16 -# shuffle xBxA -> 00BA -_SHUF_00BA: - .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 - -.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 -.align 16 -# shuffle xDxC -> DC00 -_SHUF_DC00: - .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S deleted file mode 100644 index 0bbec1c75cd0..000000000000 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ /dev/null @@ -1,774 +0,0 @@ -######################################################################## -# Implement fast SHA-256 with AVX2 instructions. (x86_64) -# -# Copyright (C) 2013 Intel Corporation. -# -# Authors: -# James Guilford <james.guilford@intel.com> -# Kirk Yap <kirk.s.yap@intel.com> -# Tim Chen <tim.c.chen@linux.intel.com> -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or -# without modification, are permitted provided that the following -# conditions are met: -# -# - Redistributions of source code must retain the above -# copyright notice, this list of conditions and the following -# disclaimer. -# -# - Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -######################################################################## -# -# This code is described in an Intel White-Paper: -# "Fast SHA-256 Implementations on Intel Architecture Processors" -# -# To find it, surf to http://www.intel.com/p/en_US/embedded -# and search for that title. -# -######################################################################## -# This code schedules 2 blocks at a time, with 4 lanes per block -######################################################################## - -#include <linux/linkage.h> -#include <linux/cfi_types.h> - -## assume buffers not aligned -#define VMOVDQ vmovdqu - -################################ Define Macros - -# addm [mem], reg -# Add reg to mem using reg-mem add and store -.macro addm p1 p2 - add \p1, \p2 - mov \p2, \p1 -.endm - -################################ - -X0 = %ymm4 -X1 = %ymm5 -X2 = %ymm6 -X3 = %ymm7 - -# XMM versions of above -XWORD0 = %xmm4 -XWORD1 = %xmm5 -XWORD2 = %xmm6 -XWORD3 = %xmm7 - -XTMP0 = %ymm0 -XTMP1 = %ymm1 -XTMP2 = %ymm2 -XTMP3 = %ymm3 -XTMP4 = %ymm8 -XFER = %ymm9 -XTMP5 = %ymm11 - -SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA -SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 -BYTE_FLIP_MASK = %ymm13 - -X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK - -NUM_BLKS = %rdx # 3rd arg -INP = %rsi # 2nd arg -CTX = %rdi # 1st arg -c = %ecx -d = %r8d -e = %edx # clobbers NUM_BLKS -y3 = %esi # clobbers INP - -SRND = CTX # SRND is same register as CTX - -a = %eax -b = %ebx -f = %r9d -g = %r10d -h = %r11d -old_h = %r11d - -T1 = %r12d -y0 = %r13d -y1 = %r14d -y2 = %r15d - - -_XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round -_XMM_SAVE_SIZE = 0 -_INP_END_SIZE = 8 -_INP_SIZE = 8 -_CTX_SIZE = 8 - -_XFER = 0 -_XMM_SAVE = _XFER + _XFER_SIZE -_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE -_INP = _INP_END + _INP_END_SIZE -_CTX = _INP + _INP_SIZE -STACK_SIZE = _CTX + _CTX_SIZE - -# rotate_Xs -# Rotate values of symbols X0...X3 -.macro rotate_Xs - X_ = X0 - X0 = X1 - X1 = X2 - X2 = X3 - X3 = X_ -.endm - -# ROTATE_ARGS -# Rotate values of symbols a...h -.macro ROTATE_ARGS - old_h = h - TMP_ = h - h = g - g = f - f = e - e = d - d = c - c = b - b = a - a = TMP_ -.endm - -.macro FOUR_ROUNDS_AND_SCHED disp -################################### RND N + 0 ############################ - - mov a, y3 # y3 = a # MAJA - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - - addl \disp(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] - mov f, y2 # y2 = f # CH - rorx $13, a, T1 # T1 = a >> 13 # S0B - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - xor g, y2 # y2 = f^g # CH - vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - - and e, y2 # y2 = (f^g)&e # CH - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $22, a, y1 # y1 = a >> 22 # S0A - add h, d # d = k + w + h + d # -- - - and b, y3 # y3 = (a|c)&b # MAJA - vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - vpsrld $7, XTMP1, XTMP2 - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - - add y0, y2 # y2 = S1 + CH # -- - vpslld $(32-7), XTMP1, XTMP3 - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 - - vpsrld $18, XTMP1, XTMP2 - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - add y3, h # h = t1 + S0 + MAJ # -- - - - ROTATE_ARGS - -################################### RND N + 1 ############################ - - mov a, y3 # y3 = a # MAJA - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - offset = \disp + 1*4 - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - - vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 - mov f, y2 # y2 = f # CH - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - xor g, y2 # y2 = f^g # CH - - - rorx $6, e, y1 # y1 = (e >> 6) # S1 - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $22, a, y1 # y1 = a >> 22 # S0A - and e, y2 # y2 = (f^g)&e # CH - add h, d # d = k + w + h + d # -- - - vpslld $(32-18), XTMP1, XTMP1 - and b, y3 # y3 = (a|c)&b # MAJA - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - - vpxor XTMP1, XTMP3, XTMP3 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - - vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 - vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - add y3, h # h = t1 + S0 + MAJ # -- - - vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} - - - ROTATE_ARGS - -################################### RND N + 2 ############################ - - mov a, y3 # y3 = a # MAJA - rorx $25, e, y0 # y0 = e >> 25 # S1A - offset = \disp + 2*4 - addl offset(%rsp, SRND), h # h = k + w + h # -- - - vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} - rorx $11, e, y1 # y1 = e >> 11 # S1B - or c, y3 # y3 = a|c # MAJA - mov f, y2 # y2 = f # CH - xor g, y2 # y2 = f^g # CH - - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} - and e, y2 # y2 = (f^g)&e # CH - - rorx $6, e, y1 # y1 = (e >> 6) # S1 - vpxor XTMP3, XTMP2, XTMP2 - add h, d # d = k + w + h + d # -- - and b, y3 # y3 = (a|c)&b # MAJA - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $22, a, y1 # y1 = a >> 22 # S0A - vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - - vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a ,T1 # T1 = (a >> 2) # S0 - vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} - - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1,h # h = k + w + h + S0 # -- - add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- - add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - - add y3,h # h = t1 + S0 + MAJ # -- - - - ROTATE_ARGS - -################################### RND N + 3 ############################ - - mov a, y3 # y3 = a # MAJA - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - offset = \disp + 3*4 - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - - vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} - mov f, y2 # y2 = f # CH - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - xor g, y2 # y2 = f^g # CH - - - vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - add h, d # d = k + w + h + d # -- - and b, y3 # y3 = (a|c)&b # MAJA - - vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - - vpxor XTMP3, XTMP2, XTMP2 - rorx $22, a, y1 # y1 = a >> 22 # S0A - add y0, y2 # y2 = S1 + CH # -- - - vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - rorx $2, a, T1 # T1 = (a >> 2) # S0 - vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} - - vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and c, T1 # T1 = a&c # MAJB - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - - add y1, h # h = k + w + h + S0 # -- - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - add y3, h # h = t1 + S0 + MAJ # -- - - ROTATE_ARGS - rotate_Xs -.endm - -.macro DO_4ROUNDS disp -################################### RND N + 0 ########################### - - mov f, y2 # y2 = f # CH - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $22, a, y1 # y1 = a >> 22 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - addl \disp(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - ROTATE_ARGS - -################################### RND N + 1 ########################### - - add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - mov f, y2 # y2 = f # CH - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - add y3, old_h # h = t1 + S0 + MAJ # -- - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $22, a, y1 # y1 = a >> 22 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - offset = 4*1 + \disp - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - ROTATE_ARGS - -################################### RND N + 2 ############################## - - add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - mov f, y2 # y2 = f # CH - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - add y3, old_h # h = t1 + S0 + MAJ # -- - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $22, a, y1 # y1 = a >> 22 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - offset = 4*2 + \disp - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - ROTATE_ARGS - -################################### RND N + 3 ########################### - - add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - mov f, y2 # y2 = f # CH - rorx $25, e, y0 # y0 = e >> 25 # S1A - rorx $11, e, y1 # y1 = e >> 11 # S1B - xor g, y2 # y2 = f^g # CH - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 - rorx $6, e, y1 # y1 = (e >> 6) # S1 - and e, y2 # y2 = (f^g)&e # CH - add y3, old_h # h = t1 + S0 + MAJ # -- - - xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 - rorx $13, a, T1 # T1 = a >> 13 # S0B - xor g, y2 # y2 = CH = ((f^g)&e)^g # CH - rorx $22, a, y1 # y1 = a >> 22 # S0A - mov a, y3 # y3 = a # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 - rorx $2, a, T1 # T1 = (a >> 2) # S0 - offset = 4*3 + \disp - addl offset(%rsp, SRND), h # h = k + w + h # -- - or c, y3 # y3 = a|c # MAJA - - xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 - mov a, T1 # T1 = a # MAJB - and b, y3 # y3 = (a|c)&b # MAJA - and c, T1 # T1 = a&c # MAJB - add y0, y2 # y2 = S1 + CH # -- - - - add h, d # d = k + w + h + d # -- - or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ - add y1, h # h = k + w + h + S0 # -- - - add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- - - - add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- - - add y3, h # h = t1 + S0 + MAJ # -- - - ROTATE_ARGS - -.endm - -######################################################################## -## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks) -## arg 1 : pointer to state -## arg 2 : pointer to input data -## arg 3 : Num blocks -######################################################################## -.text -SYM_TYPED_FUNC_START(sha256_transform_rorx) - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - push %rbp - mov %rsp, %rbp - - subq $STACK_SIZE, %rsp - and $-32, %rsp # align rsp to 32 byte boundary - - shl $6, NUM_BLKS # convert to bytes - jz .Ldone_hash - lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block - mov NUM_BLKS, _INP_END(%rsp) - - cmp NUM_BLKS, INP - je .Lonly_one_block - - ## load initial digest - mov (CTX), a - mov 4*1(CTX), b - mov 4*2(CTX), c - mov 4*3(CTX), d - mov 4*4(CTX), e - mov 4*5(CTX), f - mov 4*6(CTX), g - mov 4*7(CTX), h - - vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK - vmovdqa _SHUF_00BA(%rip), SHUF_00BA - vmovdqa _SHUF_DC00(%rip), SHUF_DC00 - - mov CTX, _CTX(%rsp) - -.Lloop0: - ## Load first 16 dwords from two blocks - VMOVDQ 0*32(INP),XTMP0 - VMOVDQ 1*32(INP),XTMP1 - VMOVDQ 2*32(INP),XTMP2 - VMOVDQ 3*32(INP),XTMP3 - - ## byte swap data - vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 - vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 - vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 - vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 - - ## transpose data into high/low halves - vperm2i128 $0x20, XTMP2, XTMP0, X0 - vperm2i128 $0x31, XTMP2, XTMP0, X1 - vperm2i128 $0x20, XTMP3, XTMP1, X2 - vperm2i128 $0x31, XTMP3, XTMP1, X3 - -.Llast_block_enter: - add $64, INP - mov INP, _INP(%rsp) - - ## schedule 48 input dwords, by doing 3 rounds of 12 each - xor SRND, SRND - -.align 16 -.Lloop1: - leaq K256+0*32(%rip), INP ## reuse INP as scratch reg - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 0*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED (_XFER + 0*32) - - leaq K256+1*32(%rip), INP - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 1*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED (_XFER + 1*32) - - leaq K256+2*32(%rip), INP - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 2*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED (_XFER + 2*32) - - leaq K256+3*32(%rip), INP - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 3*32+_XFER(%rsp, SRND) - FOUR_ROUNDS_AND_SCHED (_XFER + 3*32) - - add $4*32, SRND - cmp $3*4*32, SRND - jb .Lloop1 - -.Lloop2: - ## Do last 16 rounds with no scheduling - leaq K256+0*32(%rip), INP - vpaddd (INP, SRND), X0, XFER - vmovdqa XFER, 0*32+_XFER(%rsp, SRND) - DO_4ROUNDS (_XFER + 0*32) - - leaq K256+1*32(%rip), INP - vpaddd (INP, SRND), X1, XFER - vmovdqa XFER, 1*32+_XFER(%rsp, SRND) - DO_4ROUNDS (_XFER + 1*32) - add $2*32, SRND - - vmovdqa X2, X0 - vmovdqa X3, X1 - - cmp $4*4*32, SRND - jb .Lloop2 - - mov _CTX(%rsp), CTX - mov _INP(%rsp), INP - - addm (4*0)(CTX),a - addm (4*1)(CTX),b - addm (4*2)(CTX),c - addm (4*3)(CTX),d - addm (4*4)(CTX),e - addm (4*5)(CTX),f - addm (4*6)(CTX),g - addm (4*7)(CTX),h - - cmp _INP_END(%rsp), INP - ja .Ldone_hash - - #### Do second block using previously scheduled results - xor SRND, SRND -.align 16 -.Lloop3: - DO_4ROUNDS (_XFER + 0*32 + 16) - DO_4ROUNDS (_XFER + 1*32 + 16) - add $2*32, SRND - cmp $4*4*32, SRND - jb .Lloop3 - - mov _CTX(%rsp), CTX - mov _INP(%rsp), INP - add $64, INP - - addm (4*0)(CTX),a - addm (4*1)(CTX),b - addm (4*2)(CTX),c - addm (4*3)(CTX),d - addm (4*4)(CTX),e - addm (4*5)(CTX),f - addm (4*6)(CTX),g - addm (4*7)(CTX),h - - cmp _INP_END(%rsp), INP - jb .Lloop0 - ja .Ldone_hash - -.Ldo_last_block: - VMOVDQ 0*16(INP),XWORD0 - VMOVDQ 1*16(INP),XWORD1 - VMOVDQ 2*16(INP),XWORD2 - VMOVDQ 3*16(INP),XWORD3 - - vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 - vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 - vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 - vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 - - jmp .Llast_block_enter - -.Lonly_one_block: - - ## load initial digest - mov (4*0)(CTX),a - mov (4*1)(CTX),b - mov (4*2)(CTX),c - mov (4*3)(CTX),d - mov (4*4)(CTX),e - mov (4*5)(CTX),f - mov (4*6)(CTX),g - mov (4*7)(CTX),h - - vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK - vmovdqa _SHUF_00BA(%rip), SHUF_00BA - vmovdqa _SHUF_DC00(%rip), SHUF_DC00 - - mov CTX, _CTX(%rsp) - jmp .Ldo_last_block - -.Ldone_hash: - - mov %rbp, %rsp - pop %rbp - - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - vzeroupper - RET -SYM_FUNC_END(sha256_transform_rorx) - -.section .rodata.cst512.K256, "aM", @progbits, 512 -.align 64 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 -.align 32 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 - -# shuffle xBxA -> 00BA -.section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32 -.align 32 -_SHUF_00BA: - .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 - -# shuffle xDxC -> DC00 -.section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32 -.align 32 -_SHUF_DC00: - .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S deleted file mode 100644 index 93264ee44543..000000000000 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ /dev/null @@ -1,513 +0,0 @@ -######################################################################## -# Implement fast SHA-256 with SSSE3 instructions. (x86_64) -# -# Copyright (C) 2013 Intel Corporation. -# -# Authors: -# James Guilford <james.guilford@intel.com> -# Kirk Yap <kirk.s.yap@intel.com> -# Tim Chen <tim.c.chen@linux.intel.com> -# -# This software is available to you under a choice of one of two -# licenses. You may choose to be licensed under the terms of the GNU -# General Public License (GPL) Version 2, available from the file -# COPYING in the main directory of this source tree, or the -# OpenIB.org BSD license below: -# -# Redistribution and use in source and binary forms, with or -# without modification, are permitted provided that the following -# conditions are met: -# -# - Redistributions of source code must retain the above -# copyright notice, this list of conditions and the following -# disclaimer. -# -# - Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials -# provided with the distribution. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -######################################################################## -# -# This code is described in an Intel White-Paper: -# "Fast SHA-256 Implementations on Intel Architecture Processors" -# -# To find it, surf to http://www.intel.com/p/en_US/embedded -# and search for that title. -# -######################################################################## - -#include <linux/linkage.h> -#include <linux/cfi_types.h> - -## assume buffers not aligned -#define MOVDQ movdqu - -################################ Define Macros - -# addm [mem], reg -# Add reg to mem using reg-mem add and store -.macro addm p1 p2 - add \p1, \p2 - mov \p2, \p1 -.endm - -################################ - -# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask -# Load xmm with mem and byte swap each dword -.macro COPY_XMM_AND_BSWAP p1 p2 p3 - MOVDQ \p2, \p1 - pshufb \p3, \p1 -.endm - -################################ - -X0 = %xmm4 -X1 = %xmm5 -X2 = %xmm6 -X3 = %xmm7 - -XTMP0 = %xmm0 -XTMP1 = %xmm1 -XTMP2 = %xmm2 -XTMP3 = %xmm3 -XTMP4 = %xmm8 -XFER = %xmm9 - -SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA -SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00 -BYTE_FLIP_MASK = %xmm12 - -NUM_BLKS = %rdx # 3rd arg -INP = %rsi # 2nd arg -CTX = %rdi # 1st arg - -SRND = %rsi # clobbers INP -c = %ecx -d = %r8d -e = %edx -TBL = %r12 -a = %eax -b = %ebx - -f = %r9d -g = %r10d -h = %r11d - -y0 = %r13d -y1 = %r14d -y2 = %r15d - - - -_INP_END_SIZE = 8 -_INP_SIZE = 8 -_XFER_SIZE = 16 -_XMM_SAVE_SIZE = 0 - -_INP_END = 0 -_INP = _INP_END + _INP_END_SIZE -_XFER = _INP + _INP_SIZE -_XMM_SAVE = _XFER + _XFER_SIZE -STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE - -# rotate_Xs -# Rotate values of symbols X0...X3 -.macro rotate_Xs -X_ = X0 -X0 = X1 -X1 = X2 -X2 = X3 -X3 = X_ -.endm - -# ROTATE_ARGS -# Rotate values of symbols a...h -.macro ROTATE_ARGS -TMP_ = h -h = g -g = f -f = e -e = d -d = c -c = b -b = a -a = TMP_ -.endm - -.macro FOUR_ROUNDS_AND_SCHED - ## compute s0 four at a time and s1 two at a time - ## compute W[-16] + W[-7] 4 at a time - movdqa X3, XTMP0 - mov e, y0 # y0 = e - ror $(25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - palignr $4, X2, XTMP0 # XTMP0 = W[-7] - ror $(22-13), y1 # y1 = a >> (22-13) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - movdqa X1, XTMP1 - xor a, y1 # y1 = a ^ (a >> (22-13) - xor g, y2 # y2 = f^g - paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16] - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - ## compute s0 - palignr $4, X0, XTMP1 # XTMP1 = W[-15] - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - movdqa XTMP1, XTMP2 # XTMP2 = W[-15] - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add y0, y2 # y2 = S1 + CH - add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH - movdqa XTMP1, XTMP3 # XTMP3 = W[-15] - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - pslld $(32-7), XTMP1 # - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - psrld $7, XTMP2 # - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - # - ROTATE_ARGS # - movdqa XTMP3, XTMP2 # XTMP2 = W[-15] - mov e, y0 # y0 = e - mov a, y1 # y1 = a - movdqa XTMP3, XTMP4 # XTMP4 = W[-15] - ror $(25-11), y0 # y0 = e >> (25-11) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - ror $(22-13), y1 # y1 = a >> (22-13) - pslld $(32-18), XTMP3 # - xor a, y1 # y1 = a ^ (a >> (22-13) - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor g, y2 # y2 = f^g - psrld $18, XTMP2 # - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - pxor XTMP3, XTMP1 - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor g, y2 # y2 = CH = ((f^g)&e)^g - psrld $3, XTMP4 # XTMP4 = W[-15] >> 3 - add y0, y2 # y2 = S1 + CH - add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - pxor XTMP4, XTMP1 # XTMP1 = s0 - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - ## compute low s1 - pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - - ROTATE_ARGS - movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA} - mov e, y0 # y0 = e - mov a, y1 # y1 = a - ror $(25-11), y0 # y0 = e >> (25-11) - movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA} - xor e, y0 # y0 = e ^ (e >> (25-11)) - ror $(22-13), y1 # y1 = a >> (22-13) - mov f, y2 # y2 = f - xor a, y1 # y1 = a ^ (a >> (22-13) - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} - xor g, y2 # y2 = f^g - psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - and e, y2 # y2 = (f^g)&e - psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - xor g, y2 # y2 = CH = ((f^g)&e)^g - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - pxor XTMP3, XTMP2 - add y0, y2 # y2 = S1 + CH - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA} - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA} - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - ## compute high s1 - pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA} - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - # - ROTATE_ARGS # - movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC} - mov e, y0 # y0 = e - ror $(25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - movdqa XTMP2, X0 # X0 = W[-2] {DDCC} - ror $(22-13), y1 # y1 = a >> (22-13) - xor e, y0 # y0 = e ^ (e >> (25-11)) - mov f, y2 # y2 = f - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} - xor a, y1 # y1 = a ^ (a >> (22-13) - xor g, y2 # y2 = f^g - psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25 - and e, y2 # y2 = (f^g)&e - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - psrld $10, X0 # X0 = W[-2] >> 10 {DDCC} - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22 - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2 - xor g, y2 # y2 = CH = ((f^g)&e)^g - pxor XTMP3, XTMP2 # - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2 - add y0, y2 # y2 = S1 + CH - add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH - pxor XTMP2, X0 # X0 = s1 {xDxC} - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - pshufb SHUF_DC00, X0 # X0 = s1 {DC00} - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]} - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - - ROTATE_ARGS - rotate_Xs -.endm - -## input is [rsp + _XFER + %1 * 4] -.macro DO_ROUND round - mov e, y0 # y0 = e - ror $(25-11), y0 # y0 = e >> (25-11) - mov a, y1 # y1 = a - xor e, y0 # y0 = e ^ (e >> (25-11)) - ror $(22-13), y1 # y1 = a >> (22-13) - mov f, y2 # y2 = f - xor a, y1 # y1 = a ^ (a >> (22-13) - ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6)) - xor g, y2 # y2 = f^g - xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) - ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2)) - and e, y2 # y2 = (f^g)&e - xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) - ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) - xor g, y2 # y2 = CH = ((f^g)&e)^g - add y0, y2 # y2 = S1 + CH - ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) - offset = \round * 4 + _XFER - add offset(%rsp), y2 # y2 = k + w + S1 + CH - mov a, y0 # y0 = a - add y2, h # h = h + S1 + CH + k + w - mov a, y2 # y2 = a - or c, y0 # y0 = a|c - add h, d # d = d + h + S1 + CH + k + w - and c, y2 # y2 = a&c - and b, y0 # y0 = (a|c)&b - add y1, h # h = h + S1 + CH + k + w + S0 - or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c) - add y0, h # h = h + S1 + CH + k + w + S0 + MAJ - ROTATE_ARGS -.endm - -######################################################################## -## void sha256_transform_ssse3(struct sha256_state *state, const u8 *data, -## int blocks); -## arg 1 : pointer to state -## (struct sha256_state is assumed to begin with u32 state[8]) -## arg 2 : pointer to input data -## arg 3 : Num blocks -######################################################################## -.text -SYM_TYPED_FUNC_START(sha256_transform_ssse3) - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rbp - mov %rsp, %rbp - - subq $STACK_SIZE, %rsp - and $~15, %rsp - - shl $6, NUM_BLKS # convert to bytes - jz .Ldone_hash - add INP, NUM_BLKS - mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data - - ## load initial digest - mov 4*0(CTX), a - mov 4*1(CTX), b - mov 4*2(CTX), c - mov 4*3(CTX), d - mov 4*4(CTX), e - mov 4*5(CTX), f - mov 4*6(CTX), g - mov 4*7(CTX), h - - movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK - movdqa _SHUF_00BA(%rip), SHUF_00BA - movdqa _SHUF_DC00(%rip), SHUF_DC00 - -.Lloop0: - lea K256(%rip), TBL - - ## byte swap first 16 dwords - COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK - COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK - - mov INP, _INP(%rsp) - - ## schedule 48 input dwords, by doing 3 rounds of 16 each - mov $3, SRND -.align 16 -.Lloop1: - movdqa (TBL), XFER - paddd X0, XFER - movdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - movdqa 1*16(TBL), XFER - paddd X0, XFER - movdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - movdqa 2*16(TBL), XFER - paddd X0, XFER - movdqa XFER, _XFER(%rsp) - FOUR_ROUNDS_AND_SCHED - - movdqa 3*16(TBL), XFER - paddd X0, XFER - movdqa XFER, _XFER(%rsp) - add $4*16, TBL - FOUR_ROUNDS_AND_SCHED - - sub $1, SRND - jne .Lloop1 - - mov $2, SRND -.Lloop2: - paddd (TBL), X0 - movdqa X0, _XFER(%rsp) - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - paddd 1*16(TBL), X1 - movdqa X1, _XFER(%rsp) - add $2*16, TBL - DO_ROUND 0 - DO_ROUND 1 - DO_ROUND 2 - DO_ROUND 3 - - movdqa X2, X0 - movdqa X3, X1 - - sub $1, SRND - jne .Lloop2 - - addm (4*0)(CTX),a - addm (4*1)(CTX),b - addm (4*2)(CTX),c - addm (4*3)(CTX),d - addm (4*4)(CTX),e - addm (4*5)(CTX),f - addm (4*6)(CTX),g - addm (4*7)(CTX),h - - mov _INP(%rsp), INP - add $64, INP - cmp _INP_END(%rsp), INP - jne .Lloop0 - -.Ldone_hash: - - mov %rbp, %rsp - popq %rbp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbx - - RET -SYM_FUNC_END(sha256_transform_ssse3) - -.section .rodata.cst256.K256, "aM", @progbits, 256 -.align 64 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.align 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203 - -.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16 -.align 16 -# shuffle xBxA -> 00BA -_SHUF_00BA: - .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100 - -.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16 -.align 16 -# shuffle xDxC -> DC00 -_SHUF_DC00: - .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF diff --git a/arch/x86/crypto/sha256_ni_asm.S b/arch/x86/crypto/sha256_ni_asm.S deleted file mode 100644 index d515a55a3bc1..000000000000 --- a/arch/x86/crypto/sha256_ni_asm.S +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Intel SHA Extensions optimized implementation of a SHA-256 update function - * - * This file is provided under a dual BSD/GPLv2 license. When using or - * redistributing this file, you may do so under either license. - * - * GPL LICENSE SUMMARY - * - * Copyright(c) 2015 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * Contact Information: - * Sean Gulley <sean.m.gulley@intel.com> - * Tim Chen <tim.c.chen@linux.intel.com> - * - * BSD LICENSE - * - * Copyright(c) 2015 Intel Corporation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * * Neither the name of Intel Corporation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ - -#include <linux/linkage.h> -#include <linux/cfi_types.h> - -#define DIGEST_PTR %rdi /* 1st arg */ -#define DATA_PTR %rsi /* 2nd arg */ -#define NUM_BLKS %rdx /* 3rd arg */ - -#define SHA256CONSTANTS %rax - -#define MSG %xmm0 /* sha256rnds2 implicit operand */ -#define STATE0 %xmm1 -#define STATE1 %xmm2 -#define MSG0 %xmm3 -#define MSG1 %xmm4 -#define MSG2 %xmm5 -#define MSG3 %xmm6 -#define TMP %xmm7 - -#define SHUF_MASK %xmm8 - -#define ABEF_SAVE %xmm9 -#define CDGH_SAVE %xmm10 - -.macro do_4rounds i, m0, m1, m2, m3 -.if \i < 16 - movdqu \i*4(DATA_PTR), \m0 - pshufb SHUF_MASK, \m0 -.endif - movdqa (\i-32)*4(SHA256CONSTANTS), MSG - paddd \m0, MSG - sha256rnds2 STATE0, STATE1 -.if \i >= 12 && \i < 60 - movdqa \m0, TMP - palignr $4, \m3, TMP - paddd TMP, \m1 - sha256msg2 \m0, \m1 -.endif - punpckhqdq MSG, MSG - sha256rnds2 STATE1, STATE0 -.if \i >= 4 && \i < 52 - sha256msg1 \m0, \m3 -.endif -.endm - -/* - * Intel SHA Extensions optimized implementation of a SHA-256 update function - * - * The function takes a pointer to the current hash values, a pointer to the - * input data, and a number of 64 byte blocks to process. Once all blocks have - * been processed, the digest pointer is updated with the resulting hash value. - * The function only processes complete blocks, there is no functionality to - * store partial blocks. All message padding and hash value initialization must - * be done outside the update function. - * - * void sha256_ni_transform(uint32_t *digest, const void *data, - uint32_t numBlocks); - * digest : pointer to digest - * data: pointer to input data - * numBlocks: Number of blocks to process - */ - -.text -SYM_TYPED_FUNC_START(sha256_ni_transform) - - shl $6, NUM_BLKS /* convert to bytes */ - jz .Ldone_hash - add DATA_PTR, NUM_BLKS /* pointer to end of data */ - - /* - * load initial hash values - * Need to reorder these appropriately - * DCBA, HGFE -> ABEF, CDGH - */ - movdqu 0*16(DIGEST_PTR), STATE0 /* DCBA */ - movdqu 1*16(DIGEST_PTR), STATE1 /* HGFE */ - - movdqa STATE0, TMP - punpcklqdq STATE1, STATE0 /* FEBA */ - punpckhqdq TMP, STATE1 /* DCHG */ - pshufd $0x1B, STATE0, STATE0 /* ABEF */ - pshufd $0xB1, STATE1, STATE1 /* CDGH */ - - movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK - lea K256+32*4(%rip), SHA256CONSTANTS - -.Lloop0: - /* Save hash values for addition after rounds */ - movdqa STATE0, ABEF_SAVE - movdqa STATE1, CDGH_SAVE - -.irp i, 0, 16, 32, 48 - do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3 - do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0 - do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1 - do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2 -.endr - - /* Add current hash values with previously saved */ - paddd ABEF_SAVE, STATE0 - paddd CDGH_SAVE, STATE1 - - /* Increment data pointer and loop if more to process */ - add $64, DATA_PTR - cmp NUM_BLKS, DATA_PTR - jne .Lloop0 - - /* Write hash values back in the correct order */ - movdqa STATE0, TMP - punpcklqdq STATE1, STATE0 /* GHEF */ - punpckhqdq TMP, STATE1 /* ABCD */ - pshufd $0xB1, STATE0, STATE0 /* HGFE */ - pshufd $0x1B, STATE1, STATE1 /* DCBA */ - - movdqu STATE1, 0*16(DIGEST_PTR) - movdqu STATE0, 1*16(DIGEST_PTR) - -.Ldone_hash: - - RET -SYM_FUNC_END(sha256_ni_transform) - -.section .rodata.cst256.K256, "aM", @progbits, 256 -.align 64 -K256: - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 -.align 16 -PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203 diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c deleted file mode 100644 index e04a43d9f7d5..000000000000 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ /dev/null @@ -1,467 +0,0 @@ -/* - * Cryptographic API. - * - * Glue code for the SHA256 Secure Hash Algorithm assembler implementations - * using SSSE3, AVX, AVX2, and SHA-NI instructions. - * - * This file is based on sha256_generic.c - * - * Copyright (C) 2013 Intel Corporation. - * - * Author: - * Tim Chen <tim.c.chen@linux.intel.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/mm.h> -#include <linux/types.h> -#include <crypto/sha2.h> -#include <crypto/sha256_base.h> -#include <linux/string.h> -#include <asm/cpu_device_id.h> -#include <asm/simd.h> - -asmlinkage void sha256_transform_ssse3(struct sha256_state *state, - const u8 *data, int blocks); - -static const struct x86_cpu_id module_cpu_ids[] = { -#ifdef CONFIG_AS_SHA256_NI - X86_MATCH_FEATURE(X86_FEATURE_SHA_NI, NULL), -#endif - X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL), - X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL), - X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL), - {} -}; -MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids); - -static int _sha256_update(struct shash_desc *desc, const u8 *data, - unsigned int len, sha256_block_fn *sha256_xform) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - - if (!crypto_simd_usable() || - (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE) - return crypto_sha256_update(desc, data, len); - - /* - * Make sure struct sha256_state begins directly with the SHA256 - * 256-bit internal state, as this is what the asm functions expect. - */ - BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0); - - kernel_fpu_begin(); - sha256_base_do_update(desc, data, len, sha256_xform); - kernel_fpu_end(); - - return 0; -} - -static int sha256_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out, sha256_block_fn *sha256_xform) -{ - if (!crypto_simd_usable()) - return crypto_sha256_finup(desc, data, len, out); - - kernel_fpu_begin(); - if (len) - sha256_base_do_update(desc, data, len, sha256_xform); - sha256_base_do_finalize(desc, sha256_xform); - kernel_fpu_end(); - - return sha256_base_finish(desc, out); -} - -static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return _sha256_update(desc, data, len, sha256_transform_ssse3); -} - -static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_finup(desc, data, len, out, sha256_transform_ssse3); -} - -/* Add padding and return the message digest. */ -static int sha256_ssse3_final(struct shash_desc *desc, u8 *out) -{ - return sha256_ssse3_finup(desc, NULL, 0, out); -} - -static int sha256_ssse3_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_base_init(desc) ?: - sha256_ssse3_finup(desc, data, len, out); -} - -static struct shash_alg sha256_ssse3_algs[] = { { - .digestsize = SHA256_DIGEST_SIZE, - .init = sha256_base_init, - .update = sha256_ssse3_update, - .final = sha256_ssse3_final, - .finup = sha256_ssse3_finup, - .digest = sha256_ssse3_digest, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha256", - .cra_driver_name = "sha256-ssse3", - .cra_priority = 150, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA224_DIGEST_SIZE, - .init = sha224_base_init, - .update = sha256_ssse3_update, - .final = sha256_ssse3_final, - .finup = sha256_ssse3_finup, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha224", - .cra_driver_name = "sha224-ssse3", - .cra_priority = 150, - .cra_blocksize = SHA224_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static int register_sha256_ssse3(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - return crypto_register_shashes(sha256_ssse3_algs, - ARRAY_SIZE(sha256_ssse3_algs)); - return 0; -} - -static void unregister_sha256_ssse3(void) -{ - if (boot_cpu_has(X86_FEATURE_SSSE3)) - crypto_unregister_shashes(sha256_ssse3_algs, - ARRAY_SIZE(sha256_ssse3_algs)); -} - -asmlinkage void sha256_transform_avx(struct sha256_state *state, - const u8 *data, int blocks); - -static int sha256_avx_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return _sha256_update(desc, data, len, sha256_transform_avx); -} - -static int sha256_avx_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_finup(desc, data, len, out, sha256_transform_avx); -} - -static int sha256_avx_final(struct shash_desc *desc, u8 *out) -{ - return sha256_avx_finup(desc, NULL, 0, out); -} - -static int sha256_avx_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_base_init(desc) ?: - sha256_avx_finup(desc, data, len, out); -} - -static struct shash_alg sha256_avx_algs[] = { { - .digestsize = SHA256_DIGEST_SIZE, - .init = sha256_base_init, - .update = sha256_avx_update, - .final = sha256_avx_final, - .finup = sha256_avx_finup, - .digest = sha256_avx_digest, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha256", - .cra_driver_name = "sha256-avx", - .cra_priority = 160, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA224_DIGEST_SIZE, - .init = sha224_base_init, - .update = sha256_avx_update, - .final = sha256_avx_final, - .finup = sha256_avx_finup, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha224", - .cra_driver_name = "sha224-avx", - .cra_priority = 160, - .cra_blocksize = SHA224_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static bool avx_usable(void) -{ - if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - if (boot_cpu_has(X86_FEATURE_AVX)) - pr_info("AVX detected but unusable.\n"); - return false; - } - - return true; -} - -static int register_sha256_avx(void) -{ - if (avx_usable()) - return crypto_register_shashes(sha256_avx_algs, - ARRAY_SIZE(sha256_avx_algs)); - return 0; -} - -static void unregister_sha256_avx(void) -{ - if (avx_usable()) - crypto_unregister_shashes(sha256_avx_algs, - ARRAY_SIZE(sha256_avx_algs)); -} - -asmlinkage void sha256_transform_rorx(struct sha256_state *state, - const u8 *data, int blocks); - -static int sha256_avx2_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return _sha256_update(desc, data, len, sha256_transform_rorx); -} - -static int sha256_avx2_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_finup(desc, data, len, out, sha256_transform_rorx); -} - -static int sha256_avx2_final(struct shash_desc *desc, u8 *out) -{ - return sha256_avx2_finup(desc, NULL, 0, out); -} - -static int sha256_avx2_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_base_init(desc) ?: - sha256_avx2_finup(desc, data, len, out); -} - -static struct shash_alg sha256_avx2_algs[] = { { - .digestsize = SHA256_DIGEST_SIZE, - .init = sha256_base_init, - .update = sha256_avx2_update, - .final = sha256_avx2_final, - .finup = sha256_avx2_finup, - .digest = sha256_avx2_digest, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha256", - .cra_driver_name = "sha256-avx2", - .cra_priority = 170, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA224_DIGEST_SIZE, - .init = sha224_base_init, - .update = sha256_avx2_update, - .final = sha256_avx2_final, - .finup = sha256_avx2_finup, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha224", - .cra_driver_name = "sha224-avx2", - .cra_priority = 170, - .cra_blocksize = SHA224_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static bool avx2_usable(void) -{ - if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && - boot_cpu_has(X86_FEATURE_BMI2)) - return true; - - return false; -} - -static int register_sha256_avx2(void) -{ - if (avx2_usable()) - return crypto_register_shashes(sha256_avx2_algs, - ARRAY_SIZE(sha256_avx2_algs)); - return 0; -} - -static void unregister_sha256_avx2(void) -{ - if (avx2_usable()) - crypto_unregister_shashes(sha256_avx2_algs, - ARRAY_SIZE(sha256_avx2_algs)); -} - -#ifdef CONFIG_AS_SHA256_NI -asmlinkage void sha256_ni_transform(struct sha256_state *digest, - const u8 *data, int rounds); - -static int sha256_ni_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return _sha256_update(desc, data, len, sha256_ni_transform); -} - -static int sha256_ni_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_finup(desc, data, len, out, sha256_ni_transform); -} - -static int sha256_ni_final(struct shash_desc *desc, u8 *out) -{ - return sha256_ni_finup(desc, NULL, 0, out); -} - -static int sha256_ni_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return sha256_base_init(desc) ?: - sha256_ni_finup(desc, data, len, out); -} - -static struct shash_alg sha256_ni_algs[] = { { - .digestsize = SHA256_DIGEST_SIZE, - .init = sha256_base_init, - .update = sha256_ni_update, - .final = sha256_ni_final, - .finup = sha256_ni_finup, - .digest = sha256_ni_digest, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha256", - .cra_driver_name = "sha256-ni", - .cra_priority = 250, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .digestsize = SHA224_DIGEST_SIZE, - .init = sha224_base_init, - .update = sha256_ni_update, - .final = sha256_ni_final, - .finup = sha256_ni_finup, - .descsize = sizeof(struct sha256_state), - .base = { - .cra_name = "sha224", - .cra_driver_name = "sha224-ni", - .cra_priority = 250, - .cra_blocksize = SHA224_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static int register_sha256_ni(void) -{ - if (boot_cpu_has(X86_FEATURE_SHA_NI)) - return crypto_register_shashes(sha256_ni_algs, - ARRAY_SIZE(sha256_ni_algs)); - return 0; -} - -static void unregister_sha256_ni(void) -{ - if (boot_cpu_has(X86_FEATURE_SHA_NI)) - crypto_unregister_shashes(sha256_ni_algs, - ARRAY_SIZE(sha256_ni_algs)); -} - -#else -static inline int register_sha256_ni(void) { return 0; } -static inline void unregister_sha256_ni(void) { } -#endif - -static int __init sha256_ssse3_mod_init(void) -{ - if (!x86_match_cpu(module_cpu_ids)) - return -ENODEV; - - if (register_sha256_ssse3()) - goto fail; - - if (register_sha256_avx()) { - unregister_sha256_ssse3(); - goto fail; - } - - if (register_sha256_avx2()) { - unregister_sha256_avx(); - unregister_sha256_ssse3(); - goto fail; - } - - if (register_sha256_ni()) { - unregister_sha256_avx2(); - unregister_sha256_avx(); - unregister_sha256_ssse3(); - goto fail; - } - - return 0; -fail: - return -ENODEV; -} - -static void __exit sha256_ssse3_mod_fini(void) -{ - unregister_sha256_ni(); - unregister_sha256_avx2(); - unregister_sha256_avx(); - unregister_sha256_ssse3(); -} - -module_init(sha256_ssse3_mod_init); -module_exit(sha256_ssse3_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated"); - -MODULE_ALIAS_CRYPTO("sha256"); -MODULE_ALIAS_CRYPTO("sha256-ssse3"); -MODULE_ALIAS_CRYPTO("sha256-avx"); -MODULE_ALIAS_CRYPTO("sha256-avx2"); -MODULE_ALIAS_CRYPTO("sha224"); -MODULE_ALIAS_CRYPTO("sha224-ssse3"); -MODULE_ALIAS_CRYPTO("sha224-avx"); -MODULE_ALIAS_CRYPTO("sha224-avx2"); -#ifdef CONFIG_AS_SHA256_NI -MODULE_ALIAS_CRYPTO("sha256-ni"); -MODULE_ALIAS_CRYPTO("sha224-ni"); -#endif diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 6d3b85e53d0e..067684c54395 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -27,17 +27,13 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <asm/cpu_device_id.h> +#include <asm/simd.h> #include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> -#include <linux/init.h> +#include <linux/kernel.h> #include <linux/module.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/types.h> #include <crypto/sha2.h> #include <crypto/sha512_base.h> -#include <asm/cpu_device_id.h> -#include <asm/simd.h> asmlinkage void sha512_transform_ssse3(struct sha512_state *state, const u8 *data, int blocks); @@ -45,11 +41,7 @@ asmlinkage void sha512_transform_ssse3(struct sha512_state *state, static int sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len, sha512_block_fn *sha512_xform) { - struct sha512_state *sctx = shash_desc_ctx(desc); - - if (!crypto_simd_usable() || - (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE) - return crypto_sha512_update(desc, data, len); + int remain; /* * Make sure struct sha512_state begins directly with the SHA512 @@ -58,22 +50,17 @@ static int sha512_update(struct shash_desc *desc, const u8 *data, BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0); kernel_fpu_begin(); - sha512_base_do_update(desc, data, len, sha512_xform); + remain = sha512_base_do_update_blocks(desc, data, len, sha512_xform); kernel_fpu_end(); - return 0; + return remain; } static int sha512_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out, sha512_block_fn *sha512_xform) { - if (!crypto_simd_usable()) - return crypto_sha512_finup(desc, data, len, out); - kernel_fpu_begin(); - if (len) - sha512_base_do_update(desc, data, len, sha512_xform); - sha512_base_do_finalize(desc, sha512_xform); + sha512_base_do_finup(desc, data, len, sha512_xform); kernel_fpu_end(); return sha512_base_finish(desc, out); @@ -91,23 +78,18 @@ static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, return sha512_finup(desc, data, len, out, sha512_transform_ssse3); } -/* Add padding and return the message digest. */ -static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) -{ - return sha512_ssse3_finup(desc, NULL, 0, out); -} - static struct shash_alg sha512_ssse3_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_ssse3_update, - .final = sha512_ssse3_final, .finup = sha512_ssse3_finup, - .descsize = sizeof(struct sha512_state), + .descsize = SHA512_STATE_SIZE, .base = { .cra_name = "sha512", .cra_driver_name = "sha512-ssse3", .cra_priority = 150, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -115,13 +97,14 @@ static struct shash_alg sha512_ssse3_algs[] = { { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_ssse3_update, - .final = sha512_ssse3_final, .finup = sha512_ssse3_finup, - .descsize = sizeof(struct sha512_state), + .descsize = SHA512_STATE_SIZE, .base = { .cra_name = "sha384", .cra_driver_name = "sha384-ssse3", .cra_priority = 150, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -167,23 +150,18 @@ static int sha512_avx_finup(struct shash_desc *desc, const u8 *data, return sha512_finup(desc, data, len, out, sha512_transform_avx); } -/* Add padding and return the message digest. */ -static int sha512_avx_final(struct shash_desc *desc, u8 *out) -{ - return sha512_avx_finup(desc, NULL, 0, out); -} - static struct shash_alg sha512_avx_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_avx_update, - .final = sha512_avx_final, .finup = sha512_avx_finup, - .descsize = sizeof(struct sha512_state), + .descsize = SHA512_STATE_SIZE, .base = { .cra_name = "sha512", .cra_driver_name = "sha512-avx", .cra_priority = 160, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -191,13 +169,14 @@ static struct shash_alg sha512_avx_algs[] = { { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_avx_update, - .final = sha512_avx_final, .finup = sha512_avx_finup, - .descsize = sizeof(struct sha512_state), + .descsize = SHA512_STATE_SIZE, .base = { .cra_name = "sha384", .cra_driver_name = "sha384-avx", .cra_priority = 160, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -233,23 +212,18 @@ static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data, return sha512_finup(desc, data, len, out, sha512_transform_rorx); } -/* Add padding and return the message digest. */ -static int sha512_avx2_final(struct shash_desc *desc, u8 *out) -{ - return sha512_avx2_finup(desc, NULL, 0, out); -} - static struct shash_alg sha512_avx2_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_avx2_update, - .final = sha512_avx2_final, .finup = sha512_avx2_finup, - .descsize = sizeof(struct sha512_state), + .descsize = SHA512_STATE_SIZE, .base = { .cra_name = "sha512", .cra_driver_name = "sha512-avx2", .cra_priority = 170, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } @@ -257,13 +231,14 @@ static struct shash_alg sha512_avx2_algs[] = { { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_avx2_update, - .final = sha512_avx2_final, .finup = sha512_avx2_finup, - .descsize = sizeof(struct sha512_state), + .descsize = SHA512_STATE_SIZE, .base = { .cra_name = "sha384", .cra_driver_name = "sha384-avx2", .cra_priority = 170, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/arch/x86/crypto/sm3_avx_glue.c b/arch/x86/crypto/sm3_avx_glue.c index 661b6f22ffcd..6e8c42b9dc8e 100644 --- a/arch/x86/crypto/sm3_avx_glue.c +++ b/arch/x86/crypto/sm3_avx_glue.c @@ -10,12 +10,11 @@ #include <crypto/internal/hash.h> #include <crypto/internal/simd.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/types.h> #include <crypto/sm3.h> #include <crypto/sm3_base.h> -#include <asm/simd.h> +#include <linux/cpufeature.h> +#include <linux/kernel.h> +#include <linux/module.h> asmlinkage void sm3_transform_avx(struct sm3_state *state, const u8 *data, int nblocks); @@ -23,13 +22,7 @@ asmlinkage void sm3_transform_avx(struct sm3_state *state, static int sm3_avx_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - struct sm3_state *sctx = shash_desc_ctx(desc); - - if (!crypto_simd_usable() || - (sctx->count % SM3_BLOCK_SIZE) + len < SM3_BLOCK_SIZE) { - sm3_update(sctx, data, len); - return 0; - } + int remain; /* * Make sure struct sm3_state begins directly with the SM3 @@ -38,45 +31,17 @@ static int sm3_avx_update(struct shash_desc *desc, const u8 *data, BUILD_BUG_ON(offsetof(struct sm3_state, state) != 0); kernel_fpu_begin(); - sm3_base_do_update(desc, data, len, sm3_transform_avx); + remain = sm3_base_do_update_blocks(desc, data, len, sm3_transform_avx); kernel_fpu_end(); - - return 0; + return remain; } static int sm3_avx_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - if (!crypto_simd_usable()) { - struct sm3_state *sctx = shash_desc_ctx(desc); - - if (len) - sm3_update(sctx, data, len); - - sm3_final(sctx, out); - return 0; - } - kernel_fpu_begin(); - if (len) - sm3_base_do_update(desc, data, len, sm3_transform_avx); - sm3_base_do_finalize(desc, sm3_transform_avx); + sm3_base_do_finup(desc, data, len, sm3_transform_avx); kernel_fpu_end(); - - return sm3_base_finish(desc, out); -} - -static int sm3_avx_final(struct shash_desc *desc, u8 *out) -{ - if (!crypto_simd_usable()) { - sm3_final(shash_desc_ctx(desc), out); - return 0; - } - - kernel_fpu_begin(); - sm3_base_do_finalize(desc, sm3_transform_avx); - kernel_fpu_end(); - return sm3_base_finish(desc, out); } @@ -84,13 +49,14 @@ static struct shash_alg sm3_avx_alg = { .digestsize = SM3_DIGEST_SIZE, .init = sm3_base_init, .update = sm3_avx_update, - .final = sm3_avx_final, .finup = sm3_avx_finup, - .descsize = sizeof(struct sm3_state), + .descsize = SM3_STATE_SIZE, .base = { .cra_name = "sm3", .cra_driver_name = "sm3-avx", .cra_priority = 300, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SM3_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/arch/x86/crypto/sm4_aesni_avx2_glue.c b/arch/x86/crypto/sm4_aesni_avx2_glue.c index 1148fd4cd57f..fec0ab7a63dd 100644 --- a/arch/x86/crypto/sm4_aesni_avx2_glue.c +++ b/arch/x86/crypto/sm4_aesni_avx2_glue.c @@ -8,11 +8,10 @@ * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> */ +#include <asm/fpu/api.h> #include <linux/module.h> #include <linux/crypto.h> #include <linux/kernel.h> -#include <asm/simd.h> -#include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/sm4.h> #include "sm4-avx.h" @@ -48,10 +47,9 @@ static int ctr_crypt(struct skcipher_request *req) static struct skcipher_alg sm4_aesni_avx2_skciphers[] = { { .base = { - .cra_name = "__ecb(sm4)", - .cra_driver_name = "__ecb-sm4-aesni-avx2", + .cra_name = "ecb(sm4)", + .cra_driver_name = "ecb-sm4-aesni-avx2", .cra_priority = 500, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = SM4_BLOCK_SIZE, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, @@ -64,10 +62,9 @@ static struct skcipher_alg sm4_aesni_avx2_skciphers[] = { .decrypt = sm4_avx_ecb_decrypt, }, { .base = { - .cra_name = "__cbc(sm4)", - .cra_driver_name = "__cbc-sm4-aesni-avx2", + .cra_name = "cbc(sm4)", + .cra_driver_name = "cbc-sm4-aesni-avx2", .cra_priority = 500, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = SM4_BLOCK_SIZE, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, @@ -81,10 +78,9 @@ static struct skcipher_alg sm4_aesni_avx2_skciphers[] = { .decrypt = cbc_decrypt, }, { .base = { - .cra_name = "__ctr(sm4)", - .cra_driver_name = "__ctr-sm4-aesni-avx2", + .cra_name = "ctr(sm4)", + .cra_driver_name = "ctr-sm4-aesni-avx2", .cra_priority = 500, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, @@ -100,9 +96,6 @@ static struct skcipher_alg sm4_aesni_avx2_skciphers[] = { } }; -static struct simd_skcipher_alg * -simd_sm4_aesni_avx2_skciphers[ARRAY_SIZE(sm4_aesni_avx2_skciphers)]; - static int __init sm4_init(void) { const char *feature_name; @@ -121,16 +114,14 @@ static int __init sm4_init(void) return -ENODEV; } - return simd_register_skciphers_compat(sm4_aesni_avx2_skciphers, - ARRAY_SIZE(sm4_aesni_avx2_skciphers), - simd_sm4_aesni_avx2_skciphers); + return crypto_register_skciphers(sm4_aesni_avx2_skciphers, + ARRAY_SIZE(sm4_aesni_avx2_skciphers)); } static void __exit sm4_exit(void) { - simd_unregister_skciphers(sm4_aesni_avx2_skciphers, - ARRAY_SIZE(sm4_aesni_avx2_skciphers), - simd_sm4_aesni_avx2_skciphers); + crypto_unregister_skciphers(sm4_aesni_avx2_skciphers, + ARRAY_SIZE(sm4_aesni_avx2_skciphers)); } module_init(sm4_init); diff --git a/arch/x86/crypto/sm4_aesni_avx_glue.c b/arch/x86/crypto/sm4_aesni_avx_glue.c index 85b4ca78b47b..72867fc49ce8 100644 --- a/arch/x86/crypto/sm4_aesni_avx_glue.c +++ b/arch/x86/crypto/sm4_aesni_avx_glue.c @@ -8,11 +8,10 @@ * Copyright (c) 2021 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> */ +#include <asm/fpu/api.h> #include <linux/module.h> #include <linux/crypto.h> #include <linux/kernel.h> -#include <asm/simd.h> -#include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/sm4.h> #include "sm4-avx.h" @@ -263,10 +262,9 @@ static int ctr_crypt(struct skcipher_request *req) static struct skcipher_alg sm4_aesni_avx_skciphers[] = { { .base = { - .cra_name = "__ecb(sm4)", - .cra_driver_name = "__ecb-sm4-aesni-avx", + .cra_name = "ecb(sm4)", + .cra_driver_name = "ecb-sm4-aesni-avx", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = SM4_BLOCK_SIZE, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, @@ -279,10 +277,9 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = { .decrypt = sm4_avx_ecb_decrypt, }, { .base = { - .cra_name = "__cbc(sm4)", - .cra_driver_name = "__cbc-sm4-aesni-avx", + .cra_name = "cbc(sm4)", + .cra_driver_name = "cbc-sm4-aesni-avx", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = SM4_BLOCK_SIZE, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, @@ -296,10 +293,9 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = { .decrypt = cbc_decrypt, }, { .base = { - .cra_name = "__ctr(sm4)", - .cra_driver_name = "__ctr-sm4-aesni-avx", + .cra_name = "ctr(sm4)", + .cra_driver_name = "ctr-sm4-aesni-avx", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, @@ -315,9 +311,6 @@ static struct skcipher_alg sm4_aesni_avx_skciphers[] = { } }; -static struct simd_skcipher_alg * -simd_sm4_aesni_avx_skciphers[ARRAY_SIZE(sm4_aesni_avx_skciphers)]; - static int __init sm4_init(void) { const char *feature_name; @@ -335,16 +328,14 @@ static int __init sm4_init(void) return -ENODEV; } - return simd_register_skciphers_compat(sm4_aesni_avx_skciphers, - ARRAY_SIZE(sm4_aesni_avx_skciphers), - simd_sm4_aesni_avx_skciphers); + return crypto_register_skciphers(sm4_aesni_avx_skciphers, + ARRAY_SIZE(sm4_aesni_avx_skciphers)); } static void __exit sm4_exit(void) { - simd_unregister_skciphers(sm4_aesni_avx_skciphers, - ARRAY_SIZE(sm4_aesni_avx_skciphers), - simd_sm4_aesni_avx_skciphers); + crypto_unregister_skciphers(sm4_aesni_avx_skciphers, + ARRAY_SIZE(sm4_aesni_avx_skciphers)); } module_init(sm4_init); diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 3eb3440b477a..9e20db013750 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -13,7 +13,6 @@ #include <linux/crypto.h> #include <linux/err.h> #include <crypto/algapi.h> -#include <crypto/internal/simd.h> #include <crypto/twofish.h> #include "twofish.h" @@ -74,10 +73,9 @@ static int cbc_decrypt(struct skcipher_request *req) static struct skcipher_alg twofish_algs[] = { { - .base.cra_name = "__ecb(twofish)", - .base.cra_driver_name = "__ecb-twofish-avx", + .base.cra_name = "ecb(twofish)", + .base.cra_driver_name = "ecb-twofish-avx", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = TF_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct twofish_ctx), .base.cra_module = THIS_MODULE, @@ -87,10 +85,9 @@ static struct skcipher_alg twofish_algs[] = { .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { - .base.cra_name = "__cbc(twofish)", - .base.cra_driver_name = "__cbc-twofish-avx", + .base.cra_name = "cbc(twofish)", + .base.cra_driver_name = "cbc-twofish-avx", .base.cra_priority = 400, - .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = TF_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct twofish_ctx), .base.cra_module = THIS_MODULE, @@ -103,8 +100,6 @@ static struct skcipher_alg twofish_algs[] = { }, }; -static struct simd_skcipher_alg *twofish_simd_algs[ARRAY_SIZE(twofish_algs)]; - static int __init twofish_init(void) { const char *feature_name; @@ -114,15 +109,13 @@ static int __init twofish_init(void) return -ENODEV; } - return simd_register_skciphers_compat(twofish_algs, - ARRAY_SIZE(twofish_algs), - twofish_simd_algs); + return crypto_register_skciphers(twofish_algs, + ARRAY_SIZE(twofish_algs)); } static void __exit twofish_exit(void) { - simd_unregister_skciphers(twofish_algs, ARRAY_SIZE(twofish_algs), - twofish_simd_algs); + crypto_unregister_skciphers(twofish_algs, ARRAY_SIZE(twofish_algs)); } module_init(twofish_init); |