diff options
Diffstat (limited to 'arch/arm64/crypto')
-rw-r--r-- | arch/arm64/crypto/Kconfig | 53 | ||||
-rw-r--r-- | arch/arm64/crypto/Makefile | 20 | ||||
-rw-r--r-- | arch/arm64/crypto/aes-glue.c | 124 | ||||
-rw-r--r-- | arch/arm64/crypto/chacha-neon-core.S | 805 | ||||
-rw-r--r-- | arch/arm64/crypto/chacha-neon-glue.c | 237 | ||||
-rw-r--r-- | arch/arm64/crypto/ghash-ce-glue.c | 143 | ||||
-rw-r--r-- | arch/arm64/crypto/poly1305-armv8.pl | 917 | ||||
-rw-r--r-- | arch/arm64/crypto/poly1305-glue.c | 232 | ||||
-rw-r--r-- | arch/arm64/crypto/polyval-ce-glue.c | 73 | ||||
-rw-r--r-- | arch/arm64/crypto/sha1-ce-glue.c | 70 | ||||
-rw-r--r-- | arch/arm64/crypto/sha2-ce-core.S | 157 | ||||
-rw-r--r-- | arch/arm64/crypto/sha2-ce-glue.c | 192 | ||||
-rw-r--r-- | arch/arm64/crypto/sha256-glue.c | 194 | ||||
-rw-r--r-- | arch/arm64/crypto/sha3-ce-glue.c | 111 | ||||
-rw-r--r-- | arch/arm64/crypto/sha512-armv8.pl | 786 | ||||
-rw-r--r-- | arch/arm64/crypto/sha512-ce-glue.c | 49 | ||||
-rw-r--r-- | arch/arm64/crypto/sha512-glue.c | 35 | ||||
-rw-r--r-- | arch/arm64/crypto/sm3-ce-glue.c | 48 | ||||
-rw-r--r-- | arch/arm64/crypto/sm3-neon-glue.c | 48 | ||||
-rw-r--r-- | arch/arm64/crypto/sm4-ce-glue.c | 100 |
20 files changed, 266 insertions, 4128 deletions
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig index 3418c8d3c78d..c44b0f202a1f 100644 --- a/arch/arm64/crypto/Kconfig +++ b/arch/arm64/crypto/Kconfig @@ -25,18 +25,6 @@ config CRYPTO_NHPOLY1305_NEON Architecture: arm64 using: - NEON (Advanced SIMD) extensions -config CRYPTO_POLY1305_NEON - tristate - depends on KERNEL_MODE_NEON - select CRYPTO_HASH - select CRYPTO_ARCH_HAVE_LIB_POLY1305 - default CRYPTO_LIB_POLY1305_INTERNAL - help - Poly1305 authenticator algorithm (RFC7539) - - Architecture: arm64 using: - - NEON (Advanced SIMD) extensions - config CRYPTO_SHA1_ARM64_CE tristate "Hash functions: SHA-1 (ARMv8 Crypto Extensions)" depends on KERNEL_MODE_NEON @@ -48,25 +36,6 @@ config CRYPTO_SHA1_ARM64_CE Architecture: arm64 using: - ARMv8 Crypto Extensions -config CRYPTO_SHA256_ARM64 - tristate "Hash functions: SHA-224 and SHA-256" - select CRYPTO_HASH - help - SHA-224 and SHA-256 secure hash algorithms (FIPS 180) - - Architecture: arm64 - -config CRYPTO_SHA2_ARM64_CE - tristate "Hash functions: SHA-224 and SHA-256 (ARMv8 Crypto Extensions)" - depends on KERNEL_MODE_NEON - select CRYPTO_HASH - select CRYPTO_SHA256_ARM64 - help - SHA-224 and SHA-256 secure hash algorithms (FIPS 180) - - Architecture: arm64 using: - - ARMv8 Crypto Extensions - config CRYPTO_SHA512_ARM64 tristate "Hash functions: SHA-384 and SHA-512" select CRYPTO_HASH @@ -101,7 +70,7 @@ config CRYPTO_SM3_NEON tristate "Hash functions: SM3 (NEON)" depends on KERNEL_MODE_NEON select CRYPTO_HASH - select CRYPTO_SM3 + select CRYPTO_LIB_SM3 help SM3 (ShangMi 3) secure hash function (OSCCA GM/T 0004-2012) @@ -112,7 +81,7 @@ config CRYPTO_SM3_ARM64_CE tristate "Hash functions: SM3 (ARMv8.2 Crypto Extensions)" depends on KERNEL_MODE_NEON select CRYPTO_HASH - select CRYPTO_SM3 + select CRYPTO_LIB_SM3 help SM3 (ShangMi 3) secure hash function (OSCCA GM/T 0004-2012) @@ -143,7 +112,7 @@ config CRYPTO_AES_ARM64 config CRYPTO_AES_ARM64_CE tristate "Ciphers: AES (ARMv8 Crypto Extensions)" - depends on ARM64 && KERNEL_MODE_NEON + depends on KERNEL_MODE_NEON select CRYPTO_ALGAPI select CRYPTO_LIB_AES help @@ -186,20 +155,6 @@ config CRYPTO_AES_ARM64_NEON_BLK Architecture: arm64 using: - NEON (Advanced SIMD) extensions -config CRYPTO_CHACHA20_NEON - tristate - depends on KERNEL_MODE_NEON - select CRYPTO_SKCIPHER - select CRYPTO_LIB_CHACHA_GENERIC - select CRYPTO_ARCH_HAVE_LIB_CHACHA - default CRYPTO_LIB_CHACHA_INTERNAL - help - Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12 - stream cipher algorithms - - Architecture: arm64 using: - - NEON (Advanced SIMD) extensions - config CRYPTO_AES_ARM64_BS tristate "Ciphers: AES, modes: ECB/CBC/CTR/XCTR/XTS modes (bit-sliced NEON)" depends on KERNEL_MODE_NEON @@ -267,7 +222,7 @@ config CRYPTO_SM4_ARM64_NEON_BLK config CRYPTO_AES_ARM64_CE_CCM tristate "AEAD cipher: AES in CCM mode (ARMv8 Crypto Extensions)" - depends on ARM64 && KERNEL_MODE_NEON + depends on KERNEL_MODE_NEON select CRYPTO_ALGAPI select CRYPTO_AES_ARM64_CE select CRYPTO_AES_ARM64_CE_BLK diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile index e7139c4768ce..c231c980c514 100644 --- a/arch/arm64/crypto/Makefile +++ b/arch/arm64/crypto/Makefile @@ -8,9 +8,6 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o -obj-$(CONFIG_CRYPTO_SHA2_ARM64_CE) += sha2-ce.o -sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o - obj-$(CONFIG_CRYPTO_SHA512_ARM64_CE) += sha512-ce.o sha512-ce-y := sha512-ce-glue.o sha512-ce-core.o @@ -56,19 +53,9 @@ aes-ce-blk-y := aes-glue-ce.o aes-ce.o obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o aes-neon-blk-y := aes-glue-neon.o aes-neon.o -obj-$(CONFIG_CRYPTO_SHA256_ARM64) += sha256-arm64.o -sha256-arm64-y := sha256-glue.o sha256-core.o - obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o sha512-arm64-y := sha512-glue.o sha512-core.o -obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o -chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o - -obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o -poly1305-neon-y := poly1305-core.o poly1305-glue.o -AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_init_arm64 - obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o @@ -81,10 +68,7 @@ aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o quiet_cmd_perlasm = PERLASM $@ cmd_perlasm = $(PERL) $(<) void $(@) -$(obj)/%-core.S: $(src)/%-armv8.pl - $(call cmd,perlasm) - -$(obj)/sha256-core.S: $(src)/sha512-armv8.pl +$(obj)/sha512-core.S: $(src)/../lib/crypto/sha2-armv8.pl $(call cmd,perlasm) -clean-files += poly1305-core.S sha256-core.S sha512-core.S +clean-files += sha512-core.S diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index b0150999743f..81560f722b9d 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -5,19 +5,20 @@ * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> */ -#include <asm/neon.h> #include <asm/hwcap.h> -#include <asm/simd.h> +#include <asm/neon.h> #include <crypto/aes.h> #include <crypto/ctr.h> -#include <crypto/sha2.h> #include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> -#include <linux/module.h> -#include <linux/cpufeature.h> +#include <crypto/sha2.h> +#include <crypto/utils.h> #include <crypto/xts.h> +#include <linux/cpufeature.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> #include "aes-ce-setkey.h" @@ -130,7 +131,6 @@ struct mac_tfm_ctx { }; struct mac_desc_ctx { - unsigned int len; u8 dg[AES_BLOCK_SIZE]; }; @@ -869,109 +869,64 @@ static int mac_init(struct shash_desc *desc) struct mac_desc_ctx *ctx = shash_desc_ctx(desc); memset(ctx->dg, 0, AES_BLOCK_SIZE); - ctx->len = 0; - return 0; } static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks, - u8 dg[], int enc_before, int enc_after) + u8 dg[], int enc_before) { int rounds = 6 + ctx->key_length / 4; + int rem; - if (crypto_simd_usable()) { - int rem; - - do { - kernel_neon_begin(); - rem = aes_mac_update(in, ctx->key_enc, rounds, blocks, - dg, enc_before, enc_after); - kernel_neon_end(); - in += (blocks - rem) * AES_BLOCK_SIZE; - blocks = rem; - enc_before = 0; - } while (blocks); - } else { - if (enc_before) - aes_encrypt(ctx, dg, dg); - - while (blocks--) { - crypto_xor(dg, in, AES_BLOCK_SIZE); - in += AES_BLOCK_SIZE; - - if (blocks || enc_after) - aes_encrypt(ctx, dg, dg); - } - } + do { + kernel_neon_begin(); + rem = aes_mac_update(in, ctx->key_enc, rounds, blocks, + dg, enc_before, !enc_before); + kernel_neon_end(); + in += (blocks - rem) * AES_BLOCK_SIZE; + blocks = rem; + } while (blocks); } static int mac_update(struct shash_desc *desc, const u8 *p, unsigned int len) { struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); struct mac_desc_ctx *ctx = shash_desc_ctx(desc); + int blocks = len / AES_BLOCK_SIZE; - while (len > 0) { - unsigned int l; - - if ((ctx->len % AES_BLOCK_SIZE) == 0 && - (ctx->len + len) > AES_BLOCK_SIZE) { - - int blocks = len / AES_BLOCK_SIZE; - - len %= AES_BLOCK_SIZE; - - mac_do_update(&tctx->key, p, blocks, ctx->dg, - (ctx->len != 0), (len != 0)); - - p += blocks * AES_BLOCK_SIZE; - - if (!len) { - ctx->len = AES_BLOCK_SIZE; - break; - } - ctx->len = 0; - } - - l = min(len, AES_BLOCK_SIZE - ctx->len); - - if (l <= AES_BLOCK_SIZE) { - crypto_xor(ctx->dg + ctx->len, p, l); - ctx->len += l; - len -= l; - p += l; - } - } - - return 0; + len %= AES_BLOCK_SIZE; + mac_do_update(&tctx->key, p, blocks, ctx->dg, 0); + return len; } -static int cbcmac_final(struct shash_desc *desc, u8 *out) +static int cbcmac_finup(struct shash_desc *desc, const u8 *src, + unsigned int len, u8 *out) { struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); struct mac_desc_ctx *ctx = shash_desc_ctx(desc); - mac_do_update(&tctx->key, NULL, 0, ctx->dg, (ctx->len != 0), 0); - + if (len) { + crypto_xor(ctx->dg, src, len); + mac_do_update(&tctx->key, NULL, 0, ctx->dg, 1); + } memcpy(out, ctx->dg, AES_BLOCK_SIZE); - return 0; } -static int cmac_final(struct shash_desc *desc, u8 *out) +static int cmac_finup(struct shash_desc *desc, const u8 *src, unsigned int len, + u8 *out) { struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); struct mac_desc_ctx *ctx = shash_desc_ctx(desc); u8 *consts = tctx->consts; - if (ctx->len != AES_BLOCK_SIZE) { - ctx->dg[ctx->len] ^= 0x80; + crypto_xor(ctx->dg, src, len); + if (len != AES_BLOCK_SIZE) { + ctx->dg[len] ^= 0x80; consts += AES_BLOCK_SIZE; } - - mac_do_update(&tctx->key, consts, 1, ctx->dg, 0, 1); - + mac_do_update(&tctx->key, consts, 1, ctx->dg, 0); memcpy(out, ctx->dg, AES_BLOCK_SIZE); - return 0; } @@ -979,6 +934,8 @@ static struct shash_alg mac_algs[] = { { .base.cra_name = "cmac(aes)", .base.cra_driver_name = "cmac-aes-" MODE, .base.cra_priority = PRIO, + .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINAL_NONZERO, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct mac_tfm_ctx) + 2 * AES_BLOCK_SIZE, @@ -987,13 +944,15 @@ static struct shash_alg mac_algs[] = { { .digestsize = AES_BLOCK_SIZE, .init = mac_init, .update = mac_update, - .final = cmac_final, + .finup = cmac_finup, .setkey = cmac_setkey, .descsize = sizeof(struct mac_desc_ctx), }, { .base.cra_name = "xcbc(aes)", .base.cra_driver_name = "xcbc-aes-" MODE, .base.cra_priority = PRIO, + .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINAL_NONZERO, .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct mac_tfm_ctx) + 2 * AES_BLOCK_SIZE, @@ -1002,21 +961,22 @@ static struct shash_alg mac_algs[] = { { .digestsize = AES_BLOCK_SIZE, .init = mac_init, .update = mac_update, - .final = cmac_final, + .finup = cmac_finup, .setkey = xcbc_setkey, .descsize = sizeof(struct mac_desc_ctx), }, { .base.cra_name = "cbcmac(aes)", .base.cra_driver_name = "cbcmac-aes-" MODE, .base.cra_priority = PRIO, - .base.cra_blocksize = 1, + .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, + .base.cra_blocksize = AES_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct mac_tfm_ctx), .base.cra_module = THIS_MODULE, .digestsize = AES_BLOCK_SIZE, .init = mac_init, .update = mac_update, - .final = cbcmac_final, + .finup = cbcmac_finup, .setkey = cbcmac_setkey, .descsize = sizeof(struct mac_desc_ctx), } }; diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S deleted file mode 100644 index b70ac76f2610..000000000000 --- a/arch/arm64/crypto/chacha-neon-core.S +++ /dev/null @@ -1,805 +0,0 @@ -/* - * ChaCha/XChaCha NEON helper functions - * - * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Originally based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include <linux/linkage.h> -#include <asm/assembler.h> -#include <asm/cache.h> - - .text - .align 6 - -/* - * chacha_permute - permute one block - * - * Permute one 64-byte block where the state matrix is stored in the four NEON - * registers v0-v3. It performs matrix operations on four words in parallel, - * but requires shuffling to rearrange the words after each round. - * - * The round count is given in w3. - * - * Clobbers: w3, x10, v4, v12 - */ -SYM_FUNC_START_LOCAL(chacha_permute) - - adr_l x10, ROT8 - ld1 {v12.4s}, [x10] - -.Ldoubleround: - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - rev32 v3.8h, v3.8h - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #12 - sri v1.4s, v4.4s, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - tbl v3.16b, {v3.16b}, v12.16b - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #7 - sri v1.4s, v4.4s, #25 - - // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - ext v1.16b, v1.16b, v1.16b, #4 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - ext v2.16b, v2.16b, v2.16b, #8 - // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - ext v3.16b, v3.16b, v3.16b, #12 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - rev32 v3.8h, v3.8h - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #12 - sri v1.4s, v4.4s, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - tbl v3.16b, {v3.16b}, v12.16b - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #7 - sri v1.4s, v4.4s, #25 - - // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - ext v1.16b, v1.16b, v1.16b, #12 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - ext v2.16b, v2.16b, v2.16b, #8 - // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - ext v3.16b, v3.16b, v3.16b, #4 - - subs w3, w3, #2 - b.ne .Ldoubleround - - ret -SYM_FUNC_END(chacha_permute) - -SYM_FUNC_START(chacha_block_xor_neon) - // x0: Input state matrix, s - // x1: 1 data block output, o - // x2: 1 data block input, i - // w3: nrounds - - stp x29, x30, [sp, #-16]! - mov x29, sp - - // x0..3 = s0..3 - ld1 {v0.4s-v3.4s}, [x0] - ld1 {v8.4s-v11.4s}, [x0] - - bl chacha_permute - - ld1 {v4.16b-v7.16b}, [x2] - - // o0 = i0 ^ (x0 + s0) - add v0.4s, v0.4s, v8.4s - eor v0.16b, v0.16b, v4.16b - - // o1 = i1 ^ (x1 + s1) - add v1.4s, v1.4s, v9.4s - eor v1.16b, v1.16b, v5.16b - - // o2 = i2 ^ (x2 + s2) - add v2.4s, v2.4s, v10.4s - eor v2.16b, v2.16b, v6.16b - - // o3 = i3 ^ (x3 + s3) - add v3.4s, v3.4s, v11.4s - eor v3.16b, v3.16b, v7.16b - - st1 {v0.16b-v3.16b}, [x1] - - ldp x29, x30, [sp], #16 - ret -SYM_FUNC_END(chacha_block_xor_neon) - -SYM_FUNC_START(hchacha_block_neon) - // x0: Input state matrix, s - // x1: output (8 32-bit words) - // w2: nrounds - - stp x29, x30, [sp, #-16]! - mov x29, sp - - ld1 {v0.4s-v3.4s}, [x0] - - mov w3, w2 - bl chacha_permute - - st1 {v0.4s}, [x1], #16 - st1 {v3.4s}, [x1] - - ldp x29, x30, [sp], #16 - ret -SYM_FUNC_END(hchacha_block_neon) - - a0 .req w12 - a1 .req w13 - a2 .req w14 - a3 .req w15 - a4 .req w16 - a5 .req w17 - a6 .req w19 - a7 .req w20 - a8 .req w21 - a9 .req w22 - a10 .req w23 - a11 .req w24 - a12 .req w25 - a13 .req w26 - a14 .req w27 - a15 .req w28 - - .align 6 -SYM_FUNC_START(chacha_4block_xor_neon) - frame_push 10 - - // x0: Input state matrix, s - // x1: 4 data blocks output, o - // x2: 4 data blocks input, i - // w3: nrounds - // x4: byte count - - adr_l x10, .Lpermute - and x5, x4, #63 - add x10, x10, x5 - - // - // This function encrypts four consecutive ChaCha blocks by loading - // the state matrix in NEON registers four times. The algorithm performs - // each operation on the corresponding word of each state matrix, hence - // requires no word shuffling. For final XORing step we transpose the - // matrix by interleaving 32- and then 64-bit words, which allows us to - // do XOR in NEON registers. - // - // At the same time, a fifth block is encrypted in parallel using - // scalar registers - // - adr_l x9, CTRINC // ... and ROT8 - ld1 {v30.4s-v31.4s}, [x9] - - // x0..15[0-3] = s0..3[0..3] - add x8, x0, #16 - ld4r { v0.4s- v3.4s}, [x0] - ld4r { v4.4s- v7.4s}, [x8], #16 - ld4r { v8.4s-v11.4s}, [x8], #16 - ld4r {v12.4s-v15.4s}, [x8] - - mov a0, v0.s[0] - mov a1, v1.s[0] - mov a2, v2.s[0] - mov a3, v3.s[0] - mov a4, v4.s[0] - mov a5, v5.s[0] - mov a6, v6.s[0] - mov a7, v7.s[0] - mov a8, v8.s[0] - mov a9, v9.s[0] - mov a10, v10.s[0] - mov a11, v11.s[0] - mov a12, v12.s[0] - mov a13, v13.s[0] - mov a14, v14.s[0] - mov a15, v15.s[0] - - // x12 += counter values 1-4 - add v12.4s, v12.4s, v30.4s - -.Ldoubleround4: - // x0 += x4, x12 = rotl32(x12 ^ x0, 16) - // x1 += x5, x13 = rotl32(x13 ^ x1, 16) - // x2 += x6, x14 = rotl32(x14 ^ x2, 16) - // x3 += x7, x15 = rotl32(x15 ^ x3, 16) - add v0.4s, v0.4s, v4.4s - add a0, a0, a4 - add v1.4s, v1.4s, v5.4s - add a1, a1, a5 - add v2.4s, v2.4s, v6.4s - add a2, a2, a6 - add v3.4s, v3.4s, v7.4s - add a3, a3, a7 - - eor v12.16b, v12.16b, v0.16b - eor a12, a12, a0 - eor v13.16b, v13.16b, v1.16b - eor a13, a13, a1 - eor v14.16b, v14.16b, v2.16b - eor a14, a14, a2 - eor v15.16b, v15.16b, v3.16b - eor a15, a15, a3 - - rev32 v12.8h, v12.8h - ror a12, a12, #16 - rev32 v13.8h, v13.8h - ror a13, a13, #16 - rev32 v14.8h, v14.8h - ror a14, a14, #16 - rev32 v15.8h, v15.8h - ror a15, a15, #16 - - // x8 += x12, x4 = rotl32(x4 ^ x8, 12) - // x9 += x13, x5 = rotl32(x5 ^ x9, 12) - // x10 += x14, x6 = rotl32(x6 ^ x10, 12) - // x11 += x15, x7 = rotl32(x7 ^ x11, 12) - add v8.4s, v8.4s, v12.4s - add a8, a8, a12 - add v9.4s, v9.4s, v13.4s - add a9, a9, a13 - add v10.4s, v10.4s, v14.4s - add a10, a10, a14 - add v11.4s, v11.4s, v15.4s - add a11, a11, a15 - - eor v16.16b, v4.16b, v8.16b - eor a4, a4, a8 - eor v17.16b, v5.16b, v9.16b - eor a5, a5, a9 - eor v18.16b, v6.16b, v10.16b - eor a6, a6, a10 - eor v19.16b, v7.16b, v11.16b - eor a7, a7, a11 - - shl v4.4s, v16.4s, #12 - shl v5.4s, v17.4s, #12 - shl v6.4s, v18.4s, #12 - shl v7.4s, v19.4s, #12 - - sri v4.4s, v16.4s, #20 - ror a4, a4, #20 - sri v5.4s, v17.4s, #20 - ror a5, a5, #20 - sri v6.4s, v18.4s, #20 - ror a6, a6, #20 - sri v7.4s, v19.4s, #20 - ror a7, a7, #20 - - // x0 += x4, x12 = rotl32(x12 ^ x0, 8) - // x1 += x5, x13 = rotl32(x13 ^ x1, 8) - // x2 += x6, x14 = rotl32(x14 ^ x2, 8) - // x3 += x7, x15 = rotl32(x15 ^ x3, 8) - add v0.4s, v0.4s, v4.4s - add a0, a0, a4 - add v1.4s, v1.4s, v5.4s - add a1, a1, a5 - add v2.4s, v2.4s, v6.4s - add a2, a2, a6 - add v3.4s, v3.4s, v7.4s - add a3, a3, a7 - - eor v12.16b, v12.16b, v0.16b - eor a12, a12, a0 - eor v13.16b, v13.16b, v1.16b - eor a13, a13, a1 - eor v14.16b, v14.16b, v2.16b - eor a14, a14, a2 - eor v15.16b, v15.16b, v3.16b - eor a15, a15, a3 - - tbl v12.16b, {v12.16b}, v31.16b - ror a12, a12, #24 - tbl v13.16b, {v13.16b}, v31.16b - ror a13, a13, #24 - tbl v14.16b, {v14.16b}, v31.16b - ror a14, a14, #24 - tbl v15.16b, {v15.16b}, v31.16b - ror a15, a15, #24 - - // x8 += x12, x4 = rotl32(x4 ^ x8, 7) - // x9 += x13, x5 = rotl32(x5 ^ x9, 7) - // x10 += x14, x6 = rotl32(x6 ^ x10, 7) - // x11 += x15, x7 = rotl32(x7 ^ x11, 7) - add v8.4s, v8.4s, v12.4s - add a8, a8, a12 - add v9.4s, v9.4s, v13.4s - add a9, a9, a13 - add v10.4s, v10.4s, v14.4s - add a10, a10, a14 - add v11.4s, v11.4s, v15.4s - add a11, a11, a15 - - eor v16.16b, v4.16b, v8.16b - eor a4, a4, a8 - eor v17.16b, v5.16b, v9.16b - eor a5, a5, a9 - eor v18.16b, v6.16b, v10.16b - eor a6, a6, a10 - eor v19.16b, v7.16b, v11.16b - eor a7, a7, a11 - - shl v4.4s, v16.4s, #7 - shl v5.4s, v17.4s, #7 - shl v6.4s, v18.4s, #7 - shl v7.4s, v19.4s, #7 - - sri v4.4s, v16.4s, #25 - ror a4, a4, #25 - sri v5.4s, v17.4s, #25 - ror a5, a5, #25 - sri v6.4s, v18.4s, #25 - ror a6, a6, #25 - sri v7.4s, v19.4s, #25 - ror a7, a7, #25 - - // x0 += x5, x15 = rotl32(x15 ^ x0, 16) - // x1 += x6, x12 = rotl32(x12 ^ x1, 16) - // x2 += x7, x13 = rotl32(x13 ^ x2, 16) - // x3 += x4, x14 = rotl32(x14 ^ x3, 16) - add v0.4s, v0.4s, v5.4s - add a0, a0, a5 - add v1.4s, v1.4s, v6.4s - add a1, a1, a6 - add v2.4s, v2.4s, v7.4s - add a2, a2, a7 - add v3.4s, v3.4s, v4.4s - add a3, a3, a4 - - eor v15.16b, v15.16b, v0.16b - eor a15, a15, a0 - eor v12.16b, v12.16b, v1.16b - eor a12, a12, a1 - eor v13.16b, v13.16b, v2.16b - eor a13, a13, a2 - eor v14.16b, v14.16b, v3.16b - eor a14, a14, a3 - - rev32 v15.8h, v15.8h - ror a15, a15, #16 - rev32 v12.8h, v12.8h - ror a12, a12, #16 - rev32 v13.8h, v13.8h - ror a13, a13, #16 - rev32 v14.8h, v14.8h - ror a14, a14, #16 - - // x10 += x15, x5 = rotl32(x5 ^ x10, 12) - // x11 += x12, x6 = rotl32(x6 ^ x11, 12) - // x8 += x13, x7 = rotl32(x7 ^ x8, 12) - // x9 += x14, x4 = rotl32(x4 ^ x9, 12) - add v10.4s, v10.4s, v15.4s - add a10, a10, a15 - add v11.4s, v11.4s, v12.4s - add a11, a11, a12 - add v8.4s, v8.4s, v13.4s - add a8, a8, a13 - add v9.4s, v9.4s, v14.4s - add a9, a9, a14 - - eor v16.16b, v5.16b, v10.16b - eor a5, a5, a10 - eor v17.16b, v6.16b, v11.16b - eor a6, a6, a11 - eor v18.16b, v7.16b, v8.16b - eor a7, a7, a8 - eor v19.16b, v4.16b, v9.16b - eor a4, a4, a9 - - shl v5.4s, v16.4s, #12 - shl v6.4s, v17.4s, #12 - shl v7.4s, v18.4s, #12 - shl v4.4s, v19.4s, #12 - - sri v5.4s, v16.4s, #20 - ror a5, a5, #20 - sri v6.4s, v17.4s, #20 - ror a6, a6, #20 - sri v7.4s, v18.4s, #20 - ror a7, a7, #20 - sri v4.4s, v19.4s, #20 - ror a4, a4, #20 - - // x0 += x5, x15 = rotl32(x15 ^ x0, 8) - // x1 += x6, x12 = rotl32(x12 ^ x1, 8) - // x2 += x7, x13 = rotl32(x13 ^ x2, 8) - // x3 += x4, x14 = rotl32(x14 ^ x3, 8) - add v0.4s, v0.4s, v5.4s - add a0, a0, a5 - add v1.4s, v1.4s, v6.4s - add a1, a1, a6 - add v2.4s, v2.4s, v7.4s - add a2, a2, a7 - add v3.4s, v3.4s, v4.4s - add a3, a3, a4 - - eor v15.16b, v15.16b, v0.16b - eor a15, a15, a0 - eor v12.16b, v12.16b, v1.16b - eor a12, a12, a1 - eor v13.16b, v13.16b, v2.16b - eor a13, a13, a2 - eor v14.16b, v14.16b, v3.16b - eor a14, a14, a3 - - tbl v15.16b, {v15.16b}, v31.16b - ror a15, a15, #24 - tbl v12.16b, {v12.16b}, v31.16b - ror a12, a12, #24 - tbl v13.16b, {v13.16b}, v31.16b - ror a13, a13, #24 - tbl v14.16b, {v14.16b}, v31.16b - ror a14, a14, #24 - - // x10 += x15, x5 = rotl32(x5 ^ x10, 7) - // x11 += x12, x6 = rotl32(x6 ^ x11, 7) - // x8 += x13, x7 = rotl32(x7 ^ x8, 7) - // x9 += x14, x4 = rotl32(x4 ^ x9, 7) - add v10.4s, v10.4s, v15.4s - add a10, a10, a15 - add v11.4s, v11.4s, v12.4s - add a11, a11, a12 - add v8.4s, v8.4s, v13.4s - add a8, a8, a13 - add v9.4s, v9.4s, v14.4s - add a9, a9, a14 - - eor v16.16b, v5.16b, v10.16b - eor a5, a5, a10 - eor v17.16b, v6.16b, v11.16b - eor a6, a6, a11 - eor v18.16b, v7.16b, v8.16b - eor a7, a7, a8 - eor v19.16b, v4.16b, v9.16b - eor a4, a4, a9 - - shl v5.4s, v16.4s, #7 - shl v6.4s, v17.4s, #7 - shl v7.4s, v18.4s, #7 - shl v4.4s, v19.4s, #7 - - sri v5.4s, v16.4s, #25 - ror a5, a5, #25 - sri v6.4s, v17.4s, #25 - ror a6, a6, #25 - sri v7.4s, v18.4s, #25 - ror a7, a7, #25 - sri v4.4s, v19.4s, #25 - ror a4, a4, #25 - - subs w3, w3, #2 - b.ne .Ldoubleround4 - - ld4r {v16.4s-v19.4s}, [x0], #16 - ld4r {v20.4s-v23.4s}, [x0], #16 - - // x12 += counter values 0-3 - add v12.4s, v12.4s, v30.4s - - // x0[0-3] += s0[0] - // x1[0-3] += s0[1] - // x2[0-3] += s0[2] - // x3[0-3] += s0[3] - add v0.4s, v0.4s, v16.4s - mov w6, v16.s[0] - mov w7, v17.s[0] - add v1.4s, v1.4s, v17.4s - mov w8, v18.s[0] - mov w9, v19.s[0] - add v2.4s, v2.4s, v18.4s - add a0, a0, w6 - add a1, a1, w7 - add v3.4s, v3.4s, v19.4s - add a2, a2, w8 - add a3, a3, w9 -CPU_BE( rev a0, a0 ) -CPU_BE( rev a1, a1 ) -CPU_BE( rev a2, a2 ) -CPU_BE( rev a3, a3 ) - - ld4r {v24.4s-v27.4s}, [x0], #16 - ld4r {v28.4s-v31.4s}, [x0] - - // x4[0-3] += s1[0] - // x5[0-3] += s1[1] - // x6[0-3] += s1[2] - // x7[0-3] += s1[3] - add v4.4s, v4.4s, v20.4s - mov w6, v20.s[0] - mov w7, v21.s[0] - add v5.4s, v5.4s, v21.4s - mov w8, v22.s[0] - mov w9, v23.s[0] - add v6.4s, v6.4s, v22.4s - add a4, a4, w6 - add a5, a5, w7 - add v7.4s, v7.4s, v23.4s - add a6, a6, w8 - add a7, a7, w9 -CPU_BE( rev a4, a4 ) -CPU_BE( rev a5, a5 ) -CPU_BE( rev a6, a6 ) -CPU_BE( rev a7, a7 ) - - // x8[0-3] += s2[0] - // x9[0-3] += s2[1] - // x10[0-3] += s2[2] - // x11[0-3] += s2[3] - add v8.4s, v8.4s, v24.4s - mov w6, v24.s[0] - mov w7, v25.s[0] - add v9.4s, v9.4s, v25.4s - mov w8, v26.s[0] - mov w9, v27.s[0] - add v10.4s, v10.4s, v26.4s - add a8, a8, w6 - add a9, a9, w7 - add v11.4s, v11.4s, v27.4s - add a10, a10, w8 - add a11, a11, w9 -CPU_BE( rev a8, a8 ) -CPU_BE( rev a9, a9 ) -CPU_BE( rev a10, a10 ) -CPU_BE( rev a11, a11 ) - - // x12[0-3] += s3[0] - // x13[0-3] += s3[1] - // x14[0-3] += s3[2] - // x15[0-3] += s3[3] - add v12.4s, v12.4s, v28.4s - mov w6, v28.s[0] - mov w7, v29.s[0] - add v13.4s, v13.4s, v29.4s - mov w8, v30.s[0] - mov w9, v31.s[0] - add v14.4s, v14.4s, v30.4s - add a12, a12, w6 - add a13, a13, w7 - add v15.4s, v15.4s, v31.4s - add a14, a14, w8 - add a15, a15, w9 -CPU_BE( rev a12, a12 ) -CPU_BE( rev a13, a13 ) -CPU_BE( rev a14, a14 ) -CPU_BE( rev a15, a15 ) - - // interleave 32-bit words in state n, n+1 - ldp w6, w7, [x2], #64 - zip1 v16.4s, v0.4s, v1.4s - ldp w8, w9, [x2, #-56] - eor a0, a0, w6 - zip2 v17.4s, v0.4s, v1.4s - eor a1, a1, w7 - zip1 v18.4s, v2.4s, v3.4s - eor a2, a2, w8 - zip2 v19.4s, v2.4s, v3.4s - eor a3, a3, w9 - ldp w6, w7, [x2, #-48] - zip1 v20.4s, v4.4s, v5.4s - ldp w8, w9, [x2, #-40] - eor a4, a4, w6 - zip2 v21.4s, v4.4s, v5.4s - eor a5, a5, w7 - zip1 v22.4s, v6.4s, v7.4s - eor a6, a6, w8 - zip2 v23.4s, v6.4s, v7.4s - eor a7, a7, w9 - ldp w6, w7, [x2, #-32] - zip1 v24.4s, v8.4s, v9.4s - ldp w8, w9, [x2, #-24] - eor a8, a8, w6 - zip2 v25.4s, v8.4s, v9.4s - eor a9, a9, w7 - zip1 v26.4s, v10.4s, v11.4s - eor a10, a10, w8 - zip2 v27.4s, v10.4s, v11.4s - eor a11, a11, w9 - ldp w6, w7, [x2, #-16] - zip1 v28.4s, v12.4s, v13.4s - ldp w8, w9, [x2, #-8] - eor a12, a12, w6 - zip2 v29.4s, v12.4s, v13.4s - eor a13, a13, w7 - zip1 v30.4s, v14.4s, v15.4s - eor a14, a14, w8 - zip2 v31.4s, v14.4s, v15.4s - eor a15, a15, w9 - - add x3, x2, x4 - sub x3, x3, #128 // start of last block - - subs x5, x4, #128 - csel x2, x2, x3, ge - - // interleave 64-bit words in state n, n+2 - zip1 v0.2d, v16.2d, v18.2d - zip2 v4.2d, v16.2d, v18.2d - stp a0, a1, [x1], #64 - zip1 v8.2d, v17.2d, v19.2d - zip2 v12.2d, v17.2d, v19.2d - stp a2, a3, [x1, #-56] - - subs x6, x4, #192 - ld1 {v16.16b-v19.16b}, [x2], #64 - csel x2, x2, x3, ge - - zip1 v1.2d, v20.2d, v22.2d - zip2 v5.2d, v20.2d, v22.2d - stp a4, a5, [x1, #-48] - zip1 v9.2d, v21.2d, v23.2d - zip2 v13.2d, v21.2d, v23.2d - stp a6, a7, [x1, #-40] - - subs x7, x4, #256 - ld1 {v20.16b-v23.16b}, [x2], #64 - csel x2, x2, x3, ge - - zip1 v2.2d, v24.2d, v26.2d - zip2 v6.2d, v24.2d, v26.2d - stp a8, a9, [x1, #-32] - zip1 v10.2d, v25.2d, v27.2d - zip2 v14.2d, v25.2d, v27.2d - stp a10, a11, [x1, #-24] - - subs x8, x4, #320 - ld1 {v24.16b-v27.16b}, [x2], #64 - csel x2, x2, x3, ge - - zip1 v3.2d, v28.2d, v30.2d - zip2 v7.2d, v28.2d, v30.2d - stp a12, a13, [x1, #-16] - zip1 v11.2d, v29.2d, v31.2d - zip2 v15.2d, v29.2d, v31.2d - stp a14, a15, [x1, #-8] - - tbnz x5, #63, .Lt128 - ld1 {v28.16b-v31.16b}, [x2] - - // xor with corresponding input, write to output - eor v16.16b, v16.16b, v0.16b - eor v17.16b, v17.16b, v1.16b - eor v18.16b, v18.16b, v2.16b - eor v19.16b, v19.16b, v3.16b - - tbnz x6, #63, .Lt192 - - eor v20.16b, v20.16b, v4.16b - eor v21.16b, v21.16b, v5.16b - eor v22.16b, v22.16b, v6.16b - eor v23.16b, v23.16b, v7.16b - - st1 {v16.16b-v19.16b}, [x1], #64 - tbnz x7, #63, .Lt256 - - eor v24.16b, v24.16b, v8.16b - eor v25.16b, v25.16b, v9.16b - eor v26.16b, v26.16b, v10.16b - eor v27.16b, v27.16b, v11.16b - - st1 {v20.16b-v23.16b}, [x1], #64 - tbnz x8, #63, .Lt320 - - eor v28.16b, v28.16b, v12.16b - eor v29.16b, v29.16b, v13.16b - eor v30.16b, v30.16b, v14.16b - eor v31.16b, v31.16b, v15.16b - - st1 {v24.16b-v27.16b}, [x1], #64 - st1 {v28.16b-v31.16b}, [x1] - -.Lout: frame_pop - ret - - // fewer than 192 bytes of in/output -.Lt192: cbz x5, 1f // exactly 128 bytes? - ld1 {v28.16b-v31.16b}, [x10] - add x5, x5, x1 - tbl v28.16b, {v4.16b-v7.16b}, v28.16b - tbl v29.16b, {v4.16b-v7.16b}, v29.16b - tbl v30.16b, {v4.16b-v7.16b}, v30.16b - tbl v31.16b, {v4.16b-v7.16b}, v31.16b - -0: eor v20.16b, v20.16b, v28.16b - eor v21.16b, v21.16b, v29.16b - eor v22.16b, v22.16b, v30.16b - eor v23.16b, v23.16b, v31.16b - st1 {v20.16b-v23.16b}, [x5] // overlapping stores -1: st1 {v16.16b-v19.16b}, [x1] - b .Lout - - // fewer than 128 bytes of in/output -.Lt128: ld1 {v28.16b-v31.16b}, [x10] - add x5, x5, x1 - sub x1, x1, #64 - tbl v28.16b, {v0.16b-v3.16b}, v28.16b - tbl v29.16b, {v0.16b-v3.16b}, v29.16b - tbl v30.16b, {v0.16b-v3.16b}, v30.16b - tbl v31.16b, {v0.16b-v3.16b}, v31.16b - ld1 {v16.16b-v19.16b}, [x1] // reload first output block - b 0b - - // fewer than 256 bytes of in/output -.Lt256: cbz x6, 2f // exactly 192 bytes? - ld1 {v4.16b-v7.16b}, [x10] - add x6, x6, x1 - tbl v0.16b, {v8.16b-v11.16b}, v4.16b - tbl v1.16b, {v8.16b-v11.16b}, v5.16b - tbl v2.16b, {v8.16b-v11.16b}, v6.16b - tbl v3.16b, {v8.16b-v11.16b}, v7.16b - - eor v28.16b, v28.16b, v0.16b - eor v29.16b, v29.16b, v1.16b - eor v30.16b, v30.16b, v2.16b - eor v31.16b, v31.16b, v3.16b - st1 {v28.16b-v31.16b}, [x6] // overlapping stores -2: st1 {v20.16b-v23.16b}, [x1] - b .Lout - - // fewer than 320 bytes of in/output -.Lt320: cbz x7, 3f // exactly 256 bytes? - ld1 {v4.16b-v7.16b}, [x10] - add x7, x7, x1 - tbl v0.16b, {v12.16b-v15.16b}, v4.16b - tbl v1.16b, {v12.16b-v15.16b}, v5.16b - tbl v2.16b, {v12.16b-v15.16b}, v6.16b - tbl v3.16b, {v12.16b-v15.16b}, v7.16b - - eor v28.16b, v28.16b, v0.16b - eor v29.16b, v29.16b, v1.16b - eor v30.16b, v30.16b, v2.16b - eor v31.16b, v31.16b, v3.16b - st1 {v28.16b-v31.16b}, [x7] // overlapping stores -3: st1 {v24.16b-v27.16b}, [x1] - b .Lout -SYM_FUNC_END(chacha_4block_xor_neon) - - .section ".rodata", "a", %progbits - .align L1_CACHE_SHIFT -.Lpermute: - .set .Li, 0 - .rept 128 - .byte (.Li - 64) - .set .Li, .Li + 1 - .endr - -CTRINC: .word 1, 2, 3, 4 -ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c deleted file mode 100644 index 229876acfc58..000000000000 --- a/arch/arm64/crypto/chacha-neon-glue.c +++ /dev/null @@ -1,237 +0,0 @@ -/* - * ARM NEON and scalar accelerated ChaCha and XChaCha stream ciphers, - * including ChaCha20 (RFC7539) - * - * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include <crypto/algapi.h> -#include <crypto/internal/chacha.h> -#include <crypto/internal/simd.h> -#include <crypto/internal/skcipher.h> -#include <linux/jump_label.h> -#include <linux/kernel.h> -#include <linux/module.h> - -#include <asm/hwcap.h> -#include <asm/neon.h> -#include <asm/simd.h> - -asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src, - int nrounds); -asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src, - int nrounds, int bytes); -asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); - -static void chacha_doneon(u32 *state, u8 *dst, const u8 *src, - int bytes, int nrounds) -{ - while (bytes > 0) { - int l = min(bytes, CHACHA_BLOCK_SIZE * 5); - - if (l <= CHACHA_BLOCK_SIZE) { - u8 buf[CHACHA_BLOCK_SIZE]; - - memcpy(buf, src, l); - chacha_block_xor_neon(state, buf, buf, nrounds); - memcpy(dst, buf, l); - state[12] += 1; - break; - } - chacha_4block_xor_neon(state, dst, src, nrounds, l); - bytes -= l; - src += l; - dst += l; - state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE); - } -} - -void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) -{ - if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) { - hchacha_block_generic(state, stream, nrounds); - } else { - kernel_neon_begin(); - hchacha_block_neon(state, stream, nrounds); - kernel_neon_end(); - } -} -EXPORT_SYMBOL(hchacha_block_arch); - -void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, - int nrounds) -{ - if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE || - !crypto_simd_usable()) - return chacha_crypt_generic(state, dst, src, bytes, nrounds); - - do { - unsigned int todo = min_t(unsigned int, bytes, SZ_4K); - - kernel_neon_begin(); - chacha_doneon(state, dst, src, todo, nrounds); - kernel_neon_end(); - - bytes -= todo; - src += todo; - dst += todo; - } while (bytes); -} -EXPORT_SYMBOL(chacha_crypt_arch); - -static int chacha_neon_stream_xor(struct skcipher_request *req, - const struct chacha_ctx *ctx, const u8 *iv) -{ - struct skcipher_walk walk; - u32 state[16]; - int err; - - err = skcipher_walk_virt(&walk, req, false); - - chacha_init(state, ctx->key, iv); - - while (walk.nbytes > 0) { - unsigned int nbytes = walk.nbytes; - - if (nbytes < walk.total) - nbytes = rounddown(nbytes, walk.stride); - - if (!static_branch_likely(&have_neon) || - !crypto_simd_usable()) { - chacha_crypt_generic(state, walk.dst.virt.addr, - walk.src.virt.addr, nbytes, - ctx->nrounds); - } else { - kernel_neon_begin(); - chacha_doneon(state, walk.dst.virt.addr, - walk.src.virt.addr, nbytes, ctx->nrounds); - kernel_neon_end(); - } - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); - } - - return err; -} - -static int chacha_neon(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - - return chacha_neon_stream_xor(req, ctx, req->iv); -} - -static int xchacha_neon(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - struct chacha_ctx subctx; - u32 state[16]; - u8 real_iv[16]; - - chacha_init(state, ctx->key, req->iv); - hchacha_block_arch(state, subctx.key, ctx->nrounds); - subctx.nrounds = ctx->nrounds; - - memcpy(&real_iv[0], req->iv + 24, 8); - memcpy(&real_iv[8], req->iv + 16, 8); - return chacha_neon_stream_xor(req, &subctx, real_iv); -} - -static struct skcipher_alg algs[] = { - { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-neon", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = CHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .walksize = 5 * CHACHA_BLOCK_SIZE, - .setkey = chacha20_setkey, - .encrypt = chacha_neon, - .decrypt = chacha_neon, - }, { - .base.cra_name = "xchacha20", - .base.cra_driver_name = "xchacha20-neon", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .walksize = 5 * CHACHA_BLOCK_SIZE, - .setkey = chacha20_setkey, - .encrypt = xchacha_neon, - .decrypt = xchacha_neon, - }, { - .base.cra_name = "xchacha12", - .base.cra_driver_name = "xchacha12-neon", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA_KEY_SIZE, - .max_keysize = CHACHA_KEY_SIZE, - .ivsize = XCHACHA_IV_SIZE, - .chunksize = CHACHA_BLOCK_SIZE, - .walksize = 5 * CHACHA_BLOCK_SIZE, - .setkey = chacha12_setkey, - .encrypt = xchacha_neon, - .decrypt = xchacha_neon, - } -}; - -static int __init chacha_simd_mod_init(void) -{ - if (!cpu_have_named_feature(ASIMD)) - return 0; - - static_branch_enable(&have_neon); - - return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ? - crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0; -} - -static void __exit chacha_simd_mod_fini(void) -{ - if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) && cpu_have_named_feature(ASIMD)) - crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); -} - -module_init(chacha_simd_mod_init); -module_exit(chacha_simd_mod_fini); - -MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)"); -MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("chacha20"); -MODULE_ALIAS_CRYPTO("chacha20-neon"); -MODULE_ALIAS_CRYPTO("xchacha20"); -MODULE_ALIAS_CRYPTO("xchacha20-neon"); -MODULE_ALIAS_CRYPTO("xchacha12"); -MODULE_ALIAS_CRYPTO("xchacha12-neon"); diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index 071e122f9c37..4995b6e22335 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -6,30 +6,27 @@ */ #include <asm/neon.h> -#include <asm/simd.h> -#include <linux/unaligned.h> #include <crypto/aes.h> -#include <crypto/gcm.h> -#include <crypto/algapi.h> #include <crypto/b128ops.h> +#include <crypto/gcm.h> +#include <crypto/ghash.h> #include <crypto/gf128mul.h> #include <crypto/internal/aead.h> #include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/cpufeature.h> -#include <linux/crypto.h> +#include <linux/errno.h> +#include <linux/kernel.h> #include <linux/module.h> +#include <linux/string.h> +#include <linux/unaligned.h> MODULE_DESCRIPTION("GHASH and AES-GCM using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("ghash"); -#define GHASH_BLOCK_SIZE 16 -#define GHASH_DIGEST_SIZE 16 - #define RFC4106_NONCE_SIZE 4 struct ghash_key { @@ -37,10 +34,8 @@ struct ghash_key { u64 h[][2]; }; -struct ghash_desc_ctx { +struct arm_ghash_desc_ctx { u64 digest[GHASH_DIGEST_SIZE/sizeof(u64)]; - u8 buf[GHASH_BLOCK_SIZE]; - u32 count; }; struct gcm_aes_ctx { @@ -65,36 +60,12 @@ asmlinkage int pmull_gcm_decrypt(int bytes, u8 dst[], const u8 src[], static int ghash_init(struct shash_desc *desc) { - struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); + struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc); - *ctx = (struct ghash_desc_ctx){}; + *ctx = (struct arm_ghash_desc_ctx){}; return 0; } -static void ghash_do_update(int blocks, u64 dg[], const char *src, - struct ghash_key *key, const char *head) -{ - be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) }; - - do { - const u8 *in = src; - - if (head) { - in = head; - blocks++; - head = NULL; - } else { - src += GHASH_BLOCK_SIZE; - } - - crypto_xor((u8 *)&dst, in, GHASH_BLOCK_SIZE); - gf128mul_lle(&dst, &key->k); - } while (--blocks); - - dg[0] = be64_to_cpu(dst.b); - dg[1] = be64_to_cpu(dst.a); -} - static __always_inline void ghash_do_simd_update(int blocks, u64 dg[], const char *src, struct ghash_key *key, const char *head, @@ -103,13 +74,9 @@ void ghash_do_simd_update(int blocks, u64 dg[], const char *src, u64 const h[][2], const char *head)) { - if (likely(crypto_simd_usable())) { - kernel_neon_begin(); - simd_update(blocks, dg, src, key->h, head); - kernel_neon_end(); - } else { - ghash_do_update(blocks, dg, src, key, head); - } + kernel_neon_begin(); + simd_update(blocks, dg, src, key->h, head); + kernel_neon_end(); } /* avoid hogging the CPU for too long */ @@ -118,61 +85,59 @@ void ghash_do_simd_update(int blocks, u64 dg[], const char *src, static int ghash_update(struct shash_desc *desc, const u8 *src, unsigned int len) { - struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); - unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; + struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc); + struct ghash_key *key = crypto_shash_ctx(desc->tfm); + int blocks; - ctx->count += len; + blocks = len / GHASH_BLOCK_SIZE; + len -= blocks * GHASH_BLOCK_SIZE; - if ((partial + len) >= GHASH_BLOCK_SIZE) { - struct ghash_key *key = crypto_shash_ctx(desc->tfm); - int blocks; - - if (partial) { - int p = GHASH_BLOCK_SIZE - partial; + do { + int chunk = min(blocks, MAX_BLOCKS); - memcpy(ctx->buf + partial, src, p); - src += p; - len -= p; - } + ghash_do_simd_update(chunk, ctx->digest, src, key, NULL, + pmull_ghash_update_p8); + blocks -= chunk; + src += chunk * GHASH_BLOCK_SIZE; + } while (unlikely(blocks > 0)); + return len; +} - blocks = len / GHASH_BLOCK_SIZE; - len %= GHASH_BLOCK_SIZE; +static int ghash_export(struct shash_desc *desc, void *out) +{ + struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc); + u8 *dst = out; - do { - int chunk = min(blocks, MAX_BLOCKS); + put_unaligned_be64(ctx->digest[1], dst); + put_unaligned_be64(ctx->digest[0], dst + 8); + return 0; +} - ghash_do_simd_update(chunk, ctx->digest, src, key, - partial ? ctx->buf : NULL, - pmull_ghash_update_p8); +static int ghash_import(struct shash_desc *desc, const void *in) +{ + struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc); + const u8 *src = in; - blocks -= chunk; - src += chunk * GHASH_BLOCK_SIZE; - partial = 0; - } while (unlikely(blocks > 0)); - } - if (len) - memcpy(ctx->buf + partial, src, len); + ctx->digest[1] = get_unaligned_be64(src); + ctx->digest[0] = get_unaligned_be64(src + 8); return 0; } -static int ghash_final(struct shash_desc *desc, u8 *dst) +static int ghash_finup(struct shash_desc *desc, const u8 *src, + unsigned int len, u8 *dst) { - struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); - unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; - - if (partial) { - struct ghash_key *key = crypto_shash_ctx(desc->tfm); + struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc); + struct ghash_key *key = crypto_shash_ctx(desc->tfm); - memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial); + if (len) { + u8 buf[GHASH_BLOCK_SIZE] = {}; - ghash_do_simd_update(1, ctx->digest, ctx->buf, key, NULL, + memcpy(buf, src, len); + ghash_do_simd_update(1, ctx->digest, src, key, NULL, pmull_ghash_update_p8); + memzero_explicit(buf, sizeof(buf)); } - put_unaligned_be64(ctx->digest[1], dst); - put_unaligned_be64(ctx->digest[0], dst + 8); - - memzero_explicit(ctx, sizeof(*ctx)); - return 0; + return ghash_export(desc, dst); } static void ghash_reflect(u64 h[], const be128 *k) @@ -205,6 +170,7 @@ static struct shash_alg ghash_alg = { .base.cra_name = "ghash", .base.cra_driver_name = "ghash-neon", .base.cra_priority = 150, + .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .base.cra_blocksize = GHASH_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct ghash_key) + sizeof(u64[2]), .base.cra_module = THIS_MODULE, @@ -212,9 +178,12 @@ static struct shash_alg ghash_alg = { .digestsize = GHASH_DIGEST_SIZE, .init = ghash_init, .update = ghash_update, - .final = ghash_final, + .finup = ghash_finup, .setkey = ghash_setkey, - .descsize = sizeof(struct ghash_desc_ctx), + .export = ghash_export, + .import = ghash_import, + .descsize = sizeof(struct arm_ghash_desc_ctx), + .statesize = sizeof(struct ghash_desc_ctx), }; static int num_rounds(struct crypto_aes_ctx *ctx) diff --git a/arch/arm64/crypto/poly1305-armv8.pl b/arch/arm64/crypto/poly1305-armv8.pl deleted file mode 100644 index 22c9069c0650..000000000000 --- a/arch/arm64/crypto/poly1305-armv8.pl +++ /dev/null @@ -1,917 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause -# -# ==================================================================== -# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL -# project. -# ==================================================================== -# -# This module implements Poly1305 hash for ARMv8. -# -# June 2015 -# -# Numbers are cycles per processed byte with poly1305_blocks alone. -# -# IALU/gcc-4.9 NEON -# -# Apple A7 1.86/+5% 0.72 -# Cortex-A53 2.69/+58% 1.47 -# Cortex-A57 2.70/+7% 1.14 -# Denver 1.64/+50% 1.18(*) -# X-Gene 2.13/+68% 2.27 -# Mongoose 1.77/+75% 1.12 -# Kryo 2.70/+55% 1.13 -# ThunderX2 1.17/+95% 1.36 -# -# (*) estimate based on resources availability is less than 1.0, -# i.e. measured result is worse than expected, presumably binary -# translator is not almighty; - -$flavour=shift; -$output=shift; - -if ($flavour && $flavour ne "void") { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - - open STDOUT,"| \"$^X\" $xlate $flavour $output"; -} else { - open STDOUT,">$output"; -} - -my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); -my ($mac,$nonce)=($inp,$len); - -my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); - -$code.=<<___; -#ifndef __KERNEL__ -# include "arm_arch.h" -.extern OPENSSL_armcap_P -#endif - -.text - -// forward "declarations" are required for Apple -.globl poly1305_blocks -.globl poly1305_emit - -.globl poly1305_init -.type poly1305_init,%function -.align 5 -poly1305_init: - cmp $inp,xzr - stp xzr,xzr,[$ctx] // zero hash value - stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] - - csel x0,xzr,x0,eq - b.eq .Lno_key - -#ifndef __KERNEL__ - adrp x17,OPENSSL_armcap_P - ldr w17,[x17,#:lo12:OPENSSL_armcap_P] -#endif - - ldp $r0,$r1,[$inp] // load key - mov $s1,#0xfffffffc0fffffff - movk $s1,#0x0fff,lsl#48 -#ifdef __AARCH64EB__ - rev $r0,$r0 // flip bytes - rev $r1,$r1 -#endif - and $r0,$r0,$s1 // &=0ffffffc0fffffff - and $s1,$s1,#-4 - and $r1,$r1,$s1 // &=0ffffffc0ffffffc - mov w#$s1,#-1 - stp $r0,$r1,[$ctx,#32] // save key value - str w#$s1,[$ctx,#48] // impossible key power value - -#ifndef __KERNEL__ - tst w17,#ARMV7_NEON - - adr $d0,.Lpoly1305_blocks - adr $r0,.Lpoly1305_blocks_neon - adr $d1,.Lpoly1305_emit - - csel $d0,$d0,$r0,eq - -# ifdef __ILP32__ - stp w#$d0,w#$d1,[$len] -# else - stp $d0,$d1,[$len] -# endif -#endif - mov x0,#1 -.Lno_key: - ret -.size poly1305_init,.-poly1305_init - -.type poly1305_blocks,%function -.align 5 -poly1305_blocks: -.Lpoly1305_blocks: - ands $len,$len,#-16 - b.eq .Lno_data - - ldp $h0,$h1,[$ctx] // load hash value - ldp $h2,x17,[$ctx,#16] // [along with is_base2_26] - ldp $r0,$r1,[$ctx,#32] // load key value - -#ifdef __AARCH64EB__ - lsr $d0,$h0,#32 - mov w#$d1,w#$h0 - lsr $d2,$h1,#32 - mov w15,w#$h1 - lsr x16,$h2,#32 -#else - mov w#$d0,w#$h0 - lsr $d1,$h0,#32 - mov w#$d2,w#$h1 - lsr x15,$h1,#32 - mov w16,w#$h2 -#endif - - add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64 - lsr $d1,$d2,#12 - adds $d0,$d0,$d2,lsl#52 - add $d1,$d1,x15,lsl#14 - adc $d1,$d1,xzr - lsr $d2,x16,#24 - adds $d1,$d1,x16,lsl#40 - adc $d2,$d2,xzr - - cmp x17,#0 // is_base2_26? - add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) - csel $h0,$h0,$d0,eq // choose between radixes - csel $h1,$h1,$d1,eq - csel $h2,$h2,$d2,eq - -.Loop: - ldp $t0,$t1,[$inp],#16 // load input - sub $len,$len,#16 -#ifdef __AARCH64EB__ - rev $t0,$t0 - rev $t1,$t1 -#endif - adds $h0,$h0,$t0 // accumulate input - adcs $h1,$h1,$t1 - - mul $d0,$h0,$r0 // h0*r0 - adc $h2,$h2,$padbit - umulh $d1,$h0,$r0 - - mul $t0,$h1,$s1 // h1*5*r1 - umulh $t1,$h1,$s1 - - adds $d0,$d0,$t0 - mul $t0,$h0,$r1 // h0*r1 - adc $d1,$d1,$t1 - umulh $d2,$h0,$r1 - - adds $d1,$d1,$t0 - mul $t0,$h1,$r0 // h1*r0 - adc $d2,$d2,xzr - umulh $t1,$h1,$r0 - - adds $d1,$d1,$t0 - mul $t0,$h2,$s1 // h2*5*r1 - adc $d2,$d2,$t1 - mul $t1,$h2,$r0 // h2*r0 - - adds $d1,$d1,$t0 - adc $d2,$d2,$t1 - - and $t0,$d2,#-4 // final reduction - and $h2,$d2,#3 - add $t0,$t0,$d2,lsr#2 - adds $h0,$d0,$t0 - adcs $h1,$d1,xzr - adc $h2,$h2,xzr - - cbnz $len,.Loop - - stp $h0,$h1,[$ctx] // store hash value - stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26] - -.Lno_data: - ret -.size poly1305_blocks,.-poly1305_blocks - -.type poly1305_emit,%function -.align 5 -poly1305_emit: -.Lpoly1305_emit: - ldp $h0,$h1,[$ctx] // load hash base 2^64 - ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26] - ldp $t0,$t1,[$nonce] // load nonce - -#ifdef __AARCH64EB__ - lsr $d0,$h0,#32 - mov w#$d1,w#$h0 - lsr $d2,$h1,#32 - mov w15,w#$h1 - lsr x16,$h2,#32 -#else - mov w#$d0,w#$h0 - lsr $d1,$h0,#32 - mov w#$d2,w#$h1 - lsr x15,$h1,#32 - mov w16,w#$h2 -#endif - - add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64 - lsr $d1,$d2,#12 - adds $d0,$d0,$d2,lsl#52 - add $d1,$d1,x15,lsl#14 - adc $d1,$d1,xzr - lsr $d2,x16,#24 - adds $d1,$d1,x16,lsl#40 - adc $d2,$d2,xzr - - cmp $r0,#0 // is_base2_26? - csel $h0,$h0,$d0,eq // choose between radixes - csel $h1,$h1,$d1,eq - csel $h2,$h2,$d2,eq - - adds $d0,$h0,#5 // compare to modulus - adcs $d1,$h1,xzr - adc $d2,$h2,xzr - - tst $d2,#-4 // see if it's carried/borrowed - - csel $h0,$h0,$d0,eq - csel $h1,$h1,$d1,eq - -#ifdef __AARCH64EB__ - ror $t0,$t0,#32 // flip nonce words - ror $t1,$t1,#32 -#endif - adds $h0,$h0,$t0 // accumulate nonce - adc $h1,$h1,$t1 -#ifdef __AARCH64EB__ - rev $h0,$h0 // flip output bytes - rev $h1,$h1 -#endif - stp $h0,$h1,[$mac] // write result - - ret -.size poly1305_emit,.-poly1305_emit -___ -my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); -my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); -my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); -my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); -my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); -my ($T0,$T1,$MASK) = map("v$_",(29..31)); - -my ($in2,$zeros)=("x16","x17"); -my $is_base2_26 = $zeros; # borrow - -$code.=<<___; -.type poly1305_mult,%function -.align 5 -poly1305_mult: - mul $d0,$h0,$r0 // h0*r0 - umulh $d1,$h0,$r0 - - mul $t0,$h1,$s1 // h1*5*r1 - umulh $t1,$h1,$s1 - - adds $d0,$d0,$t0 - mul $t0,$h0,$r1 // h0*r1 - adc $d1,$d1,$t1 - umulh $d2,$h0,$r1 - - adds $d1,$d1,$t0 - mul $t0,$h1,$r0 // h1*r0 - adc $d2,$d2,xzr - umulh $t1,$h1,$r0 - - adds $d1,$d1,$t0 - mul $t0,$h2,$s1 // h2*5*r1 - adc $d2,$d2,$t1 - mul $t1,$h2,$r0 // h2*r0 - - adds $d1,$d1,$t0 - adc $d2,$d2,$t1 - - and $t0,$d2,#-4 // final reduction - and $h2,$d2,#3 - add $t0,$t0,$d2,lsr#2 - adds $h0,$d0,$t0 - adcs $h1,$d1,xzr - adc $h2,$h2,xzr - - ret -.size poly1305_mult,.-poly1305_mult - -.type poly1305_splat,%function -.align 4 -poly1305_splat: - and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26 - ubfx x13,$h0,#26,#26 - extr x14,$h1,$h0,#52 - and x14,x14,#0x03ffffff - ubfx x15,$h1,#14,#26 - extr x16,$h2,$h1,#40 - - str w12,[$ctx,#16*0] // r0 - add w12,w13,w13,lsl#2 // r1*5 - str w13,[$ctx,#16*1] // r1 - add w13,w14,w14,lsl#2 // r2*5 - str w12,[$ctx,#16*2] // s1 - str w14,[$ctx,#16*3] // r2 - add w14,w15,w15,lsl#2 // r3*5 - str w13,[$ctx,#16*4] // s2 - str w15,[$ctx,#16*5] // r3 - add w15,w16,w16,lsl#2 // r4*5 - str w14,[$ctx,#16*6] // s3 - str w16,[$ctx,#16*7] // r4 - str w15,[$ctx,#16*8] // s4 - - ret -.size poly1305_splat,.-poly1305_splat - -#ifdef __KERNEL__ -.globl poly1305_blocks_neon -#endif -.type poly1305_blocks_neon,%function -.align 5 -poly1305_blocks_neon: -.Lpoly1305_blocks_neon: - ldr $is_base2_26,[$ctx,#24] - cmp $len,#128 - b.lo .Lpoly1305_blocks - - .inst 0xd503233f // paciasp - stp x29,x30,[sp,#-80]! - add x29,sp,#0 - - stp d8,d9,[sp,#16] // meet ABI requirements - stp d10,d11,[sp,#32] - stp d12,d13,[sp,#48] - stp d14,d15,[sp,#64] - - cbz $is_base2_26,.Lbase2_64_neon - - ldp w10,w11,[$ctx] // load hash value base 2^26 - ldp w12,w13,[$ctx,#8] - ldr w14,[$ctx,#16] - - tst $len,#31 - b.eq .Leven_neon - - ldp $r0,$r1,[$ctx,#32] // load key value - - add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 - lsr $h1,x12,#12 - adds $h0,$h0,x12,lsl#52 - add $h1,$h1,x13,lsl#14 - adc $h1,$h1,xzr - lsr $h2,x14,#24 - adds $h1,$h1,x14,lsl#40 - adc $d2,$h2,xzr // can be partially reduced... - - ldp $d0,$d1,[$inp],#16 // load input - sub $len,$len,#16 - add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) - -#ifdef __AARCH64EB__ - rev $d0,$d0 - rev $d1,$d1 -#endif - adds $h0,$h0,$d0 // accumulate input - adcs $h1,$h1,$d1 - adc $h2,$h2,$padbit - - bl poly1305_mult - - and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 - ubfx x11,$h0,#26,#26 - extr x12,$h1,$h0,#52 - and x12,x12,#0x03ffffff - ubfx x13,$h1,#14,#26 - extr x14,$h2,$h1,#40 - - b .Leven_neon - -.align 4 -.Lbase2_64_neon: - ldp $r0,$r1,[$ctx,#32] // load key value - - ldp $h0,$h1,[$ctx] // load hash value base 2^64 - ldr $h2,[$ctx,#16] - - tst $len,#31 - b.eq .Linit_neon - - ldp $d0,$d1,[$inp],#16 // load input - sub $len,$len,#16 - add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) -#ifdef __AARCH64EB__ - rev $d0,$d0 - rev $d1,$d1 -#endif - adds $h0,$h0,$d0 // accumulate input - adcs $h1,$h1,$d1 - adc $h2,$h2,$padbit - - bl poly1305_mult - -.Linit_neon: - ldr w17,[$ctx,#48] // first table element - and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 - ubfx x11,$h0,#26,#26 - extr x12,$h1,$h0,#52 - and x12,x12,#0x03ffffff - ubfx x13,$h1,#14,#26 - extr x14,$h2,$h1,#40 - - cmp w17,#-1 // is value impossible? - b.ne .Leven_neon - - fmov ${H0},x10 - fmov ${H1},x11 - fmov ${H2},x12 - fmov ${H3},x13 - fmov ${H4},x14 - - ////////////////////////////////// initialize r^n table - mov $h0,$r0 // r^1 - add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) - mov $h1,$r1 - mov $h2,xzr - add $ctx,$ctx,#48+12 - bl poly1305_splat - - bl poly1305_mult // r^2 - sub $ctx,$ctx,#4 - bl poly1305_splat - - bl poly1305_mult // r^3 - sub $ctx,$ctx,#4 - bl poly1305_splat - - bl poly1305_mult // r^4 - sub $ctx,$ctx,#4 - bl poly1305_splat - sub $ctx,$ctx,#48 // restore original $ctx - b .Ldo_neon - -.align 4 -.Leven_neon: - fmov ${H0},x10 - fmov ${H1},x11 - fmov ${H2},x12 - fmov ${H3},x13 - fmov ${H4},x14 - -.Ldo_neon: - ldp x8,x12,[$inp,#32] // inp[2:3] - subs $len,$len,#64 - ldp x9,x13,[$inp,#48] - add $in2,$inp,#96 - adrp $zeros,.Lzeros - add $zeros,$zeros,#:lo12:.Lzeros - - lsl $padbit,$padbit,#24 - add x15,$ctx,#48 - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 -#endif - and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 - and x5,x9,#0x03ffffff - ubfx x6,x8,#26,#26 - ubfx x7,x9,#26,#26 - add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 - extr x8,x12,x8,#52 - extr x9,x13,x9,#52 - add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 - fmov $IN23_0,x4 - and x8,x8,#0x03ffffff - and x9,x9,#0x03ffffff - ubfx x10,x12,#14,#26 - ubfx x11,x13,#14,#26 - add x12,$padbit,x12,lsr#40 - add x13,$padbit,x13,lsr#40 - add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 - fmov $IN23_1,x6 - add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 - add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 - fmov $IN23_2,x8 - fmov $IN23_3,x10 - fmov $IN23_4,x12 - - ldp x8,x12,[$inp],#16 // inp[0:1] - ldp x9,x13,[$inp],#48 - - ld1 {$R0,$R1,$S1,$R2},[x15],#64 - ld1 {$S2,$R3,$S3,$R4},[x15],#64 - ld1 {$S4},[x15] - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 -#endif - and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 - and x5,x9,#0x03ffffff - ubfx x6,x8,#26,#26 - ubfx x7,x9,#26,#26 - add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 - extr x8,x12,x8,#52 - extr x9,x13,x9,#52 - add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 - fmov $IN01_0,x4 - and x8,x8,#0x03ffffff - and x9,x9,#0x03ffffff - ubfx x10,x12,#14,#26 - ubfx x11,x13,#14,#26 - add x12,$padbit,x12,lsr#40 - add x13,$padbit,x13,lsr#40 - add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 - fmov $IN01_1,x6 - add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 - add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 - movi $MASK.2d,#-1 - fmov $IN01_2,x8 - fmov $IN01_3,x10 - fmov $IN01_4,x12 - ushr $MASK.2d,$MASK.2d,#38 - - b.ls .Lskip_loop - -.align 4 -.Loop_neon: - //////////////////////////////////////////////////////////////// - // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 - // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r - // \___________________/ - // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 - // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r - // \___________________/ \____________________/ - // - // Note that we start with inp[2:3]*r^2. This is because it - // doesn't depend on reduction in previous iteration. - //////////////////////////////////////////////////////////////// - // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 - // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 - // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 - // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 - // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 - - subs $len,$len,#64 - umull $ACC4,$IN23_0,${R4}[2] - csel $in2,$zeros,$in2,lo - umull $ACC3,$IN23_0,${R3}[2] - umull $ACC2,$IN23_0,${R2}[2] - ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) - umull $ACC1,$IN23_0,${R1}[2] - ldp x9,x13,[$in2],#48 - umull $ACC0,$IN23_0,${R0}[2] -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 -#endif - - umlal $ACC4,$IN23_1,${R3}[2] - and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 - umlal $ACC3,$IN23_1,${R2}[2] - and x5,x9,#0x03ffffff - umlal $ACC2,$IN23_1,${R1}[2] - ubfx x6,x8,#26,#26 - umlal $ACC1,$IN23_1,${R0}[2] - ubfx x7,x9,#26,#26 - umlal $ACC0,$IN23_1,${S4}[2] - add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 - - umlal $ACC4,$IN23_2,${R2}[2] - extr x8,x12,x8,#52 - umlal $ACC3,$IN23_2,${R1}[2] - extr x9,x13,x9,#52 - umlal $ACC2,$IN23_2,${R0}[2] - add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 - umlal $ACC1,$IN23_2,${S4}[2] - fmov $IN23_0,x4 - umlal $ACC0,$IN23_2,${S3}[2] - and x8,x8,#0x03ffffff - - umlal $ACC4,$IN23_3,${R1}[2] - and x9,x9,#0x03ffffff - umlal $ACC3,$IN23_3,${R0}[2] - ubfx x10,x12,#14,#26 - umlal $ACC2,$IN23_3,${S4}[2] - ubfx x11,x13,#14,#26 - umlal $ACC1,$IN23_3,${S3}[2] - add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 - umlal $ACC0,$IN23_3,${S2}[2] - fmov $IN23_1,x6 - - add $IN01_2,$IN01_2,$H2 - add x12,$padbit,x12,lsr#40 - umlal $ACC4,$IN23_4,${R0}[2] - add x13,$padbit,x13,lsr#40 - umlal $ACC3,$IN23_4,${S4}[2] - add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 - umlal $ACC2,$IN23_4,${S3}[2] - add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 - umlal $ACC1,$IN23_4,${S2}[2] - fmov $IN23_2,x8 - umlal $ACC0,$IN23_4,${S1}[2] - fmov $IN23_3,x10 - - //////////////////////////////////////////////////////////////// - // (hash+inp[0:1])*r^4 and accumulate - - add $IN01_0,$IN01_0,$H0 - fmov $IN23_4,x12 - umlal $ACC3,$IN01_2,${R1}[0] - ldp x8,x12,[$inp],#16 // inp[0:1] - umlal $ACC0,$IN01_2,${S3}[0] - ldp x9,x13,[$inp],#48 - umlal $ACC4,$IN01_2,${R2}[0] - umlal $ACC1,$IN01_2,${S4}[0] - umlal $ACC2,$IN01_2,${R0}[0] -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 -#endif - - add $IN01_1,$IN01_1,$H1 - umlal $ACC3,$IN01_0,${R3}[0] - umlal $ACC4,$IN01_0,${R4}[0] - and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 - umlal $ACC2,$IN01_0,${R2}[0] - and x5,x9,#0x03ffffff - umlal $ACC0,$IN01_0,${R0}[0] - ubfx x6,x8,#26,#26 - umlal $ACC1,$IN01_0,${R1}[0] - ubfx x7,x9,#26,#26 - - add $IN01_3,$IN01_3,$H3 - add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 - umlal $ACC3,$IN01_1,${R2}[0] - extr x8,x12,x8,#52 - umlal $ACC4,$IN01_1,${R3}[0] - extr x9,x13,x9,#52 - umlal $ACC0,$IN01_1,${S4}[0] - add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 - umlal $ACC2,$IN01_1,${R1}[0] - fmov $IN01_0,x4 - umlal $ACC1,$IN01_1,${R0}[0] - and x8,x8,#0x03ffffff - - add $IN01_4,$IN01_4,$H4 - and x9,x9,#0x03ffffff - umlal $ACC3,$IN01_3,${R0}[0] - ubfx x10,x12,#14,#26 - umlal $ACC0,$IN01_3,${S2}[0] - ubfx x11,x13,#14,#26 - umlal $ACC4,$IN01_3,${R1}[0] - add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 - umlal $ACC1,$IN01_3,${S3}[0] - fmov $IN01_1,x6 - umlal $ACC2,$IN01_3,${S4}[0] - add x12,$padbit,x12,lsr#40 - - umlal $ACC3,$IN01_4,${S4}[0] - add x13,$padbit,x13,lsr#40 - umlal $ACC0,$IN01_4,${S1}[0] - add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 - umlal $ACC4,$IN01_4,${R0}[0] - add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 - umlal $ACC1,$IN01_4,${S2}[0] - fmov $IN01_2,x8 - umlal $ACC2,$IN01_4,${S3}[0] - fmov $IN01_3,x10 - fmov $IN01_4,x12 - - ///////////////////////////////////////////////////////////////// - // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein - // and P. Schwabe - // - // [see discussion in poly1305-armv4 module] - - ushr $T0.2d,$ACC3,#26 - xtn $H3,$ACC3 - ushr $T1.2d,$ACC0,#26 - and $ACC0,$ACC0,$MASK.2d - add $ACC4,$ACC4,$T0.2d // h3 -> h4 - bic $H3,#0xfc,lsl#24 // &=0x03ffffff - add $ACC1,$ACC1,$T1.2d // h0 -> h1 - - ushr $T0.2d,$ACC4,#26 - xtn $H4,$ACC4 - ushr $T1.2d,$ACC1,#26 - xtn $H1,$ACC1 - bic $H4,#0xfc,lsl#24 - add $ACC2,$ACC2,$T1.2d // h1 -> h2 - - add $ACC0,$ACC0,$T0.2d - shl $T0.2d,$T0.2d,#2 - shrn $T1.2s,$ACC2,#26 - xtn $H2,$ACC2 - add $ACC0,$ACC0,$T0.2d // h4 -> h0 - bic $H1,#0xfc,lsl#24 - add $H3,$H3,$T1.2s // h2 -> h3 - bic $H2,#0xfc,lsl#24 - - shrn $T0.2s,$ACC0,#26 - xtn $H0,$ACC0 - ushr $T1.2s,$H3,#26 - bic $H3,#0xfc,lsl#24 - bic $H0,#0xfc,lsl#24 - add $H1,$H1,$T0.2s // h0 -> h1 - add $H4,$H4,$T1.2s // h3 -> h4 - - b.hi .Loop_neon - -.Lskip_loop: - dup $IN23_2,${IN23_2}[0] - add $IN01_2,$IN01_2,$H2 - - //////////////////////////////////////////////////////////////// - // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 - - adds $len,$len,#32 - b.ne .Long_tail - - dup $IN23_2,${IN01_2}[0] - add $IN23_0,$IN01_0,$H0 - add $IN23_3,$IN01_3,$H3 - add $IN23_1,$IN01_1,$H1 - add $IN23_4,$IN01_4,$H4 - -.Long_tail: - dup $IN23_0,${IN23_0}[0] - umull2 $ACC0,$IN23_2,${S3} - umull2 $ACC3,$IN23_2,${R1} - umull2 $ACC4,$IN23_2,${R2} - umull2 $ACC2,$IN23_2,${R0} - umull2 $ACC1,$IN23_2,${S4} - - dup $IN23_1,${IN23_1}[0] - umlal2 $ACC0,$IN23_0,${R0} - umlal2 $ACC2,$IN23_0,${R2} - umlal2 $ACC3,$IN23_0,${R3} - umlal2 $ACC4,$IN23_0,${R4} - umlal2 $ACC1,$IN23_0,${R1} - - dup $IN23_3,${IN23_3}[0] - umlal2 $ACC0,$IN23_1,${S4} - umlal2 $ACC3,$IN23_1,${R2} - umlal2 $ACC2,$IN23_1,${R1} - umlal2 $ACC4,$IN23_1,${R3} - umlal2 $ACC1,$IN23_1,${R0} - - dup $IN23_4,${IN23_4}[0] - umlal2 $ACC3,$IN23_3,${R0} - umlal2 $ACC4,$IN23_3,${R1} - umlal2 $ACC0,$IN23_3,${S2} - umlal2 $ACC1,$IN23_3,${S3} - umlal2 $ACC2,$IN23_3,${S4} - - umlal2 $ACC3,$IN23_4,${S4} - umlal2 $ACC0,$IN23_4,${S1} - umlal2 $ACC4,$IN23_4,${R0} - umlal2 $ACC1,$IN23_4,${S2} - umlal2 $ACC2,$IN23_4,${S3} - - b.eq .Lshort_tail - - //////////////////////////////////////////////////////////////// - // (hash+inp[0:1])*r^4:r^3 and accumulate - - add $IN01_0,$IN01_0,$H0 - umlal $ACC3,$IN01_2,${R1} - umlal $ACC0,$IN01_2,${S3} - umlal $ACC4,$IN01_2,${R2} - umlal $ACC1,$IN01_2,${S4} - umlal $ACC2,$IN01_2,${R0} - - add $IN01_1,$IN01_1,$H1 - umlal $ACC3,$IN01_0,${R3} - umlal $ACC0,$IN01_0,${R0} - umlal $ACC4,$IN01_0,${R4} - umlal $ACC1,$IN01_0,${R1} - umlal $ACC2,$IN01_0,${R2} - - add $IN01_3,$IN01_3,$H3 - umlal $ACC3,$IN01_1,${R2} - umlal $ACC0,$IN01_1,${S4} - umlal $ACC4,$IN01_1,${R3} - umlal $ACC1,$IN01_1,${R0} - umlal $ACC2,$IN01_1,${R1} - - add $IN01_4,$IN01_4,$H4 - umlal $ACC3,$IN01_3,${R0} - umlal $ACC0,$IN01_3,${S2} - umlal $ACC4,$IN01_3,${R1} - umlal $ACC1,$IN01_3,${S3} - umlal $ACC2,$IN01_3,${S4} - - umlal $ACC3,$IN01_4,${S4} - umlal $ACC0,$IN01_4,${S1} - umlal $ACC4,$IN01_4,${R0} - umlal $ACC1,$IN01_4,${S2} - umlal $ACC2,$IN01_4,${S3} - -.Lshort_tail: - //////////////////////////////////////////////////////////////// - // horizontal add - - addp $ACC3,$ACC3,$ACC3 - ldp d8,d9,[sp,#16] // meet ABI requirements - addp $ACC0,$ACC0,$ACC0 - ldp d10,d11,[sp,#32] - addp $ACC4,$ACC4,$ACC4 - ldp d12,d13,[sp,#48] - addp $ACC1,$ACC1,$ACC1 - ldp d14,d15,[sp,#64] - addp $ACC2,$ACC2,$ACC2 - ldr x30,[sp,#8] - - //////////////////////////////////////////////////////////////// - // lazy reduction, but without narrowing - - ushr $T0.2d,$ACC3,#26 - and $ACC3,$ACC3,$MASK.2d - ushr $T1.2d,$ACC0,#26 - and $ACC0,$ACC0,$MASK.2d - - add $ACC4,$ACC4,$T0.2d // h3 -> h4 - add $ACC1,$ACC1,$T1.2d // h0 -> h1 - - ushr $T0.2d,$ACC4,#26 - and $ACC4,$ACC4,$MASK.2d - ushr $T1.2d,$ACC1,#26 - and $ACC1,$ACC1,$MASK.2d - add $ACC2,$ACC2,$T1.2d // h1 -> h2 - - add $ACC0,$ACC0,$T0.2d - shl $T0.2d,$T0.2d,#2 - ushr $T1.2d,$ACC2,#26 - and $ACC2,$ACC2,$MASK.2d - add $ACC0,$ACC0,$T0.2d // h4 -> h0 - add $ACC3,$ACC3,$T1.2d // h2 -> h3 - - ushr $T0.2d,$ACC0,#26 - and $ACC0,$ACC0,$MASK.2d - ushr $T1.2d,$ACC3,#26 - and $ACC3,$ACC3,$MASK.2d - add $ACC1,$ACC1,$T0.2d // h0 -> h1 - add $ACC4,$ACC4,$T1.2d // h3 -> h4 - - //////////////////////////////////////////////////////////////// - // write the result, can be partially reduced - - st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 - mov x4,#1 - st1 {$ACC4}[0],[$ctx] - str x4,[$ctx,#8] // set is_base2_26 - - ldr x29,[sp],#80 - .inst 0xd50323bf // autiasp - ret -.size poly1305_blocks_neon,.-poly1305_blocks_neon - -.pushsection .rodata -.align 5 -.Lzeros: -.long 0,0,0,0,0,0,0,0 -.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm" -.popsection - -.align 2 -#if !defined(__KERNEL__) && !defined(_WIN64) -.comm OPENSSL_armcap_P,4,4 -.hidden OPENSSL_armcap_P -#endif -___ - -foreach (split("\n",$code)) { - s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or - s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or - (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or - (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or - (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or - (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or - (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); - - s/\.[124]([sd])\[/.$1\[/; - s/w#x([0-9]+)/w$1/g; - - print $_,"\n"; -} -close STDOUT; diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c deleted file mode 100644 index 18883ea438f3..000000000000 --- a/arch/arm64/crypto/poly1305-glue.c +++ /dev/null @@ -1,232 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64 - * - * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org> - */ - -#include <asm/hwcap.h> -#include <asm/neon.h> -#include <asm/simd.h> -#include <linux/unaligned.h> -#include <crypto/algapi.h> -#include <crypto/internal/hash.h> -#include <crypto/internal/poly1305.h> -#include <crypto/internal/simd.h> -#include <linux/cpufeature.h> -#include <linux/crypto.h> -#include <linux/jump_label.h> -#include <linux/module.h> - -asmlinkage void poly1305_init_arm64(void *state, const u8 *key); -asmlinkage void poly1305_blocks(void *state, const u8 *src, u32 len, u32 hibit); -asmlinkage void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit); -asmlinkage void poly1305_emit(void *state, u8 *digest, const u32 *nonce); - -static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon); - -void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE]) -{ - poly1305_init_arm64(&dctx->h, key); - dctx->s[0] = get_unaligned_le32(key + 16); - dctx->s[1] = get_unaligned_le32(key + 20); - dctx->s[2] = get_unaligned_le32(key + 24); - dctx->s[3] = get_unaligned_le32(key + 28); - dctx->buflen = 0; -} -EXPORT_SYMBOL(poly1305_init_arch); - -static int neon_poly1305_init(struct shash_desc *desc) -{ - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - - dctx->buflen = 0; - dctx->rset = 0; - dctx->sset = false; - - return 0; -} - -static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src, - u32 len, u32 hibit, bool do_neon) -{ - if (unlikely(!dctx->sset)) { - if (!dctx->rset) { - poly1305_init_arm64(&dctx->h, src); - src += POLY1305_BLOCK_SIZE; - len -= POLY1305_BLOCK_SIZE; - dctx->rset = 1; - } - if (len >= POLY1305_BLOCK_SIZE) { - dctx->s[0] = get_unaligned_le32(src + 0); - dctx->s[1] = get_unaligned_le32(src + 4); - dctx->s[2] = get_unaligned_le32(src + 8); - dctx->s[3] = get_unaligned_le32(src + 12); - src += POLY1305_BLOCK_SIZE; - len -= POLY1305_BLOCK_SIZE; - dctx->sset = true; - } - if (len < POLY1305_BLOCK_SIZE) - return; - } - - len &= ~(POLY1305_BLOCK_SIZE - 1); - - if (static_branch_likely(&have_neon) && likely(do_neon)) - poly1305_blocks_neon(&dctx->h, src, len, hibit); - else - poly1305_blocks(&dctx->h, src, len, hibit); -} - -static void neon_poly1305_do_update(struct poly1305_desc_ctx *dctx, - const u8 *src, u32 len, bool do_neon) -{ - if (unlikely(dctx->buflen)) { - u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen); - - memcpy(dctx->buf + dctx->buflen, src, bytes); - src += bytes; - len -= bytes; - dctx->buflen += bytes; - - if (dctx->buflen == POLY1305_BLOCK_SIZE) { - neon_poly1305_blocks(dctx, dctx->buf, - POLY1305_BLOCK_SIZE, 1, false); - dctx->buflen = 0; - } - } - - if (likely(len >= POLY1305_BLOCK_SIZE)) { - neon_poly1305_blocks(dctx, src, len, 1, do_neon); - src += round_down(len, POLY1305_BLOCK_SIZE); - len %= POLY1305_BLOCK_SIZE; - } - - if (unlikely(len)) { - dctx->buflen = len; - memcpy(dctx->buf, src, len); - } -} - -static int neon_poly1305_update(struct shash_desc *desc, - const u8 *src, unsigned int srclen) -{ - bool do_neon = crypto_simd_usable() && srclen > 128; - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - - if (static_branch_likely(&have_neon) && do_neon) - kernel_neon_begin(); - neon_poly1305_do_update(dctx, src, srclen, do_neon); - if (static_branch_likely(&have_neon) && do_neon) - kernel_neon_end(); - return 0; -} - -void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, - unsigned int nbytes) -{ - if (unlikely(dctx->buflen)) { - u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen); - - memcpy(dctx->buf + dctx->buflen, src, bytes); - src += bytes; - nbytes -= bytes; - dctx->buflen += bytes; - - if (dctx->buflen == POLY1305_BLOCK_SIZE) { - poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1); - dctx->buflen = 0; - } - } - - if (likely(nbytes >= POLY1305_BLOCK_SIZE)) { - unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE); - - if (static_branch_likely(&have_neon) && crypto_simd_usable()) { - do { - unsigned int todo = min_t(unsigned int, len, SZ_4K); - - kernel_neon_begin(); - poly1305_blocks_neon(&dctx->h, src, todo, 1); - kernel_neon_end(); - - len -= todo; - src += todo; - } while (len); - } else { - poly1305_blocks(&dctx->h, src, len, 1); - src += len; - } - nbytes %= POLY1305_BLOCK_SIZE; - } - - if (unlikely(nbytes)) { - dctx->buflen = nbytes; - memcpy(dctx->buf, src, nbytes); - } -} -EXPORT_SYMBOL(poly1305_update_arch); - -void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst) -{ - if (unlikely(dctx->buflen)) { - dctx->buf[dctx->buflen++] = 1; - memset(dctx->buf + dctx->buflen, 0, - POLY1305_BLOCK_SIZE - dctx->buflen); - poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0); - } - - poly1305_emit(&dctx->h, dst, dctx->s); - memzero_explicit(dctx, sizeof(*dctx)); -} -EXPORT_SYMBOL(poly1305_final_arch); - -static int neon_poly1305_final(struct shash_desc *desc, u8 *dst) -{ - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - - if (unlikely(!dctx->sset)) - return -ENOKEY; - - poly1305_final_arch(dctx, dst); - return 0; -} - -static struct shash_alg neon_poly1305_alg = { - .init = neon_poly1305_init, - .update = neon_poly1305_update, - .final = neon_poly1305_final, - .digestsize = POLY1305_DIGEST_SIZE, - .descsize = sizeof(struct poly1305_desc_ctx), - - .base.cra_name = "poly1305", - .base.cra_driver_name = "poly1305-neon", - .base.cra_priority = 200, - .base.cra_blocksize = POLY1305_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, -}; - -static int __init neon_poly1305_mod_init(void) -{ - if (!cpu_have_named_feature(ASIMD)) - return 0; - - static_branch_enable(&have_neon); - - return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? - crypto_register_shash(&neon_poly1305_alg) : 0; -} - -static void __exit neon_poly1305_mod_exit(void) -{ - if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && cpu_have_named_feature(ASIMD)) - crypto_unregister_shash(&neon_poly1305_alg); -} - -module_init(neon_poly1305_mod_init); -module_exit(neon_poly1305_mod_exit); - -MODULE_DESCRIPTION("Poly1305 transform using NEON instructions"); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("poly1305"); -MODULE_ALIAS_CRYPTO("poly1305-neon"); diff --git a/arch/arm64/crypto/polyval-ce-glue.c b/arch/arm64/crypto/polyval-ce-glue.c index 0a3b5718df85..c4e653688ea0 100644 --- a/arch/arm64/crypto/polyval-ce-glue.c +++ b/arch/arm64/crypto/polyval-ce-glue.c @@ -15,17 +15,15 @@ * ARMv8 Crypto Extensions instructions to implement the finite field operations. */ -#include <crypto/algapi.h> +#include <asm/neon.h> #include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> #include <crypto/polyval.h> -#include <linux/crypto.h> -#include <linux/init.h> +#include <crypto/utils.h> +#include <linux/cpufeature.h> +#include <linux/errno.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/cpufeature.h> -#include <asm/neon.h> -#include <asm/simd.h> +#include <linux/string.h> #define NUM_KEY_POWERS 8 @@ -38,7 +36,6 @@ struct polyval_tfm_ctx { struct polyval_desc_ctx { u8 buffer[POLYVAL_BLOCK_SIZE]; - u32 bytes; }; asmlinkage void pmull_polyval_update(const struct polyval_tfm_ctx *keys, @@ -48,25 +45,16 @@ asmlinkage void pmull_polyval_mul(u8 *op1, const u8 *op2); static void internal_polyval_update(const struct polyval_tfm_ctx *keys, const u8 *in, size_t nblocks, u8 *accumulator) { - if (likely(crypto_simd_usable())) { - kernel_neon_begin(); - pmull_polyval_update(keys, in, nblocks, accumulator); - kernel_neon_end(); - } else { - polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in, - nblocks, accumulator); - } + kernel_neon_begin(); + pmull_polyval_update(keys, in, nblocks, accumulator); + kernel_neon_end(); } static void internal_polyval_mul(u8 *op1, const u8 *op2) { - if (likely(crypto_simd_usable())) { - kernel_neon_begin(); - pmull_polyval_mul(op1, op2); - kernel_neon_end(); - } else { - polyval_mul_non4k(op1, op2); - } + kernel_neon_begin(); + pmull_polyval_mul(op1, op2); + kernel_neon_end(); } static int polyval_arm64_setkey(struct crypto_shash *tfm, @@ -103,49 +91,27 @@ static int polyval_arm64_update(struct shash_desc *desc, { struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); - u8 *pos; unsigned int nblocks; - unsigned int n; - - if (dctx->bytes) { - n = min(srclen, dctx->bytes); - pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes; - - dctx->bytes -= n; - srclen -= n; - while (n--) - *pos++ ^= *src++; - - if (!dctx->bytes) - internal_polyval_mul(dctx->buffer, - tctx->key_powers[NUM_KEY_POWERS-1]); - } - - while (srclen >= POLYVAL_BLOCK_SIZE) { + do { /* allow rescheduling every 4K bytes */ nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE; internal_polyval_update(tctx, src, nblocks, dctx->buffer); srclen -= nblocks * POLYVAL_BLOCK_SIZE; src += nblocks * POLYVAL_BLOCK_SIZE; - } + } while (srclen >= POLYVAL_BLOCK_SIZE); - if (srclen) { - dctx->bytes = POLYVAL_BLOCK_SIZE - srclen; - pos = dctx->buffer; - while (srclen--) - *pos++ ^= *src++; - } - - return 0; + return srclen; } -static int polyval_arm64_final(struct shash_desc *desc, u8 *dst) +static int polyval_arm64_finup(struct shash_desc *desc, const u8 *src, + unsigned int len, u8 *dst) { struct polyval_desc_ctx *dctx = shash_desc_ctx(desc); const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); - if (dctx->bytes) { + if (len) { + crypto_xor(dctx->buffer, src, len); internal_polyval_mul(dctx->buffer, tctx->key_powers[NUM_KEY_POWERS-1]); } @@ -159,13 +125,14 @@ static struct shash_alg polyval_alg = { .digestsize = POLYVAL_DIGEST_SIZE, .init = polyval_arm64_init, .update = polyval_arm64_update, - .final = polyval_arm64_final, + .finup = polyval_arm64_finup, .setkey = polyval_arm64_setkey, .descsize = sizeof(struct polyval_desc_ctx), .base = { .cra_name = "polyval", .cra_driver_name = "polyval-ce", .cra_priority = 200, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .cra_blocksize = POLYVAL_BLOCK_SIZE, .cra_ctxsize = sizeof(struct polyval_tfm_ctx), .cra_module = THIS_MODULE, diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c index cbd14f208f83..65b6980817e5 100644 --- a/arch/arm64/crypto/sha1-ce-glue.c +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -7,14 +7,14 @@ #include <asm/neon.h> #include <asm/simd.h> -#include <linux/unaligned.h> #include <crypto/internal/hash.h> #include <crypto/internal/simd.h> #include <crypto/sha1.h> #include <crypto/sha1_base.h> #include <linux/cpufeature.h> -#include <linux/crypto.h> +#include <linux/kernel.h> #include <linux/module.h> +#include <linux/string.h> MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); @@ -56,79 +56,49 @@ static int sha1_ce_update(struct shash_desc *desc, const u8 *data, { struct sha1_ce_state *sctx = shash_desc_ctx(desc); - if (!crypto_simd_usable()) - return crypto_sha1_update(desc, data, len); - sctx->finalize = 0; - sha1_base_do_update(desc, data, len, sha1_ce_transform); - - return 0; + return sha1_base_do_update_blocks(desc, data, len, sha1_ce_transform); } static int sha1_ce_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { struct sha1_ce_state *sctx = shash_desc_ctx(desc); - bool finalize = !sctx->sst.count && !(len % SHA1_BLOCK_SIZE) && len; - - if (!crypto_simd_usable()) - return crypto_sha1_finup(desc, data, len, out); + bool finalized = false; /* * Allow the asm code to perform the finalization if there is no * partial data and the input is a round multiple of the block size. */ - sctx->finalize = finalize; - - sha1_base_do_update(desc, data, len, sha1_ce_transform); - if (!finalize) - sha1_base_do_finalize(desc, sha1_ce_transform); - return sha1_base_finish(desc, out); -} - -static int sha1_ce_final(struct shash_desc *desc, u8 *out) -{ - struct sha1_ce_state *sctx = shash_desc_ctx(desc); - - if (!crypto_simd_usable()) - return crypto_sha1_finup(desc, NULL, 0, out); - - sctx->finalize = 0; - sha1_base_do_finalize(desc, sha1_ce_transform); + if (len >= SHA1_BLOCK_SIZE) { + unsigned int remain = len - round_down(len, SHA1_BLOCK_SIZE); + + finalized = !remain; + sctx->finalize = finalized; + sha1_base_do_update_blocks(desc, data, len, sha1_ce_transform); + data += len - remain; + len = remain; + } + if (!finalized) { + sctx->finalize = 0; + sha1_base_do_finup(desc, data, len, sha1_ce_transform); + } return sha1_base_finish(desc, out); } -static int sha1_ce_export(struct shash_desc *desc, void *out) -{ - struct sha1_ce_state *sctx = shash_desc_ctx(desc); - - memcpy(out, &sctx->sst, sizeof(struct sha1_state)); - return 0; -} - -static int sha1_ce_import(struct shash_desc *desc, const void *in) -{ - struct sha1_ce_state *sctx = shash_desc_ctx(desc); - - memcpy(&sctx->sst, in, sizeof(struct sha1_state)); - sctx->finalize = 0; - return 0; -} - static struct shash_alg alg = { .init = sha1_base_init, .update = sha1_ce_update, - .final = sha1_ce_final, .finup = sha1_ce_finup, - .import = sha1_ce_import, - .export = sha1_ce_export, .descsize = sizeof(struct sha1_ce_state), - .statesize = sizeof(struct sha1_state), + .statesize = SHA1_STATE_SIZE, .digestsize = SHA1_DIGEST_SIZE, .base = { .cra_name = "sha1", .cra_driver_name = "sha1-ce", .cra_priority = 200, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, } diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S deleted file mode 100644 index fce84d88ddb2..000000000000 --- a/arch/arm64/crypto/sha2-ce-core.S +++ /dev/null @@ -1,157 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions - * - * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> - */ - -#include <linux/linkage.h> -#include <asm/assembler.h> - - .text - .arch armv8-a+crypto - - dga .req q20 - dgav .req v20 - dgb .req q21 - dgbv .req v21 - - t0 .req v22 - t1 .req v23 - - dg0q .req q24 - dg0v .req v24 - dg1q .req q25 - dg1v .req v25 - dg2q .req q26 - dg2v .req v26 - - .macro add_only, ev, rc, s0 - mov dg2v.16b, dg0v.16b - .ifeq \ev - add t1.4s, v\s0\().4s, \rc\().4s - sha256h dg0q, dg1q, t0.4s - sha256h2 dg1q, dg2q, t0.4s - .else - .ifnb \s0 - add t0.4s, v\s0\().4s, \rc\().4s - .endif - sha256h dg0q, dg1q, t1.4s - sha256h2 dg1q, dg2q, t1.4s - .endif - .endm - - .macro add_update, ev, rc, s0, s1, s2, s3 - sha256su0 v\s0\().4s, v\s1\().4s - add_only \ev, \rc, \s1 - sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s - .endm - - /* - * The SHA-256 round constants - */ - .section ".rodata", "a" - .align 4 -.Lsha2_rcon: - .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 - .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 - .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 - .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 - .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc - .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da - .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 - .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 - .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 - .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 - .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 - .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 - .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 - .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 - .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 - .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 - - /* - * int __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src, - * int blocks) - */ - .text -SYM_FUNC_START(__sha256_ce_transform) - /* load round constants */ - adr_l x8, .Lsha2_rcon - ld1 { v0.4s- v3.4s}, [x8], #64 - ld1 { v4.4s- v7.4s}, [x8], #64 - ld1 { v8.4s-v11.4s}, [x8], #64 - ld1 {v12.4s-v15.4s}, [x8] - - /* load state */ - ld1 {dgav.4s, dgbv.4s}, [x0] - - /* load sha256_ce_state::finalize */ - ldr_l w4, sha256_ce_offsetof_finalize, x4 - ldr w4, [x0, x4] - - /* load input */ -0: ld1 {v16.4s-v19.4s}, [x1], #64 - sub w2, w2, #1 - -CPU_LE( rev32 v16.16b, v16.16b ) -CPU_LE( rev32 v17.16b, v17.16b ) -CPU_LE( rev32 v18.16b, v18.16b ) -CPU_LE( rev32 v19.16b, v19.16b ) - -1: add t0.4s, v16.4s, v0.4s - mov dg0v.16b, dgav.16b - mov dg1v.16b, dgbv.16b - - add_update 0, v1, 16, 17, 18, 19 - add_update 1, v2, 17, 18, 19, 16 - add_update 0, v3, 18, 19, 16, 17 - add_update 1, v4, 19, 16, 17, 18 - - add_update 0, v5, 16, 17, 18, 19 - add_update 1, v6, 17, 18, 19, 16 - add_update 0, v7, 18, 19, 16, 17 - add_update 1, v8, 19, 16, 17, 18 - - add_update 0, v9, 16, 17, 18, 19 - add_update 1, v10, 17, 18, 19, 16 - add_update 0, v11, 18, 19, 16, 17 - add_update 1, v12, 19, 16, 17, 18 - - add_only 0, v13, 17 - add_only 1, v14, 18 - add_only 0, v15, 19 - add_only 1 - - /* update state */ - add dgav.4s, dgav.4s, dg0v.4s - add dgbv.4s, dgbv.4s, dg1v.4s - - /* handled all input blocks? */ - cbz w2, 2f - cond_yield 3f, x5, x6 - b 0b - - /* - * Final block: add padding and total bit count. - * Skip if the input size was not a round multiple of the block size, - * the padding is handled by the C code in that case. - */ -2: cbz x4, 3f - ldr_l w4, sha256_ce_offsetof_count, x4 - ldr x4, [x0, x4] - movi v17.2d, #0 - mov x8, #0x80000000 - movi v18.2d, #0 - ror x7, x4, #29 // ror(lsl(x4, 3), 32) - fmov d16, x8 - mov x4, #0 - mov v19.d[0], xzr - mov v19.d[1], x7 - b 1b - - /* store new state */ -3: st1 {dgav.4s, dgbv.4s}, [x0] - mov w0, w2 - ret -SYM_FUNC_END(__sha256_ce_transform) diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c deleted file mode 100644 index 6b4866a88ded..000000000000 --- a/arch/arm64/crypto/sha2-ce-glue.c +++ /dev/null @@ -1,192 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions - * - * Copyright (C) 2014 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> - */ - -#include <asm/neon.h> -#include <asm/simd.h> -#include <linux/unaligned.h> -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> -#include <crypto/sha2.h> -#include <crypto/sha256_base.h> -#include <linux/cpufeature.h> -#include <linux/crypto.h> -#include <linux/module.h> - -MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions"); -MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("sha224"); -MODULE_ALIAS_CRYPTO("sha256"); - -struct sha256_ce_state { - struct sha256_state sst; - u32 finalize; -}; - -extern const u32 sha256_ce_offsetof_count; -extern const u32 sha256_ce_offsetof_finalize; - -asmlinkage int __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src, - int blocks); - -static void sha256_ce_transform(struct sha256_state *sst, u8 const *src, - int blocks) -{ - while (blocks) { - int rem; - - kernel_neon_begin(); - rem = __sha256_ce_transform(container_of(sst, - struct sha256_ce_state, - sst), src, blocks); - kernel_neon_end(); - src += (blocks - rem) * SHA256_BLOCK_SIZE; - blocks = rem; - } -} - -const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state, - sst.count); -const u32 sha256_ce_offsetof_finalize = offsetof(struct sha256_ce_state, - finalize); - -asmlinkage void sha256_block_data_order(u32 *digest, u8 const *src, int blocks); - -static void sha256_arm64_transform(struct sha256_state *sst, u8 const *src, - int blocks) -{ - sha256_block_data_order(sst->state, src, blocks); -} - -static int sha256_ce_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct sha256_ce_state *sctx = shash_desc_ctx(desc); - - if (!crypto_simd_usable()) - return sha256_base_do_update(desc, data, len, - sha256_arm64_transform); - - sctx->finalize = 0; - sha256_base_do_update(desc, data, len, sha256_ce_transform); - - return 0; -} - -static int sha256_ce_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - struct sha256_ce_state *sctx = shash_desc_ctx(desc); - bool finalize = !sctx->sst.count && !(len % SHA256_BLOCK_SIZE) && len; - - if (!crypto_simd_usable()) { - if (len) - sha256_base_do_update(desc, data, len, - sha256_arm64_transform); - sha256_base_do_finalize(desc, sha256_arm64_transform); - return sha256_base_finish(desc, out); - } - - /* - * Allow the asm code to perform the finalization if there is no - * partial data and the input is a round multiple of the block size. - */ - sctx->finalize = finalize; - - sha256_base_do_update(desc, data, len, sha256_ce_transform); - if (!finalize) - sha256_base_do_finalize(desc, sha256_ce_transform); - return sha256_base_finish(desc, out); -} - -static int sha256_ce_final(struct shash_desc *desc, u8 *out) -{ - struct sha256_ce_state *sctx = shash_desc_ctx(desc); - - if (!crypto_simd_usable()) { - sha256_base_do_finalize(desc, sha256_arm64_transform); - return sha256_base_finish(desc, out); - } - - sctx->finalize = 0; - sha256_base_do_finalize(desc, sha256_ce_transform); - return sha256_base_finish(desc, out); -} - -static int sha256_ce_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - sha256_base_init(desc); - return sha256_ce_finup(desc, data, len, out); -} - -static int sha256_ce_export(struct shash_desc *desc, void *out) -{ - struct sha256_ce_state *sctx = shash_desc_ctx(desc); - - memcpy(out, &sctx->sst, sizeof(struct sha256_state)); - return 0; -} - -static int sha256_ce_import(struct shash_desc *desc, const void *in) -{ - struct sha256_ce_state *sctx = shash_desc_ctx(desc); - - memcpy(&sctx->sst, in, sizeof(struct sha256_state)); - sctx->finalize = 0; - return 0; -} - -static struct shash_alg algs[] = { { - .init = sha224_base_init, - .update = sha256_ce_update, - .final = sha256_ce_final, - .finup = sha256_ce_finup, - .export = sha256_ce_export, - .import = sha256_ce_import, - .descsize = sizeof(struct sha256_ce_state), - .statesize = sizeof(struct sha256_state), - .digestsize = SHA224_DIGEST_SIZE, - .base = { - .cra_name = "sha224", - .cra_driver_name = "sha224-ce", - .cra_priority = 200, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -}, { - .init = sha256_base_init, - .update = sha256_ce_update, - .final = sha256_ce_final, - .finup = sha256_ce_finup, - .digest = sha256_ce_digest, - .export = sha256_ce_export, - .import = sha256_ce_import, - .descsize = sizeof(struct sha256_ce_state), - .statesize = sizeof(struct sha256_state), - .digestsize = SHA256_DIGEST_SIZE, - .base = { - .cra_name = "sha256", - .cra_driver_name = "sha256-ce", - .cra_priority = 200, - .cra_blocksize = SHA256_BLOCK_SIZE, - .cra_module = THIS_MODULE, - } -} }; - -static int __init sha2_ce_mod_init(void) -{ - return crypto_register_shashes(algs, ARRAY_SIZE(algs)); -} - -static void __exit sha2_ce_mod_fini(void) -{ - crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); -} - -module_cpu_feature_match(SHA2, sha2_ce_mod_init); -module_exit(sha2_ce_mod_fini); diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c deleted file mode 100644 index 35356987cc1e..000000000000 --- a/arch/arm64/crypto/sha256-glue.c +++ /dev/null @@ -1,194 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Linux/arm64 port of the OpenSSL SHA256 implementation for AArch64 - * - * Copyright (c) 2016 Linaro Ltd. <ard.biesheuvel@linaro.org> - */ - -#include <asm/hwcap.h> -#include <asm/neon.h> -#include <asm/simd.h> -#include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> -#include <crypto/sha2.h> -#include <crypto/sha256_base.h> -#include <linux/module.h> -#include <linux/string.h> -#include <linux/types.h> - -MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash for arm64"); -MODULE_AUTHOR("Andy Polyakov <appro@openssl.org>"); -MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("sha224"); -MODULE_ALIAS_CRYPTO("sha256"); - -asmlinkage void sha256_block_data_order(u32 *digest, const void *data, - unsigned int num_blks); -EXPORT_SYMBOL(sha256_block_data_order); - -static void sha256_arm64_transform(struct sha256_state *sst, u8 const *src, - int blocks) -{ - sha256_block_data_order(sst->state, src, blocks); -} - -asmlinkage void sha256_block_neon(u32 *digest, const void *data, - unsigned int num_blks); - -static void sha256_neon_transform(struct sha256_state *sst, u8 const *src, - int blocks) -{ - sha256_block_neon(sst->state, src, blocks); -} - -static int crypto_sha256_arm64_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - return sha256_base_do_update(desc, data, len, sha256_arm64_transform); -} - -static int crypto_sha256_arm64_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - if (len) - sha256_base_do_update(desc, data, len, sha256_arm64_transform); - sha256_base_do_finalize(desc, sha256_arm64_transform); - - return sha256_base_finish(desc, out); -} - -static int crypto_sha256_arm64_final(struct shash_desc *desc, u8 *out) -{ - return crypto_sha256_arm64_finup(desc, NULL, 0, out); -} - -static struct shash_alg algs[] = { { - .digestsize = SHA256_DIGEST_SIZE, - .init = sha256_base_init, - .update = crypto_sha256_arm64_update, - .final = crypto_sha256_arm64_final, - .finup = crypto_sha256_arm64_finup, - .descsize = sizeof(struct sha256_state), - .base.cra_name = "sha256", - .base.cra_driver_name = "sha256-arm64", - .base.cra_priority = 125, - .base.cra_blocksize = SHA256_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, -}, { - .digestsize = SHA224_DIGEST_SIZE, - .init = sha224_base_init, - .update = crypto_sha256_arm64_update, - .final = crypto_sha256_arm64_final, - .finup = crypto_sha256_arm64_finup, - .descsize = sizeof(struct sha256_state), - .base.cra_name = "sha224", - .base.cra_driver_name = "sha224-arm64", - .base.cra_priority = 125, - .base.cra_blocksize = SHA224_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, -} }; - -static int sha256_update_neon(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - - if (!crypto_simd_usable()) - return sha256_base_do_update(desc, data, len, - sha256_arm64_transform); - - while (len > 0) { - unsigned int chunk = len; - - /* - * Don't hog the CPU for the entire time it takes to process all - * input when running on a preemptible kernel, but process the - * data block by block instead. - */ - if (IS_ENABLED(CONFIG_PREEMPTION) && - chunk + sctx->count % SHA256_BLOCK_SIZE > SHA256_BLOCK_SIZE) - chunk = SHA256_BLOCK_SIZE - - sctx->count % SHA256_BLOCK_SIZE; - - kernel_neon_begin(); - sha256_base_do_update(desc, data, chunk, sha256_neon_transform); - kernel_neon_end(); - data += chunk; - len -= chunk; - } - return 0; -} - -static int sha256_finup_neon(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - if (!crypto_simd_usable()) { - if (len) - sha256_base_do_update(desc, data, len, - sha256_arm64_transform); - sha256_base_do_finalize(desc, sha256_arm64_transform); - } else { - if (len) - sha256_update_neon(desc, data, len); - kernel_neon_begin(); - sha256_base_do_finalize(desc, sha256_neon_transform); - kernel_neon_end(); - } - return sha256_base_finish(desc, out); -} - -static int sha256_final_neon(struct shash_desc *desc, u8 *out) -{ - return sha256_finup_neon(desc, NULL, 0, out); -} - -static struct shash_alg neon_algs[] = { { - .digestsize = SHA256_DIGEST_SIZE, - .init = sha256_base_init, - .update = sha256_update_neon, - .final = sha256_final_neon, - .finup = sha256_finup_neon, - .descsize = sizeof(struct sha256_state), - .base.cra_name = "sha256", - .base.cra_driver_name = "sha256-arm64-neon", - .base.cra_priority = 150, - .base.cra_blocksize = SHA256_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, -}, { - .digestsize = SHA224_DIGEST_SIZE, - .init = sha224_base_init, - .update = sha256_update_neon, - .final = sha256_final_neon, - .finup = sha256_finup_neon, - .descsize = sizeof(struct sha256_state), - .base.cra_name = "sha224", - .base.cra_driver_name = "sha224-arm64-neon", - .base.cra_priority = 150, - .base.cra_blocksize = SHA224_BLOCK_SIZE, - .base.cra_module = THIS_MODULE, -} }; - -static int __init sha256_mod_init(void) -{ - int ret = crypto_register_shashes(algs, ARRAY_SIZE(algs)); - if (ret) - return ret; - - if (cpu_have_named_feature(ASIMD)) { - ret = crypto_register_shashes(neon_algs, ARRAY_SIZE(neon_algs)); - if (ret) - crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); - } - return ret; -} - -static void __exit sha256_mod_fini(void) -{ - if (cpu_have_named_feature(ASIMD)) - crypto_unregister_shashes(neon_algs, ARRAY_SIZE(neon_algs)); - crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); -} - -module_init(sha256_mod_init); -module_exit(sha256_mod_fini); diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c index 5662c3ac49e9..b4f1001046c9 100644 --- a/arch/arm64/crypto/sha3-ce-glue.c +++ b/arch/arm64/crypto/sha3-ce-glue.c @@ -12,13 +12,13 @@ #include <asm/hwcap.h> #include <asm/neon.h> #include <asm/simd.h> -#include <linux/unaligned.h> #include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> #include <crypto/sha3.h> #include <linux/cpufeature.h> -#include <linux/crypto.h> +#include <linux/kernel.h> #include <linux/module.h> +#include <linux/string.h> +#include <linux/unaligned.h> MODULE_DESCRIPTION("SHA3 secure hash using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); @@ -35,74 +35,55 @@ static int sha3_update(struct shash_desc *desc, const u8 *data, unsigned int len) { struct sha3_state *sctx = shash_desc_ctx(desc); - unsigned int digest_size = crypto_shash_digestsize(desc->tfm); - - if (!crypto_simd_usable()) - return crypto_sha3_update(desc, data, len); - - if ((sctx->partial + len) >= sctx->rsiz) { - int blocks; - - if (sctx->partial) { - int p = sctx->rsiz - sctx->partial; - - memcpy(sctx->buf + sctx->partial, data, p); - kernel_neon_begin(); - sha3_ce_transform(sctx->st, sctx->buf, 1, digest_size); - kernel_neon_end(); - - data += p; - len -= p; - sctx->partial = 0; - } - - blocks = len / sctx->rsiz; - len %= sctx->rsiz; - - while (blocks) { - int rem; - - kernel_neon_begin(); - rem = sha3_ce_transform(sctx->st, data, blocks, - digest_size); - kernel_neon_end(); - data += (blocks - rem) * sctx->rsiz; - blocks = rem; - } - } - - if (len) { - memcpy(sctx->buf + sctx->partial, data, len); - sctx->partial += len; - } - return 0; + struct crypto_shash *tfm = desc->tfm; + unsigned int bs, ds; + int blocks; + + ds = crypto_shash_digestsize(tfm); + bs = crypto_shash_blocksize(tfm); + blocks = len / bs; + len -= blocks * bs; + do { + int rem; + + kernel_neon_begin(); + rem = sha3_ce_transform(sctx->st, data, blocks, ds); + kernel_neon_end(); + data += (blocks - rem) * bs; + blocks = rem; + } while (blocks); + return len; } -static int sha3_final(struct shash_desc *desc, u8 *out) +static int sha3_finup(struct shash_desc *desc, const u8 *src, unsigned int len, + u8 *out) { struct sha3_state *sctx = shash_desc_ctx(desc); - unsigned int digest_size = crypto_shash_digestsize(desc->tfm); + struct crypto_shash *tfm = desc->tfm; __le64 *digest = (__le64 *)out; + u8 block[SHA3_224_BLOCK_SIZE]; + unsigned int bs, ds; int i; - if (!crypto_simd_usable()) - return crypto_sha3_final(desc, out); + ds = crypto_shash_digestsize(tfm); + bs = crypto_shash_blocksize(tfm); + memcpy(block, src, len); - sctx->buf[sctx->partial++] = 0x06; - memset(sctx->buf + sctx->partial, 0, sctx->rsiz - sctx->partial); - sctx->buf[sctx->rsiz - 1] |= 0x80; + block[len++] = 0x06; + memset(block + len, 0, bs - len); + block[bs - 1] |= 0x80; kernel_neon_begin(); - sha3_ce_transform(sctx->st, sctx->buf, 1, digest_size); + sha3_ce_transform(sctx->st, block, 1, ds); kernel_neon_end(); + memzero_explicit(block , sizeof(block)); - for (i = 0; i < digest_size / 8; i++) + for (i = 0; i < ds / 8; i++) put_unaligned_le64(sctx->st[i], digest++); - if (digest_size & 4) + if (ds & 4) put_unaligned_le32(sctx->st[i], (__le32 *)digest); - memzero_explicit(sctx, sizeof(*sctx)); return 0; } @@ -110,10 +91,11 @@ static struct shash_alg algs[] = { { .digestsize = SHA3_224_DIGEST_SIZE, .init = crypto_sha3_init, .update = sha3_update, - .final = sha3_final, - .descsize = sizeof(struct sha3_state), + .finup = sha3_finup, + .descsize = SHA3_STATE_SIZE, .base.cra_name = "sha3-224", .base.cra_driver_name = "sha3-224-ce", + .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .base.cra_blocksize = SHA3_224_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .base.cra_priority = 200, @@ -121,10 +103,11 @@ static struct shash_alg algs[] = { { .digestsize = SHA3_256_DIGEST_SIZE, .init = crypto_sha3_init, .update = sha3_update, - .final = sha3_final, - .descsize = sizeof(struct sha3_state), + .finup = sha3_finup, + .descsize = SHA3_STATE_SIZE, .base.cra_name = "sha3-256", .base.cra_driver_name = "sha3-256-ce", + .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .base.cra_blocksize = SHA3_256_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .base.cra_priority = 200, @@ -132,10 +115,11 @@ static struct shash_alg algs[] = { { .digestsize = SHA3_384_DIGEST_SIZE, .init = crypto_sha3_init, .update = sha3_update, - .final = sha3_final, - .descsize = sizeof(struct sha3_state), + .finup = sha3_finup, + .descsize = SHA3_STATE_SIZE, .base.cra_name = "sha3-384", .base.cra_driver_name = "sha3-384-ce", + .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .base.cra_blocksize = SHA3_384_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .base.cra_priority = 200, @@ -143,10 +127,11 @@ static struct shash_alg algs[] = { { .digestsize = SHA3_512_DIGEST_SIZE, .init = crypto_sha3_init, .update = sha3_update, - .final = sha3_final, - .descsize = sizeof(struct sha3_state), + .finup = sha3_finup, + .descsize = SHA3_STATE_SIZE, .base.cra_name = "sha3-512", .base.cra_driver_name = "sha3-512-ce", + .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, .base.cra_blocksize = SHA3_512_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .base.cra_priority = 200, diff --git a/arch/arm64/crypto/sha512-armv8.pl b/arch/arm64/crypto/sha512-armv8.pl deleted file mode 100644 index 35ec9ae99fe1..000000000000 --- a/arch/arm64/crypto/sha512-armv8.pl +++ /dev/null @@ -1,786 +0,0 @@ -#! /usr/bin/env perl -# SPDX-License-Identifier: GPL-2.0 - -# This code is taken from the OpenSSL project but the author (Andy Polyakov) -# has relicensed it under the GPLv2. Therefore this program is free software; -# you can redistribute it and/or modify it under the terms of the GNU General -# Public License version 2 as published by the Free Software Foundation. -# -# The original headers, including the original license headers, are -# included below for completeness. - -# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. -# -# Licensed under the OpenSSL license (the "License"). You may not use -# this file except in compliance with the License. You can obtain a copy -# in the file LICENSE in the source distribution or at -# https://www.openssl.org/source/license.html - -# ==================================================================== -# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL -# project. The module is, however, dual licensed under OpenSSL and -# CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. -# ==================================================================== -# -# SHA256/512 for ARMv8. -# -# Performance in cycles per processed byte and improvement coefficient -# over code generated with "default" compiler: -# -# SHA256-hw SHA256(*) SHA512 -# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) -# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) -# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) -# Denver 2.01 10.5 (+26%) 6.70 (+8%) -# X-Gene 20.0 (+100%) 12.8 (+300%(***)) -# Mongoose 2.36 13.0 (+50%) 8.36 (+33%) -# -# (*) Software SHA256 results are of lesser relevance, presented -# mostly for informational purposes. -# (**) The result is a trade-off: it's possible to improve it by -# 10% (or by 1 cycle per round), but at the cost of 20% loss -# on Cortex-A53 (or by 4 cycles per round). -# (***) Super-impressive coefficients over gcc-generated code are -# indication of some compiler "pathology", most notably code -# generated with -mgeneral-regs-only is significantly faster -# and the gap is only 40-90%. -# -# October 2016. -# -# Originally it was reckoned that it makes no sense to implement NEON -# version of SHA256 for 64-bit processors. This is because performance -# improvement on most wide-spread Cortex-A5x processors was observed -# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was -# observed that 32-bit NEON SHA256 performs significantly better than -# 64-bit scalar version on *some* of the more recent processors. As -# result 64-bit NEON version of SHA256 was added to provide best -# all-round performance. For example it executes ~30% faster on X-Gene -# and Mongoose. [For reference, NEON version of SHA512 is bound to -# deliver much less improvement, likely *negative* on Cortex-A5x. -# Which is why NEON support is limited to SHA256.] - -$output=pop; -$flavour=pop; - -if ($flavour && $flavour ne "void") { - $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; - ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or - ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or - die "can't locate arm-xlate.pl"; - - open OUT,"| \"$^X\" $xlate $flavour $output"; - *STDOUT=*OUT; -} else { - open STDOUT,">$output"; -} - -if ($output =~ /512/) { - $BITS=512; - $SZ=8; - @Sigma0=(28,34,39); - @Sigma1=(14,18,41); - @sigma0=(1, 8, 7); - @sigma1=(19,61, 6); - $rounds=80; - $reg_t="x"; -} else { - $BITS=256; - $SZ=4; - @Sigma0=( 2,13,22); - @Sigma1=( 6,11,25); - @sigma0=( 7,18, 3); - @sigma1=(17,19,10); - $rounds=64; - $reg_t="w"; -} - -$func="sha${BITS}_block_data_order"; - -($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); - -@X=map("$reg_t$_",(3..15,0..2)); -@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27)); -($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28)); - -sub BODY_00_xx { -my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; -my $j=($i+1)&15; -my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]); - $T0=@X[$i+3] if ($i<11); - -$code.=<<___ if ($i<16); -#ifndef __AARCH64EB__ - rev @X[$i],@X[$i] // $i -#endif -___ -$code.=<<___ if ($i<13 && ($i&1)); - ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ -___ -$code.=<<___ if ($i==13); - ldp @X[14],@X[15],[$inp] -___ -$code.=<<___ if ($i>=14); - ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`] -___ -$code.=<<___ if ($i>0 && $i<16); - add $a,$a,$t1 // h+=Sigma0(a) -___ -$code.=<<___ if ($i>=11); - str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`] -___ -# While ARMv8 specifies merged rotate-n-logical operation such as -# 'eor x,y,z,ror#n', it was found to negatively affect performance -# on Apple A7. The reason seems to be that it requires even 'y' to -# be available earlier. This means that such merged instruction is -# not necessarily best choice on critical path... On the other hand -# Cortex-A5x handles merged instructions much better than disjoint -# rotate and logical... See (**) footnote above. -$code.=<<___ if ($i<15); - ror $t0,$e,#$Sigma1[0] - add $h,$h,$t2 // h+=K[i] - eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]` - and $t1,$f,$e - bic $t2,$g,$e - add $h,$h,@X[$i&15] // h+=X[i] - orr $t1,$t1,$t2 // Ch(e,f,g) - eor $t2,$a,$b // a^b, b^c in next round - eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e) - ror $T0,$a,#$Sigma0[0] - add $h,$h,$t1 // h+=Ch(e,f,g) - eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]` - add $h,$h,$t0 // h+=Sigma1(e) - and $t3,$t3,$t2 // (b^c)&=(a^b) - add $d,$d,$h // d+=h - eor $t3,$t3,$b // Maj(a,b,c) - eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a) - add $h,$h,$t3 // h+=Maj(a,b,c) - ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round - //add $h,$h,$t1 // h+=Sigma0(a) -___ -$code.=<<___ if ($i>=15); - ror $t0,$e,#$Sigma1[0] - add $h,$h,$t2 // h+=K[i] - ror $T1,@X[($j+1)&15],#$sigma0[0] - and $t1,$f,$e - ror $T2,@X[($j+14)&15],#$sigma1[0] - bic $t2,$g,$e - ror $T0,$a,#$Sigma0[0] - add $h,$h,@X[$i&15] // h+=X[i] - eor $t0,$t0,$e,ror#$Sigma1[1] - eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1] - orr $t1,$t1,$t2 // Ch(e,f,g) - eor $t2,$a,$b // a^b, b^c in next round - eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e) - eor $T0,$T0,$a,ror#$Sigma0[1] - add $h,$h,$t1 // h+=Ch(e,f,g) - and $t3,$t3,$t2 // (b^c)&=(a^b) - eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1] - eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1]) - add $h,$h,$t0 // h+=Sigma1(e) - eor $t3,$t3,$b // Maj(a,b,c) - eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a) - eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14]) - add @X[$j],@X[$j],@X[($j+9)&15] - add $d,$d,$h // d+=h - add $h,$h,$t3 // h+=Maj(a,b,c) - ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round - add @X[$j],@X[$j],$T1 - add $h,$h,$t1 // h+=Sigma0(a) - add @X[$j],@X[$j],$T2 -___ - ($t2,$t3)=($t3,$t2); -} - -$code.=<<___; -#ifndef __KERNEL__ -# include "arm_arch.h" -#endif - -.text - -.extern OPENSSL_armcap_P -.globl $func -.type $func,%function -.align 6 -$func: -___ -$code.=<<___ if ($SZ==4); -#ifndef __KERNEL__ -# ifdef __ILP32__ - ldrsw x16,.LOPENSSL_armcap_P -# else - ldr x16,.LOPENSSL_armcap_P -# endif - adr x17,.LOPENSSL_armcap_P - add x16,x16,x17 - ldr w16,[x16] - tst w16,#ARMV8_SHA256 - b.ne .Lv8_entry - tst w16,#ARMV7_NEON - b.ne .Lneon_entry -#endif -___ -$code.=<<___; - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#4*$SZ - - ldp $A,$B,[$ctx] // load context - ldp $C,$D,[$ctx,#2*$SZ] - ldp $E,$F,[$ctx,#4*$SZ] - add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input - ldp $G,$H,[$ctx,#6*$SZ] - adr $Ktbl,.LK$BITS - stp $ctx,$num,[x29,#96] - -.Loop: - ldp @X[0],@X[1],[$inp],#2*$SZ - ldr $t2,[$Ktbl],#$SZ // *K++ - eor $t3,$B,$C // magic seed - str $inp,[x29,#112] -___ -for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } -$code.=".Loop_16_xx:\n"; -for (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } -$code.=<<___; - cbnz $t2,.Loop_16_xx - - ldp $ctx,$num,[x29,#96] - ldr $inp,[x29,#112] - sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind - - ldp @X[0],@X[1],[$ctx] - ldp @X[2],@X[3],[$ctx,#2*$SZ] - add $inp,$inp,#14*$SZ // advance input pointer - ldp @X[4],@X[5],[$ctx,#4*$SZ] - add $A,$A,@X[0] - ldp @X[6],@X[7],[$ctx,#6*$SZ] - add $B,$B,@X[1] - add $C,$C,@X[2] - add $D,$D,@X[3] - stp $A,$B,[$ctx] - add $E,$E,@X[4] - add $F,$F,@X[5] - stp $C,$D,[$ctx,#2*$SZ] - add $G,$G,@X[6] - add $H,$H,@X[7] - cmp $inp,$num - stp $E,$F,[$ctx,#4*$SZ] - stp $G,$H,[$ctx,#6*$SZ] - b.ne .Loop - - ldp x19,x20,[x29,#16] - add sp,sp,#4*$SZ - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#128 - ret -.size $func,.-$func - -.align 6 -.type .LK$BITS,%object -.LK$BITS: -___ -$code.=<<___ if ($SZ==8); - .quad 0x428a2f98d728ae22,0x7137449123ef65cd - .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc - .quad 0x3956c25bf348b538,0x59f111f1b605d019 - .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 - .quad 0xd807aa98a3030242,0x12835b0145706fbe - .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 - .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 - .quad 0x9bdc06a725c71235,0xc19bf174cf692694 - .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 - .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 - .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 - .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 - .quad 0x983e5152ee66dfab,0xa831c66d2db43210 - .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 - .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 - .quad 0x06ca6351e003826f,0x142929670a0e6e70 - .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 - .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df - .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 - .quad 0x81c2c92e47edaee6,0x92722c851482353b - .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 - .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 - .quad 0xd192e819d6ef5218,0xd69906245565a910 - .quad 0xf40e35855771202a,0x106aa07032bbd1b8 - .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 - .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 - .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb - .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 - .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 - .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec - .quad 0x90befffa23631e28,0xa4506cebde82bde9 - .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b - .quad 0xca273eceea26619c,0xd186b8c721c0c207 - .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 - .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 - .quad 0x113f9804bef90dae,0x1b710b35131c471b - .quad 0x28db77f523047d84,0x32caab7b40c72493 - .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c - .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a - .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 - .quad 0 // terminator -___ -$code.=<<___ if ($SZ==4); - .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - .long 0 //terminator -___ -$code.=<<___; -.size .LK$BITS,.-.LK$BITS -#ifndef __KERNEL__ -.align 3 -.LOPENSSL_armcap_P: -# ifdef __ILP32__ - .long OPENSSL_armcap_P-. -# else - .quad OPENSSL_armcap_P-. -# endif -#endif -.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" -.align 2 -___ - -if ($SZ==4) { -my $Ktbl="x3"; - -my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); -my @MSG=map("v$_.16b",(4..7)); -my ($W0,$W1)=("v16.4s","v17.4s"); -my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); - -$code.=<<___; -#ifndef __KERNEL__ -.type sha256_block_armv8,%function -.align 6 -sha256_block_armv8: -.Lv8_entry: - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ld1.32 {$ABCD,$EFGH},[$ctx] - adr $Ktbl,.LK256 - -.Loop_hw: - ld1 {@MSG[0]-@MSG[3]},[$inp],#64 - sub $num,$num,#1 - ld1.32 {$W0},[$Ktbl],#16 - rev32 @MSG[0],@MSG[0] - rev32 @MSG[1],@MSG[1] - rev32 @MSG[2],@MSG[2] - rev32 @MSG[3],@MSG[3] - orr $ABCD_SAVE,$ABCD,$ABCD // offload - orr $EFGH_SAVE,$EFGH,$EFGH -___ -for($i=0;$i<12;$i++) { -$code.=<<___; - ld1.32 {$W1},[$Ktbl],#16 - add.i32 $W0,$W0,@MSG[0] - sha256su0 @MSG[0],@MSG[1] - orr $abcd,$ABCD,$ABCD - sha256h $ABCD,$EFGH,$W0 - sha256h2 $EFGH,$abcd,$W0 - sha256su1 @MSG[0],@MSG[2],@MSG[3] -___ - ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); -} -$code.=<<___; - ld1.32 {$W1},[$Ktbl],#16 - add.i32 $W0,$W0,@MSG[0] - orr $abcd,$ABCD,$ABCD - sha256h $ABCD,$EFGH,$W0 - sha256h2 $EFGH,$abcd,$W0 - - ld1.32 {$W0},[$Ktbl],#16 - add.i32 $W1,$W1,@MSG[1] - orr $abcd,$ABCD,$ABCD - sha256h $ABCD,$EFGH,$W1 - sha256h2 $EFGH,$abcd,$W1 - - ld1.32 {$W1},[$Ktbl] - add.i32 $W0,$W0,@MSG[2] - sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind - orr $abcd,$ABCD,$ABCD - sha256h $ABCD,$EFGH,$W0 - sha256h2 $EFGH,$abcd,$W0 - - add.i32 $W1,$W1,@MSG[3] - orr $abcd,$ABCD,$ABCD - sha256h $ABCD,$EFGH,$W1 - sha256h2 $EFGH,$abcd,$W1 - - add.i32 $ABCD,$ABCD,$ABCD_SAVE - add.i32 $EFGH,$EFGH,$EFGH_SAVE - - cbnz $num,.Loop_hw - - st1.32 {$ABCD,$EFGH},[$ctx] - - ldr x29,[sp],#16 - ret -.size sha256_block_armv8,.-sha256_block_armv8 -#endif -___ -} - -if ($SZ==4) { ######################################### NEON stuff # -# You'll surely note a lot of similarities with sha256-armv4 module, -# and of course it's not a coincidence. sha256-armv4 was used as -# initial template, but was adapted for ARMv8 instruction set and -# extensively re-tuned for all-round performance. - -my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10)); -my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15)); -my $Ktbl="x16"; -my $Xfer="x17"; -my @X = map("q$_",(0..3)); -my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19)); -my $j=0; - -sub AUTOLOAD() # thunk [simplified] x86-style perlasm -{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; - my $arg = pop; - $arg = "#$arg" if ($arg*1 eq $arg); - $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; -} - -sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; } -sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; } -sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; } - -sub Xupdate() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); - my ($a,$b,$c,$d,$e,$f,$g,$h); - - &ext_8 ($T0,@X[0],@X[1],4); # X[1..4] - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &ext_8 ($T3,@X[2],@X[3],4); # X[9..12] - eval(shift(@insns)); - eval(shift(@insns)); - &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15] - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T2,$T0,$sigma0[0]); - eval(shift(@insns)); - &ushr_32 ($T1,$T0,$sigma0[2]); - eval(shift(@insns)); - &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12] - eval(shift(@insns)); - &sli_32 ($T2,$T0,32-$sigma0[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T3,$T0,$sigma0[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T1,$T1,$T2); - eval(shift(@insns)); - eval(shift(@insns)); - &sli_32 ($T3,$T0,32-$sigma0[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T4,$T7,$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4]) - eval(shift(@insns)); - eval(shift(@insns)); - &sli_32 ($T4,$T7,32-$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T5,$T7,$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T3,$T7,$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) - eval(shift(@insns)); - eval(shift(@insns)); - &sli_u32 ($T3,$T7,32-$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T5,$T5,$T4); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &ushr_32 ($T6,@X[0],$sigma1[0]); - eval(shift(@insns)); - &ushr_32 ($T7,@X[0],$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &sli_32 ($T6,@X[0],32-$sigma1[0]); - eval(shift(@insns)); - &ushr_32 ($T5,@X[0],$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T7,$T7,$T6); - eval(shift(@insns)); - eval(shift(@insns)); - &sli_32 ($T5,@X[0],32-$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &ld1_32 ("{$T0}","[$Ktbl], #16"); - eval(shift(@insns)); - &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17]) - eval(shift(@insns)); - eval(shift(@insns)); - &eor_8 ($T5,$T5,$T5); - eval(shift(@insns)); - eval(shift(@insns)); - &mov (&Dhi($T5), &Dlo($T7)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &add_32 ($T0,$T0,@X[0]); - while($#insns>=1) { eval(shift(@insns)); } - &st1_32 ("{$T0}","[$Xfer], #16"); - eval(shift(@insns)); - - push(@X,shift(@X)); # "rotate" X[] -} - -sub Xpreload() -{ use integer; - my $body = shift; - my @insns = (&$body,&$body,&$body,&$body); - my ($a,$b,$c,$d,$e,$f,$g,$h); - - eval(shift(@insns)); - eval(shift(@insns)); - &ld1_8 ("{@X[0]}","[$inp],#16"); - eval(shift(@insns)); - eval(shift(@insns)); - &ld1_32 ("{$T0}","[$Ktbl],#16"); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &rev32 (@X[0],@X[0]); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &add_32 ($T0,$T0,@X[0]); - foreach (@insns) { eval; } # remaining instructions - &st1_32 ("{$T0}","[$Xfer], #16"); - - push(@X,shift(@X)); # "rotate" X[] -} - -sub body_00_15 () { - ( - '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. - '&add ($h,$h,$t1)', # h+=X[i]+K[i] - '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past - '&and ($t1,$f,$e)', - '&bic ($t4,$g,$e)', - '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', - '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past - '&orr ($t1,$t1,$t4)', # Ch(e,f,g) - '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) - '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', - '&add ($h,$h,$t1)', # h+=Ch(e,f,g) - '&ror ($t0,$t0,"#$Sigma1[0]")', - '&eor ($t2,$a,$b)', # a^b, b^c in next round - '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) - '&add ($h,$h,$t0)', # h+=Sigma1(e) - '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. - '&ldr ($t1,"[$Ktbl]") if ($j==15);'. - '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) - '&ror ($t4,$t4,"#$Sigma0[0]")', - '&add ($d,$d,$h)', # d+=h - '&eor ($t3,$t3,$b)', # Maj(a,b,c) - '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' - ) -} - -$code.=<<___; -#ifdef __KERNEL__ -.globl sha256_block_neon -#endif -.type sha256_block_neon,%function -.align 4 -sha256_block_neon: -.Lneon_entry: - stp x29, x30, [sp, #-16]! - mov x29, sp - sub sp,sp,#16*4 - - adr $Ktbl,.LK256 - add $num,$inp,$num,lsl#6 // len to point at the end of inp - - ld1.8 {@X[0]},[$inp], #16 - ld1.8 {@X[1]},[$inp], #16 - ld1.8 {@X[2]},[$inp], #16 - ld1.8 {@X[3]},[$inp], #16 - ld1.32 {$T0},[$Ktbl], #16 - ld1.32 {$T1},[$Ktbl], #16 - ld1.32 {$T2},[$Ktbl], #16 - ld1.32 {$T3},[$Ktbl], #16 - rev32 @X[0],@X[0] // yes, even on - rev32 @X[1],@X[1] // big-endian - rev32 @X[2],@X[2] - rev32 @X[3],@X[3] - mov $Xfer,sp - add.32 $T0,$T0,@X[0] - add.32 $T1,$T1,@X[1] - add.32 $T2,$T2,@X[2] - st1.32 {$T0-$T1},[$Xfer], #32 - add.32 $T3,$T3,@X[3] - st1.32 {$T2-$T3},[$Xfer] - sub $Xfer,$Xfer,#32 - - ldp $A,$B,[$ctx] - ldp $C,$D,[$ctx,#8] - ldp $E,$F,[$ctx,#16] - ldp $G,$H,[$ctx,#24] - ldr $t1,[sp,#0] - mov $t2,wzr - eor $t3,$B,$C - mov $t4,wzr - b .L_00_48 - -.align 4 -.L_00_48: -___ - &Xupdate(\&body_00_15); - &Xupdate(\&body_00_15); - &Xupdate(\&body_00_15); - &Xupdate(\&body_00_15); -$code.=<<___; - cmp $t1,#0 // check for K256 terminator - ldr $t1,[sp,#0] - sub $Xfer,$Xfer,#64 - bne .L_00_48 - - sub $Ktbl,$Ktbl,#256 // rewind $Ktbl - cmp $inp,$num - mov $Xfer, #64 - csel $Xfer, $Xfer, xzr, eq - sub $inp,$inp,$Xfer // avoid SEGV - mov $Xfer,sp -___ - &Xpreload(\&body_00_15); - &Xpreload(\&body_00_15); - &Xpreload(\&body_00_15); - &Xpreload(\&body_00_15); -$code.=<<___; - add $A,$A,$t4 // h+=Sigma0(a) from the past - ldp $t0,$t1,[$ctx,#0] - add $A,$A,$t2 // h+=Maj(a,b,c) from the past - ldp $t2,$t3,[$ctx,#8] - add $A,$A,$t0 // accumulate - add $B,$B,$t1 - ldp $t0,$t1,[$ctx,#16] - add $C,$C,$t2 - add $D,$D,$t3 - ldp $t2,$t3,[$ctx,#24] - add $E,$E,$t0 - add $F,$F,$t1 - ldr $t1,[sp,#0] - stp $A,$B,[$ctx,#0] - add $G,$G,$t2 - mov $t2,wzr - stp $C,$D,[$ctx,#8] - add $H,$H,$t3 - stp $E,$F,[$ctx,#16] - eor $t3,$B,$C - stp $G,$H,[$ctx,#24] - mov $t4,wzr - mov $Xfer,sp - b.ne .L_00_48 - - ldr x29,[x29] - add sp,sp,#16*4+16 - ret -.size sha256_block_neon,.-sha256_block_neon -___ -} - -$code.=<<___; -#ifndef __KERNEL__ -.comm OPENSSL_armcap_P,4,4 -#endif -___ - -{ my %opcode = ( - "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, - "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); - - sub unsha256 { - my ($mnemonic,$arg)=@_; - - $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o - && - sprintf ".inst\t0x%08x\t//%s %s", - $opcode{$mnemonic}|$1|($2<<5)|($3<<16), - $mnemonic,$arg; - } -} - -open SELF,$0; -while(<SELF>) { - next if (/^#!/); - last if (!s/^#/\/\// and !/^$/); - print; -} -close SELF; - -foreach(split("\n",$code)) { - - s/\`([^\`]*)\`/eval($1)/ge; - - s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; - - s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers - - s/\.[ui]?8(\s)/$1/; - s/\.\w?32\b// and s/\.16b/\.4s/g; - m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; - - print $_,"\n"; -} - -close STDOUT; diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c index 071f64293227..6fb3001fa2c9 100644 --- a/arch/arm64/crypto/sha512-ce-glue.c +++ b/arch/arm64/crypto/sha512-ce-glue.c @@ -10,14 +10,11 @@ */ #include <asm/neon.h> -#include <asm/simd.h> -#include <linux/unaligned.h> #include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> #include <crypto/sha2.h> #include <crypto/sha512_base.h> #include <linux/cpufeature.h> -#include <linux/crypto.h> +#include <linux/kernel.h> #include <linux/module.h> MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash using ARMv8 Crypto Extensions"); @@ -29,12 +26,10 @@ MODULE_ALIAS_CRYPTO("sha512"); asmlinkage int __sha512_ce_transform(struct sha512_state *sst, u8 const *src, int blocks); -asmlinkage void sha512_block_data_order(u64 *digest, u8 const *src, int blocks); - static void sha512_ce_transform(struct sha512_state *sst, u8 const *src, int blocks) { - while (blocks) { + do { int rem; kernel_neon_begin(); @@ -42,67 +37,47 @@ static void sha512_ce_transform(struct sha512_state *sst, u8 const *src, kernel_neon_end(); src += (blocks - rem) * SHA512_BLOCK_SIZE; blocks = rem; - } -} - -static void sha512_arm64_transform(struct sha512_state *sst, u8 const *src, - int blocks) -{ - sha512_block_data_order(sst->state, src, blocks); + } while (blocks); } static int sha512_ce_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - sha512_block_fn *fn = crypto_simd_usable() ? sha512_ce_transform - : sha512_arm64_transform; - - sha512_base_do_update(desc, data, len, fn); - return 0; + return sha512_base_do_update_blocks(desc, data, len, + sha512_ce_transform); } static int sha512_ce_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - sha512_block_fn *fn = crypto_simd_usable() ? sha512_ce_transform - : sha512_arm64_transform; - - sha512_base_do_update(desc, data, len, fn); - sha512_base_do_finalize(desc, fn); - return sha512_base_finish(desc, out); -} - -static int sha512_ce_final(struct shash_desc *desc, u8 *out) -{ - sha512_block_fn *fn = crypto_simd_usable() ? sha512_ce_transform - : sha512_arm64_transform; - - sha512_base_do_finalize(desc, fn); + sha512_base_do_finup(desc, data, len, sha512_ce_transform); return sha512_base_finish(desc, out); } static struct shash_alg algs[] = { { .init = sha384_base_init, .update = sha512_ce_update, - .final = sha512_ce_final, .finup = sha512_ce_finup, - .descsize = sizeof(struct sha512_state), + .descsize = SHA512_STATE_SIZE, .digestsize = SHA384_DIGEST_SIZE, .base.cra_name = "sha384", .base.cra_driver_name = "sha384-ce", .base.cra_priority = 200, + .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .base.cra_blocksize = SHA512_BLOCK_SIZE, .base.cra_module = THIS_MODULE, }, { .init = sha512_base_init, .update = sha512_ce_update, - .final = sha512_ce_final, .finup = sha512_ce_finup, - .descsize = sizeof(struct sha512_state), + .descsize = SHA512_STATE_SIZE, .digestsize = SHA512_DIGEST_SIZE, .base.cra_name = "sha512", .base.cra_driver_name = "sha512-ce", .base.cra_priority = 200, + .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .base.cra_blocksize = SHA512_BLOCK_SIZE, .base.cra_module = THIS_MODULE, } }; diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c index 62f129dea83d..15aa9d8b7b2c 100644 --- a/arch/arm64/crypto/sha512-glue.c +++ b/arch/arm64/crypto/sha512-glue.c @@ -6,11 +6,10 @@ */ #include <crypto/internal/hash.h> -#include <linux/types.h> -#include <linux/string.h> #include <crypto/sha2.h> #include <crypto/sha512_base.h> -#include <asm/neon.h> +#include <linux/kernel.h> +#include <linux/module.h> MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash for arm64"); MODULE_AUTHOR("Andy Polyakov <appro@openssl.org>"); @@ -19,59 +18,53 @@ MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("sha384"); MODULE_ALIAS_CRYPTO("sha512"); -asmlinkage void sha512_block_data_order(u64 *digest, const void *data, - unsigned int num_blks); -EXPORT_SYMBOL(sha512_block_data_order); +asmlinkage void sha512_blocks_arch(u64 *digest, const void *data, + unsigned int num_blks); static void sha512_arm64_transform(struct sha512_state *sst, u8 const *src, int blocks) { - sha512_block_data_order(sst->state, src, blocks); + sha512_blocks_arch(sst->state, src, blocks); } static int sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - return sha512_base_do_update(desc, data, len, sha512_arm64_transform); + return sha512_base_do_update_blocks(desc, data, len, + sha512_arm64_transform); } static int sha512_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - if (len) - sha512_base_do_update(desc, data, len, sha512_arm64_transform); - sha512_base_do_finalize(desc, sha512_arm64_transform); - + sha512_base_do_finup(desc, data, len, sha512_arm64_transform); return sha512_base_finish(desc, out); } -static int sha512_final(struct shash_desc *desc, u8 *out) -{ - return sha512_finup(desc, NULL, 0, out); -} - static struct shash_alg algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_update, - .final = sha512_final, .finup = sha512_finup, - .descsize = sizeof(struct sha512_state), + .descsize = SHA512_STATE_SIZE, .base.cra_name = "sha512", .base.cra_driver_name = "sha512-arm64", .base.cra_priority = 150, + .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .base.cra_blocksize = SHA512_BLOCK_SIZE, .base.cra_module = THIS_MODULE, }, { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_update, - .final = sha512_final, .finup = sha512_finup, - .descsize = sizeof(struct sha512_state), + .descsize = SHA512_STATE_SIZE, .base.cra_name = "sha384", .base.cra_driver_name = "sha384-arm64", .base.cra_priority = 150, + .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .base.cra_blocksize = SHA384_BLOCK_SIZE, .base.cra_module = THIS_MODULE, } }; diff --git a/arch/arm64/crypto/sm3-ce-glue.c b/arch/arm64/crypto/sm3-ce-glue.c index 1a71788c4cda..eac6f5fa0abe 100644 --- a/arch/arm64/crypto/sm3-ce-glue.c +++ b/arch/arm64/crypto/sm3-ce-glue.c @@ -6,14 +6,11 @@ */ #include <asm/neon.h> -#include <asm/simd.h> -#include <linux/unaligned.h> #include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> #include <crypto/sm3.h> #include <crypto/sm3_base.h> #include <linux/cpufeature.h> -#include <linux/crypto.h> +#include <linux/kernel.h> #include <linux/module.h> MODULE_DESCRIPTION("SM3 secure hash using ARMv8 Crypto Extensions"); @@ -26,50 +23,20 @@ asmlinkage void sm3_ce_transform(struct sm3_state *sst, u8 const *src, static int sm3_ce_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - if (!crypto_simd_usable()) { - sm3_update(shash_desc_ctx(desc), data, len); - return 0; - } + int remain; kernel_neon_begin(); - sm3_base_do_update(desc, data, len, sm3_ce_transform); + remain = sm3_base_do_update_blocks(desc, data, len, sm3_ce_transform); kernel_neon_end(); - - return 0; -} - -static int sm3_ce_final(struct shash_desc *desc, u8 *out) -{ - if (!crypto_simd_usable()) { - sm3_final(shash_desc_ctx(desc), out); - return 0; - } - - kernel_neon_begin(); - sm3_base_do_finalize(desc, sm3_ce_transform); - kernel_neon_end(); - - return sm3_base_finish(desc, out); + return remain; } static int sm3_ce_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - if (!crypto_simd_usable()) { - struct sm3_state *sctx = shash_desc_ctx(desc); - - if (len) - sm3_update(sctx, data, len); - sm3_final(sctx, out); - return 0; - } - kernel_neon_begin(); - if (len) - sm3_base_do_update(desc, data, len, sm3_ce_transform); - sm3_base_do_finalize(desc, sm3_ce_transform); + sm3_base_do_finup(desc, data, len, sm3_ce_transform); kernel_neon_end(); - return sm3_base_finish(desc, out); } @@ -77,11 +44,12 @@ static struct shash_alg sm3_alg = { .digestsize = SM3_DIGEST_SIZE, .init = sm3_base_init, .update = sm3_ce_update, - .final = sm3_ce_final, .finup = sm3_ce_finup, - .descsize = sizeof(struct sm3_state), + .descsize = SM3_STATE_SIZE, .base.cra_name = "sm3", .base.cra_driver_name = "sm3-ce", + .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .base.cra_blocksize = SM3_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .base.cra_priority = 400, diff --git a/arch/arm64/crypto/sm3-neon-glue.c b/arch/arm64/crypto/sm3-neon-glue.c index 8dd71ce79b69..6c4611a503a3 100644 --- a/arch/arm64/crypto/sm3-neon-glue.c +++ b/arch/arm64/crypto/sm3-neon-glue.c @@ -6,14 +6,11 @@ */ #include <asm/neon.h> -#include <asm/simd.h> -#include <linux/unaligned.h> #include <crypto/internal/hash.h> -#include <crypto/internal/simd.h> #include <crypto/sm3.h> #include <crypto/sm3_base.h> #include <linux/cpufeature.h> -#include <linux/crypto.h> +#include <linux/kernel.h> #include <linux/module.h> @@ -23,50 +20,20 @@ asmlinkage void sm3_neon_transform(struct sm3_state *sst, u8 const *src, static int sm3_neon_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - if (!crypto_simd_usable()) { - sm3_update(shash_desc_ctx(desc), data, len); - return 0; - } + int remain; kernel_neon_begin(); - sm3_base_do_update(desc, data, len, sm3_neon_transform); + remain = sm3_base_do_update_blocks(desc, data, len, sm3_neon_transform); kernel_neon_end(); - - return 0; -} - -static int sm3_neon_final(struct shash_desc *desc, u8 *out) -{ - if (!crypto_simd_usable()) { - sm3_final(shash_desc_ctx(desc), out); - return 0; - } - - kernel_neon_begin(); - sm3_base_do_finalize(desc, sm3_neon_transform); - kernel_neon_end(); - - return sm3_base_finish(desc, out); + return remain; } static int sm3_neon_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - if (!crypto_simd_usable()) { - struct sm3_state *sctx = shash_desc_ctx(desc); - - if (len) - sm3_update(sctx, data, len); - sm3_final(sctx, out); - return 0; - } - kernel_neon_begin(); - if (len) - sm3_base_do_update(desc, data, len, sm3_neon_transform); - sm3_base_do_finalize(desc, sm3_neon_transform); + sm3_base_do_finup(desc, data, len, sm3_neon_transform); kernel_neon_end(); - return sm3_base_finish(desc, out); } @@ -74,11 +41,12 @@ static struct shash_alg sm3_alg = { .digestsize = SM3_DIGEST_SIZE, .init = sm3_base_init, .update = sm3_neon_update, - .final = sm3_neon_final, .finup = sm3_neon_finup, - .descsize = sizeof(struct sm3_state), + .descsize = SM3_STATE_SIZE, .base.cra_name = "sm3", .base.cra_driver_name = "sm3-neon", + .base.cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINUP_MAX, .base.cra_blocksize = SM3_BLOCK_SIZE, .base.cra_module = THIS_MODULE, .base.cra_priority = 200, diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c index 43741bed874e..7a60e7b559dc 100644 --- a/arch/arm64/crypto/sm4-ce-glue.c +++ b/arch/arm64/crypto/sm4-ce-glue.c @@ -8,19 +8,18 @@ * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> */ -#include <linux/module.h> -#include <linux/crypto.h> -#include <linux/kernel.h> -#include <linux/cpufeature.h> #include <asm/neon.h> -#include <asm/simd.h> #include <crypto/b128ops.h> -#include <crypto/internal/simd.h> -#include <crypto/internal/skcipher.h> #include <crypto/internal/hash.h> +#include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> -#include <crypto/xts.h> #include <crypto/sm4.h> +#include <crypto/utils.h> +#include <crypto/xts.h> +#include <linux/cpufeature.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> #define BYTES2BLKS(nbytes) ((nbytes) >> 4) @@ -64,7 +63,6 @@ struct sm4_mac_tfm_ctx { }; struct sm4_mac_desc_ctx { - unsigned int len; u8 digest[SM4_BLOCK_SIZE]; }; @@ -591,8 +589,6 @@ static int sm4_mac_init(struct shash_desc *desc) struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc); memset(ctx->digest, 0, SM4_BLOCK_SIZE); - ctx->len = 0; - return 0; } @@ -601,87 +597,50 @@ static int sm4_mac_update(struct shash_desc *desc, const u8 *p, { struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc); - unsigned int l, nblocks; - - if (len == 0) - return 0; - - if (ctx->len || ctx->len + len < SM4_BLOCK_SIZE) { - l = min(len, SM4_BLOCK_SIZE - ctx->len); - - crypto_xor(ctx->digest + ctx->len, p, l); - ctx->len += l; - len -= l; - p += l; - } - - if (len && (ctx->len % SM4_BLOCK_SIZE) == 0) { - kernel_neon_begin(); - - if (len < SM4_BLOCK_SIZE && ctx->len == SM4_BLOCK_SIZE) { - sm4_ce_crypt_block(tctx->key.rkey_enc, - ctx->digest, ctx->digest); - ctx->len = 0; - } else { - nblocks = len / SM4_BLOCK_SIZE; - len %= SM4_BLOCK_SIZE; + unsigned int nblocks = len / SM4_BLOCK_SIZE; - sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, p, - nblocks, (ctx->len == SM4_BLOCK_SIZE), - (len != 0)); - - p += nblocks * SM4_BLOCK_SIZE; - - if (len == 0) - ctx->len = SM4_BLOCK_SIZE; - } - - kernel_neon_end(); - - if (len) { - crypto_xor(ctx->digest, p, len); - ctx->len = len; - } - } - - return 0; + len %= SM4_BLOCK_SIZE; + kernel_neon_begin(); + sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, p, + nblocks, false, true); + kernel_neon_end(); + return len; } -static int sm4_cmac_final(struct shash_desc *desc, u8 *out) +static int sm4_cmac_finup(struct shash_desc *desc, const u8 *src, + unsigned int len, u8 *out) { struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc); const u8 *consts = tctx->consts; - if (ctx->len != SM4_BLOCK_SIZE) { - ctx->digest[ctx->len] ^= 0x80; + crypto_xor(ctx->digest, src, len); + if (len != SM4_BLOCK_SIZE) { + ctx->digest[len] ^= 0x80; consts += SM4_BLOCK_SIZE; } - kernel_neon_begin(); sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, consts, 1, false, true); kernel_neon_end(); - memcpy(out, ctx->digest, SM4_BLOCK_SIZE); - return 0; } -static int sm4_cbcmac_final(struct shash_desc *desc, u8 *out) +static int sm4_cbcmac_finup(struct shash_desc *desc, const u8 *src, + unsigned int len, u8 *out) { struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc); - if (ctx->len) { + if (len) { + crypto_xor(ctx->digest, src, len); kernel_neon_begin(); sm4_ce_crypt_block(tctx->key.rkey_enc, ctx->digest, ctx->digest); kernel_neon_end(); } - memcpy(out, ctx->digest, SM4_BLOCK_SIZE); - return 0; } @@ -691,6 +650,8 @@ static struct shash_alg sm4_mac_algs[] = { .cra_name = "cmac(sm4)", .cra_driver_name = "cmac-sm4-ce", .cra_priority = 400, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINAL_NONZERO, .cra_blocksize = SM4_BLOCK_SIZE, .cra_ctxsize = sizeof(struct sm4_mac_tfm_ctx) + SM4_BLOCK_SIZE * 2, @@ -699,7 +660,7 @@ static struct shash_alg sm4_mac_algs[] = { .digestsize = SM4_BLOCK_SIZE, .init = sm4_mac_init, .update = sm4_mac_update, - .final = sm4_cmac_final, + .finup = sm4_cmac_finup, .setkey = sm4_cmac_setkey, .descsize = sizeof(struct sm4_mac_desc_ctx), }, { @@ -707,6 +668,8 @@ static struct shash_alg sm4_mac_algs[] = { .cra_name = "xcbc(sm4)", .cra_driver_name = "xcbc-sm4-ce", .cra_priority = 400, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY | + CRYPTO_AHASH_ALG_FINAL_NONZERO, .cra_blocksize = SM4_BLOCK_SIZE, .cra_ctxsize = sizeof(struct sm4_mac_tfm_ctx) + SM4_BLOCK_SIZE * 2, @@ -715,7 +678,7 @@ static struct shash_alg sm4_mac_algs[] = { .digestsize = SM4_BLOCK_SIZE, .init = sm4_mac_init, .update = sm4_mac_update, - .final = sm4_cmac_final, + .finup = sm4_cmac_finup, .setkey = sm4_xcbc_setkey, .descsize = sizeof(struct sm4_mac_desc_ctx), }, { @@ -723,14 +686,15 @@ static struct shash_alg sm4_mac_algs[] = { .cra_name = "cbcmac(sm4)", .cra_driver_name = "cbcmac-sm4-ce", .cra_priority = 400, - .cra_blocksize = 1, + .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY, + .cra_blocksize = SM4_BLOCK_SIZE, .cra_ctxsize = sizeof(struct sm4_mac_tfm_ctx), .cra_module = THIS_MODULE, }, .digestsize = SM4_BLOCK_SIZE, .init = sm4_mac_init, .update = sm4_mac_update, - .final = sm4_cbcmac_final, + .finup = sm4_cbcmac_finup, .setkey = sm4_cbcmac_setkey, .descsize = sizeof(struct sm4_mac_desc_ctx), } |