20 files changed, 266 insertions, 4128 deletions
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 3418c8d3c78d..c44b0f202a1f 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -25,18 +25,6 @@ config CRYPTO_NHPOLY1305_NEON
 	  Architecture: arm64 using:
 	  - NEON (Advanced SIMD) extensions
 
-config CRYPTO_POLY1305_NEON
-	tristate
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_HASH
-	select CRYPTO_ARCH_HAVE_LIB_POLY1305
-	default CRYPTO_LIB_POLY1305_INTERNAL
-	help
-	  Poly1305 authenticator algorithm (RFC7539)
-
-	  Architecture: arm64 using:
-	  - NEON (Advanced SIMD) extensions
-
 config CRYPTO_SHA1_ARM64_CE
 	tristate "Hash functions: SHA-1 (ARMv8 Crypto Extensions)"
 	depends on KERNEL_MODE_NEON
@@ -48,25 +36,6 @@ config CRYPTO_SHA1_ARM64_CE
 	  Architecture: arm64 using:
 	  - ARMv8 Crypto Extensions
 
-config CRYPTO_SHA256_ARM64
-	tristate "Hash functions: SHA-224 and SHA-256"
-	select CRYPTO_HASH
-	help
-	  SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
-
-	  Architecture: arm64
-
-config CRYPTO_SHA2_ARM64_CE
-	tristate "Hash functions: SHA-224 and SHA-256 (ARMv8 Crypto Extensions)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_HASH
-	select CRYPTO_SHA256_ARM64
-	help
-	  SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
-
-	  Architecture: arm64 using:
-	  - ARMv8 Crypto Extensions
-
 config CRYPTO_SHA512_ARM64
 	tristate "Hash functions: SHA-384 and SHA-512"
 	select CRYPTO_HASH
@@ -101,7 +70,7 @@ config CRYPTO_SM3_NEON
 	tristate "Hash functions: SM3 (NEON)"
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_HASH
-	select CRYPTO_SM3
+	select CRYPTO_LIB_SM3
 	help
 	  SM3 (ShangMi 3) secure hash function (OSCCA GM/T 0004-2012)
 
@@ -112,7 +81,7 @@ config CRYPTO_SM3_ARM64_CE
 	tristate "Hash functions: SM3 (ARMv8.2 Crypto Extensions)"
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_HASH
-	select CRYPTO_SM3
+	select CRYPTO_LIB_SM3
 	help
 	  SM3 (ShangMi 3) secure hash function (OSCCA GM/T 0004-2012)
 
@@ -143,7 +112,7 @@ config CRYPTO_AES_ARM64
 
 config CRYPTO_AES_ARM64_CE
 	tristate "Ciphers: AES (ARMv8 Crypto Extensions)"
-	depends on ARM64 && KERNEL_MODE_NEON
+	depends on KERNEL_MODE_NEON
 	select CRYPTO_ALGAPI
 	select CRYPTO_LIB_AES
 	help
@@ -186,20 +155,6 @@ config CRYPTO_AES_ARM64_NEON_BLK
 	  Architecture: arm64 using:
 	  - NEON (Advanced SIMD) extensions
 
-config CRYPTO_CHACHA20_NEON
-	tristate
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_SKCIPHER
-	select CRYPTO_LIB_CHACHA_GENERIC
-	select CRYPTO_ARCH_HAVE_LIB_CHACHA
-	default CRYPTO_LIB_CHACHA_INTERNAL
-	help
-	  Length-preserving ciphers: ChaCha20, XChaCha20, and XChaCha12
-	  stream cipher algorithms
-
-	  Architecture: arm64 using:
-	  - NEON (Advanced SIMD) extensions
-
 config CRYPTO_AES_ARM64_BS
 	tristate "Ciphers: AES, modes: ECB/CBC/CTR/XCTR/XTS modes (bit-sliced NEON)"
 	depends on KERNEL_MODE_NEON
@@ -267,7 +222,7 @@ config CRYPTO_SM4_ARM64_NEON_BLK
 
 config CRYPTO_AES_ARM64_CE_CCM
 	tristate "AEAD cipher: AES in CCM mode (ARMv8 Crypto Extensions)"
-	depends on ARM64 && KERNEL_MODE_NEON
+	depends on KERNEL_MODE_NEON
 	select CRYPTO_ALGAPI
 	select CRYPTO_AES_ARM64_CE
 	select CRYPTO_AES_ARM64_CE_BLK
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index e7139c4768ce..c231c980c514 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -8,9 +8,6 @@
 obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o
 sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o
 
-obj-$(CONFIG_CRYPTO_SHA2_ARM64_CE) += sha2-ce.o
-sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o
-
 obj-$(CONFIG_CRYPTO_SHA512_ARM64_CE) += sha512-ce.o
 sha512-ce-y := sha512-ce-glue.o sha512-ce-core.o
 
@@ -56,19 +53,9 @@ aes-ce-blk-y := aes-glue-ce.o aes-ce.o
 obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o
 aes-neon-blk-y := aes-glue-neon.o aes-neon.o
 
-obj-$(CONFIG_CRYPTO_SHA256_ARM64) += sha256-arm64.o
-sha256-arm64-y := sha256-glue.o sha256-core.o
-
 obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
 sha512-arm64-y := sha512-glue.o sha512-core.o
 
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
-chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
-
-obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
-poly1305-neon-y := poly1305-core.o poly1305-glue.o
-AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_init_arm64
-
 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
 
@@ -81,10 +68,7 @@ aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
 quiet_cmd_perlasm = PERLASM $@
       cmd_perlasm = $(PERL) $(<) void $(@)
 
-$(obj)/%-core.S: $(src)/%-armv8.pl
-	$(call cmd,perlasm)
-
-$(obj)/sha256-core.S: $(src)/sha512-armv8.pl
+$(obj)/sha512-core.S: $(src)/../lib/crypto/sha2-armv8.pl
 	$(call cmd,perlasm)
 
-clean-files += poly1305-core.S sha256-core.S sha512-core.S
+clean-files += sha512-core.S
diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
index b0150999743f..81560f722b9d 100644
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -5,19 +5,20 @@
  * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
  */
 
-#include <asm/neon.h>
 #include <asm/hwcap.h>
-#include <asm/simd.h>
+#include <asm/neon.h>
 #include <crypto/aes.h>
 #include <crypto/ctr.h>
-#include <crypto/sha2.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
-#include <linux/module.h>
-#include <linux/cpufeature.h>
+#include <crypto/sha2.h>
+#include <crypto/utils.h>
 #include <crypto/xts.h>
+#include <linux/cpufeature.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
 
 #include "aes-ce-setkey.h"
 
@@ -130,7 +131,6 @@ struct mac_tfm_ctx {
 };
 
 struct mac_desc_ctx {
-	unsigned int len;
 	u8 dg[AES_BLOCK_SIZE];
 };
 
@@ -869,109 +869,64 @@ static int mac_init(struct shash_desc *desc)
 	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
 
 	memset(ctx->dg, 0, AES_BLOCK_SIZE);
-	ctx->len = 0;
-
 	return 0;
 }
 
 static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks,
-			  u8 dg[], int enc_before, int enc_after)
+			  u8 dg[], int enc_before)
 {
 	int rounds = 6 + ctx->key_length / 4;
+	int rem;
 
-	if (crypto_simd_usable()) {
-		int rem;
-
-		do {
-			kernel_neon_begin();
-			rem = aes_mac_update(in, ctx->key_enc, rounds, blocks,
-					     dg, enc_before, enc_after);
-			kernel_neon_end();
-			in += (blocks - rem) * AES_BLOCK_SIZE;
-			blocks = rem;
-			enc_before = 0;
-		} while (blocks);
-	} else {
-		if (enc_before)
-			aes_encrypt(ctx, dg, dg);
-
-		while (blocks--) {
-			crypto_xor(dg, in, AES_BLOCK_SIZE);
-			in += AES_BLOCK_SIZE;
-
-			if (blocks || enc_after)
-				aes_encrypt(ctx, dg, dg);
-		}
-	}
+	do {
+		kernel_neon_begin();
+		rem = aes_mac_update(in, ctx->key_enc, rounds, blocks,
+				     dg, enc_before, !enc_before);
+		kernel_neon_end();
+		in += (blocks - rem) * AES_BLOCK_SIZE;
+		blocks = rem;
+	} while (blocks);
 }
 
 static int mac_update(struct shash_desc *desc, const u8 *p, unsigned int len)
 {
 	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
 	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
+	int blocks = len / AES_BLOCK_SIZE;
 
-	while (len > 0) {
-		unsigned int l;
-
-		if ((ctx->len % AES_BLOCK_SIZE) == 0 &&
-		    (ctx->len + len) > AES_BLOCK_SIZE) {
-
-			int blocks = len / AES_BLOCK_SIZE;
-
-			len %= AES_BLOCK_SIZE;
-
-			mac_do_update(&tctx->key, p, blocks, ctx->dg,
-				      (ctx->len != 0), (len != 0));
-
-			p += blocks * AES_BLOCK_SIZE;
-
-			if (!len) {
-				ctx->len = AES_BLOCK_SIZE;
-				break;
-			}
-			ctx->len = 0;
-		}
-
-		l = min(len, AES_BLOCK_SIZE - ctx->len);
-
-		if (l <= AES_BLOCK_SIZE) {
-			crypto_xor(ctx->dg + ctx->len, p, l);
-			ctx->len += l;
-			len -= l;
-			p += l;
-		}
-	}
-
-	return 0;
+	len %= AES_BLOCK_SIZE;
+	mac_do_update(&tctx->key, p, blocks, ctx->dg, 0);
+	return len;
 }
 
-static int cbcmac_final(struct shash_desc *desc, u8 *out)
+static int cbcmac_finup(struct shash_desc *desc, const u8 *src,
+			unsigned int len, u8 *out)
 {
 	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
 	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
 
-	mac_do_update(&tctx->key, NULL, 0, ctx->dg, (ctx->len != 0), 0);
-
+	if (len) {
+		crypto_xor(ctx->dg, src, len);
+		mac_do_update(&tctx->key, NULL, 0, ctx->dg, 1);
+	}
 	memcpy(out, ctx->dg, AES_BLOCK_SIZE);
-
 	return 0;
 }
 
-static int cmac_final(struct shash_desc *desc, u8 *out)
+static int cmac_finup(struct shash_desc *desc, const u8 *src, unsigned int len,
+		      u8 *out)
 {
 	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
 	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
 	u8 *consts = tctx->consts;
 
-	if (ctx->len != AES_BLOCK_SIZE) {
-		ctx->dg[ctx->len] ^= 0x80;
+	crypto_xor(ctx->dg, src, len);
+	if (len != AES_BLOCK_SIZE) {
+		ctx->dg[len] ^= 0x80;
 		consts += AES_BLOCK_SIZE;
 	}
-
-	mac_do_update(&tctx->key, consts, 1, ctx->dg, 0, 1);
-
+	mac_do_update(&tctx->key, consts, 1, ctx->dg, 0);
 	memcpy(out, ctx->dg, AES_BLOCK_SIZE);
-
 	return 0;
 }
 
@@ -979,6 +934,8 @@ static struct shash_alg mac_algs[] = { {
 	.base.cra_name		= "cmac(aes)",
 	.base.cra_driver_name	= "cmac-aes-" MODE,
 	.base.cra_priority	= PRIO,
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+				  CRYPTO_AHASH_ALG_FINAL_NONZERO,
 	.base.cra_blocksize	= AES_BLOCK_SIZE,
 	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx) +
 				  2 * AES_BLOCK_SIZE,
@@ -987,13 +944,15 @@ static struct shash_alg mac_algs[] = { {
 	.digestsize		= AES_BLOCK_SIZE,
 	.init			= mac_init,
 	.update			= mac_update,
-	.final			= cmac_final,
+	.finup			= cmac_finup,
 	.setkey			= cmac_setkey,
 	.descsize		= sizeof(struct mac_desc_ctx),
 }, {
 	.base.cra_name		= "xcbc(aes)",
 	.base.cra_driver_name	= "xcbc-aes-" MODE,
 	.base.cra_priority	= PRIO,
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+				  CRYPTO_AHASH_ALG_FINAL_NONZERO,
 	.base.cra_blocksize	= AES_BLOCK_SIZE,
 	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx) +
 				  2 * AES_BLOCK_SIZE,
@@ -1002,21 +961,22 @@ static struct shash_alg mac_algs[] = { {
 	.digestsize		= AES_BLOCK_SIZE,
 	.init			= mac_init,
 	.update			= mac_update,
-	.final			= cmac_final,
+	.finup			= cmac_finup,
 	.setkey			= xcbc_setkey,
 	.descsize		= sizeof(struct mac_desc_ctx),
 }, {
 	.base.cra_name		= "cbcmac(aes)",
 	.base.cra_driver_name	= "cbcmac-aes-" MODE,
 	.base.cra_priority	= PRIO,
-	.base.cra_blocksize	= 1,
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
+	.base.cra_blocksize	= AES_BLOCK_SIZE,
 	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx),
 	.base.cra_module	= THIS_MODULE,
 
 	.digestsize		= AES_BLOCK_SIZE,
 	.init			= mac_init,
 	.update			= mac_update,
-	.final			= cbcmac_final,
+	.finup			= cbcmac_finup,
 	.setkey			= cbcmac_setkey,
 	.descsize		= sizeof(struct mac_desc_ctx),
 } };
diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S
deleted file mode 100644
index b70ac76f2610..000000000000
--- a/arch/arm64/crypto/chacha-neon-core.S
+++ /dev/null
@@ -1,805 +0,0 @@
-/*
- * ChaCha/XChaCha NEON helper functions
- *
- * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Originally based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/cache.h>
-
-	.text
-	.align		6
-
-/*
- * chacha_permute - permute one block
- *
- * Permute one 64-byte block where the state matrix is stored in the four NEON
- * registers v0-v3.  It performs matrix operations on four words in parallel,
- * but requires shuffling to rearrange the words after each round.
- *
- * The round count is given in w3.
- *
- * Clobbers: w3, x10, v4, v12
- */
-SYM_FUNC_START_LOCAL(chacha_permute)
-
-	adr_l		x10, ROT8
-	ld1		{v12.4s}, [x10]
-
-.Ldoubleround:
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-	add		v0.4s, v0.4s, v1.4s
-	eor		v3.16b, v3.16b, v0.16b
-	rev32		v3.8h, v3.8h
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-	add		v2.4s, v2.4s, v3.4s
-	eor		v4.16b, v1.16b, v2.16b
-	shl		v1.4s, v4.4s, #12
-	sri		v1.4s, v4.4s, #20
-
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-	add		v0.4s, v0.4s, v1.4s
-	eor		v3.16b, v3.16b, v0.16b
-	tbl		v3.16b, {v3.16b}, v12.16b
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-	add		v2.4s, v2.4s, v3.4s
-	eor		v4.16b, v1.16b, v2.16b
-	shl		v1.4s, v4.4s, #7
-	sri		v1.4s, v4.4s, #25
-
-	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
-	ext		v1.16b, v1.16b, v1.16b, #4
-	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-	ext		v2.16b, v2.16b, v2.16b, #8
-	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
-	ext		v3.16b, v3.16b, v3.16b, #12
-
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-	add		v0.4s, v0.4s, v1.4s
-	eor		v3.16b, v3.16b, v0.16b
-	rev32		v3.8h, v3.8h
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-	add		v2.4s, v2.4s, v3.4s
-	eor		v4.16b, v1.16b, v2.16b
-	shl		v1.4s, v4.4s, #12
-	sri		v1.4s, v4.4s, #20
-
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-	add		v0.4s, v0.4s, v1.4s
-	eor		v3.16b, v3.16b, v0.16b
-	tbl		v3.16b, {v3.16b}, v12.16b
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-	add		v2.4s, v2.4s, v3.4s
-	eor		v4.16b, v1.16b, v2.16b
-	shl		v1.4s, v4.4s, #7
-	sri		v1.4s, v4.4s, #25
-
-	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
-	ext		v1.16b, v1.16b, v1.16b, #12
-	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-	ext		v2.16b, v2.16b, v2.16b, #8
-	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
-	ext		v3.16b, v3.16b, v3.16b, #4
-
-	subs		w3, w3, #2
-	b.ne		.Ldoubleround
-
-	ret
-SYM_FUNC_END(chacha_permute)
-
-SYM_FUNC_START(chacha_block_xor_neon)
-	// x0: Input state matrix, s
-	// x1: 1 data block output, o
-	// x2: 1 data block input, i
-	// w3: nrounds
-
-	stp		x29, x30, [sp, #-16]!
-	mov		x29, sp
-
-	// x0..3 = s0..3
-	ld1		{v0.4s-v3.4s}, [x0]
-	ld1		{v8.4s-v11.4s}, [x0]
-
-	bl		chacha_permute
-
-	ld1		{v4.16b-v7.16b}, [x2]
-
-	// o0 = i0 ^ (x0 + s0)
-	add		v0.4s, v0.4s, v8.4s
-	eor		v0.16b, v0.16b, v4.16b
-
-	// o1 = i1 ^ (x1 + s1)
-	add		v1.4s, v1.4s, v9.4s
-	eor		v1.16b, v1.16b, v5.16b
-
-	// o2 = i2 ^ (x2 + s2)
-	add		v2.4s, v2.4s, v10.4s
-	eor		v2.16b, v2.16b, v6.16b
-
-	// o3 = i3 ^ (x3 + s3)
-	add		v3.4s, v3.4s, v11.4s
-	eor		v3.16b, v3.16b, v7.16b
-
-	st1		{v0.16b-v3.16b}, [x1]
-
-	ldp		x29, x30, [sp], #16
-	ret
-SYM_FUNC_END(chacha_block_xor_neon)
-
-SYM_FUNC_START(hchacha_block_neon)
-	// x0: Input state matrix, s
-	// x1: output (8 32-bit words)
-	// w2: nrounds
-
-	stp		x29, x30, [sp, #-16]!
-	mov		x29, sp
-
-	ld1		{v0.4s-v3.4s}, [x0]
-
-	mov		w3, w2
-	bl		chacha_permute
-
-	st1		{v0.4s}, [x1], #16
-	st1		{v3.4s}, [x1]
-
-	ldp		x29, x30, [sp], #16
-	ret
-SYM_FUNC_END(hchacha_block_neon)
-
-	a0		.req	w12
-	a1		.req	w13
-	a2		.req	w14
-	a3		.req	w15
-	a4		.req	w16
-	a5		.req	w17
-	a6		.req	w19
-	a7		.req	w20
-	a8		.req	w21
-	a9		.req	w22
-	a10		.req	w23
-	a11		.req	w24
-	a12		.req	w25
-	a13		.req	w26
-	a14		.req	w27
-	a15		.req	w28
-
-	.align		6
-SYM_FUNC_START(chacha_4block_xor_neon)
-	frame_push	10
-
-	// x0: Input state matrix, s
-	// x1: 4 data blocks output, o
-	// x2: 4 data blocks input, i
-	// w3: nrounds
-	// x4: byte count
-
-	adr_l		x10, .Lpermute
-	and		x5, x4, #63
-	add		x10, x10, x5
-
-	//
-	// This function encrypts four consecutive ChaCha blocks by loading
-	// the state matrix in NEON registers four times. The algorithm performs
-	// each operation on the corresponding word of each state matrix, hence
-	// requires no word shuffling. For final XORing step we transpose the
-	// matrix by interleaving 32- and then 64-bit words, which allows us to
-	// do XOR in NEON registers.
-	//
-	// At the same time, a fifth block is encrypted in parallel using
-	// scalar registers
-	//
-	adr_l		x9, CTRINC		// ... and ROT8
-	ld1		{v30.4s-v31.4s}, [x9]
-
-	// x0..15[0-3] = s0..3[0..3]
-	add		x8, x0, #16
-	ld4r		{ v0.4s- v3.4s}, [x0]
-	ld4r		{ v4.4s- v7.4s}, [x8], #16
-	ld4r		{ v8.4s-v11.4s}, [x8], #16
-	ld4r		{v12.4s-v15.4s}, [x8]
-
-	mov		a0, v0.s[0]
-	mov		a1, v1.s[0]
-	mov		a2, v2.s[0]
-	mov		a3, v3.s[0]
-	mov		a4, v4.s[0]
-	mov		a5, v5.s[0]
-	mov		a6, v6.s[0]
-	mov		a7, v7.s[0]
-	mov		a8, v8.s[0]
-	mov		a9, v9.s[0]
-	mov		a10, v10.s[0]
-	mov		a11, v11.s[0]
-	mov		a12, v12.s[0]
-	mov		a13, v13.s[0]
-	mov		a14, v14.s[0]
-	mov		a15, v15.s[0]
-
-	// x12 += counter values 1-4
-	add		v12.4s, v12.4s, v30.4s
-
-.Ldoubleround4:
-	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
-	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
-	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
-	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
-	add		v0.4s, v0.4s, v4.4s
-	  add		a0, a0, a4
-	add		v1.4s, v1.4s, v5.4s
-	  add		a1, a1, a5
-	add		v2.4s, v2.4s, v6.4s
-	  add		a2, a2, a6
-	add		v3.4s, v3.4s, v7.4s
-	  add		a3, a3, a7
-
-	eor		v12.16b, v12.16b, v0.16b
-	  eor		a12, a12, a0
-	eor		v13.16b, v13.16b, v1.16b
-	  eor		a13, a13, a1
-	eor		v14.16b, v14.16b, v2.16b
-	  eor		a14, a14, a2
-	eor		v15.16b, v15.16b, v3.16b
-	  eor		a15, a15, a3
-
-	rev32		v12.8h, v12.8h
-	  ror		a12, a12, #16
-	rev32		v13.8h, v13.8h
-	  ror		a13, a13, #16
-	rev32		v14.8h, v14.8h
-	  ror		a14, a14, #16
-	rev32		v15.8h, v15.8h
-	  ror		a15, a15, #16
-
-	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
-	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
-	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
-	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
-	add		v8.4s, v8.4s, v12.4s
-	  add		a8, a8, a12
-	add		v9.4s, v9.4s, v13.4s
-	  add		a9, a9, a13
-	add		v10.4s, v10.4s, v14.4s
-	  add		a10, a10, a14
-	add		v11.4s, v11.4s, v15.4s
-	  add		a11, a11, a15
-
-	eor		v16.16b, v4.16b, v8.16b
-	  eor		a4, a4, a8
-	eor		v17.16b, v5.16b, v9.16b
-	  eor		a5, a5, a9
-	eor		v18.16b, v6.16b, v10.16b
-	  eor		a6, a6, a10
-	eor		v19.16b, v7.16b, v11.16b
-	  eor		a7, a7, a11
-
-	shl		v4.4s, v16.4s, #12
-	shl		v5.4s, v17.4s, #12
-	shl		v6.4s, v18.4s, #12
-	shl		v7.4s, v19.4s, #12
-
-	sri		v4.4s, v16.4s, #20
-	  ror		a4, a4, #20
-	sri		v5.4s, v17.4s, #20
-	  ror		a5, a5, #20
-	sri		v6.4s, v18.4s, #20
-	  ror		a6, a6, #20
-	sri		v7.4s, v19.4s, #20
-	  ror		a7, a7, #20
-
-	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
-	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
-	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
-	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
-	add		v0.4s, v0.4s, v4.4s
-	  add		a0, a0, a4
-	add		v1.4s, v1.4s, v5.4s
-	  add		a1, a1, a5
-	add		v2.4s, v2.4s, v6.4s
-	  add		a2, a2, a6
-	add		v3.4s, v3.4s, v7.4s
-	  add		a3, a3, a7
-
-	eor		v12.16b, v12.16b, v0.16b
-	  eor		a12, a12, a0
-	eor		v13.16b, v13.16b, v1.16b
-	  eor		a13, a13, a1
-	eor		v14.16b, v14.16b, v2.16b
-	  eor		a14, a14, a2
-	eor		v15.16b, v15.16b, v3.16b
-	  eor		a15, a15, a3
-
-	tbl		v12.16b, {v12.16b}, v31.16b
-	  ror		a12, a12, #24
-	tbl		v13.16b, {v13.16b}, v31.16b
-	  ror		a13, a13, #24
-	tbl		v14.16b, {v14.16b}, v31.16b
-	  ror		a14, a14, #24
-	tbl		v15.16b, {v15.16b}, v31.16b
-	  ror		a15, a15, #24
-
-	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
-	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
-	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
-	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
-	add		v8.4s, v8.4s, v12.4s
-	  add		a8, a8, a12
-	add		v9.4s, v9.4s, v13.4s
-	  add		a9, a9, a13
-	add		v10.4s, v10.4s, v14.4s
-	  add		a10, a10, a14
-	add		v11.4s, v11.4s, v15.4s
-	  add		a11, a11, a15
-
-	eor		v16.16b, v4.16b, v8.16b
-	  eor		a4, a4, a8
-	eor		v17.16b, v5.16b, v9.16b
-	  eor		a5, a5, a9
-	eor		v18.16b, v6.16b, v10.16b
-	  eor		a6, a6, a10
-	eor		v19.16b, v7.16b, v11.16b
-	  eor		a7, a7, a11
-
-	shl		v4.4s, v16.4s, #7
-	shl		v5.4s, v17.4s, #7
-	shl		v6.4s, v18.4s, #7
-	shl		v7.4s, v19.4s, #7
-
-	sri		v4.4s, v16.4s, #25
-	  ror		a4, a4, #25
-	sri		v5.4s, v17.4s, #25
-	  ror		a5, a5, #25
-	sri		v6.4s, v18.4s, #25
-	 ror		a6, a6, #25
-	sri		v7.4s, v19.4s, #25
-	  ror		a7, a7, #25
-
-	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
-	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
-	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
-	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
-	add		v0.4s, v0.4s, v5.4s
-	  add		a0, a0, a5
-	add		v1.4s, v1.4s, v6.4s
-	  add		a1, a1, a6
-	add		v2.4s, v2.4s, v7.4s
-	  add		a2, a2, a7
-	add		v3.4s, v3.4s, v4.4s
-	  add		a3, a3, a4
-
-	eor		v15.16b, v15.16b, v0.16b
-	  eor		a15, a15, a0
-	eor		v12.16b, v12.16b, v1.16b
-	  eor		a12, a12, a1
-	eor		v13.16b, v13.16b, v2.16b
-	  eor		a13, a13, a2
-	eor		v14.16b, v14.16b, v3.16b
-	  eor		a14, a14, a3
-
-	rev32		v15.8h, v15.8h
-	  ror		a15, a15, #16
-	rev32		v12.8h, v12.8h
-	  ror		a12, a12, #16
-	rev32		v13.8h, v13.8h
-	  ror		a13, a13, #16
-	rev32		v14.8h, v14.8h
-	  ror		a14, a14, #16
-
-	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
-	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
-	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
-	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
-	add		v10.4s, v10.4s, v15.4s
-	  add		a10, a10, a15
-	add		v11.4s, v11.4s, v12.4s
-	  add		a11, a11, a12
-	add		v8.4s, v8.4s, v13.4s
-	  add		a8, a8, a13
-	add		v9.4s, v9.4s, v14.4s
-	  add		a9, a9, a14
-
-	eor		v16.16b, v5.16b, v10.16b
-	  eor		a5, a5, a10
-	eor		v17.16b, v6.16b, v11.16b
-	  eor		a6, a6, a11
-	eor		v18.16b, v7.16b, v8.16b
-	  eor		a7, a7, a8
-	eor		v19.16b, v4.16b, v9.16b
-	  eor		a4, a4, a9
-
-	shl		v5.4s, v16.4s, #12
-	shl		v6.4s, v17.4s, #12
-	shl		v7.4s, v18.4s, #12
-	shl		v4.4s, v19.4s, #12
-
-	sri		v5.4s, v16.4s, #20
-	  ror		a5, a5, #20
-	sri		v6.4s, v17.4s, #20
-	  ror		a6, a6, #20
-	sri		v7.4s, v18.4s, #20
-	  ror		a7, a7, #20
-	sri		v4.4s, v19.4s, #20
-	  ror		a4, a4, #20
-
-	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
-	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
-	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
-	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
-	add		v0.4s, v0.4s, v5.4s
-	  add		a0, a0, a5
-	add		v1.4s, v1.4s, v6.4s
-	  add		a1, a1, a6
-	add		v2.4s, v2.4s, v7.4s
-	  add		a2, a2, a7
-	add		v3.4s, v3.4s, v4.4s
-	  add		a3, a3, a4
-
-	eor		v15.16b, v15.16b, v0.16b
-	  eor		a15, a15, a0
-	eor		v12.16b, v12.16b, v1.16b
-	  eor		a12, a12, a1
-	eor		v13.16b, v13.16b, v2.16b
-	  eor		a13, a13, a2
-	eor		v14.16b, v14.16b, v3.16b
-	  eor		a14, a14, a3
-
-	tbl		v15.16b, {v15.16b}, v31.16b
-	  ror		a15, a15, #24
-	tbl		v12.16b, {v12.16b}, v31.16b
-	  ror		a12, a12, #24
-	tbl		v13.16b, {v13.16b}, v31.16b
-	  ror		a13, a13, #24
-	tbl		v14.16b, {v14.16b}, v31.16b
-	  ror		a14, a14, #24
-
-	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
-	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
-	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
-	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
-	add		v10.4s, v10.4s, v15.4s
-	  add		a10, a10, a15
-	add		v11.4s, v11.4s, v12.4s
-	  add		a11, a11, a12
-	add		v8.4s, v8.4s, v13.4s
-	  add		a8, a8, a13
-	add		v9.4s, v9.4s, v14.4s
-	  add		a9, a9, a14
-
-	eor		v16.16b, v5.16b, v10.16b
-	  eor		a5, a5, a10
-	eor		v17.16b, v6.16b, v11.16b
-	  eor		a6, a6, a11
-	eor		v18.16b, v7.16b, v8.16b
-	  eor		a7, a7, a8
-	eor		v19.16b, v4.16b, v9.16b
-	  eor		a4, a4, a9
-
-	shl		v5.4s, v16.4s, #7
-	shl		v6.4s, v17.4s, #7
-	shl		v7.4s, v18.4s, #7
-	shl		v4.4s, v19.4s, #7
-
-	sri		v5.4s, v16.4s, #25
-	  ror		a5, a5, #25
-	sri		v6.4s, v17.4s, #25
-	  ror		a6, a6, #25
-	sri		v7.4s, v18.4s, #25
-	  ror		a7, a7, #25
-	sri		v4.4s, v19.4s, #25
-	  ror		a4, a4, #25
-
-	subs		w3, w3, #2
-	b.ne		.Ldoubleround4
-
-	ld4r		{v16.4s-v19.4s}, [x0], #16
-	ld4r		{v20.4s-v23.4s}, [x0], #16
-
-	// x12 += counter values 0-3
-	add		v12.4s, v12.4s, v30.4s
-
-	// x0[0-3] += s0[0]
-	// x1[0-3] += s0[1]
-	// x2[0-3] += s0[2]
-	// x3[0-3] += s0[3]
-	add		v0.4s, v0.4s, v16.4s
-	  mov		w6, v16.s[0]
-	  mov		w7, v17.s[0]
-	add		v1.4s, v1.4s, v17.4s
-	  mov		w8, v18.s[0]
-	  mov		w9, v19.s[0]
-	add		v2.4s, v2.4s, v18.4s
-	  add		a0, a0, w6
-	  add		a1, a1, w7
-	add		v3.4s, v3.4s, v19.4s
-	  add		a2, a2, w8
-	  add		a3, a3, w9
-CPU_BE(	  rev		a0, a0		)
-CPU_BE(	  rev		a1, a1		)
-CPU_BE(	  rev		a2, a2		)
-CPU_BE(	  rev		a3, a3		)
-
-	ld4r		{v24.4s-v27.4s}, [x0], #16
-	ld4r		{v28.4s-v31.4s}, [x0]
-
-	// x4[0-3] += s1[0]
-	// x5[0-3] += s1[1]
-	// x6[0-3] += s1[2]
-	// x7[0-3] += s1[3]
-	add		v4.4s, v4.4s, v20.4s
-	  mov		w6, v20.s[0]
-	  mov		w7, v21.s[0]
-	add		v5.4s, v5.4s, v21.4s
-	  mov		w8, v22.s[0]
-	  mov		w9, v23.s[0]
-	add		v6.4s, v6.4s, v22.4s
-	  add		a4, a4, w6
-	  add		a5, a5, w7
-	add		v7.4s, v7.4s, v23.4s
-	  add		a6, a6, w8
-	  add		a7, a7, w9
-CPU_BE(	  rev		a4, a4		)
-CPU_BE(	  rev		a5, a5		)
-CPU_BE(	  rev		a6, a6		)
-CPU_BE(	  rev		a7, a7		)
-
-	// x8[0-3] += s2[0]
-	// x9[0-3] += s2[1]
-	// x10[0-3] += s2[2]
-	// x11[0-3] += s2[3]
-	add		v8.4s, v8.4s, v24.4s
-	  mov		w6, v24.s[0]
-	  mov		w7, v25.s[0]
-	add		v9.4s, v9.4s, v25.4s
-	  mov		w8, v26.s[0]
-	  mov		w9, v27.s[0]
-	add		v10.4s, v10.4s, v26.4s
-	  add		a8, a8, w6
-	  add		a9, a9, w7
-	add		v11.4s, v11.4s, v27.4s
-	  add		a10, a10, w8
-	  add		a11, a11, w9
-CPU_BE(	  rev		a8, a8		)
-CPU_BE(	  rev		a9, a9		)
-CPU_BE(	  rev		a10, a10	)
-CPU_BE(	  rev		a11, a11	)
-
-	// x12[0-3] += s3[0]
-	// x13[0-3] += s3[1]
-	// x14[0-3] += s3[2]
-	// x15[0-3] += s3[3]
-	add		v12.4s, v12.4s, v28.4s
-	  mov		w6, v28.s[0]
-	  mov		w7, v29.s[0]
-	add		v13.4s, v13.4s, v29.4s
-	  mov		w8, v30.s[0]
-	  mov		w9, v31.s[0]
-	add		v14.4s, v14.4s, v30.4s
-	  add		a12, a12, w6
-	  add		a13, a13, w7
-	add		v15.4s, v15.4s, v31.4s
-	  add		a14, a14, w8
-	  add		a15, a15, w9
-CPU_BE(	  rev		a12, a12	)
-CPU_BE(	  rev		a13, a13	)
-CPU_BE(	  rev		a14, a14	)
-CPU_BE(	  rev		a15, a15	)
-
-	// interleave 32-bit words in state n, n+1
-	  ldp		w6, w7, [x2], #64
-	zip1		v16.4s, v0.4s, v1.4s
-	  ldp		w8, w9, [x2, #-56]
-	  eor		a0, a0, w6
-	zip2		v17.4s, v0.4s, v1.4s
-	  eor		a1, a1, w7
-	zip1		v18.4s, v2.4s, v3.4s
-	  eor		a2, a2, w8
-	zip2		v19.4s, v2.4s, v3.4s
-	  eor		a3, a3, w9
-	  ldp		w6, w7, [x2, #-48]
-	zip1		v20.4s, v4.4s, v5.4s
-	  ldp		w8, w9, [x2, #-40]
-	  eor		a4, a4, w6
-	zip2		v21.4s, v4.4s, v5.4s
-	  eor		a5, a5, w7
-	zip1		v22.4s, v6.4s, v7.4s
-	  eor		a6, a6, w8
-	zip2		v23.4s, v6.4s, v7.4s
-	  eor		a7, a7, w9
-	  ldp		w6, w7, [x2, #-32]
-	zip1		v24.4s, v8.4s, v9.4s
-	  ldp		w8, w9, [x2, #-24]
-	  eor		a8, a8, w6
-	zip2		v25.4s, v8.4s, v9.4s
-	  eor		a9, a9, w7
-	zip1		v26.4s, v10.4s, v11.4s
-	  eor		a10, a10, w8
-	zip2		v27.4s, v10.4s, v11.4s
-	  eor		a11, a11, w9
-	  ldp		w6, w7, [x2, #-16]
-	zip1		v28.4s, v12.4s, v13.4s
-	  ldp		w8, w9, [x2, #-8]
-	  eor		a12, a12, w6
-	zip2		v29.4s, v12.4s, v13.4s
-	  eor		a13, a13, w7
-	zip1		v30.4s, v14.4s, v15.4s
-	  eor		a14, a14, w8
-	zip2		v31.4s, v14.4s, v15.4s
-	  eor		a15, a15, w9
-
-	add		x3, x2, x4
-	sub		x3, x3, #128		// start of last block
-
-	subs		x5, x4, #128
-	csel		x2, x2, x3, ge
-
-	// interleave 64-bit words in state n, n+2
-	zip1		v0.2d, v16.2d, v18.2d
-	zip2		v4.2d, v16.2d, v18.2d
-	  stp		a0, a1, [x1], #64
-	zip1		v8.2d, v17.2d, v19.2d
-	zip2		v12.2d, v17.2d, v19.2d
-	  stp		a2, a3, [x1, #-56]
-
-	subs		x6, x4, #192
-	ld1		{v16.16b-v19.16b}, [x2], #64
-	csel		x2, x2, x3, ge
-
-	zip1		v1.2d, v20.2d, v22.2d
-	zip2		v5.2d, v20.2d, v22.2d
-	  stp		a4, a5, [x1, #-48]
-	zip1		v9.2d, v21.2d, v23.2d
-	zip2		v13.2d, v21.2d, v23.2d
-	  stp		a6, a7, [x1, #-40]
-
-	subs		x7, x4, #256
-	ld1		{v20.16b-v23.16b}, [x2], #64
-	csel		x2, x2, x3, ge
-
-	zip1		v2.2d, v24.2d, v26.2d
-	zip2		v6.2d, v24.2d, v26.2d
-	  stp		a8, a9, [x1, #-32]
-	zip1		v10.2d, v25.2d, v27.2d
-	zip2		v14.2d, v25.2d, v27.2d
-	  stp		a10, a11, [x1, #-24]
-
-	subs		x8, x4, #320
-	ld1		{v24.16b-v27.16b}, [x2], #64
-	csel		x2, x2, x3, ge
-
-	zip1		v3.2d, v28.2d, v30.2d
-	zip2		v7.2d, v28.2d, v30.2d
-	  stp		a12, a13, [x1, #-16]
-	zip1		v11.2d, v29.2d, v31.2d
-	zip2		v15.2d, v29.2d, v31.2d
-	  stp		a14, a15, [x1, #-8]
-
-	tbnz		x5, #63, .Lt128
-	ld1		{v28.16b-v31.16b}, [x2]
-
-	// xor with corresponding input, write to output
-	eor		v16.16b, v16.16b, v0.16b
-	eor		v17.16b, v17.16b, v1.16b
-	eor		v18.16b, v18.16b, v2.16b
-	eor		v19.16b, v19.16b, v3.16b
-
-	tbnz		x6, #63, .Lt192
-
-	eor		v20.16b, v20.16b, v4.16b
-	eor		v21.16b, v21.16b, v5.16b
-	eor		v22.16b, v22.16b, v6.16b
-	eor		v23.16b, v23.16b, v7.16b
-
-	st1		{v16.16b-v19.16b}, [x1], #64
-	tbnz		x7, #63, .Lt256
-
-	eor		v24.16b, v24.16b, v8.16b
-	eor		v25.16b, v25.16b, v9.16b
-	eor		v26.16b, v26.16b, v10.16b
-	eor		v27.16b, v27.16b, v11.16b
-
-	st1		{v20.16b-v23.16b}, [x1], #64
-	tbnz		x8, #63, .Lt320
-
-	eor		v28.16b, v28.16b, v12.16b
-	eor		v29.16b, v29.16b, v13.16b
-	eor		v30.16b, v30.16b, v14.16b
-	eor		v31.16b, v31.16b, v15.16b
-
-	st1		{v24.16b-v27.16b}, [x1], #64
-	st1		{v28.16b-v31.16b}, [x1]
-
-.Lout:	frame_pop
-	ret
-
-	// fewer than 192 bytes of in/output
-.Lt192:	cbz		x5, 1f				// exactly 128 bytes?
-	ld1		{v28.16b-v31.16b}, [x10]
-	add		x5, x5, x1
-	tbl		v28.16b, {v4.16b-v7.16b}, v28.16b
-	tbl		v29.16b, {v4.16b-v7.16b}, v29.16b
-	tbl		v30.16b, {v4.16b-v7.16b}, v30.16b
-	tbl		v31.16b, {v4.16b-v7.16b}, v31.16b
-
-0:	eor		v20.16b, v20.16b, v28.16b
-	eor		v21.16b, v21.16b, v29.16b
-	eor		v22.16b, v22.16b, v30.16b
-	eor		v23.16b, v23.16b, v31.16b
-	st1		{v20.16b-v23.16b}, [x5]		// overlapping stores
-1:	st1		{v16.16b-v19.16b}, [x1]
-	b		.Lout
-
-	// fewer than 128 bytes of in/output
-.Lt128:	ld1		{v28.16b-v31.16b}, [x10]
-	add		x5, x5, x1
-	sub		x1, x1, #64
-	tbl		v28.16b, {v0.16b-v3.16b}, v28.16b
-	tbl		v29.16b, {v0.16b-v3.16b}, v29.16b
-	tbl		v30.16b, {v0.16b-v3.16b}, v30.16b
-	tbl		v31.16b, {v0.16b-v3.16b}, v31.16b
-	ld1		{v16.16b-v19.16b}, [x1]		// reload first output block
-	b		0b
-
-	// fewer than 256 bytes of in/output
-.Lt256:	cbz		x6, 2f				// exactly 192 bytes?
-	ld1		{v4.16b-v7.16b}, [x10]
-	add		x6, x6, x1
-	tbl		v0.16b, {v8.16b-v11.16b}, v4.16b
-	tbl		v1.16b, {v8.16b-v11.16b}, v5.16b
-	tbl		v2.16b, {v8.16b-v11.16b}, v6.16b
-	tbl		v3.16b, {v8.16b-v11.16b}, v7.16b
-
-	eor		v28.16b, v28.16b, v0.16b
-	eor		v29.16b, v29.16b, v1.16b
-	eor		v30.16b, v30.16b, v2.16b
-	eor		v31.16b, v31.16b, v3.16b
-	st1		{v28.16b-v31.16b}, [x6]		// overlapping stores
-2:	st1		{v20.16b-v23.16b}, [x1]
-	b		.Lout
-
-	// fewer than 320 bytes of in/output
-.Lt320:	cbz		x7, 3f				// exactly 256 bytes?
-	ld1		{v4.16b-v7.16b}, [x10]
-	add		x7, x7, x1
-	tbl		v0.16b, {v12.16b-v15.16b}, v4.16b
-	tbl		v1.16b, {v12.16b-v15.16b}, v5.16b
-	tbl		v2.16b, {v12.16b-v15.16b}, v6.16b
-	tbl		v3.16b, {v12.16b-v15.16b}, v7.16b
-
-	eor		v28.16b, v28.16b, v0.16b
-	eor		v29.16b, v29.16b, v1.16b
-	eor		v30.16b, v30.16b, v2.16b
-	eor		v31.16b, v31.16b, v3.16b
-	st1		{v28.16b-v31.16b}, [x7]		// overlapping stores
-3:	st1		{v24.16b-v27.16b}, [x1]
-	b		.Lout
-SYM_FUNC_END(chacha_4block_xor_neon)
-
-	.section	".rodata", "a", %progbits
-	.align		L1_CACHE_SHIFT
-.Lpermute:
-	.set		.Li, 0
-	.rept		128
-	.byte		(.Li - 64)
-	.set		.Li, .Li + 1
-	.endr
-
-CTRINC:	.word		1, 2, 3, 4
-ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c
deleted file mode 100644
index 229876acfc58..000000000000
--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * ARM NEON and scalar accelerated ChaCha and XChaCha stream ciphers,
- * including ChaCha20 (RFC7539)
- *
- * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <crypto/algapi.h>
-#include <crypto/internal/chacha.h>
-#include <crypto/internal/simd.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src,
-				      int nrounds);
-asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src,
-				       int nrounds, int bytes);
-asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-
-static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
-			  int bytes, int nrounds)
-{
-	while (bytes > 0) {
-		int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
-
-		if (l <= CHACHA_BLOCK_SIZE) {
-			u8 buf[CHACHA_BLOCK_SIZE];
-
-			memcpy(buf, src, l);
-			chacha_block_xor_neon(state, buf, buf, nrounds);
-			memcpy(dst, buf, l);
-			state[12] += 1;
-			break;
-		}
-		chacha_4block_xor_neon(state, dst, src, nrounds, l);
-		bytes -= l;
-		src += l;
-		dst += l;
-		state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
-	}
-}
-
-void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
-{
-	if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
-		hchacha_block_generic(state, stream, nrounds);
-	} else {
-		kernel_neon_begin();
-		hchacha_block_neon(state, stream, nrounds);
-		kernel_neon_end();
-	}
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
-		       int nrounds)
-{
-	if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
-	    !crypto_simd_usable())
-		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
-
-	do {
-		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
-
-		kernel_neon_begin();
-		chacha_doneon(state, dst, src, todo, nrounds);
-		kernel_neon_end();
-
-		bytes -= todo;
-		src += todo;
-		dst += todo;
-	} while (bytes);
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-static int chacha_neon_stream_xor(struct skcipher_request *req,
-				  const struct chacha_ctx *ctx, const u8 *iv)
-{
-	struct skcipher_walk walk;
-	u32 state[16];
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, false);
-
-	chacha_init(state, ctx->key, iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-
-		if (nbytes < walk.total)
-			nbytes = rounddown(nbytes, walk.stride);
-
-		if (!static_branch_likely(&have_neon) ||
-		    !crypto_simd_usable()) {
-			chacha_crypt_generic(state, walk.dst.virt.addr,
-					     walk.src.virt.addr, nbytes,
-					     ctx->nrounds);
-		} else {
-			kernel_neon_begin();
-			chacha_doneon(state, walk.dst.virt.addr,
-				      walk.src.virt.addr, nbytes, ctx->nrounds);
-			kernel_neon_end();
-		}
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-	}
-
-	return err;
-}
-
-static int chacha_neon(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-
-	return chacha_neon_stream_xor(req, ctx, req->iv);
-}
-
-static int xchacha_neon(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct chacha_ctx subctx;
-	u32 state[16];
-	u8 real_iv[16];
-
-	chacha_init(state, ctx->key, req->iv);
-	hchacha_block_arch(state, subctx.key, ctx->nrounds);
-	subctx.nrounds = ctx->nrounds;
-
-	memcpy(&real_iv[0], req->iv + 24, 8);
-	memcpy(&real_iv[8], req->iv + 16, 8);
-	return chacha_neon_stream_xor(req, &subctx, real_iv);
-}
-
-static struct skcipher_alg algs[] = {
-	{
-		.base.cra_name		= "chacha20",
-		.base.cra_driver_name	= "chacha20-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= CHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 5 * CHACHA_BLOCK_SIZE,
-		.setkey			= chacha20_setkey,
-		.encrypt		= chacha_neon,
-		.decrypt		= chacha_neon,
-	}, {
-		.base.cra_name		= "xchacha20",
-		.base.cra_driver_name	= "xchacha20-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= XCHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 5 * CHACHA_BLOCK_SIZE,
-		.setkey			= chacha20_setkey,
-		.encrypt		= xchacha_neon,
-		.decrypt		= xchacha_neon,
-	}, {
-		.base.cra_name		= "xchacha12",
-		.base.cra_driver_name	= "xchacha12-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= XCHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 5 * CHACHA_BLOCK_SIZE,
-		.setkey			= chacha12_setkey,
-		.encrypt		= xchacha_neon,
-		.decrypt		= xchacha_neon,
-	}
-};
-
-static int __init chacha_simd_mod_init(void)
-{
-	if (!cpu_have_named_feature(ASIMD))
-		return 0;
-
-	static_branch_enable(&have_neon);
-
-	return IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) ?
-		crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
-}
-
-static void __exit chacha_simd_mod_fini(void)
-{
-	if (IS_REACHABLE(CONFIG_CRYPTO_SKCIPHER) && cpu_have_named_feature(ASIMD))
-		crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-module_init(chacha_simd_mod_init);
-module_exit(chacha_simd_mod_fini);
-
-MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("chacha20");
-MODULE_ALIAS_CRYPTO("chacha20-neon");
-MODULE_ALIAS_CRYPTO("xchacha20");
-MODULE_ALIAS_CRYPTO("xchacha20-neon");
-MODULE_ALIAS_CRYPTO("xchacha12");
-MODULE_ALIAS_CRYPTO("xchacha12-neon");
diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c
index 071e122f9c37..4995b6e22335 100644
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -6,30 +6,27 @@
  */
 
 #include <asm/neon.h>
-#include <asm/simd.h>
-#include <linux/unaligned.h>
 #include <crypto/aes.h>
-#include <crypto/gcm.h>
-#include <crypto/algapi.h>
 #include <crypto/b128ops.h>
+#include <crypto/gcm.h>
+#include <crypto/ghash.h>
 #include <crypto/gf128mul.h>
 #include <crypto/internal/aead.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
 #include <linux/cpufeature.h>
-#include <linux/crypto.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
 
 MODULE_DESCRIPTION("GHASH and AES-GCM using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS_CRYPTO("ghash");
 
-#define GHASH_BLOCK_SIZE	16
-#define GHASH_DIGEST_SIZE	16
-
 #define RFC4106_NONCE_SIZE	4
 
 struct ghash_key {
@@ -37,10 +34,8 @@ struct ghash_key {
 	u64			h[][2];
 };
 
-struct ghash_desc_ctx {
+struct arm_ghash_desc_ctx {
 	u64 digest[GHASH_DIGEST_SIZE/sizeof(u64)];
-	u8 buf[GHASH_BLOCK_SIZE];
-	u32 count;
 };
 
 struct gcm_aes_ctx {
@@ -65,36 +60,12 @@ asmlinkage int pmull_gcm_decrypt(int bytes, u8 dst[], const u8 src[],
 
 static int ghash_init(struct shash_desc *desc)
 {
-	struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
+	struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc);
 
-	*ctx = (struct ghash_desc_ctx){};
+	*ctx = (struct arm_ghash_desc_ctx){};
 	return 0;
 }
 
-static void ghash_do_update(int blocks, u64 dg[], const char *src,
-			    struct ghash_key *key, const char *head)
-{
-	be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) };
-
-	do {
-		const u8 *in = src;
-
-		if (head) {
-			in = head;
-			blocks++;
-			head = NULL;
-		} else {
-			src += GHASH_BLOCK_SIZE;
-		}
-
-		crypto_xor((u8 *)&dst, in, GHASH_BLOCK_SIZE);
-		gf128mul_lle(&dst, &key->k);
-	} while (--blocks);
-
-	dg[0] = be64_to_cpu(dst.b);
-	dg[1] = be64_to_cpu(dst.a);
-}
-
 static __always_inline
 void ghash_do_simd_update(int blocks, u64 dg[], const char *src,
 			  struct ghash_key *key, const char *head,
@@ -103,13 +74,9 @@ void ghash_do_simd_update(int blocks, u64 dg[], const char *src,
 					      u64 const h[][2],
 					      const char *head))
 {
-	if (likely(crypto_simd_usable())) {
-		kernel_neon_begin();
-		simd_update(blocks, dg, src, key->h, head);
-		kernel_neon_end();
-	} else {
-		ghash_do_update(blocks, dg, src, key, head);
-	}
+	kernel_neon_begin();
+	simd_update(blocks, dg, src, key->h, head);
+	kernel_neon_end();
 }
 
 /* avoid hogging the CPU for too long */
@@ -118,61 +85,59 @@ void ghash_do_simd_update(int blocks, u64 dg[], const char *src,
 static int ghash_update(struct shash_desc *desc, const u8 *src,
 			unsigned int len)
 {
-	struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
-	unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
+	struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc);
+	struct ghash_key *key = crypto_shash_ctx(desc->tfm);
+	int blocks;
 
-	ctx->count += len;
+	blocks = len / GHASH_BLOCK_SIZE;
+	len -= blocks * GHASH_BLOCK_SIZE;
 
-	if ((partial + len) >= GHASH_BLOCK_SIZE) {
-		struct ghash_key *key = crypto_shash_ctx(desc->tfm);
-		int blocks;
-
-		if (partial) {
-			int p = GHASH_BLOCK_SIZE - partial;
+	do {
+		int chunk = min(blocks, MAX_BLOCKS);
 
-			memcpy(ctx->buf + partial, src, p);
-			src += p;
-			len -= p;
-		}
+		ghash_do_simd_update(chunk, ctx->digest, src, key, NULL,
+				     pmull_ghash_update_p8);
+		blocks -= chunk;
+		src += chunk * GHASH_BLOCK_SIZE;
+	} while (unlikely(blocks > 0));
+	return len;
+}
 
-		blocks = len / GHASH_BLOCK_SIZE;
-		len %= GHASH_BLOCK_SIZE;
+static int ghash_export(struct shash_desc *desc, void *out)
+{
+	struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc);
+	u8 *dst = out;
 
-		do {
-			int chunk = min(blocks, MAX_BLOCKS);
+	put_unaligned_be64(ctx->digest[1], dst);
+	put_unaligned_be64(ctx->digest[0], dst + 8);
+	return 0;
+}
 
-			ghash_do_simd_update(chunk, ctx->digest, src, key,
-					     partial ? ctx->buf : NULL,
-					     pmull_ghash_update_p8);
+static int ghash_import(struct shash_desc *desc, const void *in)
+{
+	struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc);
+	const u8 *src = in;
 
-			blocks -= chunk;
-			src += chunk * GHASH_BLOCK_SIZE;
-			partial = 0;
-		} while (unlikely(blocks > 0));
-	}
-	if (len)
-		memcpy(ctx->buf + partial, src, len);
+	ctx->digest[1] = get_unaligned_be64(src);
+	ctx->digest[0] = get_unaligned_be64(src + 8);
 	return 0;
 }
 
-static int ghash_final(struct shash_desc *desc, u8 *dst)
+static int ghash_finup(struct shash_desc *desc, const u8 *src,
+		       unsigned int len, u8 *dst)
 {
-	struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
-	unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
-
-	if (partial) {
-		struct ghash_key *key = crypto_shash_ctx(desc->tfm);
+	struct arm_ghash_desc_ctx *ctx = shash_desc_ctx(desc);
+	struct ghash_key *key = crypto_shash_ctx(desc->tfm);
 
-		memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
+	if (len) {
+		u8 buf[GHASH_BLOCK_SIZE] = {};
 
-		ghash_do_simd_update(1, ctx->digest, ctx->buf, key, NULL,
+		memcpy(buf, src, len);
+		ghash_do_simd_update(1, ctx->digest, src, key, NULL,
 				     pmull_ghash_update_p8);
+		memzero_explicit(buf, sizeof(buf));
 	}
-	put_unaligned_be64(ctx->digest[1], dst);
-	put_unaligned_be64(ctx->digest[0], dst + 8);
-
-	memzero_explicit(ctx, sizeof(*ctx));
-	return 0;
+	return ghash_export(desc, dst);
 }
 
 static void ghash_reflect(u64 h[], const be128 *k)
@@ -205,6 +170,7 @@ static struct shash_alg ghash_alg = {
 	.base.cra_name		= "ghash",
 	.base.cra_driver_name	= "ghash-neon",
 	.base.cra_priority	= 150,
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
 	.base.cra_blocksize	= GHASH_BLOCK_SIZE,
 	.base.cra_ctxsize	= sizeof(struct ghash_key) + sizeof(u64[2]),
 	.base.cra_module	= THIS_MODULE,
@@ -212,9 +178,12 @@ static struct shash_alg ghash_alg = {
 	.digestsize		= GHASH_DIGEST_SIZE,
 	.init			= ghash_init,
 	.update			= ghash_update,
-	.final			= ghash_final,
+	.finup			= ghash_finup,
 	.setkey			= ghash_setkey,
-	.descsize		= sizeof(struct ghash_desc_ctx),
+	.export			= ghash_export,
+	.import			= ghash_import,
+	.descsize		= sizeof(struct arm_ghash_desc_ctx),
+	.statesize		= sizeof(struct ghash_desc_ctx),
 };
 
 static int num_rounds(struct crypto_aes_ctx *ctx)
diff --git a/arch/arm64/crypto/poly1305-armv8.pl b/arch/arm64/crypto/poly1305-armv8.pl
deleted file mode 100644
index 22c9069c0650..000000000000
--- a/arch/arm64/crypto/poly1305-armv8.pl
+++ /dev/null
@@ -1,917 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
-#
-# ====================================================================
-# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
-# project.
-# ====================================================================
-#
-# This module implements Poly1305 hash for ARMv8.
-#
-# June 2015
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone.
-#
-#		IALU/gcc-4.9	NEON
-#
-# Apple A7	1.86/+5%	0.72
-# Cortex-A53	2.69/+58%	1.47
-# Cortex-A57	2.70/+7%	1.14
-# Denver	1.64/+50%	1.18(*)
-# X-Gene	2.13/+68%	2.27
-# Mongoose	1.77/+75%	1.12
-# Kryo		2.70/+55%	1.13
-# ThunderX2	1.17/+95%	1.36
-#
-# (*)	estimate based on resources availability is less than 1.0,
-#	i.e. measured result is worse than expected, presumably binary
-#	translator is not almighty;
-
-$flavour=shift;
-$output=shift;
-
-if ($flavour && $flavour ne "void") {
-    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-    die "can't locate arm-xlate.pl";
-
-    open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
-    open STDOUT,">$output";
-}
-
-my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
-my ($mac,$nonce)=($inp,$len);
-
-my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
-
-$code.=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-.extern	OPENSSL_armcap_P
-#endif
-
-.text
-
-// forward "declarations" are required for Apple
-.globl	poly1305_blocks
-.globl	poly1305_emit
-
-.globl	poly1305_init
-.type	poly1305_init,%function
-.align	5
-poly1305_init:
-	cmp	$inp,xzr
-	stp	xzr,xzr,[$ctx]		// zero hash value
-	stp	xzr,xzr,[$ctx,#16]	// [along with is_base2_26]
-
-	csel	x0,xzr,x0,eq
-	b.eq	.Lno_key
-
-#ifndef	__KERNEL__
-	adrp	x17,OPENSSL_armcap_P
-	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
-#endif
-
-	ldp	$r0,$r1,[$inp]		// load key
-	mov	$s1,#0xfffffffc0fffffff
-	movk	$s1,#0x0fff,lsl#48
-#ifdef	__AARCH64EB__
-	rev	$r0,$r0			// flip bytes
-	rev	$r1,$r1
-#endif
-	and	$r0,$r0,$s1		// &=0ffffffc0fffffff
-	and	$s1,$s1,#-4
-	and	$r1,$r1,$s1		// &=0ffffffc0ffffffc
-	mov	w#$s1,#-1
-	stp	$r0,$r1,[$ctx,#32]	// save key value
-	str	w#$s1,[$ctx,#48]	// impossible key power value
-
-#ifndef	__KERNEL__
-	tst	w17,#ARMV7_NEON
-
-	adr	$d0,.Lpoly1305_blocks
-	adr	$r0,.Lpoly1305_blocks_neon
-	adr	$d1,.Lpoly1305_emit
-
-	csel	$d0,$d0,$r0,eq
-
-# ifdef	__ILP32__
-	stp	w#$d0,w#$d1,[$len]
-# else
-	stp	$d0,$d1,[$len]
-# endif
-#endif
-	mov	x0,#1
-.Lno_key:
-	ret
-.size	poly1305_init,.-poly1305_init
-
-.type	poly1305_blocks,%function
-.align	5
-poly1305_blocks:
-.Lpoly1305_blocks:
-	ands	$len,$len,#-16
-	b.eq	.Lno_data
-
-	ldp	$h0,$h1,[$ctx]		// load hash value
-	ldp	$h2,x17,[$ctx,#16]	// [along with is_base2_26]
-	ldp	$r0,$r1,[$ctx,#32]	// load key value
-
-#ifdef	__AARCH64EB__
-	lsr	$d0,$h0,#32
-	mov	w#$d1,w#$h0
-	lsr	$d2,$h1,#32
-	mov	w15,w#$h1
-	lsr	x16,$h2,#32
-#else
-	mov	w#$d0,w#$h0
-	lsr	$d1,$h0,#32
-	mov	w#$d2,w#$h1
-	lsr	x15,$h1,#32
-	mov	w16,w#$h2
-#endif
-
-	add	$d0,$d0,$d1,lsl#26	// base 2^26 -> base 2^64
-	lsr	$d1,$d2,#12
-	adds	$d0,$d0,$d2,lsl#52
-	add	$d1,$d1,x15,lsl#14
-	adc	$d1,$d1,xzr
-	lsr	$d2,x16,#24
-	adds	$d1,$d1,x16,lsl#40
-	adc	$d2,$d2,xzr
-
-	cmp	x17,#0			// is_base2_26?
-	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
-	csel	$h0,$h0,$d0,eq		// choose between radixes
-	csel	$h1,$h1,$d1,eq
-	csel	$h2,$h2,$d2,eq
-
-.Loop:
-	ldp	$t0,$t1,[$inp],#16	// load input
-	sub	$len,$len,#16
-#ifdef	__AARCH64EB__
-	rev	$t0,$t0
-	rev	$t1,$t1
-#endif
-	adds	$h0,$h0,$t0		// accumulate input
-	adcs	$h1,$h1,$t1
-
-	mul	$d0,$h0,$r0		// h0*r0
-	adc	$h2,$h2,$padbit
-	umulh	$d1,$h0,$r0
-
-	mul	$t0,$h1,$s1		// h1*5*r1
-	umulh	$t1,$h1,$s1
-
-	adds	$d0,$d0,$t0
-	mul	$t0,$h0,$r1		// h0*r1
-	adc	$d1,$d1,$t1
-	umulh	$d2,$h0,$r1
-
-	adds	$d1,$d1,$t0
-	mul	$t0,$h1,$r0		// h1*r0
-	adc	$d2,$d2,xzr
-	umulh	$t1,$h1,$r0
-
-	adds	$d1,$d1,$t0
-	mul	$t0,$h2,$s1		// h2*5*r1
-	adc	$d2,$d2,$t1
-	mul	$t1,$h2,$r0		// h2*r0
-
-	adds	$d1,$d1,$t0
-	adc	$d2,$d2,$t1
-
-	and	$t0,$d2,#-4		// final reduction
-	and	$h2,$d2,#3
-	add	$t0,$t0,$d2,lsr#2
-	adds	$h0,$d0,$t0
-	adcs	$h1,$d1,xzr
-	adc	$h2,$h2,xzr
-
-	cbnz	$len,.Loop
-
-	stp	$h0,$h1,[$ctx]		// store hash value
-	stp	$h2,xzr,[$ctx,#16]	// [and clear is_base2_26]
-
-.Lno_data:
-	ret
-.size	poly1305_blocks,.-poly1305_blocks
-
-.type	poly1305_emit,%function
-.align	5
-poly1305_emit:
-.Lpoly1305_emit:
-	ldp	$h0,$h1,[$ctx]		// load hash base 2^64
-	ldp	$h2,$r0,[$ctx,#16]	// [along with is_base2_26]
-	ldp	$t0,$t1,[$nonce]	// load nonce
-
-#ifdef	__AARCH64EB__
-	lsr	$d0,$h0,#32
-	mov	w#$d1,w#$h0
-	lsr	$d2,$h1,#32
-	mov	w15,w#$h1
-	lsr	x16,$h2,#32
-#else
-	mov	w#$d0,w#$h0
-	lsr	$d1,$h0,#32
-	mov	w#$d2,w#$h1
-	lsr	x15,$h1,#32
-	mov	w16,w#$h2
-#endif
-
-	add	$d0,$d0,$d1,lsl#26	// base 2^26 -> base 2^64
-	lsr	$d1,$d2,#12
-	adds	$d0,$d0,$d2,lsl#52
-	add	$d1,$d1,x15,lsl#14
-	adc	$d1,$d1,xzr
-	lsr	$d2,x16,#24
-	adds	$d1,$d1,x16,lsl#40
-	adc	$d2,$d2,xzr
-
-	cmp	$r0,#0			// is_base2_26?
-	csel	$h0,$h0,$d0,eq		// choose between radixes
-	csel	$h1,$h1,$d1,eq
-	csel	$h2,$h2,$d2,eq
-
-	adds	$d0,$h0,#5		// compare to modulus
-	adcs	$d1,$h1,xzr
-	adc	$d2,$h2,xzr
-
-	tst	$d2,#-4			// see if it's carried/borrowed
-
-	csel	$h0,$h0,$d0,eq
-	csel	$h1,$h1,$d1,eq
-
-#ifdef	__AARCH64EB__
-	ror	$t0,$t0,#32		// flip nonce words
-	ror	$t1,$t1,#32
-#endif
-	adds	$h0,$h0,$t0		// accumulate nonce
-	adc	$h1,$h1,$t1
-#ifdef	__AARCH64EB__
-	rev	$h0,$h0			// flip output bytes
-	rev	$h1,$h1
-#endif
-	stp	$h0,$h1,[$mac]		// write result
-
-	ret
-.size	poly1305_emit,.-poly1305_emit
-___
-my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
-my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
-my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
-my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
-my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
-my ($T0,$T1,$MASK) = map("v$_",(29..31));
-
-my ($in2,$zeros)=("x16","x17");
-my $is_base2_26 = $zeros;		# borrow
-
-$code.=<<___;
-.type	poly1305_mult,%function
-.align	5
-poly1305_mult:
-	mul	$d0,$h0,$r0		// h0*r0
-	umulh	$d1,$h0,$r0
-
-	mul	$t0,$h1,$s1		// h1*5*r1
-	umulh	$t1,$h1,$s1
-
-	adds	$d0,$d0,$t0
-	mul	$t0,$h0,$r1		// h0*r1
-	adc	$d1,$d1,$t1
-	umulh	$d2,$h0,$r1
-
-	adds	$d1,$d1,$t0
-	mul	$t0,$h1,$r0		// h1*r0
-	adc	$d2,$d2,xzr
-	umulh	$t1,$h1,$r0
-
-	adds	$d1,$d1,$t0
-	mul	$t0,$h2,$s1		// h2*5*r1
-	adc	$d2,$d2,$t1
-	mul	$t1,$h2,$r0		// h2*r0
-
-	adds	$d1,$d1,$t0
-	adc	$d2,$d2,$t1
-
-	and	$t0,$d2,#-4		// final reduction
-	and	$h2,$d2,#3
-	add	$t0,$t0,$d2,lsr#2
-	adds	$h0,$d0,$t0
-	adcs	$h1,$d1,xzr
-	adc	$h2,$h2,xzr
-
-	ret
-.size	poly1305_mult,.-poly1305_mult
-
-.type	poly1305_splat,%function
-.align	4
-poly1305_splat:
-	and	x12,$h0,#0x03ffffff	// base 2^64 -> base 2^26
-	ubfx	x13,$h0,#26,#26
-	extr	x14,$h1,$h0,#52
-	and	x14,x14,#0x03ffffff
-	ubfx	x15,$h1,#14,#26
-	extr	x16,$h2,$h1,#40
-
-	str	w12,[$ctx,#16*0]	// r0
-	add	w12,w13,w13,lsl#2	// r1*5
-	str	w13,[$ctx,#16*1]	// r1
-	add	w13,w14,w14,lsl#2	// r2*5
-	str	w12,[$ctx,#16*2]	// s1
-	str	w14,[$ctx,#16*3]	// r2
-	add	w14,w15,w15,lsl#2	// r3*5
-	str	w13,[$ctx,#16*4]	// s2
-	str	w15,[$ctx,#16*5]	// r3
-	add	w15,w16,w16,lsl#2	// r4*5
-	str	w14,[$ctx,#16*6]	// s3
-	str	w16,[$ctx,#16*7]	// r4
-	str	w15,[$ctx,#16*8]	// s4
-
-	ret
-.size	poly1305_splat,.-poly1305_splat
-
-#ifdef	__KERNEL__
-.globl	poly1305_blocks_neon
-#endif
-.type	poly1305_blocks_neon,%function
-.align	5
-poly1305_blocks_neon:
-.Lpoly1305_blocks_neon:
-	ldr	$is_base2_26,[$ctx,#24]
-	cmp	$len,#128
-	b.lo	.Lpoly1305_blocks
-
-	.inst	0xd503233f		// paciasp
-	stp	x29,x30,[sp,#-80]!
-	add	x29,sp,#0
-
-	stp	d8,d9,[sp,#16]		// meet ABI requirements
-	stp	d10,d11,[sp,#32]
-	stp	d12,d13,[sp,#48]
-	stp	d14,d15,[sp,#64]
-
-	cbz	$is_base2_26,.Lbase2_64_neon
-
-	ldp	w10,w11,[$ctx]		// load hash value base 2^26
-	ldp	w12,w13,[$ctx,#8]
-	ldr	w14,[$ctx,#16]
-
-	tst	$len,#31
-	b.eq	.Leven_neon
-
-	ldp	$r0,$r1,[$ctx,#32]	// load key value
-
-	add	$h0,x10,x11,lsl#26	// base 2^26 -> base 2^64
-	lsr	$h1,x12,#12
-	adds	$h0,$h0,x12,lsl#52
-	add	$h1,$h1,x13,lsl#14
-	adc	$h1,$h1,xzr
-	lsr	$h2,x14,#24
-	adds	$h1,$h1,x14,lsl#40
-	adc	$d2,$h2,xzr		// can be partially reduced...
-
-	ldp	$d0,$d1,[$inp],#16	// load input
-	sub	$len,$len,#16
-	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
-
-#ifdef	__AARCH64EB__
-	rev	$d0,$d0
-	rev	$d1,$d1
-#endif
-	adds	$h0,$h0,$d0		// accumulate input
-	adcs	$h1,$h1,$d1
-	adc	$h2,$h2,$padbit
-
-	bl	poly1305_mult
-
-	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26
-	ubfx	x11,$h0,#26,#26
-	extr	x12,$h1,$h0,#52
-	and	x12,x12,#0x03ffffff
-	ubfx	x13,$h1,#14,#26
-	extr	x14,$h2,$h1,#40
-
-	b	.Leven_neon
-
-.align	4
-.Lbase2_64_neon:
-	ldp	$r0,$r1,[$ctx,#32]	// load key value
-
-	ldp	$h0,$h1,[$ctx]		// load hash value base 2^64
-	ldr	$h2,[$ctx,#16]
-
-	tst	$len,#31
-	b.eq	.Linit_neon
-
-	ldp	$d0,$d1,[$inp],#16	// load input
-	sub	$len,$len,#16
-	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
-#ifdef	__AARCH64EB__
-	rev	$d0,$d0
-	rev	$d1,$d1
-#endif
-	adds	$h0,$h0,$d0		// accumulate input
-	adcs	$h1,$h1,$d1
-	adc	$h2,$h2,$padbit
-
-	bl	poly1305_mult
-
-.Linit_neon:
-	ldr	w17,[$ctx,#48]		// first table element
-	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26
-	ubfx	x11,$h0,#26,#26
-	extr	x12,$h1,$h0,#52
-	and	x12,x12,#0x03ffffff
-	ubfx	x13,$h1,#14,#26
-	extr	x14,$h2,$h1,#40
-
-	cmp	w17,#-1			// is value impossible?
-	b.ne	.Leven_neon
-
-	fmov	${H0},x10
-	fmov	${H1},x11
-	fmov	${H2},x12
-	fmov	${H3},x13
-	fmov	${H4},x14
-
-	////////////////////////////////// initialize r^n table
-	mov	$h0,$r0			// r^1
-	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
-	mov	$h1,$r1
-	mov	$h2,xzr
-	add	$ctx,$ctx,#48+12
-	bl	poly1305_splat
-
-	bl	poly1305_mult		// r^2
-	sub	$ctx,$ctx,#4
-	bl	poly1305_splat
-
-	bl	poly1305_mult		// r^3
-	sub	$ctx,$ctx,#4
-	bl	poly1305_splat
-
-	bl	poly1305_mult		// r^4
-	sub	$ctx,$ctx,#4
-	bl	poly1305_splat
-	sub	$ctx,$ctx,#48		// restore original $ctx
-	b	.Ldo_neon
-
-.align	4
-.Leven_neon:
-	fmov	${H0},x10
-	fmov	${H1},x11
-	fmov	${H2},x12
-	fmov	${H3},x13
-	fmov	${H4},x14
-
-.Ldo_neon:
-	ldp	x8,x12,[$inp,#32]	// inp[2:3]
-	subs	$len,$len,#64
-	ldp	x9,x13,[$inp,#48]
-	add	$in2,$inp,#96
-	adrp	$zeros,.Lzeros
-	add	$zeros,$zeros,#:lo12:.Lzeros
-
-	lsl	$padbit,$padbit,#24
-	add	x15,$ctx,#48
-
-#ifdef	__AARCH64EB__
-	rev	x8,x8
-	rev	x12,x12
-	rev	x9,x9
-	rev	x13,x13
-#endif
-	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
-	and	x5,x9,#0x03ffffff
-	ubfx	x6,x8,#26,#26
-	ubfx	x7,x9,#26,#26
-	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
-	extr	x8,x12,x8,#52
-	extr	x9,x13,x9,#52
-	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
-	fmov	$IN23_0,x4
-	and	x8,x8,#0x03ffffff
-	and	x9,x9,#0x03ffffff
-	ubfx	x10,x12,#14,#26
-	ubfx	x11,x13,#14,#26
-	add	x12,$padbit,x12,lsr#40
-	add	x13,$padbit,x13,lsr#40
-	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
-	fmov	$IN23_1,x6
-	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
-	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
-	fmov	$IN23_2,x8
-	fmov	$IN23_3,x10
-	fmov	$IN23_4,x12
-
-	ldp	x8,x12,[$inp],#16	// inp[0:1]
-	ldp	x9,x13,[$inp],#48
-
-	ld1	{$R0,$R1,$S1,$R2},[x15],#64
-	ld1	{$S2,$R3,$S3,$R4},[x15],#64
-	ld1	{$S4},[x15]
-
-#ifdef	__AARCH64EB__
-	rev	x8,x8
-	rev	x12,x12
-	rev	x9,x9
-	rev	x13,x13
-#endif
-	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
-	and	x5,x9,#0x03ffffff
-	ubfx	x6,x8,#26,#26
-	ubfx	x7,x9,#26,#26
-	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
-	extr	x8,x12,x8,#52
-	extr	x9,x13,x9,#52
-	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
-	fmov	$IN01_0,x4
-	and	x8,x8,#0x03ffffff
-	and	x9,x9,#0x03ffffff
-	ubfx	x10,x12,#14,#26
-	ubfx	x11,x13,#14,#26
-	add	x12,$padbit,x12,lsr#40
-	add	x13,$padbit,x13,lsr#40
-	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
-	fmov	$IN01_1,x6
-	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
-	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
-	movi	$MASK.2d,#-1
-	fmov	$IN01_2,x8
-	fmov	$IN01_3,x10
-	fmov	$IN01_4,x12
-	ushr	$MASK.2d,$MASK.2d,#38
-
-	b.ls	.Lskip_loop
-
-.align	4
-.Loop_neon:
-	////////////////////////////////////////////////////////////////
-	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
-	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
-	//   \___________________/
-	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
-	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
-	//   \___________________/ \____________________/
-	//
-	// Note that we start with inp[2:3]*r^2. This is because it
-	// doesn't depend on reduction in previous iteration.
-	////////////////////////////////////////////////////////////////
-	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
-	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
-	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
-	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
-	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
-
-	subs	$len,$len,#64
-	umull	$ACC4,$IN23_0,${R4}[2]
-	csel	$in2,$zeros,$in2,lo
-	umull	$ACC3,$IN23_0,${R3}[2]
-	umull	$ACC2,$IN23_0,${R2}[2]
-	 ldp	x8,x12,[$in2],#16	// inp[2:3] (or zero)
-	umull	$ACC1,$IN23_0,${R1}[2]
-	 ldp	x9,x13,[$in2],#48
-	umull	$ACC0,$IN23_0,${R0}[2]
-#ifdef	__AARCH64EB__
-	 rev	x8,x8
-	 rev	x12,x12
-	 rev	x9,x9
-	 rev	x13,x13
-#endif
-
-	umlal	$ACC4,$IN23_1,${R3}[2]
-	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
-	umlal	$ACC3,$IN23_1,${R2}[2]
-	 and	x5,x9,#0x03ffffff
-	umlal	$ACC2,$IN23_1,${R1}[2]
-	 ubfx	x6,x8,#26,#26
-	umlal	$ACC1,$IN23_1,${R0}[2]
-	 ubfx	x7,x9,#26,#26
-	umlal	$ACC0,$IN23_1,${S4}[2]
-	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
-
-	umlal	$ACC4,$IN23_2,${R2}[2]
-	 extr	x8,x12,x8,#52
-	umlal	$ACC3,$IN23_2,${R1}[2]
-	 extr	x9,x13,x9,#52
-	umlal	$ACC2,$IN23_2,${R0}[2]
-	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
-	umlal	$ACC1,$IN23_2,${S4}[2]
-	 fmov	$IN23_0,x4
-	umlal	$ACC0,$IN23_2,${S3}[2]
-	 and	x8,x8,#0x03ffffff
-
-	umlal	$ACC4,$IN23_3,${R1}[2]
-	 and	x9,x9,#0x03ffffff
-	umlal	$ACC3,$IN23_3,${R0}[2]
-	 ubfx	x10,x12,#14,#26
-	umlal	$ACC2,$IN23_3,${S4}[2]
-	 ubfx	x11,x13,#14,#26
-	umlal	$ACC1,$IN23_3,${S3}[2]
-	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
-	umlal	$ACC0,$IN23_3,${S2}[2]
-	 fmov	$IN23_1,x6
-
-	add	$IN01_2,$IN01_2,$H2
-	 add	x12,$padbit,x12,lsr#40
-	umlal	$ACC4,$IN23_4,${R0}[2]
-	 add	x13,$padbit,x13,lsr#40
-	umlal	$ACC3,$IN23_4,${S4}[2]
-	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
-	umlal	$ACC2,$IN23_4,${S3}[2]
-	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
-	umlal	$ACC1,$IN23_4,${S2}[2]
-	 fmov	$IN23_2,x8
-	umlal	$ACC0,$IN23_4,${S1}[2]
-	 fmov	$IN23_3,x10
-
-	////////////////////////////////////////////////////////////////
-	// (hash+inp[0:1])*r^4 and accumulate
-
-	add	$IN01_0,$IN01_0,$H0
-	 fmov	$IN23_4,x12
-	umlal	$ACC3,$IN01_2,${R1}[0]
-	 ldp	x8,x12,[$inp],#16	// inp[0:1]
-	umlal	$ACC0,$IN01_2,${S3}[0]
-	 ldp	x9,x13,[$inp],#48
-	umlal	$ACC4,$IN01_2,${R2}[0]
-	umlal	$ACC1,$IN01_2,${S4}[0]
-	umlal	$ACC2,$IN01_2,${R0}[0]
-#ifdef	__AARCH64EB__
-	 rev	x8,x8
-	 rev	x12,x12
-	 rev	x9,x9
-	 rev	x13,x13
-#endif
-
-	add	$IN01_1,$IN01_1,$H1
-	umlal	$ACC3,$IN01_0,${R3}[0]
-	umlal	$ACC4,$IN01_0,${R4}[0]
-	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
-	umlal	$ACC2,$IN01_0,${R2}[0]
-	 and	x5,x9,#0x03ffffff
-	umlal	$ACC0,$IN01_0,${R0}[0]
-	 ubfx	x6,x8,#26,#26
-	umlal	$ACC1,$IN01_0,${R1}[0]
-	 ubfx	x7,x9,#26,#26
-
-	add	$IN01_3,$IN01_3,$H3
-	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
-	umlal	$ACC3,$IN01_1,${R2}[0]
-	 extr	x8,x12,x8,#52
-	umlal	$ACC4,$IN01_1,${R3}[0]
-	 extr	x9,x13,x9,#52
-	umlal	$ACC0,$IN01_1,${S4}[0]
-	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
-	umlal	$ACC2,$IN01_1,${R1}[0]
-	 fmov	$IN01_0,x4
-	umlal	$ACC1,$IN01_1,${R0}[0]
-	 and	x8,x8,#0x03ffffff
-
-	add	$IN01_4,$IN01_4,$H4
-	 and	x9,x9,#0x03ffffff
-	umlal	$ACC3,$IN01_3,${R0}[0]
-	 ubfx	x10,x12,#14,#26
-	umlal	$ACC0,$IN01_3,${S2}[0]
-	 ubfx	x11,x13,#14,#26
-	umlal	$ACC4,$IN01_3,${R1}[0]
-	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
-	umlal	$ACC1,$IN01_3,${S3}[0]
-	 fmov	$IN01_1,x6
-	umlal	$ACC2,$IN01_3,${S4}[0]
-	 add	x12,$padbit,x12,lsr#40
-
-	umlal	$ACC3,$IN01_4,${S4}[0]
-	 add	x13,$padbit,x13,lsr#40
-	umlal	$ACC0,$IN01_4,${S1}[0]
-	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
-	umlal	$ACC4,$IN01_4,${R0}[0]
-	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
-	umlal	$ACC1,$IN01_4,${S2}[0]
-	 fmov	$IN01_2,x8
-	umlal	$ACC2,$IN01_4,${S3}[0]
-	 fmov	$IN01_3,x10
-	 fmov	$IN01_4,x12
-
-	/////////////////////////////////////////////////////////////////
-	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
-	// and P. Schwabe
-	//
-	// [see discussion in poly1305-armv4 module]
-
-	ushr	$T0.2d,$ACC3,#26
-	xtn	$H3,$ACC3
-	 ushr	$T1.2d,$ACC0,#26
-	 and	$ACC0,$ACC0,$MASK.2d
-	add	$ACC4,$ACC4,$T0.2d	// h3 -> h4
-	bic	$H3,#0xfc,lsl#24	// &=0x03ffffff
-	 add	$ACC1,$ACC1,$T1.2d	// h0 -> h1
-
-	ushr	$T0.2d,$ACC4,#26
-	xtn	$H4,$ACC4
-	 ushr	$T1.2d,$ACC1,#26
-	 xtn	$H1,$ACC1
-	bic	$H4,#0xfc,lsl#24
-	 add	$ACC2,$ACC2,$T1.2d	// h1 -> h2
-
-	add	$ACC0,$ACC0,$T0.2d
-	shl	$T0.2d,$T0.2d,#2
-	 shrn	$T1.2s,$ACC2,#26
-	 xtn	$H2,$ACC2
-	add	$ACC0,$ACC0,$T0.2d	// h4 -> h0
-	 bic	$H1,#0xfc,lsl#24
-	 add	$H3,$H3,$T1.2s		// h2 -> h3
-	 bic	$H2,#0xfc,lsl#24
-
-	shrn	$T0.2s,$ACC0,#26
-	xtn	$H0,$ACC0
-	 ushr	$T1.2s,$H3,#26
-	 bic	$H3,#0xfc,lsl#24
-	 bic	$H0,#0xfc,lsl#24
-	add	$H1,$H1,$T0.2s		// h0 -> h1
-	 add	$H4,$H4,$T1.2s		// h3 -> h4
-
-	b.hi	.Loop_neon
-
-.Lskip_loop:
-	dup	$IN23_2,${IN23_2}[0]
-	add	$IN01_2,$IN01_2,$H2
-
-	////////////////////////////////////////////////////////////////
-	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
-	adds	$len,$len,#32
-	b.ne	.Long_tail
-
-	dup	$IN23_2,${IN01_2}[0]
-	add	$IN23_0,$IN01_0,$H0
-	add	$IN23_3,$IN01_3,$H3
-	add	$IN23_1,$IN01_1,$H1
-	add	$IN23_4,$IN01_4,$H4
-
-.Long_tail:
-	dup	$IN23_0,${IN23_0}[0]
-	umull2	$ACC0,$IN23_2,${S3}
-	umull2	$ACC3,$IN23_2,${R1}
-	umull2	$ACC4,$IN23_2,${R2}
-	umull2	$ACC2,$IN23_2,${R0}
-	umull2	$ACC1,$IN23_2,${S4}
-
-	dup	$IN23_1,${IN23_1}[0]
-	umlal2	$ACC0,$IN23_0,${R0}
-	umlal2	$ACC2,$IN23_0,${R2}
-	umlal2	$ACC3,$IN23_0,${R3}
-	umlal2	$ACC4,$IN23_0,${R4}
-	umlal2	$ACC1,$IN23_0,${R1}
-
-	dup	$IN23_3,${IN23_3}[0]
-	umlal2	$ACC0,$IN23_1,${S4}
-	umlal2	$ACC3,$IN23_1,${R2}
-	umlal2	$ACC2,$IN23_1,${R1}
-	umlal2	$ACC4,$IN23_1,${R3}
-	umlal2	$ACC1,$IN23_1,${R0}
-
-	dup	$IN23_4,${IN23_4}[0]
-	umlal2	$ACC3,$IN23_3,${R0}
-	umlal2	$ACC4,$IN23_3,${R1}
-	umlal2	$ACC0,$IN23_3,${S2}
-	umlal2	$ACC1,$IN23_3,${S3}
-	umlal2	$ACC2,$IN23_3,${S4}
-
-	umlal2	$ACC3,$IN23_4,${S4}
-	umlal2	$ACC0,$IN23_4,${S1}
-	umlal2	$ACC4,$IN23_4,${R0}
-	umlal2	$ACC1,$IN23_4,${S2}
-	umlal2	$ACC2,$IN23_4,${S3}
-
-	b.eq	.Lshort_tail
-
-	////////////////////////////////////////////////////////////////
-	// (hash+inp[0:1])*r^4:r^3 and accumulate
-
-	add	$IN01_0,$IN01_0,$H0
-	umlal	$ACC3,$IN01_2,${R1}
-	umlal	$ACC0,$IN01_2,${S3}
-	umlal	$ACC4,$IN01_2,${R2}
-	umlal	$ACC1,$IN01_2,${S4}
-	umlal	$ACC2,$IN01_2,${R0}
-
-	add	$IN01_1,$IN01_1,$H1
-	umlal	$ACC3,$IN01_0,${R3}
-	umlal	$ACC0,$IN01_0,${R0}
-	umlal	$ACC4,$IN01_0,${R4}
-	umlal	$ACC1,$IN01_0,${R1}
-	umlal	$ACC2,$IN01_0,${R2}
-
-	add	$IN01_3,$IN01_3,$H3
-	umlal	$ACC3,$IN01_1,${R2}
-	umlal	$ACC0,$IN01_1,${S4}
-	umlal	$ACC4,$IN01_1,${R3}
-	umlal	$ACC1,$IN01_1,${R0}
-	umlal	$ACC2,$IN01_1,${R1}
-
-	add	$IN01_4,$IN01_4,$H4
-	umlal	$ACC3,$IN01_3,${R0}
-	umlal	$ACC0,$IN01_3,${S2}
-	umlal	$ACC4,$IN01_3,${R1}
-	umlal	$ACC1,$IN01_3,${S3}
-	umlal	$ACC2,$IN01_3,${S4}
-
-	umlal	$ACC3,$IN01_4,${S4}
-	umlal	$ACC0,$IN01_4,${S1}
-	umlal	$ACC4,$IN01_4,${R0}
-	umlal	$ACC1,$IN01_4,${S2}
-	umlal	$ACC2,$IN01_4,${S3}
-
-.Lshort_tail:
-	////////////////////////////////////////////////////////////////
-	// horizontal add
-
-	addp	$ACC3,$ACC3,$ACC3
-	 ldp	d8,d9,[sp,#16]		// meet ABI requirements
-	addp	$ACC0,$ACC0,$ACC0
-	 ldp	d10,d11,[sp,#32]
-	addp	$ACC4,$ACC4,$ACC4
-	 ldp	d12,d13,[sp,#48]
-	addp	$ACC1,$ACC1,$ACC1
-	 ldp	d14,d15,[sp,#64]
-	addp	$ACC2,$ACC2,$ACC2
-	 ldr	x30,[sp,#8]
-
-	////////////////////////////////////////////////////////////////
-	// lazy reduction, but without narrowing
-
-	ushr	$T0.2d,$ACC3,#26
-	and	$ACC3,$ACC3,$MASK.2d
-	 ushr	$T1.2d,$ACC0,#26
-	 and	$ACC0,$ACC0,$MASK.2d
-
-	add	$ACC4,$ACC4,$T0.2d	// h3 -> h4
-	 add	$ACC1,$ACC1,$T1.2d	// h0 -> h1
-
-	ushr	$T0.2d,$ACC4,#26
-	and	$ACC4,$ACC4,$MASK.2d
-	 ushr	$T1.2d,$ACC1,#26
-	 and	$ACC1,$ACC1,$MASK.2d
-	 add	$ACC2,$ACC2,$T1.2d	// h1 -> h2
-
-	add	$ACC0,$ACC0,$T0.2d
-	shl	$T0.2d,$T0.2d,#2
-	 ushr	$T1.2d,$ACC2,#26
-	 and	$ACC2,$ACC2,$MASK.2d
-	add	$ACC0,$ACC0,$T0.2d	// h4 -> h0
-	 add	$ACC3,$ACC3,$T1.2d	// h2 -> h3
-
-	ushr	$T0.2d,$ACC0,#26
-	and	$ACC0,$ACC0,$MASK.2d
-	 ushr	$T1.2d,$ACC3,#26
-	 and	$ACC3,$ACC3,$MASK.2d
-	add	$ACC1,$ACC1,$T0.2d	// h0 -> h1
-	 add	$ACC4,$ACC4,$T1.2d	// h3 -> h4
-
-	////////////////////////////////////////////////////////////////
-	// write the result, can be partially reduced
-
-	st4	{$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
-	mov	x4,#1
-	st1	{$ACC4}[0],[$ctx]
-	str	x4,[$ctx,#8]		// set is_base2_26
-
-	ldr	x29,[sp],#80
-	 .inst	0xd50323bf		// autiasp
-	ret
-.size	poly1305_blocks_neon,.-poly1305_blocks_neon
-
-.pushsection .rodata
-.align	5
-.Lzeros:
-.long	0,0,0,0,0,0,0,0
-.asciz	"Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
-.popsection
-
-.align	2
-#if !defined(__KERNEL__) && !defined(_WIN64)
-.comm	OPENSSL_armcap_P,4,4
-.hidden	OPENSSL_armcap_P
-#endif
-___
-
-foreach (split("\n",$code)) {
-	s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/			or
-	s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/	or
-	(m/\bdup\b/ and (s/\.[24]s/.2d/g or 1))			or
-	(m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1))	or
-	(m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1))		or
-	(m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1))		or
-	(m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
-
-	s/\.[124]([sd])\[/.$1\[/;
-	s/w#x([0-9]+)/w$1/g;
-
-	print $_,"\n";
-}
-close STDOUT;
diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c
deleted file mode 100644
index 18883ea438f3..000000000000
--- a/arch/arm64/crypto/poly1305-glue.c
+++ /dev/null
@@ -1,232 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
- *
- * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <linux/unaligned.h>
-#include <crypto/algapi.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/poly1305.h>
-#include <crypto/internal/simd.h>
-#include <linux/cpufeature.h>
-#include <linux/crypto.h>
-#include <linux/jump_label.h>
-#include <linux/module.h>
-
-asmlinkage void poly1305_init_arm64(void *state, const u8 *key);
-asmlinkage void poly1305_blocks(void *state, const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_emit(void *state, u8 *digest, const u32 *nonce);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-
-void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
-{
-	poly1305_init_arm64(&dctx->h, key);
-	dctx->s[0] = get_unaligned_le32(key + 16);
-	dctx->s[1] = get_unaligned_le32(key + 20);
-	dctx->s[2] = get_unaligned_le32(key + 24);
-	dctx->s[3] = get_unaligned_le32(key + 28);
-	dctx->buflen = 0;
-}
-EXPORT_SYMBOL(poly1305_init_arch);
-
-static int neon_poly1305_init(struct shash_desc *desc)
-{
-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	dctx->buflen = 0;
-	dctx->rset = 0;
-	dctx->sset = false;
-
-	return 0;
-}
-
-static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
-				 u32 len, u32 hibit, bool do_neon)
-{
-	if (unlikely(!dctx->sset)) {
-		if (!dctx->rset) {
-			poly1305_init_arm64(&dctx->h, src);
-			src += POLY1305_BLOCK_SIZE;
-			len -= POLY1305_BLOCK_SIZE;
-			dctx->rset = 1;
-		}
-		if (len >= POLY1305_BLOCK_SIZE) {
-			dctx->s[0] = get_unaligned_le32(src +  0);
-			dctx->s[1] = get_unaligned_le32(src +  4);
-			dctx->s[2] = get_unaligned_le32(src +  8);
-			dctx->s[3] = get_unaligned_le32(src + 12);
-			src += POLY1305_BLOCK_SIZE;
-			len -= POLY1305_BLOCK_SIZE;
-			dctx->sset = true;
-		}
-		if (len < POLY1305_BLOCK_SIZE)
-			return;
-	}
-
-	len &= ~(POLY1305_BLOCK_SIZE - 1);
-
-	if (static_branch_likely(&have_neon) && likely(do_neon))
-		poly1305_blocks_neon(&dctx->h, src, len, hibit);
-	else
-		poly1305_blocks(&dctx->h, src, len, hibit);
-}
-
-static void neon_poly1305_do_update(struct poly1305_desc_ctx *dctx,
-				    const u8 *src, u32 len, bool do_neon)
-{
-	if (unlikely(dctx->buflen)) {
-		u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
-
-		memcpy(dctx->buf + dctx->buflen, src, bytes);
-		src += bytes;
-		len -= bytes;
-		dctx->buflen += bytes;
-
-		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
-			neon_poly1305_blocks(dctx, dctx->buf,
-					     POLY1305_BLOCK_SIZE, 1, false);
-			dctx->buflen = 0;
-		}
-	}
-
-	if (likely(len >= POLY1305_BLOCK_SIZE)) {
-		neon_poly1305_blocks(dctx, src, len, 1, do_neon);
-		src += round_down(len, POLY1305_BLOCK_SIZE);
-		len %= POLY1305_BLOCK_SIZE;
-	}
-
-	if (unlikely(len)) {
-		dctx->buflen = len;
-		memcpy(dctx->buf, src, len);
-	}
-}
-
-static int neon_poly1305_update(struct shash_desc *desc,
-				const u8 *src, unsigned int srclen)
-{
-	bool do_neon = crypto_simd_usable() && srclen > 128;
-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	if (static_branch_likely(&have_neon) && do_neon)
-		kernel_neon_begin();
-	neon_poly1305_do_update(dctx, src, srclen, do_neon);
-	if (static_branch_likely(&have_neon) && do_neon)
-		kernel_neon_end();
-	return 0;
-}
-
-void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
-			  unsigned int nbytes)
-{
-	if (unlikely(dctx->buflen)) {
-		u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
-
-		memcpy(dctx->buf + dctx->buflen, src, bytes);
-		src += bytes;
-		nbytes -= bytes;
-		dctx->buflen += bytes;
-
-		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
-			poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
-			dctx->buflen = 0;
-		}
-	}
-
-	if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
-		unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
-
-		if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
-			do {
-				unsigned int todo = min_t(unsigned int, len, SZ_4K);
-
-				kernel_neon_begin();
-				poly1305_blocks_neon(&dctx->h, src, todo, 1);
-				kernel_neon_end();
-
-				len -= todo;
-				src += todo;
-			} while (len);
-		} else {
-			poly1305_blocks(&dctx->h, src, len, 1);
-			src += len;
-		}
-		nbytes %= POLY1305_BLOCK_SIZE;
-	}
-
-	if (unlikely(nbytes)) {
-		dctx->buflen = nbytes;
-		memcpy(dctx->buf, src, nbytes);
-	}
-}
-EXPORT_SYMBOL(poly1305_update_arch);
-
-void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
-{
-	if (unlikely(dctx->buflen)) {
-		dctx->buf[dctx->buflen++] = 1;
-		memset(dctx->buf + dctx->buflen, 0,
-		       POLY1305_BLOCK_SIZE - dctx->buflen);
-		poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
-	}
-
-	poly1305_emit(&dctx->h, dst, dctx->s);
-	memzero_explicit(dctx, sizeof(*dctx));
-}
-EXPORT_SYMBOL(poly1305_final_arch);
-
-static int neon_poly1305_final(struct shash_desc *desc, u8 *dst)
-{
-	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	if (unlikely(!dctx->sset))
-		return -ENOKEY;
-
-	poly1305_final_arch(dctx, dst);
-	return 0;
-}
-
-static struct shash_alg neon_poly1305_alg = {
-	.init			= neon_poly1305_init,
-	.update			= neon_poly1305_update,
-	.final			= neon_poly1305_final,
-	.digestsize		= POLY1305_DIGEST_SIZE,
-	.descsize		= sizeof(struct poly1305_desc_ctx),
-
-	.base.cra_name		= "poly1305",
-	.base.cra_driver_name	= "poly1305-neon",
-	.base.cra_priority	= 200,
-	.base.cra_blocksize	= POLY1305_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-};
-
-static int __init neon_poly1305_mod_init(void)
-{
-	if (!cpu_have_named_feature(ASIMD))
-		return 0;
-
-	static_branch_enable(&have_neon);
-
-	return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
-		crypto_register_shash(&neon_poly1305_alg) : 0;
-}
-
-static void __exit neon_poly1305_mod_exit(void)
-{
-	if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && cpu_have_named_feature(ASIMD))
-		crypto_unregister_shash(&neon_poly1305_alg);
-}
-
-module_init(neon_poly1305_mod_init);
-module_exit(neon_poly1305_mod_exit);
-
-MODULE_DESCRIPTION("Poly1305 transform using NEON instructions");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("poly1305");
-MODULE_ALIAS_CRYPTO("poly1305-neon");
diff --git a/arch/arm64/crypto/polyval-ce-glue.c b/arch/arm64/crypto/polyval-ce-glue.c
index 0a3b5718df85..c4e653688ea0 100644
--- a/arch/arm64/crypto/polyval-ce-glue.c
+++ b/arch/arm64/crypto/polyval-ce-glue.c
@@ -15,17 +15,15 @@
  * ARMv8 Crypto Extensions instructions to implement the finite field operations.
  */
 
-#include <crypto/algapi.h>
+#include <asm/neon.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <crypto/polyval.h>
-#include <linux/crypto.h>
-#include <linux/init.h>
+#include <crypto/utils.h>
+#include <linux/cpufeature.h>
+#include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/cpufeature.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
+#include <linux/string.h>
 
 #define NUM_KEY_POWERS	8
 
@@ -38,7 +36,6 @@ struct polyval_tfm_ctx {
 
 struct polyval_desc_ctx {
 	u8 buffer[POLYVAL_BLOCK_SIZE];
-	u32 bytes;
 };
 
 asmlinkage void pmull_polyval_update(const struct polyval_tfm_ctx *keys,
@@ -48,25 +45,16 @@ asmlinkage void pmull_polyval_mul(u8 *op1, const u8 *op2);
 static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
 	const u8 *in, size_t nblocks, u8 *accumulator)
 {
-	if (likely(crypto_simd_usable())) {
-		kernel_neon_begin();
-		pmull_polyval_update(keys, in, nblocks, accumulator);
-		kernel_neon_end();
-	} else {
-		polyval_update_non4k(keys->key_powers[NUM_KEY_POWERS-1], in,
-			nblocks, accumulator);
-	}
+	kernel_neon_begin();
+	pmull_polyval_update(keys, in, nblocks, accumulator);
+	kernel_neon_end();
 }
 
 static void internal_polyval_mul(u8 *op1, const u8 *op2)
 {
-	if (likely(crypto_simd_usable())) {
-		kernel_neon_begin();
-		pmull_polyval_mul(op1, op2);
-		kernel_neon_end();
-	} else {
-		polyval_mul_non4k(op1, op2);
-	}
+	kernel_neon_begin();
+	pmull_polyval_mul(op1, op2);
+	kernel_neon_end();
 }
 
 static int polyval_arm64_setkey(struct crypto_shash *tfm,
@@ -103,49 +91,27 @@ static int polyval_arm64_update(struct shash_desc *desc,
 {
 	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
 	const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
-	u8 *pos;
 	unsigned int nblocks;
-	unsigned int n;
-
-	if (dctx->bytes) {
-		n = min(srclen, dctx->bytes);
-		pos = dctx->buffer + POLYVAL_BLOCK_SIZE - dctx->bytes;
-
-		dctx->bytes -= n;
-		srclen -= n;
 
-		while (n--)
-			*pos++ ^= *src++;
-
-		if (!dctx->bytes)
-			internal_polyval_mul(dctx->buffer,
-					    tctx->key_powers[NUM_KEY_POWERS-1]);
-	}
-
-	while (srclen >= POLYVAL_BLOCK_SIZE) {
+	do {
 		/* allow rescheduling every 4K bytes */
 		nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
 		internal_polyval_update(tctx, src, nblocks, dctx->buffer);
 		srclen -= nblocks * POLYVAL_BLOCK_SIZE;
 		src += nblocks * POLYVAL_BLOCK_SIZE;
-	}
+	} while (srclen >= POLYVAL_BLOCK_SIZE);
 
-	if (srclen) {
-		dctx->bytes = POLYVAL_BLOCK_SIZE - srclen;
-		pos = dctx->buffer;
-		while (srclen--)
-			*pos++ ^= *src++;
-	}
-
-	return 0;
+	return srclen;
 }
 
-static int polyval_arm64_final(struct shash_desc *desc, u8 *dst)
+static int polyval_arm64_finup(struct shash_desc *desc, const u8 *src,
+			       unsigned int len, u8 *dst)
 {
 	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
 	const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
 
-	if (dctx->bytes) {
+	if (len) {
+		crypto_xor(dctx->buffer, src, len);
 		internal_polyval_mul(dctx->buffer,
 				     tctx->key_powers[NUM_KEY_POWERS-1]);
 	}
@@ -159,13 +125,14 @@ static struct shash_alg polyval_alg = {
 	.digestsize	= POLYVAL_DIGEST_SIZE,
 	.init		= polyval_arm64_init,
 	.update		= polyval_arm64_update,
-	.final		= polyval_arm64_final,
+	.finup		= polyval_arm64_finup,
 	.setkey		= polyval_arm64_setkey,
 	.descsize	= sizeof(struct polyval_desc_ctx),
 	.base		= {
 		.cra_name		= "polyval",
 		.cra_driver_name	= "polyval-ce",
 		.cra_priority		= 200,
+		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
 		.cra_blocksize		= POLYVAL_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct polyval_tfm_ctx),
 		.cra_module		= THIS_MODULE,
diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
index cbd14f208f83..65b6980817e5 100644
--- a/arch/arm64/crypto/sha1-ce-glue.c
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@@ -7,14 +7,14 @@
 
 #include <asm/neon.h>
 #include <asm/simd.h>
-#include <linux/unaligned.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
 #include <crypto/sha1.h>
 #include <crypto/sha1_base.h>
 #include <linux/cpufeature.h>
-#include <linux/crypto.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/string.h>
 
 MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
@@ -56,79 +56,49 @@ static int sha1_ce_update(struct shash_desc *desc, const u8 *data,
 {
 	struct sha1_ce_state *sctx = shash_desc_ctx(desc);
 
-	if (!crypto_simd_usable())
-		return crypto_sha1_update(desc, data, len);
-
 	sctx->finalize = 0;
-	sha1_base_do_update(desc, data, len, sha1_ce_transform);
-
-	return 0;
+	return sha1_base_do_update_blocks(desc, data, len, sha1_ce_transform);
 }
 
 static int sha1_ce_finup(struct shash_desc *desc, const u8 *data,
 			 unsigned int len, u8 *out)
 {
 	struct sha1_ce_state *sctx = shash_desc_ctx(desc);
-	bool finalize = !sctx->sst.count && !(len % SHA1_BLOCK_SIZE) && len;
-
-	if (!crypto_simd_usable())
-		return crypto_sha1_finup(desc, data, len, out);
+	bool finalized = false;
 
 	/*
 	 * Allow the asm code to perform the finalization if there is no
 	 * partial data and the input is a round multiple of the block size.
 	 */
-	sctx->finalize = finalize;
-
-	sha1_base_do_update(desc, data, len, sha1_ce_transform);
-	if (!finalize)
-		sha1_base_do_finalize(desc, sha1_ce_transform);
-	return sha1_base_finish(desc, out);
-}
-
-static int sha1_ce_final(struct shash_desc *desc, u8 *out)
-{
-	struct sha1_ce_state *sctx = shash_desc_ctx(desc);
-
-	if (!crypto_simd_usable())
-		return crypto_sha1_finup(desc, NULL, 0, out);
-
-	sctx->finalize = 0;
-	sha1_base_do_finalize(desc, sha1_ce_transform);
+	if (len >= SHA1_BLOCK_SIZE) {
+		unsigned int remain = len - round_down(len, SHA1_BLOCK_SIZE);
+
+		finalized = !remain;
+		sctx->finalize = finalized;
+		sha1_base_do_update_blocks(desc, data, len, sha1_ce_transform);
+		data += len - remain;
+		len = remain;
+	}
+	if (!finalized) {
+		sctx->finalize = 0;
+		sha1_base_do_finup(desc, data, len, sha1_ce_transform);
+	}
 	return sha1_base_finish(desc, out);
 }
 
-static int sha1_ce_export(struct shash_desc *desc, void *out)
-{
-	struct sha1_ce_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(out, &sctx->sst, sizeof(struct sha1_state));
-	return 0;
-}
-
-static int sha1_ce_import(struct shash_desc *desc, const void *in)
-{
-	struct sha1_ce_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(&sctx->sst, in, sizeof(struct sha1_state));
-	sctx->finalize = 0;
-	return 0;
-}
-
 static struct shash_alg alg = {
 	.init			= sha1_base_init,
 	.update			= sha1_ce_update,
-	.final			= sha1_ce_final,
 	.finup			= sha1_ce_finup,
-	.import			= sha1_ce_import,
-	.export			= sha1_ce_export,
 	.descsize		= sizeof(struct sha1_ce_state),
-	.statesize		= sizeof(struct sha1_state),
+	.statesize		= SHA1_STATE_SIZE,
 	.digestsize		= SHA1_DIGEST_SIZE,
 	.base			= {
 		.cra_name		= "sha1",
 		.cra_driver_name	= "sha1-ce",
 		.cra_priority		= 200,
+		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+					  CRYPTO_AHASH_ALG_FINUP_MAX,
 		.cra_blocksize		= SHA1_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
 	}
diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S
deleted file mode 100644
index fce84d88ddb2..000000000000
--- a/arch/arm64/crypto/sha2-ce-core.S
+++ /dev/null
@@ -1,157 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
- *
- * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-	.text
-	.arch		armv8-a+crypto
-
-	dga		.req	q20
-	dgav		.req	v20
-	dgb		.req	q21
-	dgbv		.req	v21
-
-	t0		.req	v22
-	t1		.req	v23
-
-	dg0q		.req	q24
-	dg0v		.req	v24
-	dg1q		.req	q25
-	dg1v		.req	v25
-	dg2q		.req	q26
-	dg2v		.req	v26
-
-	.macro		add_only, ev, rc, s0
-	mov		dg2v.16b, dg0v.16b
-	.ifeq		\ev
-	add		t1.4s, v\s0\().4s, \rc\().4s
-	sha256h		dg0q, dg1q, t0.4s
-	sha256h2	dg1q, dg2q, t0.4s
-	.else
-	.ifnb		\s0
-	add		t0.4s, v\s0\().4s, \rc\().4s
-	.endif
-	sha256h		dg0q, dg1q, t1.4s
-	sha256h2	dg1q, dg2q, t1.4s
-	.endif
-	.endm
-
-	.macro		add_update, ev, rc, s0, s1, s2, s3
-	sha256su0	v\s0\().4s, v\s1\().4s
-	add_only	\ev, \rc, \s1
-	sha256su1	v\s0\().4s, v\s2\().4s, v\s3\().4s
-	.endm
-
-	/*
-	 * The SHA-256 round constants
-	 */
-	.section	".rodata", "a"
-	.align		4
-.Lsha2_rcon:
-	.word		0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
-	.word		0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
-	.word		0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
-	.word		0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
-	.word		0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
-	.word		0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
-	.word		0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
-	.word		0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
-	.word		0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
-	.word		0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
-	.word		0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
-	.word		0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
-	.word		0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
-	.word		0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
-	.word		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
-	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-
-	/*
-	 * int __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
-	 *			     int blocks)
-	 */
-	.text
-SYM_FUNC_START(__sha256_ce_transform)
-	/* load round constants */
-	adr_l		x8, .Lsha2_rcon
-	ld1		{ v0.4s- v3.4s}, [x8], #64
-	ld1		{ v4.4s- v7.4s}, [x8], #64
-	ld1		{ v8.4s-v11.4s}, [x8], #64
-	ld1		{v12.4s-v15.4s}, [x8]
-
-	/* load state */
-	ld1		{dgav.4s, dgbv.4s}, [x0]
-
-	/* load sha256_ce_state::finalize */
-	ldr_l		w4, sha256_ce_offsetof_finalize, x4
-	ldr		w4, [x0, x4]
-
-	/* load input */
-0:	ld1		{v16.4s-v19.4s}, [x1], #64
-	sub		w2, w2, #1
-
-CPU_LE(	rev32		v16.16b, v16.16b	)
-CPU_LE(	rev32		v17.16b, v17.16b	)
-CPU_LE(	rev32		v18.16b, v18.16b	)
-CPU_LE(	rev32		v19.16b, v19.16b	)
-
-1:	add		t0.4s, v16.4s, v0.4s
-	mov		dg0v.16b, dgav.16b
-	mov		dg1v.16b, dgbv.16b
-
-	add_update	0,  v1, 16, 17, 18, 19
-	add_update	1,  v2, 17, 18, 19, 16
-	add_update	0,  v3, 18, 19, 16, 17
-	add_update	1,  v4, 19, 16, 17, 18
-
-	add_update	0,  v5, 16, 17, 18, 19
-	add_update	1,  v6, 17, 18, 19, 16
-	add_update	0,  v7, 18, 19, 16, 17
-	add_update	1,  v8, 19, 16, 17, 18
-
-	add_update	0,  v9, 16, 17, 18, 19
-	add_update	1, v10, 17, 18, 19, 16
-	add_update	0, v11, 18, 19, 16, 17
-	add_update	1, v12, 19, 16, 17, 18
-
-	add_only	0, v13, 17
-	add_only	1, v14, 18
-	add_only	0, v15, 19
-	add_only	1
-
-	/* update state */
-	add		dgav.4s, dgav.4s, dg0v.4s
-	add		dgbv.4s, dgbv.4s, dg1v.4s
-
-	/* handled all input blocks? */
-	cbz		w2, 2f
-	cond_yield	3f, x5, x6
-	b		0b
-
-	/*
-	 * Final block: add padding and total bit count.
-	 * Skip if the input size was not a round multiple of the block size,
-	 * the padding is handled by the C code in that case.
-	 */
-2:	cbz		x4, 3f
-	ldr_l		w4, sha256_ce_offsetof_count, x4
-	ldr		x4, [x0, x4]
-	movi		v17.2d, #0
-	mov		x8, #0x80000000
-	movi		v18.2d, #0
-	ror		x7, x4, #29		// ror(lsl(x4, 3), 32)
-	fmov		d16, x8
-	mov		x4, #0
-	mov		v19.d[0], xzr
-	mov		v19.d[1], x7
-	b		1b
-
-	/* store new state */
-3:	st1		{dgav.4s, dgbv.4s}, [x0]
-	mov		w0, w2
-	ret
-SYM_FUNC_END(__sha256_ce_transform)
diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
deleted file mode 100644
index 6b4866a88ded..000000000000
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ /dev/null
@@ -1,192 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2014 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <linux/unaligned.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <crypto/sha2.h>
-#include <crypto/sha256_base.h>
-#include <linux/cpufeature.h>
-#include <linux/crypto.h>
-#include <linux/module.h>
-
-MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("sha224");
-MODULE_ALIAS_CRYPTO("sha256");
-
-struct sha256_ce_state {
-	struct sha256_state	sst;
-	u32			finalize;
-};
-
-extern const u32 sha256_ce_offsetof_count;
-extern const u32 sha256_ce_offsetof_finalize;
-
-asmlinkage int __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
-				     int blocks);
-
-static void sha256_ce_transform(struct sha256_state *sst, u8 const *src,
-				int blocks)
-{
-	while (blocks) {
-		int rem;
-
-		kernel_neon_begin();
-		rem = __sha256_ce_transform(container_of(sst,
-							 struct sha256_ce_state,
-							 sst), src, blocks);
-		kernel_neon_end();
-		src += (blocks - rem) * SHA256_BLOCK_SIZE;
-		blocks = rem;
-	}
-}
-
-const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state,
-					      sst.count);
-const u32 sha256_ce_offsetof_finalize = offsetof(struct sha256_ce_state,
-						 finalize);
-
-asmlinkage void sha256_block_data_order(u32 *digest, u8 const *src, int blocks);
-
-static void sha256_arm64_transform(struct sha256_state *sst, u8 const *src,
-				   int blocks)
-{
-	sha256_block_data_order(sst->state, src, blocks);
-}
-
-static int sha256_ce_update(struct shash_desc *desc, const u8 *data,
-			    unsigned int len)
-{
-	struct sha256_ce_state *sctx = shash_desc_ctx(desc);
-
-	if (!crypto_simd_usable())
-		return sha256_base_do_update(desc, data, len,
-					     sha256_arm64_transform);
-
-	sctx->finalize = 0;
-	sha256_base_do_update(desc, data, len, sha256_ce_transform);
-
-	return 0;
-}
-
-static int sha256_ce_finup(struct shash_desc *desc, const u8 *data,
-			   unsigned int len, u8 *out)
-{
-	struct sha256_ce_state *sctx = shash_desc_ctx(desc);
-	bool finalize = !sctx->sst.count && !(len % SHA256_BLOCK_SIZE) && len;
-
-	if (!crypto_simd_usable()) {
-		if (len)
-			sha256_base_do_update(desc, data, len,
-					      sha256_arm64_transform);
-		sha256_base_do_finalize(desc, sha256_arm64_transform);
-		return sha256_base_finish(desc, out);
-	}
-
-	/*
-	 * Allow the asm code to perform the finalization if there is no
-	 * partial data and the input is a round multiple of the block size.
-	 */
-	sctx->finalize = finalize;
-
-	sha256_base_do_update(desc, data, len, sha256_ce_transform);
-	if (!finalize)
-		sha256_base_do_finalize(desc, sha256_ce_transform);
-	return sha256_base_finish(desc, out);
-}
-
-static int sha256_ce_final(struct shash_desc *desc, u8 *out)
-{
-	struct sha256_ce_state *sctx = shash_desc_ctx(desc);
-
-	if (!crypto_simd_usable()) {
-		sha256_base_do_finalize(desc, sha256_arm64_transform);
-		return sha256_base_finish(desc, out);
-	}
-
-	sctx->finalize = 0;
-	sha256_base_do_finalize(desc, sha256_ce_transform);
-	return sha256_base_finish(desc, out);
-}
-
-static int sha256_ce_digest(struct shash_desc *desc, const u8 *data,
-			    unsigned int len, u8 *out)
-{
-	sha256_base_init(desc);
-	return sha256_ce_finup(desc, data, len, out);
-}
-
-static int sha256_ce_export(struct shash_desc *desc, void *out)
-{
-	struct sha256_ce_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(out, &sctx->sst, sizeof(struct sha256_state));
-	return 0;
-}
-
-static int sha256_ce_import(struct shash_desc *desc, const void *in)
-{
-	struct sha256_ce_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(&sctx->sst, in, sizeof(struct sha256_state));
-	sctx->finalize = 0;
-	return 0;
-}
-
-static struct shash_alg algs[] = { {
-	.init			= sha224_base_init,
-	.update			= sha256_ce_update,
-	.final			= sha256_ce_final,
-	.finup			= sha256_ce_finup,
-	.export			= sha256_ce_export,
-	.import			= sha256_ce_import,
-	.descsize		= sizeof(struct sha256_ce_state),
-	.statesize		= sizeof(struct sha256_state),
-	.digestsize		= SHA224_DIGEST_SIZE,
-	.base			= {
-		.cra_name		= "sha224",
-		.cra_driver_name	= "sha224-ce",
-		.cra_priority		= 200,
-		.cra_blocksize		= SHA256_BLOCK_SIZE,
-		.cra_module		= THIS_MODULE,
-	}
-}, {
-	.init			= sha256_base_init,
-	.update			= sha256_ce_update,
-	.final			= sha256_ce_final,
-	.finup			= sha256_ce_finup,
-	.digest			= sha256_ce_digest,
-	.export			= sha256_ce_export,
-	.import			= sha256_ce_import,
-	.descsize		= sizeof(struct sha256_ce_state),
-	.statesize		= sizeof(struct sha256_state),
-	.digestsize		= SHA256_DIGEST_SIZE,
-	.base			= {
-		.cra_name		= "sha256",
-		.cra_driver_name	= "sha256-ce",
-		.cra_priority		= 200,
-		.cra_blocksize		= SHA256_BLOCK_SIZE,
-		.cra_module		= THIS_MODULE,
-	}
-} };
-
-static int __init sha2_ce_mod_init(void)
-{
-	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit sha2_ce_mod_fini(void)
-{
-	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_cpu_feature_match(SHA2, sha2_ce_mod_init);
-module_exit(sha2_ce_mod_fini);
diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c
deleted file mode 100644
index 35356987cc1e..000000000000
--- a/arch/arm64/crypto/sha256-glue.c
+++ /dev/null
@@ -1,194 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Linux/arm64 port of the OpenSSL SHA256 implementation for AArch64
- *
- * Copyright (c) 2016 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <crypto/sha2.h>
-#include <crypto/sha256_base.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/types.h>
-
-MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash for arm64");
-MODULE_AUTHOR("Andy Polyakov <appro@openssl.org>");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("sha224");
-MODULE_ALIAS_CRYPTO("sha256");
-
-asmlinkage void sha256_block_data_order(u32 *digest, const void *data,
-					unsigned int num_blks);
-EXPORT_SYMBOL(sha256_block_data_order);
-
-static void sha256_arm64_transform(struct sha256_state *sst, u8 const *src,
-				   int blocks)
-{
-	sha256_block_data_order(sst->state, src, blocks);
-}
-
-asmlinkage void sha256_block_neon(u32 *digest, const void *data,
-				  unsigned int num_blks);
-
-static void sha256_neon_transform(struct sha256_state *sst, u8 const *src,
-				  int blocks)
-{
-	sha256_block_neon(sst->state, src, blocks);
-}
-
-static int crypto_sha256_arm64_update(struct shash_desc *desc, const u8 *data,
-				      unsigned int len)
-{
-	return sha256_base_do_update(desc, data, len, sha256_arm64_transform);
-}
-
-static int crypto_sha256_arm64_finup(struct shash_desc *desc, const u8 *data,
-				     unsigned int len, u8 *out)
-{
-	if (len)
-		sha256_base_do_update(desc, data, len, sha256_arm64_transform);
-	sha256_base_do_finalize(desc, sha256_arm64_transform);
-
-	return sha256_base_finish(desc, out);
-}
-
-static int crypto_sha256_arm64_final(struct shash_desc *desc, u8 *out)
-{
-	return crypto_sha256_arm64_finup(desc, NULL, 0, out);
-}
-
-static struct shash_alg algs[] = { {
-	.digestsize		= SHA256_DIGEST_SIZE,
-	.init			= sha256_base_init,
-	.update			= crypto_sha256_arm64_update,
-	.final			= crypto_sha256_arm64_final,
-	.finup			= crypto_sha256_arm64_finup,
-	.descsize		= sizeof(struct sha256_state),
-	.base.cra_name		= "sha256",
-	.base.cra_driver_name	= "sha256-arm64",
-	.base.cra_priority	= 125,
-	.base.cra_blocksize	= SHA256_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-}, {
-	.digestsize		= SHA224_DIGEST_SIZE,
-	.init			= sha224_base_init,
-	.update			= crypto_sha256_arm64_update,
-	.final			= crypto_sha256_arm64_final,
-	.finup			= crypto_sha256_arm64_finup,
-	.descsize		= sizeof(struct sha256_state),
-	.base.cra_name		= "sha224",
-	.base.cra_driver_name	= "sha224-arm64",
-	.base.cra_priority	= 125,
-	.base.cra_blocksize	= SHA224_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-} };
-
-static int sha256_update_neon(struct shash_desc *desc, const u8 *data,
-			      unsigned int len)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-
-	if (!crypto_simd_usable())
-		return sha256_base_do_update(desc, data, len,
-				sha256_arm64_transform);
-
-	while (len > 0) {
-		unsigned int chunk = len;
-
-		/*
-		 * Don't hog the CPU for the entire time it takes to process all
-		 * input when running on a preemptible kernel, but process the
-		 * data block by block instead.
-		 */
-		if (IS_ENABLED(CONFIG_PREEMPTION) &&
-		    chunk + sctx->count % SHA256_BLOCK_SIZE > SHA256_BLOCK_SIZE)
-			chunk = SHA256_BLOCK_SIZE -
-				sctx->count % SHA256_BLOCK_SIZE;
-
-		kernel_neon_begin();
-		sha256_base_do_update(desc, data, chunk, sha256_neon_transform);
-		kernel_neon_end();
-		data += chunk;
-		len -= chunk;
-	}
-	return 0;
-}
-
-static int sha256_finup_neon(struct shash_desc *desc, const u8 *data,
-			     unsigned int len, u8 *out)
-{
-	if (!crypto_simd_usable()) {
-		if (len)
-			sha256_base_do_update(desc, data, len,
-				sha256_arm64_transform);
-		sha256_base_do_finalize(desc, sha256_arm64_transform);
-	} else {
-		if (len)
-			sha256_update_neon(desc, data, len);
-		kernel_neon_begin();
-		sha256_base_do_finalize(desc, sha256_neon_transform);
-		kernel_neon_end();
-	}
-	return sha256_base_finish(desc, out);
-}
-
-static int sha256_final_neon(struct shash_desc *desc, u8 *out)
-{
-	return sha256_finup_neon(desc, NULL, 0, out);
-}
-
-static struct shash_alg neon_algs[] = { {
-	.digestsize		= SHA256_DIGEST_SIZE,
-	.init			= sha256_base_init,
-	.update			= sha256_update_neon,
-	.final			= sha256_final_neon,
-	.finup			= sha256_finup_neon,
-	.descsize		= sizeof(struct sha256_state),
-	.base.cra_name		= "sha256",
-	.base.cra_driver_name	= "sha256-arm64-neon",
-	.base.cra_priority	= 150,
-	.base.cra_blocksize	= SHA256_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-}, {
-	.digestsize		= SHA224_DIGEST_SIZE,
-	.init			= sha224_base_init,
-	.update			= sha256_update_neon,
-	.final			= sha256_final_neon,
-	.finup			= sha256_finup_neon,
-	.descsize		= sizeof(struct sha256_state),
-	.base.cra_name		= "sha224",
-	.base.cra_driver_name	= "sha224-arm64-neon",
-	.base.cra_priority	= 150,
-	.base.cra_blocksize	= SHA224_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-} };
-
-static int __init sha256_mod_init(void)
-{
-	int ret = crypto_register_shashes(algs, ARRAY_SIZE(algs));
-	if (ret)
-		return ret;
-
-	if (cpu_have_named_feature(ASIMD)) {
-		ret = crypto_register_shashes(neon_algs, ARRAY_SIZE(neon_algs));
-		if (ret)
-			crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-	}
-	return ret;
-}
-
-static void __exit sha256_mod_fini(void)
-{
-	if (cpu_have_named_feature(ASIMD))
-		crypto_unregister_shashes(neon_algs, ARRAY_SIZE(neon_algs));
-	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_init(sha256_mod_init);
-module_exit(sha256_mod_fini);
diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c
index 5662c3ac49e9..b4f1001046c9 100644
--- a/arch/arm64/crypto/sha3-ce-glue.c
+++ b/arch/arm64/crypto/sha3-ce-glue.c
@@ -12,13 +12,13 @@
 #include <asm/hwcap.h>
 #include <asm/neon.h>
 #include <asm/simd.h>
-#include <linux/unaligned.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <crypto/sha3.h>
 #include <linux/cpufeature.h>
-#include <linux/crypto.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
 
 MODULE_DESCRIPTION("SHA3 secure hash using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
@@ -35,74 +35,55 @@ static int sha3_update(struct shash_desc *desc, const u8 *data,
 		       unsigned int len)
 {
 	struct sha3_state *sctx = shash_desc_ctx(desc);
-	unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
-
-	if (!crypto_simd_usable())
-		return crypto_sha3_update(desc, data, len);
-
-	if ((sctx->partial + len) >= sctx->rsiz) {
-		int blocks;
-
-		if (sctx->partial) {
-			int p = sctx->rsiz - sctx->partial;
-
-			memcpy(sctx->buf + sctx->partial, data, p);
-			kernel_neon_begin();
-			sha3_ce_transform(sctx->st, sctx->buf, 1, digest_size);
-			kernel_neon_end();
-
-			data += p;
-			len -= p;
-			sctx->partial = 0;
-		}
-
-		blocks = len / sctx->rsiz;
-		len %= sctx->rsiz;
-
-		while (blocks) {
-			int rem;
-
-			kernel_neon_begin();
-			rem = sha3_ce_transform(sctx->st, data, blocks,
-						digest_size);
-			kernel_neon_end();
-			data += (blocks - rem) * sctx->rsiz;
-			blocks = rem;
-		}
-	}
-
-	if (len) {
-		memcpy(sctx->buf + sctx->partial, data, len);
-		sctx->partial += len;
-	}
-	return 0;
+	struct crypto_shash *tfm = desc->tfm;
+	unsigned int bs, ds;
+	int blocks;
+
+	ds = crypto_shash_digestsize(tfm);
+	bs = crypto_shash_blocksize(tfm);
+	blocks = len / bs;
+	len -= blocks * bs;
+	do {
+		int rem;
+
+		kernel_neon_begin();
+		rem = sha3_ce_transform(sctx->st, data, blocks, ds);
+		kernel_neon_end();
+		data += (blocks - rem) * bs;
+		blocks = rem;
+	} while (blocks);
+	return len;
 }
 
-static int sha3_final(struct shash_desc *desc, u8 *out)
+static int sha3_finup(struct shash_desc *desc, const u8 *src, unsigned int len,
+		      u8 *out)
 {
 	struct sha3_state *sctx = shash_desc_ctx(desc);
-	unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
+	struct crypto_shash *tfm = desc->tfm;
 	__le64 *digest = (__le64 *)out;
+	u8 block[SHA3_224_BLOCK_SIZE];
+	unsigned int bs, ds;
 	int i;
 
-	if (!crypto_simd_usable())
-		return crypto_sha3_final(desc, out);
+	ds = crypto_shash_digestsize(tfm);
+	bs = crypto_shash_blocksize(tfm);
+	memcpy(block, src, len);
 
-	sctx->buf[sctx->partial++] = 0x06;
-	memset(sctx->buf + sctx->partial, 0, sctx->rsiz - sctx->partial);
-	sctx->buf[sctx->rsiz - 1] |= 0x80;
+	block[len++] = 0x06;
+	memset(block + len, 0, bs - len);
+	block[bs - 1] |= 0x80;
 
 	kernel_neon_begin();
-	sha3_ce_transform(sctx->st, sctx->buf, 1, digest_size);
+	sha3_ce_transform(sctx->st, block, 1, ds);
 	kernel_neon_end();
+	memzero_explicit(block , sizeof(block));
 
-	for (i = 0; i < digest_size / 8; i++)
+	for (i = 0; i < ds / 8; i++)
 		put_unaligned_le64(sctx->st[i], digest++);
 
-	if (digest_size & 4)
+	if (ds & 4)
 		put_unaligned_le32(sctx->st[i], (__le32 *)digest);
 
-	memzero_explicit(sctx, sizeof(*sctx));
 	return 0;
 }
 
@@ -110,10 +91,11 @@ static struct shash_alg algs[] = { {
 	.digestsize		= SHA3_224_DIGEST_SIZE,
 	.init			= crypto_sha3_init,
 	.update			= sha3_update,
-	.final			= sha3_final,
-	.descsize		= sizeof(struct sha3_state),
+	.finup			= sha3_finup,
+	.descsize		= SHA3_STATE_SIZE,
 	.base.cra_name		= "sha3-224",
 	.base.cra_driver_name	= "sha3-224-ce",
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
 	.base.cra_blocksize	= SHA3_224_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_priority	= 200,
@@ -121,10 +103,11 @@ static struct shash_alg algs[] = { {
 	.digestsize		= SHA3_256_DIGEST_SIZE,
 	.init			= crypto_sha3_init,
 	.update			= sha3_update,
-	.final			= sha3_final,
-	.descsize		= sizeof(struct sha3_state),
+	.finup			= sha3_finup,
+	.descsize		= SHA3_STATE_SIZE,
 	.base.cra_name		= "sha3-256",
 	.base.cra_driver_name	= "sha3-256-ce",
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
 	.base.cra_blocksize	= SHA3_256_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_priority	= 200,
@@ -132,10 +115,11 @@ static struct shash_alg algs[] = { {
 	.digestsize		= SHA3_384_DIGEST_SIZE,
 	.init			= crypto_sha3_init,
 	.update			= sha3_update,
-	.final			= sha3_final,
-	.descsize		= sizeof(struct sha3_state),
+	.finup			= sha3_finup,
+	.descsize		= SHA3_STATE_SIZE,
 	.base.cra_name		= "sha3-384",
 	.base.cra_driver_name	= "sha3-384-ce",
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
 	.base.cra_blocksize	= SHA3_384_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_priority	= 200,
@@ -143,10 +127,11 @@ static struct shash_alg algs[] = { {
 	.digestsize		= SHA3_512_DIGEST_SIZE,
 	.init			= crypto_sha3_init,
 	.update			= sha3_update,
-	.final			= sha3_final,
-	.descsize		= sizeof(struct sha3_state),
+	.finup			= sha3_finup,
+	.descsize		= SHA3_STATE_SIZE,
 	.base.cra_name		= "sha3-512",
 	.base.cra_driver_name	= "sha3-512-ce",
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
 	.base.cra_blocksize	= SHA3_512_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_priority	= 200,
diff --git a/arch/arm64/crypto/sha512-armv8.pl b/arch/arm64/crypto/sha512-armv8.pl
deleted file mode 100644
index 35ec9ae99fe1..000000000000
--- a/arch/arm64/crypto/sha512-armv8.pl
+++ /dev/null
@@ -1,786 +0,0 @@
-#! /usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0
-
-# This code is taken from the OpenSSL project but the author (Andy Polyakov)
-# has relicensed it under the GPLv2. Therefore this program is free software;
-# you can redistribute it and/or modify it under the terms of the GNU General
-# Public License version 2 as published by the Free Software Foundation.
-#
-# The original headers, including the original license headers, are
-# included below for completeness.
-
-# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
-#
-# Licensed under the OpenSSL license (the "License").  You may not use
-# this file except in compliance with the License.  You can obtain a copy
-# in the file LICENSE in the source distribution or at
-# https://www.openssl.org/source/license.html
-
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# SHA256/512 for ARMv8.
-#
-# Performance in cycles per processed byte and improvement coefficient
-# over code generated with "default" compiler:
-#
-#		SHA256-hw	SHA256(*)	SHA512
-# Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
-# Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
-# Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
-# Denver	2.01		10.5 (+26%)	6.70 (+8%)
-# X-Gene			20.0 (+100%)	12.8 (+300%(***))
-# Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
-#
-# (*)	Software SHA256 results are of lesser relevance, presented
-#	mostly for informational purposes.
-# (**)	The result is a trade-off: it's possible to improve it by
-#	10% (or by 1 cycle per round), but at the cost of 20% loss
-#	on Cortex-A53 (or by 4 cycles per round).
-# (***)	Super-impressive coefficients over gcc-generated code are
-#	indication of some compiler "pathology", most notably code
-#	generated with -mgeneral-regs-only is significantly faster
-#	and the gap is only 40-90%.
-#
-# October 2016.
-#
-# Originally it was reckoned that it makes no sense to implement NEON
-# version of SHA256 for 64-bit processors. This is because performance
-# improvement on most wide-spread Cortex-A5x processors was observed
-# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
-# observed that 32-bit NEON SHA256 performs significantly better than
-# 64-bit scalar version on *some* of the more recent processors. As
-# result 64-bit NEON version of SHA256 was added to provide best
-# all-round performance. For example it executes ~30% faster on X-Gene
-# and Mongoose. [For reference, NEON version of SHA512 is bound to
-# deliver much less improvement, likely *negative* on Cortex-A5x.
-# Which is why NEON support is limited to SHA256.]
-
-$output=pop;
-$flavour=pop;
-
-if ($flavour && $flavour ne "void") {
-    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-    die "can't locate arm-xlate.pl";
-
-    open OUT,"| \"$^X\" $xlate $flavour $output";
-    *STDOUT=*OUT;
-} else {
-    open STDOUT,">$output";
-}
-
-if ($output =~ /512/) {
-	$BITS=512;
-	$SZ=8;
-	@Sigma0=(28,34,39);
-	@Sigma1=(14,18,41);
-	@sigma0=(1,  8, 7);
-	@sigma1=(19,61, 6);
-	$rounds=80;
-	$reg_t="x";
-} else {
-	$BITS=256;
-	$SZ=4;
-	@Sigma0=( 2,13,22);
-	@Sigma1=( 6,11,25);
-	@sigma0=( 7,18, 3);
-	@sigma1=(17,19,10);
-	$rounds=64;
-	$reg_t="w";
-}
-
-$func="sha${BITS}_block_data_order";
-
-($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
-
-@X=map("$reg_t$_",(3..15,0..2));
-@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
-($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
-
-sub BODY_00_xx {
-my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
-my $j=($i+1)&15;
-my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
-   $T0=@X[$i+3] if ($i<11);
-
-$code.=<<___	if ($i<16);
-#ifndef	__AARCH64EB__
-	rev	@X[$i],@X[$i]			// $i
-#endif
-___
-$code.=<<___	if ($i<13 && ($i&1));
-	ldp	@X[$i+1],@X[$i+2],[$inp],#2*$SZ
-___
-$code.=<<___	if ($i==13);
-	ldp	@X[14],@X[15],[$inp]
-___
-$code.=<<___	if ($i>=14);
-	ldr	@X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
-___
-$code.=<<___	if ($i>0 && $i<16);
-	add	$a,$a,$t1			// h+=Sigma0(a)
-___
-$code.=<<___	if ($i>=11);
-	str	@X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
-___
-# While ARMv8 specifies merged rotate-n-logical operation such as
-# 'eor x,y,z,ror#n', it was found to negatively affect performance
-# on Apple A7. The reason seems to be that it requires even 'y' to
-# be available earlier. This means that such merged instruction is
-# not necessarily best choice on critical path... On the other hand
-# Cortex-A5x handles merged instructions much better than disjoint
-# rotate and logical... See (**) footnote above.
-$code.=<<___	if ($i<15);
-	ror	$t0,$e,#$Sigma1[0]
-	add	$h,$h,$t2			// h+=K[i]
-	eor	$T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
-	and	$t1,$f,$e
-	bic	$t2,$g,$e
-	add	$h,$h,@X[$i&15]			// h+=X[i]
-	orr	$t1,$t1,$t2			// Ch(e,f,g)
-	eor	$t2,$a,$b			// a^b, b^c in next round
-	eor	$t0,$t0,$T0,ror#$Sigma1[1]	// Sigma1(e)
-	ror	$T0,$a,#$Sigma0[0]
-	add	$h,$h,$t1			// h+=Ch(e,f,g)
-	eor	$t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
-	add	$h,$h,$t0			// h+=Sigma1(e)
-	and	$t3,$t3,$t2			// (b^c)&=(a^b)
-	add	$d,$d,$h			// d+=h
-	eor	$t3,$t3,$b			// Maj(a,b,c)
-	eor	$t1,$T0,$t1,ror#$Sigma0[1]	// Sigma0(a)
-	add	$h,$h,$t3			// h+=Maj(a,b,c)
-	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
-	//add	$h,$h,$t1			// h+=Sigma0(a)
-___
-$code.=<<___	if ($i>=15);
-	ror	$t0,$e,#$Sigma1[0]
-	add	$h,$h,$t2			// h+=K[i]
-	ror	$T1,@X[($j+1)&15],#$sigma0[0]
-	and	$t1,$f,$e
-	ror	$T2,@X[($j+14)&15],#$sigma1[0]
-	bic	$t2,$g,$e
-	ror	$T0,$a,#$Sigma0[0]
-	add	$h,$h,@X[$i&15]			// h+=X[i]
-	eor	$t0,$t0,$e,ror#$Sigma1[1]
-	eor	$T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
-	orr	$t1,$t1,$t2			// Ch(e,f,g)
-	eor	$t2,$a,$b			// a^b, b^c in next round
-	eor	$t0,$t0,$e,ror#$Sigma1[2]	// Sigma1(e)
-	eor	$T0,$T0,$a,ror#$Sigma0[1]
-	add	$h,$h,$t1			// h+=Ch(e,f,g)
-	and	$t3,$t3,$t2			// (b^c)&=(a^b)
-	eor	$T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
-	eor	$T1,$T1,@X[($j+1)&15],lsr#$sigma0[2]	// sigma0(X[i+1])
-	add	$h,$h,$t0			// h+=Sigma1(e)
-	eor	$t3,$t3,$b			// Maj(a,b,c)
-	eor	$t1,$T0,$a,ror#$Sigma0[2]	// Sigma0(a)
-	eor	$T2,$T2,@X[($j+14)&15],lsr#$sigma1[2]	// sigma1(X[i+14])
-	add	@X[$j],@X[$j],@X[($j+9)&15]
-	add	$d,$d,$h			// d+=h
-	add	$h,$h,$t3			// h+=Maj(a,b,c)
-	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
-	add	@X[$j],@X[$j],$T1
-	add	$h,$h,$t1			// h+=Sigma0(a)
-	add	@X[$j],@X[$j],$T2
-___
-	($t2,$t3)=($t3,$t2);
-}
-
-$code.=<<___;
-#ifndef	__KERNEL__
-# include "arm_arch.h"
-#endif
-
-.text
-
-.extern	OPENSSL_armcap_P
-.globl	$func
-.type	$func,%function
-.align	6
-$func:
-___
-$code.=<<___	if ($SZ==4);
-#ifndef	__KERNEL__
-# ifdef	__ILP32__
-	ldrsw	x16,.LOPENSSL_armcap_P
-# else
-	ldr	x16,.LOPENSSL_armcap_P
-# endif
-	adr	x17,.LOPENSSL_armcap_P
-	add	x16,x16,x17
-	ldr	w16,[x16]
-	tst	w16,#ARMV8_SHA256
-	b.ne	.Lv8_entry
-	tst	w16,#ARMV7_NEON
-	b.ne	.Lneon_entry
-#endif
-___
-$code.=<<___;
-	stp	x29,x30,[sp,#-128]!
-	add	x29,sp,#0
-
-	stp	x19,x20,[sp,#16]
-	stp	x21,x22,[sp,#32]
-	stp	x23,x24,[sp,#48]
-	stp	x25,x26,[sp,#64]
-	stp	x27,x28,[sp,#80]
-	sub	sp,sp,#4*$SZ
-
-	ldp	$A,$B,[$ctx]				// load context
-	ldp	$C,$D,[$ctx,#2*$SZ]
-	ldp	$E,$F,[$ctx,#4*$SZ]
-	add	$num,$inp,$num,lsl#`log(16*$SZ)/log(2)`	// end of input
-	ldp	$G,$H,[$ctx,#6*$SZ]
-	adr	$Ktbl,.LK$BITS
-	stp	$ctx,$num,[x29,#96]
-
-.Loop:
-	ldp	@X[0],@X[1],[$inp],#2*$SZ
-	ldr	$t2,[$Ktbl],#$SZ			// *K++
-	eor	$t3,$B,$C				// magic seed
-	str	$inp,[x29,#112]
-___
-for ($i=0;$i<16;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
-$code.=".Loop_16_xx:\n";
-for (;$i<32;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
-	cbnz	$t2,.Loop_16_xx
-
-	ldp	$ctx,$num,[x29,#96]
-	ldr	$inp,[x29,#112]
-	sub	$Ktbl,$Ktbl,#`$SZ*($rounds+1)`		// rewind
-
-	ldp	@X[0],@X[1],[$ctx]
-	ldp	@X[2],@X[3],[$ctx,#2*$SZ]
-	add	$inp,$inp,#14*$SZ			// advance input pointer
-	ldp	@X[4],@X[5],[$ctx,#4*$SZ]
-	add	$A,$A,@X[0]
-	ldp	@X[6],@X[7],[$ctx,#6*$SZ]
-	add	$B,$B,@X[1]
-	add	$C,$C,@X[2]
-	add	$D,$D,@X[3]
-	stp	$A,$B,[$ctx]
-	add	$E,$E,@X[4]
-	add	$F,$F,@X[5]
-	stp	$C,$D,[$ctx,#2*$SZ]
-	add	$G,$G,@X[6]
-	add	$H,$H,@X[7]
-	cmp	$inp,$num
-	stp	$E,$F,[$ctx,#4*$SZ]
-	stp	$G,$H,[$ctx,#6*$SZ]
-	b.ne	.Loop
-
-	ldp	x19,x20,[x29,#16]
-	add	sp,sp,#4*$SZ
-	ldp	x21,x22,[x29,#32]
-	ldp	x23,x24,[x29,#48]
-	ldp	x25,x26,[x29,#64]
-	ldp	x27,x28,[x29,#80]
-	ldp	x29,x30,[sp],#128
-	ret
-.size	$func,.-$func
-
-.align	6
-.type	.LK$BITS,%object
-.LK$BITS:
-___
-$code.=<<___ if ($SZ==8);
-	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
-	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
-	.quad	0x3956c25bf348b538,0x59f111f1b605d019
-	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
-	.quad	0xd807aa98a3030242,0x12835b0145706fbe
-	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
-	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
-	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
-	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
-	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
-	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
-	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
-	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
-	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
-	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
-	.quad	0x06ca6351e003826f,0x142929670a0e6e70
-	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
-	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
-	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
-	.quad	0x81c2c92e47edaee6,0x92722c851482353b
-	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
-	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
-	.quad	0xd192e819d6ef5218,0xd69906245565a910
-	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
-	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
-	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
-	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
-	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
-	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
-	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
-	.quad	0x90befffa23631e28,0xa4506cebde82bde9
-	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
-	.quad	0xca273eceea26619c,0xd186b8c721c0c207
-	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
-	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
-	.quad	0x113f9804bef90dae,0x1b710b35131c471b
-	.quad	0x28db77f523047d84,0x32caab7b40c72493
-	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
-	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
-	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
-	.quad	0	// terminator
-___
-$code.=<<___ if ($SZ==4);
-	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-	.long	0	//terminator
-___
-$code.=<<___;
-.size	.LK$BITS,.-.LK$BITS
-#ifndef	__KERNEL__
-.align	3
-.LOPENSSL_armcap_P:
-# ifdef	__ILP32__
-	.long	OPENSSL_armcap_P-.
-# else
-	.quad	OPENSSL_armcap_P-.
-# endif
-#endif
-.asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
-.align	2
-___
-
-if ($SZ==4) {
-my $Ktbl="x3";
-
-my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
-my @MSG=map("v$_.16b",(4..7));
-my ($W0,$W1)=("v16.4s","v17.4s");
-my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
-
-$code.=<<___;
-#ifndef	__KERNEL__
-.type	sha256_block_armv8,%function
-.align	6
-sha256_block_armv8:
-.Lv8_entry:
-	stp		x29,x30,[sp,#-16]!
-	add		x29,sp,#0
-
-	ld1.32		{$ABCD,$EFGH},[$ctx]
-	adr		$Ktbl,.LK256
-
-.Loop_hw:
-	ld1		{@MSG[0]-@MSG[3]},[$inp],#64
-	sub		$num,$num,#1
-	ld1.32		{$W0},[$Ktbl],#16
-	rev32		@MSG[0],@MSG[0]
-	rev32		@MSG[1],@MSG[1]
-	rev32		@MSG[2],@MSG[2]
-	rev32		@MSG[3],@MSG[3]
-	orr		$ABCD_SAVE,$ABCD,$ABCD		// offload
-	orr		$EFGH_SAVE,$EFGH,$EFGH
-___
-for($i=0;$i<12;$i++) {
-$code.=<<___;
-	ld1.32		{$W1},[$Ktbl],#16
-	add.i32		$W0,$W0,@MSG[0]
-	sha256su0	@MSG[0],@MSG[1]
-	orr		$abcd,$ABCD,$ABCD
-	sha256h		$ABCD,$EFGH,$W0
-	sha256h2	$EFGH,$abcd,$W0
-	sha256su1	@MSG[0],@MSG[2],@MSG[3]
-___
-	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
-}
-$code.=<<___;
-	ld1.32		{$W1},[$Ktbl],#16
-	add.i32		$W0,$W0,@MSG[0]
-	orr		$abcd,$ABCD,$ABCD
-	sha256h		$ABCD,$EFGH,$W0
-	sha256h2	$EFGH,$abcd,$W0
-
-	ld1.32		{$W0},[$Ktbl],#16
-	add.i32		$W1,$W1,@MSG[1]
-	orr		$abcd,$ABCD,$ABCD
-	sha256h		$ABCD,$EFGH,$W1
-	sha256h2	$EFGH,$abcd,$W1
-
-	ld1.32		{$W1},[$Ktbl]
-	add.i32		$W0,$W0,@MSG[2]
-	sub		$Ktbl,$Ktbl,#$rounds*$SZ-16	// rewind
-	orr		$abcd,$ABCD,$ABCD
-	sha256h		$ABCD,$EFGH,$W0
-	sha256h2	$EFGH,$abcd,$W0
-
-	add.i32		$W1,$W1,@MSG[3]
-	orr		$abcd,$ABCD,$ABCD
-	sha256h		$ABCD,$EFGH,$W1
-	sha256h2	$EFGH,$abcd,$W1
-
-	add.i32		$ABCD,$ABCD,$ABCD_SAVE
-	add.i32		$EFGH,$EFGH,$EFGH_SAVE
-
-	cbnz		$num,.Loop_hw
-
-	st1.32		{$ABCD,$EFGH},[$ctx]
-
-	ldr		x29,[sp],#16
-	ret
-.size	sha256_block_armv8,.-sha256_block_armv8
-#endif
-___
-}
-
-if ($SZ==4) {	######################################### NEON stuff #
-# You'll surely note a lot of similarities with sha256-armv4 module,
-# and of course it's not a coincidence. sha256-armv4 was used as
-# initial template, but was adapted for ARMv8 instruction set and
-# extensively re-tuned for all-round performance.
-
-my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
-my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
-my $Ktbl="x16";
-my $Xfer="x17";
-my @X = map("q$_",(0..3));
-my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
-my $j=0;
-
-sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
-  my $arg = pop;
-    $arg = "#$arg" if ($arg*1 eq $arg);
-    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
-}
-
-sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
-sub Dlo     { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
-sub Dhi     { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
-
-sub Xupdate()
-{ use integer;
-  my $body = shift;
-  my @insns = (&$body,&$body,&$body,&$body);
-  my ($a,$b,$c,$d,$e,$f,$g,$h);
-
-	&ext_8		($T0,@X[0],@X[1],4);	# X[1..4]
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&ext_8		($T3,@X[2],@X[3],4);	# X[9..12]
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&mov		(&Dscalar($T7),&Dhi(@X[3]));	# X[14..15]
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&ushr_32	($T2,$T0,$sigma0[0]);
-	 eval(shift(@insns));
-	&ushr_32	($T1,$T0,$sigma0[2]);
-	 eval(shift(@insns));
-	&add_32 	(@X[0],@X[0],$T3);	# X[0..3] += X[9..12]
-	 eval(shift(@insns));
-	&sli_32		($T2,$T0,32-$sigma0[0]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&ushr_32	($T3,$T0,$sigma0[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&eor_8		($T1,$T1,$T2);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&sli_32		($T3,$T0,32-$sigma0[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &ushr_32	($T4,$T7,$sigma1[0]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&eor_8		($T1,$T1,$T3);		# sigma0(X[1..4])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &sli_32	($T4,$T7,32-$sigma1[0]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &ushr_32	($T5,$T7,$sigma1[2]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &ushr_32	($T3,$T7,$sigma1[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&add_32		(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &sli_u32	($T3,$T7,32-$sigma1[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &eor_8	($T5,$T5,$T4);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &eor_8	($T5,$T5,$T3);		# sigma1(X[14..15])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&add_32		(@X[0],@X[0],$T5);	# X[0..1] += sigma1(X[14..15])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &ushr_32	($T6,@X[0],$sigma1[0]);
-	 eval(shift(@insns));
-	  &ushr_32	($T7,@X[0],$sigma1[2]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &sli_32	($T6,@X[0],32-$sigma1[0]);
-	 eval(shift(@insns));
-	  &ushr_32	($T5,@X[0],$sigma1[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &eor_8	($T7,$T7,$T6);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	  &sli_32	($T5,@X[0],32-$sigma1[1]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&ld1_32		("{$T0}","[$Ktbl], #16");
-	 eval(shift(@insns));
-	  &eor_8	($T7,$T7,$T5);		# sigma1(X[16..17])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&eor_8		($T5,$T5,$T5);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&mov		(&Dhi($T5), &Dlo($T7));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&add_32		(@X[0],@X[0],$T5);	# X[2..3] += sigma1(X[16..17])
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&add_32		($T0,$T0,@X[0]);
-	 while($#insns>=1) { eval(shift(@insns)); }
-	&st1_32		("{$T0}","[$Xfer], #16");
-	 eval(shift(@insns));
-
-	push(@X,shift(@X));		# "rotate" X[]
-}
-
-sub Xpreload()
-{ use integer;
-  my $body = shift;
-  my @insns = (&$body,&$body,&$body,&$body);
-  my ($a,$b,$c,$d,$e,$f,$g,$h);
-
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&ld1_8		("{@X[0]}","[$inp],#16");
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&ld1_32		("{$T0}","[$Ktbl],#16");
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&rev32		(@X[0],@X[0]);
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	 eval(shift(@insns));
-	&add_32		($T0,$T0,@X[0]);
-	 foreach (@insns) { eval; }	# remaining instructions
-	&st1_32		("{$T0}","[$Xfer], #16");
-
-	push(@X,shift(@X));		# "rotate" X[]
-}
-
-sub body_00_15 () {
-	(
-	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
-	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
-	'&add	($a,$a,$t4);'.			# h+=Sigma0(a) from the past
-	'&and	($t1,$f,$e)',
-	'&bic	($t4,$g,$e)',
-	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
-	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
-	'&orr	($t1,$t1,$t4)',			# Ch(e,f,g)
-	'&eor	($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
-	'&eor	($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
-	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
-	'&ror	($t0,$t0,"#$Sigma1[0]")',
-	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
-	'&eor	($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
-	'&add	($h,$h,$t0)',			# h+=Sigma1(e)
-	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
-	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
-	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
-	'&ror	($t4,$t4,"#$Sigma0[0]")',
-	'&add	($d,$d,$h)',			# d+=h
-	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
-	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
-	)
-}
-
-$code.=<<___;
-#ifdef	__KERNEL__
-.globl	sha256_block_neon
-#endif
-.type	sha256_block_neon,%function
-.align	4
-sha256_block_neon:
-.Lneon_entry:
-	stp	x29, x30, [sp, #-16]!
-	mov	x29, sp
-	sub	sp,sp,#16*4
-
-	adr	$Ktbl,.LK256
-	add	$num,$inp,$num,lsl#6	// len to point at the end of inp
-
-	ld1.8	{@X[0]},[$inp], #16
-	ld1.8	{@X[1]},[$inp], #16
-	ld1.8	{@X[2]},[$inp], #16
-	ld1.8	{@X[3]},[$inp], #16
-	ld1.32	{$T0},[$Ktbl], #16
-	ld1.32	{$T1},[$Ktbl], #16
-	ld1.32	{$T2},[$Ktbl], #16
-	ld1.32	{$T3},[$Ktbl], #16
-	rev32	@X[0],@X[0]		// yes, even on
-	rev32	@X[1],@X[1]		// big-endian
-	rev32	@X[2],@X[2]
-	rev32	@X[3],@X[3]
-	mov	$Xfer,sp
-	add.32	$T0,$T0,@X[0]
-	add.32	$T1,$T1,@X[1]
-	add.32	$T2,$T2,@X[2]
-	st1.32	{$T0-$T1},[$Xfer], #32
-	add.32	$T3,$T3,@X[3]
-	st1.32	{$T2-$T3},[$Xfer]
-	sub	$Xfer,$Xfer,#32
-
-	ldp	$A,$B,[$ctx]
-	ldp	$C,$D,[$ctx,#8]
-	ldp	$E,$F,[$ctx,#16]
-	ldp	$G,$H,[$ctx,#24]
-	ldr	$t1,[sp,#0]
-	mov	$t2,wzr
-	eor	$t3,$B,$C
-	mov	$t4,wzr
-	b	.L_00_48
-
-.align	4
-.L_00_48:
-___
-	&Xupdate(\&body_00_15);
-	&Xupdate(\&body_00_15);
-	&Xupdate(\&body_00_15);
-	&Xupdate(\&body_00_15);
-$code.=<<___;
-	cmp	$t1,#0				// check for K256 terminator
-	ldr	$t1,[sp,#0]
-	sub	$Xfer,$Xfer,#64
-	bne	.L_00_48
-
-	sub	$Ktbl,$Ktbl,#256		// rewind $Ktbl
-	cmp	$inp,$num
-	mov	$Xfer, #64
-	csel	$Xfer, $Xfer, xzr, eq
-	sub	$inp,$inp,$Xfer			// avoid SEGV
-	mov	$Xfer,sp
-___
-	&Xpreload(\&body_00_15);
-	&Xpreload(\&body_00_15);
-	&Xpreload(\&body_00_15);
-	&Xpreload(\&body_00_15);
-$code.=<<___;
-	add	$A,$A,$t4			// h+=Sigma0(a) from the past
-	ldp	$t0,$t1,[$ctx,#0]
-	add	$A,$A,$t2			// h+=Maj(a,b,c) from the past
-	ldp	$t2,$t3,[$ctx,#8]
-	add	$A,$A,$t0			// accumulate
-	add	$B,$B,$t1
-	ldp	$t0,$t1,[$ctx,#16]
-	add	$C,$C,$t2
-	add	$D,$D,$t3
-	ldp	$t2,$t3,[$ctx,#24]
-	add	$E,$E,$t0
-	add	$F,$F,$t1
-	 ldr	$t1,[sp,#0]
-	stp	$A,$B,[$ctx,#0]
-	add	$G,$G,$t2
-	 mov	$t2,wzr
-	stp	$C,$D,[$ctx,#8]
-	add	$H,$H,$t3
-	stp	$E,$F,[$ctx,#16]
-	 eor	$t3,$B,$C
-	stp	$G,$H,[$ctx,#24]
-	 mov	$t4,wzr
-	 mov	$Xfer,sp
-	b.ne	.L_00_48
-
-	ldr	x29,[x29]
-	add	sp,sp,#16*4+16
-	ret
-.size	sha256_block_neon,.-sha256_block_neon
-___
-}
-
-$code.=<<___;
-#ifndef	__KERNEL__
-.comm	OPENSSL_armcap_P,4,4
-#endif
-___
-
-{   my  %opcode = (
-	"sha256h"	=> 0x5e004000,	"sha256h2"	=> 0x5e005000,
-	"sha256su0"	=> 0x5e282800,	"sha256su1"	=> 0x5e006000	);
-
-    sub unsha256 {
-	my ($mnemonic,$arg)=@_;
-
-	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
-	&&
-	sprintf ".inst\t0x%08x\t//%s %s",
-			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
-			$mnemonic,$arg;
-    }
-}
-
-open SELF,$0;
-while(<SELF>) {
-        next if (/^#!/);
-        last if (!s/^#/\/\// and !/^$/);
-        print;
-}
-close SELF;
-
-foreach(split("\n",$code)) {
-
-	s/\`([^\`]*)\`/eval($1)/ge;
-
-	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
-
-	s/\bq([0-9]+)\b/v$1.16b/g;		# old->new registers
-
-	s/\.[ui]?8(\s)/$1/;
-	s/\.\w?32\b//		and s/\.16b/\.4s/g;
-	m/(ld|st)1[^\[]+\[0\]/	and s/\.4s/\.s/g;
-
-	print $_,"\n";
-}
-
-close STDOUT;
diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c
index 071f64293227..6fb3001fa2c9 100644
--- a/arch/arm64/crypto/sha512-ce-glue.c
+++ b/arch/arm64/crypto/sha512-ce-glue.c
@@ -10,14 +10,11 @@
  */
 
 #include <asm/neon.h>
-#include <asm/simd.h>
-#include <linux/unaligned.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <crypto/sha2.h>
 #include <crypto/sha512_base.h>
 #include <linux/cpufeature.h>
-#include <linux/crypto.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
 
 MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash using ARMv8 Crypto Extensions");
@@ -29,12 +26,10 @@ MODULE_ALIAS_CRYPTO("sha512");
 asmlinkage int __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
 				     int blocks);
 
-asmlinkage void sha512_block_data_order(u64 *digest, u8 const *src, int blocks);
-
 static void sha512_ce_transform(struct sha512_state *sst, u8 const *src,
 				int blocks)
 {
-	while (blocks) {
+	do {
 		int rem;
 
 		kernel_neon_begin();
@@ -42,67 +37,47 @@ static void sha512_ce_transform(struct sha512_state *sst, u8 const *src,
 		kernel_neon_end();
 		src += (blocks - rem) * SHA512_BLOCK_SIZE;
 		blocks = rem;
-	}
-}
-
-static void sha512_arm64_transform(struct sha512_state *sst, u8 const *src,
-				   int blocks)
-{
-	sha512_block_data_order(sst->state, src, blocks);
+	} while (blocks);
 }
 
 static int sha512_ce_update(struct shash_desc *desc, const u8 *data,
 			    unsigned int len)
 {
-	sha512_block_fn *fn = crypto_simd_usable() ? sha512_ce_transform
-						   : sha512_arm64_transform;
-
-	sha512_base_do_update(desc, data, len, fn);
-	return 0;
+	return sha512_base_do_update_blocks(desc, data, len,
+					    sha512_ce_transform);
 }
 
 static int sha512_ce_finup(struct shash_desc *desc, const u8 *data,
 			   unsigned int len, u8 *out)
 {
-	sha512_block_fn *fn = crypto_simd_usable() ? sha512_ce_transform
-						   : sha512_arm64_transform;
-
-	sha512_base_do_update(desc, data, len, fn);
-	sha512_base_do_finalize(desc, fn);
-	return sha512_base_finish(desc, out);
-}
-
-static int sha512_ce_final(struct shash_desc *desc, u8 *out)
-{
-	sha512_block_fn *fn = crypto_simd_usable() ? sha512_ce_transform
-						   : sha512_arm64_transform;
-
-	sha512_base_do_finalize(desc, fn);
+	sha512_base_do_finup(desc, data, len, sha512_ce_transform);
 	return sha512_base_finish(desc, out);
 }
 
 static struct shash_alg algs[] = { {
 	.init			= sha384_base_init,
 	.update			= sha512_ce_update,
-	.final			= sha512_ce_final,
 	.finup			= sha512_ce_finup,
-	.descsize		= sizeof(struct sha512_state),
+	.descsize		= SHA512_STATE_SIZE,
 	.digestsize		= SHA384_DIGEST_SIZE,
 	.base.cra_name		= "sha384",
 	.base.cra_driver_name	= "sha384-ce",
 	.base.cra_priority	= 200,
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+				  CRYPTO_AHASH_ALG_FINUP_MAX,
 	.base.cra_blocksize	= SHA512_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 }, {
 	.init			= sha512_base_init,
 	.update			= sha512_ce_update,
-	.final			= sha512_ce_final,
 	.finup			= sha512_ce_finup,
-	.descsize		= sizeof(struct sha512_state),
+	.descsize		= SHA512_STATE_SIZE,
 	.digestsize		= SHA512_DIGEST_SIZE,
 	.base.cra_name		= "sha512",
 	.base.cra_driver_name	= "sha512-ce",
 	.base.cra_priority	= 200,
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+				  CRYPTO_AHASH_ALG_FINUP_MAX,
 	.base.cra_blocksize	= SHA512_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 } };
diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c
index 62f129dea83d..15aa9d8b7b2c 100644
--- a/arch/arm64/crypto/sha512-glue.c
+++ b/arch/arm64/crypto/sha512-glue.c
@@ -6,11 +6,10 @@
  */
 
 #include <crypto/internal/hash.h>
-#include <linux/types.h>
-#include <linux/string.h>
 #include <crypto/sha2.h>
 #include <crypto/sha512_base.h>
-#include <asm/neon.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
 
 MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash for arm64");
 MODULE_AUTHOR("Andy Polyakov <appro@openssl.org>");
@@ -19,59 +18,53 @@ MODULE_LICENSE("GPL v2");
 MODULE_ALIAS_CRYPTO("sha384");
 MODULE_ALIAS_CRYPTO("sha512");
 
-asmlinkage void sha512_block_data_order(u64 *digest, const void *data,
-					unsigned int num_blks);
-EXPORT_SYMBOL(sha512_block_data_order);
+asmlinkage void sha512_blocks_arch(u64 *digest, const void *data,
+				   unsigned int num_blks);
 
 static void sha512_arm64_transform(struct sha512_state *sst, u8 const *src,
 				   int blocks)
 {
-	sha512_block_data_order(sst->state, src, blocks);
+	sha512_blocks_arch(sst->state, src, blocks);
 }
 
 static int sha512_update(struct shash_desc *desc, const u8 *data,
 			 unsigned int len)
 {
-	return sha512_base_do_update(desc, data, len, sha512_arm64_transform);
+	return sha512_base_do_update_blocks(desc, data, len,
+					    sha512_arm64_transform);
 }
 
 static int sha512_finup(struct shash_desc *desc, const u8 *data,
 			unsigned int len, u8 *out)
 {
-	if (len)
-		sha512_base_do_update(desc, data, len, sha512_arm64_transform);
-	sha512_base_do_finalize(desc, sha512_arm64_transform);
-
+	sha512_base_do_finup(desc, data, len, sha512_arm64_transform);
 	return sha512_base_finish(desc, out);
 }
 
-static int sha512_final(struct shash_desc *desc, u8 *out)
-{
-	return sha512_finup(desc, NULL, 0, out);
-}
-
 static struct shash_alg algs[] = { {
 	.digestsize		= SHA512_DIGEST_SIZE,
 	.init			= sha512_base_init,
 	.update			= sha512_update,
-	.final			= sha512_final,
 	.finup			= sha512_finup,
-	.descsize		= sizeof(struct sha512_state),
+	.descsize		= SHA512_STATE_SIZE,
 	.base.cra_name		= "sha512",
 	.base.cra_driver_name	= "sha512-arm64",
 	.base.cra_priority	= 150,
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+				  CRYPTO_AHASH_ALG_FINUP_MAX,
 	.base.cra_blocksize	= SHA512_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 }, {
 	.digestsize		= SHA384_DIGEST_SIZE,
 	.init			= sha384_base_init,
 	.update			= sha512_update,
-	.final			= sha512_final,
 	.finup			= sha512_finup,
-	.descsize		= sizeof(struct sha512_state),
+	.descsize		= SHA512_STATE_SIZE,
 	.base.cra_name		= "sha384",
 	.base.cra_driver_name	= "sha384-arm64",
 	.base.cra_priority	= 150,
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+				  CRYPTO_AHASH_ALG_FINUP_MAX,
 	.base.cra_blocksize	= SHA384_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 } };
diff --git a/arch/arm64/crypto/sm3-ce-glue.c b/arch/arm64/crypto/sm3-ce-glue.c
index 1a71788c4cda..eac6f5fa0abe 100644
--- a/arch/arm64/crypto/sm3-ce-glue.c
+++ b/arch/arm64/crypto/sm3-ce-glue.c
@@ -6,14 +6,11 @@
  */
 
 #include <asm/neon.h>
-#include <asm/simd.h>
-#include <linux/unaligned.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <crypto/sm3.h>
 #include <crypto/sm3_base.h>
 #include <linux/cpufeature.h>
-#include <linux/crypto.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
 
 MODULE_DESCRIPTION("SM3 secure hash using ARMv8 Crypto Extensions");
@@ -26,50 +23,20 @@ asmlinkage void sm3_ce_transform(struct sm3_state *sst, u8 const *src,
 static int sm3_ce_update(struct shash_desc *desc, const u8 *data,
 			 unsigned int len)
 {
-	if (!crypto_simd_usable()) {
-		sm3_update(shash_desc_ctx(desc), data, len);
-		return 0;
-	}
+	int remain;
 
 	kernel_neon_begin();
-	sm3_base_do_update(desc, data, len, sm3_ce_transform);
+	remain = sm3_base_do_update_blocks(desc, data, len, sm3_ce_transform);
 	kernel_neon_end();
-
-	return 0;
-}
-
-static int sm3_ce_final(struct shash_desc *desc, u8 *out)
-{
-	if (!crypto_simd_usable()) {
-		sm3_final(shash_desc_ctx(desc), out);
-		return 0;
-	}
-
-	kernel_neon_begin();
-	sm3_base_do_finalize(desc, sm3_ce_transform);
-	kernel_neon_end();
-
-	return sm3_base_finish(desc, out);
+	return remain;
 }
 
 static int sm3_ce_finup(struct shash_desc *desc, const u8 *data,
 			unsigned int len, u8 *out)
 {
-	if (!crypto_simd_usable()) {
-		struct sm3_state *sctx = shash_desc_ctx(desc);
-
-		if (len)
-			sm3_update(sctx, data, len);
-		sm3_final(sctx, out);
-		return 0;
-	}
-
 	kernel_neon_begin();
-	if (len)
-		sm3_base_do_update(desc, data, len, sm3_ce_transform);
-	sm3_base_do_finalize(desc, sm3_ce_transform);
+	sm3_base_do_finup(desc, data, len, sm3_ce_transform);
 	kernel_neon_end();
-
 	return sm3_base_finish(desc, out);
 }
 
@@ -77,11 +44,12 @@ static struct shash_alg sm3_alg = {
 	.digestsize		= SM3_DIGEST_SIZE,
 	.init			= sm3_base_init,
 	.update			= sm3_ce_update,
-	.final			= sm3_ce_final,
 	.finup			= sm3_ce_finup,
-	.descsize		= sizeof(struct sm3_state),
+	.descsize		= SM3_STATE_SIZE,
 	.base.cra_name		= "sm3",
 	.base.cra_driver_name	= "sm3-ce",
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+				  CRYPTO_AHASH_ALG_FINUP_MAX,
 	.base.cra_blocksize	= SM3_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_priority	= 400,
diff --git a/arch/arm64/crypto/sm3-neon-glue.c b/arch/arm64/crypto/sm3-neon-glue.c
index 8dd71ce79b69..6c4611a503a3 100644
--- a/arch/arm64/crypto/sm3-neon-glue.c
+++ b/arch/arm64/crypto/sm3-neon-glue.c
@@ -6,14 +6,11 @@
  */
 
 #include <asm/neon.h>
-#include <asm/simd.h>
-#include <linux/unaligned.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <crypto/sm3.h>
 #include <crypto/sm3_base.h>
 #include <linux/cpufeature.h>
-#include <linux/crypto.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
 
 
@@ -23,50 +20,20 @@ asmlinkage void sm3_neon_transform(struct sm3_state *sst, u8 const *src,
 static int sm3_neon_update(struct shash_desc *desc, const u8 *data,
 			   unsigned int len)
 {
-	if (!crypto_simd_usable()) {
-		sm3_update(shash_desc_ctx(desc), data, len);
-		return 0;
-	}
+	int remain;
 
 	kernel_neon_begin();
-	sm3_base_do_update(desc, data, len, sm3_neon_transform);
+	remain = sm3_base_do_update_blocks(desc, data, len, sm3_neon_transform);
 	kernel_neon_end();
-
-	return 0;
-}
-
-static int sm3_neon_final(struct shash_desc *desc, u8 *out)
-{
-	if (!crypto_simd_usable()) {
-		sm3_final(shash_desc_ctx(desc), out);
-		return 0;
-	}
-
-	kernel_neon_begin();
-	sm3_base_do_finalize(desc, sm3_neon_transform);
-	kernel_neon_end();
-
-	return sm3_base_finish(desc, out);
+	return remain;
 }
 
 static int sm3_neon_finup(struct shash_desc *desc, const u8 *data,
 			  unsigned int len, u8 *out)
 {
-	if (!crypto_simd_usable()) {
-		struct sm3_state *sctx = shash_desc_ctx(desc);
-
-		if (len)
-			sm3_update(sctx, data, len);
-		sm3_final(sctx, out);
-		return 0;
-	}
-
 	kernel_neon_begin();
-	if (len)
-		sm3_base_do_update(desc, data, len, sm3_neon_transform);
-	sm3_base_do_finalize(desc, sm3_neon_transform);
+	sm3_base_do_finup(desc, data, len, sm3_neon_transform);
 	kernel_neon_end();
-
 	return sm3_base_finish(desc, out);
 }
 
@@ -74,11 +41,12 @@ static struct shash_alg sm3_alg = {
 	.digestsize		= SM3_DIGEST_SIZE,
 	.init			= sm3_base_init,
 	.update			= sm3_neon_update,
-	.final			= sm3_neon_final,
 	.finup			= sm3_neon_finup,
-	.descsize		= sizeof(struct sm3_state),
+	.descsize		= SM3_STATE_SIZE,
 	.base.cra_name		= "sm3",
 	.base.cra_driver_name	= "sm3-neon",
+	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+				  CRYPTO_AHASH_ALG_FINUP_MAX,
 	.base.cra_blocksize	= SM3_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_priority	= 200,
diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c
index 43741bed874e..7a60e7b559dc 100644
--- a/arch/arm64/crypto/sm4-ce-glue.c
+++ b/arch/arm64/crypto/sm4-ce-glue.c
@@ -8,19 +8,18 @@
  * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
  */
 
-#include <linux/module.h>
-#include <linux/crypto.h>
-#include <linux/kernel.h>
-#include <linux/cpufeature.h>
 #include <asm/neon.h>
-#include <asm/simd.h>
 #include <crypto/b128ops.h>
-#include <crypto/internal/simd.h>
-#include <crypto/internal/skcipher.h>
 #include <crypto/internal/hash.h>
+#include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
-#include <crypto/xts.h>
 #include <crypto/sm4.h>
+#include <crypto/utils.h>
+#include <crypto/xts.h>
+#include <linux/cpufeature.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
 
 #define BYTES2BLKS(nbytes)	((nbytes) >> 4)
 
@@ -64,7 +63,6 @@ struct sm4_mac_tfm_ctx {
 };
 
 struct sm4_mac_desc_ctx {
-	unsigned int len;
 	u8 digest[SM4_BLOCK_SIZE];
 };
 
@@ -591,8 +589,6 @@ static int sm4_mac_init(struct shash_desc *desc)
 	struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc);
 
 	memset(ctx->digest, 0, SM4_BLOCK_SIZE);
-	ctx->len = 0;
-
 	return 0;
 }
 
@@ -601,87 +597,50 @@ static int sm4_mac_update(struct shash_desc *desc, const u8 *p,
 {
 	struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
 	struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc);
-	unsigned int l, nblocks;
-
-	if (len == 0)
-		return 0;
-
-	if (ctx->len || ctx->len + len < SM4_BLOCK_SIZE) {
-		l = min(len, SM4_BLOCK_SIZE - ctx->len);
-
-		crypto_xor(ctx->digest + ctx->len, p, l);
-		ctx->len += l;
-		len -= l;
-		p += l;
-	}
-
-	if (len && (ctx->len % SM4_BLOCK_SIZE) == 0) {
-		kernel_neon_begin();
-
-		if (len < SM4_BLOCK_SIZE && ctx->len == SM4_BLOCK_SIZE) {
-			sm4_ce_crypt_block(tctx->key.rkey_enc,
-					   ctx->digest, ctx->digest);
-			ctx->len = 0;
-		} else {
-			nblocks = len / SM4_BLOCK_SIZE;
-			len %= SM4_BLOCK_SIZE;
+	unsigned int nblocks = len / SM4_BLOCK_SIZE;
 
-			sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, p,
-					  nblocks, (ctx->len == SM4_BLOCK_SIZE),
-					  (len != 0));
-
-			p += nblocks * SM4_BLOCK_SIZE;
-
-			if (len == 0)
-				ctx->len = SM4_BLOCK_SIZE;
-		}
-
-		kernel_neon_end();
-
-		if (len) {
-			crypto_xor(ctx->digest, p, len);
-			ctx->len = len;
-		}
-	}
-
-	return 0;
+	len %= SM4_BLOCK_SIZE;
+	kernel_neon_begin();
+	sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, p,
+			  nblocks, false, true);
+	kernel_neon_end();
+	return len;
 }
 
-static int sm4_cmac_final(struct shash_desc *desc, u8 *out)
+static int sm4_cmac_finup(struct shash_desc *desc, const u8 *src,
+			  unsigned int len, u8 *out)
 {
 	struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
 	struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc);
 	const u8 *consts = tctx->consts;
 
-	if (ctx->len != SM4_BLOCK_SIZE) {
-		ctx->digest[ctx->len] ^= 0x80;
+	crypto_xor(ctx->digest, src, len);
+	if (len != SM4_BLOCK_SIZE) {
+		ctx->digest[len] ^= 0x80;
 		consts += SM4_BLOCK_SIZE;
 	}
-
 	kernel_neon_begin();
 	sm4_ce_mac_update(tctx->key.rkey_enc, ctx->digest, consts, 1,
 			  false, true);
 	kernel_neon_end();
-
 	memcpy(out, ctx->digest, SM4_BLOCK_SIZE);
-
 	return 0;
 }
 
-static int sm4_cbcmac_final(struct shash_desc *desc, u8 *out)
+static int sm4_cbcmac_finup(struct shash_desc *desc, const u8 *src,
+			    unsigned int len, u8 *out)
 {
 	struct sm4_mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
 	struct sm4_mac_desc_ctx *ctx = shash_desc_ctx(desc);
 
-	if (ctx->len) {
+	if (len) {
+		crypto_xor(ctx->digest, src, len);
 		kernel_neon_begin();
 		sm4_ce_crypt_block(tctx->key.rkey_enc, ctx->digest,
 				   ctx->digest);
 		kernel_neon_end();
 	}
-
 	memcpy(out, ctx->digest, SM4_BLOCK_SIZE);
-
 	return 0;
 }
 
@@ -691,6 +650,8 @@ static struct shash_alg sm4_mac_algs[] = {
 			.cra_name		= "cmac(sm4)",
 			.cra_driver_name	= "cmac-sm4-ce",
 			.cra_priority		= 400,
+			.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+						  CRYPTO_AHASH_ALG_FINAL_NONZERO,
 			.cra_blocksize		= SM4_BLOCK_SIZE,
 			.cra_ctxsize		= sizeof(struct sm4_mac_tfm_ctx)
 							+ SM4_BLOCK_SIZE * 2,
@@ -699,7 +660,7 @@ static struct shash_alg sm4_mac_algs[] = {
 		.digestsize	= SM4_BLOCK_SIZE,
 		.init		= sm4_mac_init,
 		.update		= sm4_mac_update,
-		.final		= sm4_cmac_final,
+		.finup		= sm4_cmac_finup,
 		.setkey		= sm4_cmac_setkey,
 		.descsize	= sizeof(struct sm4_mac_desc_ctx),
 	}, {
@@ -707,6 +668,8 @@ static struct shash_alg sm4_mac_algs[] = {
 			.cra_name		= "xcbc(sm4)",
 			.cra_driver_name	= "xcbc-sm4-ce",
 			.cra_priority		= 400,
+			.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY |
+						  CRYPTO_AHASH_ALG_FINAL_NONZERO,
 			.cra_blocksize		= SM4_BLOCK_SIZE,
 			.cra_ctxsize		= sizeof(struct sm4_mac_tfm_ctx)
 							+ SM4_BLOCK_SIZE * 2,
@@ -715,7 +678,7 @@ static struct shash_alg sm4_mac_algs[] = {
 		.digestsize	= SM4_BLOCK_SIZE,
 		.init		= sm4_mac_init,
 		.update		= sm4_mac_update,
-		.final		= sm4_cmac_final,
+		.finup		= sm4_cmac_finup,
 		.setkey		= sm4_xcbc_setkey,
 		.descsize	= sizeof(struct sm4_mac_desc_ctx),
 	}, {
@@ -723,14 +686,15 @@ static struct shash_alg sm4_mac_algs[] = {
 			.cra_name		= "cbcmac(sm4)",
 			.cra_driver_name	= "cbcmac-sm4-ce",
 			.cra_priority		= 400,
-			.cra_blocksize		= 1,
+			.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
+			.cra_blocksize		= SM4_BLOCK_SIZE,
 			.cra_ctxsize		= sizeof(struct sm4_mac_tfm_ctx),
 			.cra_module		= THIS_MODULE,
 		},
 		.digestsize	= SM4_BLOCK_SIZE,
 		.init		= sm4_mac_init,
 		.update		= sm4_mac_update,
-		.final		= sm4_cbcmac_final,
+		.finup		= sm4_cbcmac_finup,
 		.setkey		= sm4_cbcmac_setkey,
 		.descsize	= sizeof(struct sm4_mac_desc_ctx),
 	}