51 files changed, 4593 insertions, 277 deletions
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 16859c6226dd..a3647352bff6 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -28,6 +28,17 @@ config CRYPTO_LIB_ARC4
 config CRYPTO_LIB_GF128MUL
 	tristate
 
+config CRYPTO_LIB_BLAKE2B
+	tristate
+	help
+	  The BLAKE2b library functions.  Select this if your module uses any of
+	  the functions from <crypto/blake2b.h>.
+
+config CRYPTO_LIB_BLAKE2B_ARCH
+	bool
+	depends on CRYPTO_LIB_BLAKE2B && !UML
+	default y if ARM && KERNEL_MODE_NEON
+
 # BLAKE2s support is always built-in, so there's no CRYPTO_LIB_BLAKE2S option.
 
 config CRYPTO_LIB_BLAKE2S_ARCH
@@ -124,6 +135,18 @@ config CRYPTO_LIB_POLY1305_RSIZE
 	default 9 if ARM || ARM64
 	default 1
 
+config CRYPTO_LIB_POLYVAL
+	tristate
+	help
+	  The POLYVAL library functions.  Select this if your module uses any of
+	  the functions from <crypto/polyval.h>.
+
+config CRYPTO_LIB_POLYVAL_ARCH
+	bool
+	depends on CRYPTO_LIB_POLYVAL && !UML
+	default y if ARM64 && KERNEL_MODE_NEON
+	default y if X86_64
+
 config CRYPTO_LIB_CHACHA20POLY1305
 	tristate
 	select CRYPTO_LIB_CHACHA
@@ -184,6 +207,19 @@ config CRYPTO_LIB_SHA512_ARCH
 	default y if SPARC64
 	default y if X86_64
 
+config CRYPTO_LIB_SHA3
+	tristate
+	select CRYPTO_LIB_UTILS
+	help
+	  The SHA3 library functions.  Select this if your module uses any of
+	  the functions from <crypto/sha3.h>.
+
+config CRYPTO_LIB_SHA3_ARCH
+	bool
+	depends on CRYPTO_LIB_SHA3 && !UML
+	default y if ARM64 && KERNEL_MODE_NEON
+	default y if S390
+
 config CRYPTO_LIB_SM3
 	tristate
 
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index d2845b214585..b5346cebbb55 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -31,6 +31,16 @@ obj-$(CONFIG_CRYPTO_LIB_GF128MUL)		+= gf128mul.o
 
 ################################################################################
 
+obj-$(CONFIG_CRYPTO_LIB_BLAKE2B) += libblake2b.o
+libblake2b-y := blake2b.o
+CFLAGS_blake2b.o := -Wframe-larger-than=4096 #  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105930
+ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2B_ARCH),y)
+CFLAGS_blake2b.o += -I$(src)/$(SRCARCH)
+libblake2b-$(CONFIG_ARM) += arm/blake2b-neon-core.o
+endif # CONFIG_CRYPTO_LIB_BLAKE2B_ARCH
+
+################################################################################
+
 # blake2s is used by the /dev/random driver which is always builtin
 obj-y += blake2s.o
 ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2S_ARCH),y)
@@ -188,6 +198,16 @@ clean-files += arm/poly1305-core.S \
 
 ################################################################################
 
+obj-$(CONFIG_CRYPTO_LIB_POLYVAL) += libpolyval.o
+libpolyval-y := polyval.o
+ifeq ($(CONFIG_CRYPTO_LIB_POLYVAL_ARCH),y)
+CFLAGS_polyval.o += -I$(src)/$(SRCARCH)
+libpolyval-$(CONFIG_ARM64) += arm64/polyval-ce-core.o
+libpolyval-$(CONFIG_X86) += x86/polyval-pclmul-avx.o
+endif
+
+################################################################################
+
 obj-$(CONFIG_CRYPTO_LIB_SHA1) += libsha1.o
 libsha1-y := sha1.o
 ifeq ($(CONFIG_CRYPTO_LIB_SHA1_ARCH),y)
@@ -268,6 +288,16 @@ endif # CONFIG_CRYPTO_LIB_SHA512_ARCH
 
 ################################################################################
 
+obj-$(CONFIG_CRYPTO_LIB_SHA3) += libsha3.o
+libsha3-y := sha3.o
+
+ifeq ($(CONFIG_CRYPTO_LIB_SHA3_ARCH),y)
+CFLAGS_sha3.o += -I$(src)/$(SRCARCH)
+libsha3-$(CONFIG_ARM64) += arm64/sha3-ce-core.o
+endif # CONFIG_CRYPTO_LIB_SHA3_ARCH
+
+################################################################################
+
 obj-$(CONFIG_MPILIB) += mpi/
 
 obj-$(CONFIG_CRYPTO_SELFTESTS_FULL)		+= simd.o
diff --git a/lib/crypto/arm/blake2b-neon-core.S b/lib/crypto/arm/blake2b-neon-core.S
new file mode 100644
index 000000000000..b55c37f0b88f
--- /dev/null
+++ b/lib/crypto/arm/blake2b-neon-core.S
@@ -0,0 +1,350 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * BLAKE2b digest algorithm optimized with ARM NEON instructions.  On ARM
+ * processors that have NEON support but not the ARMv8 Crypto Extensions,
+ * typically this BLAKE2b implementation is much faster than the SHA-2 family
+ * and slightly faster than SHA-1.
+ *
+ * Copyright 2020 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+	.text
+	.fpu		neon
+
+	// The arguments to blake2b_compress_neon()
+	CTX		.req	r0
+	DATA		.req	r1
+	NBLOCKS		.req	r2
+	INC		.req	r3
+
+	// Pointers to the rotation tables
+	ROR24_TABLE	.req	r4
+	ROR16_TABLE	.req	r5
+
+	// The original stack pointer
+	ORIG_SP		.req	r6
+
+	// NEON registers which contain the message words of the current block.
+	// M_0-M_3 are occasionally used for other purposes too.
+	M_0		.req	d16
+	M_1		.req	d17
+	M_2		.req	d18
+	M_3		.req	d19
+	M_4		.req	d20
+	M_5		.req	d21
+	M_6		.req	d22
+	M_7		.req	d23
+	M_8		.req	d24
+	M_9		.req	d25
+	M_10		.req	d26
+	M_11		.req	d27
+	M_12		.req	d28
+	M_13		.req	d29
+	M_14		.req	d30
+	M_15		.req	d31
+
+	.align		4
+	// Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
+	// instruction.  This is the most efficient way to implement these
+	// rotation amounts with NEON.  (On Cortex-A53 it's the same speed as
+	// vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
+.Lror24_table:
+	.byte		3, 4, 5, 6, 7, 0, 1, 2
+.Lror16_table:
+	.byte		2, 3, 4, 5, 6, 7, 0, 1
+	// The BLAKE2b initialization vector
+.Lblake2b_IV:
+	.quad		0x6a09e667f3bcc908, 0xbb67ae8584caa73b
+	.quad		0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
+	.quad		0x510e527fade682d1, 0x9b05688c2b3e6c1f
+	.quad		0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
+
+// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
+// NEON registers q0-q7.  The message block is in q8..q15 (M_0-M_15).  The stack
+// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
+// (M_0-M_3), so that they can be reloaded if they are used as temporary
+// registers.  The macro arguments s0-s15 give the order in which the message
+// words are used in this round.  'final' is 1 if this is the final round.
+.macro	_blake2b_round	s0, s1, s2, s3, s4, s5, s6, s7, \
+			s8, s9, s10, s11, s12, s13, s14, s15, final=0
+
+	// Mix the columns:
+	// (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
+	// (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
+
+	// a += b + m[blake2b_sigma[r][2*i + 0]];
+	vadd.u64	q0, q0, q2
+	vadd.u64	q1, q1, q3
+	vadd.u64	d0, d0, M_\s0
+	vadd.u64	d1, d1, M_\s2
+	vadd.u64	d2, d2, M_\s4
+	vadd.u64	d3, d3, M_\s6
+
+	// d = ror64(d ^ a, 32);
+	veor		q6, q6, q0
+	veor		q7, q7, q1
+	vrev64.32	q6, q6
+	vrev64.32	q7, q7
+
+	// c += d;
+	vadd.u64	q4, q4, q6
+	vadd.u64	q5, q5, q7
+
+	// b = ror64(b ^ c, 24);
+	vld1.8		{M_0}, [ROR24_TABLE, :64]
+	veor		q2, q2, q4
+	veor		q3, q3, q5
+	vtbl.8		d4, {d4}, M_0
+	vtbl.8		d5, {d5}, M_0
+	vtbl.8		d6, {d6}, M_0
+	vtbl.8		d7, {d7}, M_0
+
+	// a += b + m[blake2b_sigma[r][2*i + 1]];
+	//
+	// M_0 got clobbered above, so we have to reload it if any of the four
+	// message words this step needs happens to be M_0.  Otherwise we don't
+	// need to reload it here, as it will just get clobbered again below.
+.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
+	vld1.8		{M_0}, [sp, :64]
+.endif
+	vadd.u64	q0, q0, q2
+	vadd.u64	q1, q1, q3
+	vadd.u64	d0, d0, M_\s1
+	vadd.u64	d1, d1, M_\s3
+	vadd.u64	d2, d2, M_\s5
+	vadd.u64	d3, d3, M_\s7
+
+	// d = ror64(d ^ a, 16);
+	vld1.8		{M_0}, [ROR16_TABLE, :64]
+	veor		q6, q6, q0
+	veor		q7, q7, q1
+	vtbl.8		d12, {d12}, M_0
+	vtbl.8		d13, {d13}, M_0
+	vtbl.8		d14, {d14}, M_0
+	vtbl.8		d15, {d15}, M_0
+
+	// c += d;
+	vadd.u64	q4, q4, q6
+	vadd.u64	q5, q5, q7
+
+	// b = ror64(b ^ c, 63);
+	//
+	// This rotation amount isn't a multiple of 8, so it has to be
+	// implemented using a pair of shifts, which requires temporary
+	// registers.  Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
+	veor		q8, q2, q4
+	veor		q9, q3, q5
+	vshr.u64	q2, q8, #63
+	vshr.u64	q3, q9, #63
+	vsli.u64	q2, q8, #1
+	vsli.u64	q3, q9, #1
+	vld1.8		{q8-q9}, [sp, :256]
+
+	// Mix the diagonals:
+	// (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
+	// (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
+	//
+	// There are two possible ways to do this: use 'vext' instructions to
+	// shift the rows of the matrix so that the diagonals become columns,
+	// and undo it afterwards; or just use 64-bit operations on 'd'
+	// registers instead of 128-bit operations on 'q' registers.  We use the
+	// latter approach, as it performs much better on Cortex-A7.
+
+	// a += b + m[blake2b_sigma[r][2*i + 0]];
+	vadd.u64	d0, d0, d5
+	vadd.u64	d1, d1, d6
+	vadd.u64	d2, d2, d7
+	vadd.u64	d3, d3, d4
+	vadd.u64	d0, d0, M_\s8
+	vadd.u64	d1, d1, M_\s10
+	vadd.u64	d2, d2, M_\s12
+	vadd.u64	d3, d3, M_\s14
+
+	// d = ror64(d ^ a, 32);
+	veor		d15, d15, d0
+	veor		d12, d12, d1
+	veor		d13, d13, d2
+	veor		d14, d14, d3
+	vrev64.32	d15, d15
+	vrev64.32	d12, d12
+	vrev64.32	d13, d13
+	vrev64.32	d14, d14
+
+	// c += d;
+	vadd.u64	d10, d10, d15
+	vadd.u64	d11, d11, d12
+	vadd.u64	d8, d8, d13
+	vadd.u64	d9, d9, d14
+
+	// b = ror64(b ^ c, 24);
+	vld1.8		{M_0}, [ROR24_TABLE, :64]
+	veor		d5, d5, d10
+	veor		d6, d6, d11
+	veor		d7, d7, d8
+	veor		d4, d4, d9
+	vtbl.8		d5, {d5}, M_0
+	vtbl.8		d6, {d6}, M_0
+	vtbl.8		d7, {d7}, M_0
+	vtbl.8		d4, {d4}, M_0
+
+	// a += b + m[blake2b_sigma[r][2*i + 1]];
+.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
+	vld1.8		{M_0}, [sp, :64]
+.endif
+	vadd.u64	d0, d0, d5
+	vadd.u64	d1, d1, d6
+	vadd.u64	d2, d2, d7
+	vadd.u64	d3, d3, d4
+	vadd.u64	d0, d0, M_\s9
+	vadd.u64	d1, d1, M_\s11
+	vadd.u64	d2, d2, M_\s13
+	vadd.u64	d3, d3, M_\s15
+
+	// d = ror64(d ^ a, 16);
+	vld1.8		{M_0}, [ROR16_TABLE, :64]
+	veor		d15, d15, d0
+	veor		d12, d12, d1
+	veor		d13, d13, d2
+	veor		d14, d14, d3
+	vtbl.8		d12, {d12}, M_0
+	vtbl.8		d13, {d13}, M_0
+	vtbl.8		d14, {d14}, M_0
+	vtbl.8		d15, {d15}, M_0
+
+	// c += d;
+	vadd.u64	d10, d10, d15
+	vadd.u64	d11, d11, d12
+	vadd.u64	d8, d8, d13
+	vadd.u64	d9, d9, d14
+
+	// b = ror64(b ^ c, 63);
+	veor		d16, d4, d9
+	veor		d17, d5, d10
+	veor		d18, d6, d11
+	veor		d19, d7, d8
+	vshr.u64	q2, q8, #63
+	vshr.u64	q3, q9, #63
+	vsli.u64	q2, q8, #1
+	vsli.u64	q3, q9, #1
+	// Reloading q8-q9 can be skipped on the final round.
+.if ! \final
+	vld1.8		{q8-q9}, [sp, :256]
+.endif
+.endm
+
+//
+// void blake2b_compress_neon(struct blake2b_ctx *ctx,
+//			      const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2b_ctx are used:
+//	u64 h[8];	(inout)
+//	u64 t[2];	(inout)
+//	u64 f[2];	(in)
+//
+	.align		5
+ENTRY(blake2b_compress_neon)
+	push		{r4-r10}
+
+	// Allocate a 32-byte stack buffer that is 32-byte aligned.
+	mov		ORIG_SP, sp
+	sub		ip, sp, #32
+	bic		ip, ip, #31
+	mov		sp, ip
+
+	adr		ROR24_TABLE, .Lror24_table
+	adr		ROR16_TABLE, .Lror16_table
+
+	mov		ip, CTX
+	vld1.64		{q0-q1}, [ip]!		// Load h[0..3]
+	vld1.64		{q2-q3}, [ip]!		// Load h[4..7]
+.Lnext_block:
+	  adr		r10, .Lblake2b_IV
+	vld1.64		{q14-q15}, [ip]		// Load t[0..1] and f[0..1]
+	vld1.64		{q4-q5}, [r10]!		// Load IV[0..3]
+	  vmov		r7, r8, d28		// Copy t[0] to (r7, r8)
+	vld1.64		{q6-q7}, [r10]		// Load IV[4..7]
+	  adds		r7, r7, INC		// Increment counter
+	bcs		.Lslow_inc_ctr
+	vmov.i32	d28[0], r7
+	vst1.64		{d28}, [ip]		// Update t[0]
+.Linc_ctr_done:
+
+	// Load the next message block and finish initializing the state matrix
+	// 'v'.  Fortunately, there are exactly enough NEON registers to fit the
+	// entire state matrix in q0-q7 and the entire message block in q8-15.
+	//
+	// However, _blake2b_round also needs some extra registers for rotates,
+	// so we have to spill some registers.  It's better to spill the message
+	// registers than the state registers, as the message doesn't change.
+	// Therefore we store a copy of the first 32 bytes of the message block
+	// (q8-q9) in an aligned buffer on the stack so that they can be
+	// reloaded when needed.  (We could just reload directly from the
+	// message buffer, but it's faster to use aligned loads.)
+	vld1.8		{q8-q9}, [DATA]!
+	  veor		q6, q6, q14	// v[12..13] = IV[4..5] ^ t[0..1]
+	vld1.8		{q10-q11}, [DATA]!
+	  veor		q7, q7, q15	// v[14..15] = IV[6..7] ^ f[0..1]
+	vld1.8		{q12-q13}, [DATA]!
+	vst1.8		{q8-q9}, [sp, :256]
+	  mov		ip, CTX
+	vld1.8		{q14-q15}, [DATA]!
+
+	// Execute the rounds.  Each round is provided the order in which it
+	// needs to use the message words.
+	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
+	_blake2b_round	11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
+	_blake2b_round	7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
+	_blake2b_round	9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
+	_blake2b_round	2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
+	_blake2b_round	12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
+	_blake2b_round	13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
+	_blake2b_round	6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
+	_blake2b_round	10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
+	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
+			final=1
+
+	// Fold the final state matrix into the hash chaining value:
+	//
+	//	for (i = 0; i < 8; i++)
+	//		h[i] ^= v[i] ^ v[i + 8];
+	//
+	  vld1.64	{q8-q9}, [ip]!		// Load old h[0..3]
+	veor		q0, q0, q4		// v[0..1] ^= v[8..9]
+	veor		q1, q1, q5		// v[2..3] ^= v[10..11]
+	  vld1.64	{q10-q11}, [ip]		// Load old h[4..7]
+	veor		q2, q2, q6		// v[4..5] ^= v[12..13]
+	veor		q3, q3, q7		// v[6..7] ^= v[14..15]
+	veor		q0, q0, q8		// v[0..1] ^= h[0..1]
+	veor		q1, q1, q9		// v[2..3] ^= h[2..3]
+	  mov		ip, CTX
+	  subs		NBLOCKS, NBLOCKS, #1	// nblocks--
+	  vst1.64	{q0-q1}, [ip]!		// Store new h[0..3]
+	veor		q2, q2, q10		// v[4..5] ^= h[4..5]
+	veor		q3, q3, q11		// v[6..7] ^= h[6..7]
+	  vst1.64	{q2-q3}, [ip]!		// Store new h[4..7]
+
+	// Advance to the next block, if there is one.
+	bne		.Lnext_block		// nblocks != 0?
+
+	mov		sp, ORIG_SP
+	pop		{r4-r10}
+	mov		pc, lr
+
+.Lslow_inc_ctr:
+	// Handle the case where the counter overflowed its low 32 bits, by
+	// carrying the overflow bit into the full 128-bit counter.
+	vmov		r9, r10, d29
+	adcs		r8, r8, #0
+	adcs		r9, r9, #0
+	adc		r10, r10, #0
+	vmov		d28, r7, r8
+	vmov		d29, r9, r10
+	vst1.64		{q14}, [ip]		// Update t[0] and t[1]
+	b		.Linc_ctr_done
+ENDPROC(blake2b_compress_neon)
diff --git a/lib/crypto/arm/blake2b.h b/lib/crypto/arm/blake2b.h
new file mode 100644
index 000000000000..5c76498521e6
--- /dev/null
+++ b/lib/crypto/arm/blake2b.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * BLAKE2b digest algorithm, NEON accelerated
+ *
+ * Copyright 2020 Google LLC
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+asmlinkage void blake2b_compress_neon(struct blake2b_ctx *ctx,
+				      const u8 *data, size_t nblocks, u32 inc);
+
+static void blake2b_compress(struct blake2b_ctx *ctx,
+			     const u8 *data, size_t nblocks, u32 inc)
+{
+	if (!static_branch_likely(&have_neon) || !may_use_simd()) {
+		blake2b_compress_generic(ctx, data, nblocks, inc);
+		return;
+	}
+	do {
+		const size_t blocks = min_t(size_t, nblocks,
+					    SZ_4K / BLAKE2B_BLOCK_SIZE);
+
+		scoped_ksimd()
+			blake2b_compress_neon(ctx, data, blocks, inc);
+
+		data += blocks * BLAKE2B_BLOCK_SIZE;
+		nblocks -= blocks;
+	} while (nblocks);
+}
+
+#define blake2b_mod_init_arch blake2b_mod_init_arch
+static void blake2b_mod_init_arch(void)
+{
+	if (elf_hwcap & HWCAP_NEON)
+		static_branch_enable(&have_neon);
+}
diff --git a/lib/crypto/arm/blake2s-core.S b/lib/crypto/arm/blake2s-core.S
index 293f44fa8f31..933f0558b7cd 100644
--- a/lib/crypto/arm/blake2s-core.S
+++ b/lib/crypto/arm/blake2s-core.S
@@ -115,7 +115,7 @@
 
 // Execute one round of BLAKE2s by updating the state matrix v[0..15].  v[0..9]
 // are in r0..r9.  The stack pointer points to 8 bytes of scratch space for
-// spilling v[8..9], then to v[9..15], then to the message block.  r10-r12 and
+// spilling v[8..9], then to v[10..15], then to the message block.  r10-r12 and
 // r14 are free to use.  The macro arguments s0-s15 give the order in which the
 // message words are used in this round.
 //
@@ -170,10 +170,10 @@
 .endm
 
 //
-// void blake2s_compress(struct blake2s_state *state,
-//			 const u8 *block, size_t nblocks, u32 inc);
+// void blake2s_compress(struct blake2s_ctx *ctx,
+//			 const u8 *data, size_t nblocks, u32 inc);
 //
-// Only the first three fields of struct blake2s_state are used:
+// Only the first three fields of struct blake2s_ctx are used:
 //	u32 h[8];	(inout)
 //	u32 t[2];	(inout)
 //	u32 f[2];	(in)
@@ -183,8 +183,8 @@ ENTRY(blake2s_compress)
 	push		{r0-r2,r4-r11,lr}	// keep this an even number
 
 .Lnext_block:
-	// r0 is 'state'
-	// r1 is 'block'
+	// r0 is 'ctx'
+	// r1 is 'data'
 	// r3 is 'inc'
 
 	// Load and increment the counter t[0..1].
@@ -209,18 +209,18 @@ ENTRY(blake2s_compress)
 .Lcopy_block_done:
 	str		r1, [sp, #68]		// Update message pointer
 
-	// Calculate v[8..15].  Push v[9..15] onto the stack, and leave space
+	// Calculate v[8..15].  Push v[10..15] onto the stack, and leave space
 	// for spilling v[8..9].  Leave v[8..9] in r8-r9.
-	mov		r14, r0			// r14 = state
+	mov		r14, r0			// r14 = ctx
 	adr		r12, .Lblake2s_IV
 	ldmia		r12!, {r8-r9}		// load IV[0..1]
 	__ldrd		r0, r1, r14, 40		// load f[0..1]
-	ldm		r12, {r2-r7}		// load IV[3..7]
+	ldm		r12, {r2-r7}		// load IV[2..7]
 	eor		r4, r4, r10		// v[12] = IV[4] ^ t[0]
 	eor		r5, r5, r11		// v[13] = IV[5] ^ t[1]
 	eor		r6, r6, r0		// v[14] = IV[6] ^ f[0]
 	eor		r7, r7, r1		// v[15] = IV[7] ^ f[1]
-	push		{r2-r7}			// push v[9..15]
+	push		{r2-r7}			// push v[10..15]
 	sub		sp, sp, #8		// leave space for v[8..9]
 
 	// Load h[0..7] == v[0..7].
@@ -275,7 +275,7 @@ ENTRY(blake2s_compress)
 	// Advance to the next block, if there is one.  Note that if there are
 	// multiple blocks, then 'inc' (the counter increment amount) must be
 	// 64.  So we can simply set it to 64 without re-loading it.
-	ldm		sp, {r0, r1, r2}	// load (state, block, nblocks)
+	ldm		sp, {r0, r1, r2}	// load (ctx, data, nblocks)
 	mov		r3, #64			// set 'inc'
 	subs		r2, r2, #1		// nblocks--
 	str		r2, [sp, #8]
diff --git a/lib/crypto/arm/blake2s.h b/lib/crypto/arm/blake2s.h
index aa7a97139ea7..42c04440c191 100644
--- a/lib/crypto/arm/blake2s.h
+++ b/lib/crypto/arm/blake2s.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 
 /* defined in blake2s-core.S */
-void blake2s_compress(struct blake2s_state *state, const u8 *block,
-		      size_t nblocks, u32 inc);
+void blake2s_compress(struct blake2s_ctx *ctx,
+		      const u8 *data, size_t nblocks, u32 inc);
diff --git a/lib/crypto/arm/chacha.h b/lib/crypto/arm/chacha.h
index 0cae30f8ee5d..836e49088e98 100644
--- a/lib/crypto/arm/chacha.h
+++ b/lib/crypto/arm/chacha.h
@@ -12,7 +12,6 @@
 
 #include <asm/cputype.h>
 #include <asm/hwcap.h>
-#include <asm/neon.h>
 #include <asm/simd.h>
 
 asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
@@ -68,9 +67,8 @@ static void hchacha_block_arch(const struct chacha_state *state,
 	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
 		hchacha_block_arm(state, out, nrounds);
 	} else {
-		kernel_neon_begin();
-		hchacha_block_neon(state, out, nrounds);
-		kernel_neon_end();
+		scoped_ksimd()
+			hchacha_block_neon(state, out, nrounds);
 	}
 }
 
@@ -87,9 +85,8 @@ static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
 	do {
 		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
 
-		kernel_neon_begin();
-		chacha_doneon(state, dst, src, todo, nrounds);
-		kernel_neon_end();
+		scoped_ksimd()
+			chacha_doneon(state, dst, src, todo, nrounds);
 
 		bytes -= todo;
 		src += todo;
diff --git a/lib/crypto/arm/curve25519.h b/lib/crypto/arm/curve25519.h
index f6d66494eb8f..b1a566885e95 100644
--- a/lib/crypto/arm/curve25519.h
+++ b/lib/crypto/arm/curve25519.h
@@ -25,9 +25,8 @@ static void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
 			    const u8 point[CURVE25519_KEY_SIZE])
 {
 	if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
-		kernel_neon_begin();
-		curve25519_neon(out, scalar, point);
-		kernel_neon_end();
+		scoped_ksimd()
+			curve25519_neon(out, scalar, point);
 	} else {
 		curve25519_generic(out, scalar, point);
 	}
diff --git a/lib/crypto/arm/poly1305.h b/lib/crypto/arm/poly1305.h
index 0021cf368307..0fe903d8de55 100644
--- a/lib/crypto/arm/poly1305.h
+++ b/lib/crypto/arm/poly1305.h
@@ -6,7 +6,6 @@
  */
 
 #include <asm/hwcap.h>
-#include <asm/neon.h>
 #include <asm/simd.h>
 #include <linux/cpufeature.h>
 #include <linux/jump_label.h>
@@ -32,9 +31,8 @@ static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src,
 		do {
 			unsigned int todo = min_t(unsigned int, len, SZ_4K);
 
-			kernel_neon_begin();
-			poly1305_blocks_neon(state, src, todo, padbit);
-			kernel_neon_end();
+			scoped_ksimd()
+				poly1305_blocks_neon(state, src, todo, padbit);
 
 			len -= todo;
 			src += todo;
diff --git a/lib/crypto/arm/sha1-armv7-neon.S b/lib/crypto/arm/sha1-armv7-neon.S
index 6edba3ab62e8..a0323fa5c58a 100644
--- a/lib/crypto/arm/sha1-armv7-neon.S
+++ b/lib/crypto/arm/sha1-armv7-neon.S
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
+/* ARM/NEON accelerated SHA-1 transform function
  *
  * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  */
diff --git a/lib/crypto/arm/sha1-ce-core.S b/lib/crypto/arm/sha1-ce-core.S
index 2de40dd25e47..7d6b2631ca8d 100644
--- a/lib/crypto/arm/sha1-ce-core.S
+++ b/lib/crypto/arm/sha1-ce-core.S
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
+ * SHA-1 secure hash using ARMv8 Crypto Extensions
  *
  * Copyright (C) 2015 Linaro Ltd.
  * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
diff --git a/lib/crypto/arm/sha1.h b/lib/crypto/arm/sha1.h
index 29f8bcad0447..3e2d8c7cab9f 100644
--- a/lib/crypto/arm/sha1.h
+++ b/lib/crypto/arm/sha1.h
@@ -4,7 +4,6 @@
  *
  * Copyright 2025 Google LLC
  */
-#include <asm/neon.h>
 #include <asm/simd.h>
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
@@ -22,12 +21,12 @@ static void sha1_blocks(struct sha1_block_state *state,
 {
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
 	    static_branch_likely(&have_neon) && likely(may_use_simd())) {
-		kernel_neon_begin();
-		if (static_branch_likely(&have_ce))
-			sha1_ce_transform(state, data, nblocks);
-		else
-			sha1_transform_neon(state, data, nblocks);
-		kernel_neon_end();
+		scoped_ksimd() {
+			if (static_branch_likely(&have_ce))
+				sha1_ce_transform(state, data, nblocks);
+			else
+				sha1_transform_neon(state, data, nblocks);
+		}
 	} else {
 		sha1_block_data_order(state, data, nblocks);
 	}
diff --git a/lib/crypto/arm/sha256-ce.S b/lib/crypto/arm/sha256-ce.S
index 7481ac8e6c0d..144ee805f64a 100644
--- a/lib/crypto/arm/sha256-ce.S
+++ b/lib/crypto/arm/sha256-ce.S
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * sha256-ce.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions
+ * SHA-224/256 secure hash using ARMv8 Crypto Extensions
  *
  * Copyright (C) 2015 Linaro Ltd.
  * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
diff --git a/lib/crypto/arm/sha256.h b/lib/crypto/arm/sha256.h
index 7556457b3094..ae7e52dd6e3b 100644
--- a/lib/crypto/arm/sha256.h
+++ b/lib/crypto/arm/sha256.h
@@ -22,12 +22,12 @@ static void sha256_blocks(struct sha256_block_state *state,
 {
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
 	    static_branch_likely(&have_neon) && likely(may_use_simd())) {
-		kernel_neon_begin();
-		if (static_branch_likely(&have_ce))
-			sha256_ce_transform(state, data, nblocks);
-		else
-			sha256_block_data_order_neon(state, data, nblocks);
-		kernel_neon_end();
+		scoped_ksimd() {
+			if (static_branch_likely(&have_ce))
+				sha256_ce_transform(state, data, nblocks);
+			else
+				sha256_block_data_order_neon(state, data, nblocks);
+		}
 	} else {
 		sha256_block_data_order(state, data, nblocks);
 	}
diff --git a/lib/crypto/arm/sha512.h b/lib/crypto/arm/sha512.h
index d1b485dd275d..ed9bd81d6d78 100644
--- a/lib/crypto/arm/sha512.h
+++ b/lib/crypto/arm/sha512.h
@@ -19,9 +19,8 @@ static void sha512_blocks(struct sha512_block_state *state,
 {
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
 	    static_branch_likely(&have_neon) && likely(may_use_simd())) {
-		kernel_neon_begin();
-		sha512_block_data_order_neon(state, data, nblocks);
-		kernel_neon_end();
+		scoped_ksimd()
+			sha512_block_data_order_neon(state, data, nblocks);
 	} else {
 		sha512_block_data_order(state, data, nblocks);
 	}
diff --git a/lib/crypto/arm64/chacha.h b/lib/crypto/arm64/chacha.h
index ba6c22d46086..ca8c6a8b0578 100644
--- a/lib/crypto/arm64/chacha.h
+++ b/lib/crypto/arm64/chacha.h
@@ -23,7 +23,6 @@
 #include <linux/kernel.h>
 
 #include <asm/hwcap.h>
-#include <asm/neon.h>
 #include <asm/simd.h>
 
 asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
@@ -65,9 +64,8 @@ static void hchacha_block_arch(const struct chacha_state *state,
 	if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
 		hchacha_block_generic(state, out, nrounds);
 	} else {
-		kernel_neon_begin();
-		hchacha_block_neon(state, out, nrounds);
-		kernel_neon_end();
+		scoped_ksimd()
+			hchacha_block_neon(state, out, nrounds);
 	}
 }
 
@@ -81,9 +79,8 @@ static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
 	do {
 		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
 
-		kernel_neon_begin();
-		chacha_doneon(state, dst, src, todo, nrounds);
-		kernel_neon_end();
+		scoped_ksimd()
+			chacha_doneon(state, dst, src, todo, nrounds);
 
 		bytes -= todo;
 		src += todo;
diff --git a/lib/crypto/arm64/poly1305.h b/lib/crypto/arm64/poly1305.h
index aed5921ccd9a..b77669767cd6 100644
--- a/lib/crypto/arm64/poly1305.h
+++ b/lib/crypto/arm64/poly1305.h
@@ -6,7 +6,6 @@
  */
 
 #include <asm/hwcap.h>
-#include <asm/neon.h>
 #include <asm/simd.h>
 #include <linux/cpufeature.h>
 #include <linux/jump_label.h>
@@ -31,9 +30,8 @@ static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src,
 		do {
 			unsigned int todo = min_t(unsigned int, len, SZ_4K);
 
-			kernel_neon_begin();
-			poly1305_blocks_neon(state, src, todo, padbit);
-			kernel_neon_end();
+			scoped_ksimd()
+				poly1305_blocks_neon(state, src, todo, padbit);
 
 			len -= todo;
 			src += todo;
diff --git a/lib/crypto/arm64/polyval-ce-core.S b/lib/crypto/arm64/polyval-ce-core.S
new file mode 100644
index 000000000000..7c731a044d02
--- /dev/null
+++ b/lib/crypto/arm64/polyval-ce-core.S
@@ -0,0 +1,359 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Implementation of POLYVAL using ARMv8 Crypto Extensions.
+ *
+ * Copyright 2021 Google LLC
+ */
+/*
+ * This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions
+ * It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8,
+ * ..., h^1 in the POLYVAL finite field. This precomputation allows us to split
+ * finite field multiplication into two steps.
+ *
+ * In the first step, we consider h^i, m_i as normal polynomials of degree less
+ * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
+ * is simply polynomial multiplication.
+ *
+ * In the second step, we compute the reduction of p(x) modulo the finite field
+ * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
+ * multiplication is finite field multiplication. The advantage is that the
+ * two-step process  only requires 1 finite field reduction for every 8
+ * polynomial multiplications. Further parallelism is gained by interleaving the
+ * multiplications and polynomial reductions.
+ */
+
+#include <linux/linkage.h>
+#define STRIDE_BLOCKS 8
+
+ACCUMULATOR	.req	x0
+KEY_POWERS	.req	x1
+MSG		.req	x2
+BLOCKS_LEFT	.req	x3
+KEY_START	.req	x10
+EXTRA_BYTES	.req	x11
+TMP	.req	x13
+
+M0	.req	v0
+M1	.req	v1
+M2	.req	v2
+M3	.req	v3
+M4	.req	v4
+M5	.req	v5
+M6	.req	v6
+M7	.req	v7
+KEY8	.req	v8
+KEY7	.req	v9
+KEY6	.req	v10
+KEY5	.req	v11
+KEY4	.req	v12
+KEY3	.req	v13
+KEY2	.req	v14
+KEY1	.req	v15
+PL	.req	v16
+PH	.req	v17
+TMP_V	.req	v18
+LO	.req	v20
+MI	.req	v21
+HI	.req	v22
+SUM	.req	v23
+GSTAR	.req	v24
+
+	.text
+
+	.arch	armv8-a+crypto
+	.align	4
+
+.Lgstar:
+	.quad	0xc200000000000000, 0xc200000000000000
+
+/*
+ * Computes the product of two 128-bit polynomials in X and Y and XORs the
+ * components of the 256-bit product into LO, MI, HI.
+ *
+ * Given:
+ *  X = [X_1 : X_0]
+ *  Y = [Y_1 : Y_0]
+ *
+ * We compute:
+ *  LO += X_0 * Y_0
+ *  MI += (X_0 + X_1) * (Y_0 + Y_1)
+ *  HI += X_1 * Y_1
+ *
+ * Later, the 256-bit result can be extracted as:
+ *   [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0]
+ * This step is done when computing the polynomial reduction for efficiency
+ * reasons.
+ *
+ * Karatsuba multiplication is used instead of Schoolbook multiplication because
+ * it was found to be slightly faster on ARM64 CPUs.
+ *
+ */
+.macro karatsuba1 X Y
+	X .req \X
+	Y .req \Y
+	ext	v25.16b, X.16b, X.16b, #8
+	ext	v26.16b, Y.16b, Y.16b, #8
+	eor	v25.16b, v25.16b, X.16b
+	eor	v26.16b, v26.16b, Y.16b
+	pmull2	v28.1q, X.2d, Y.2d
+	pmull	v29.1q, X.1d, Y.1d
+	pmull	v27.1q, v25.1d, v26.1d
+	eor	HI.16b, HI.16b, v28.16b
+	eor	LO.16b, LO.16b, v29.16b
+	eor	MI.16b, MI.16b, v27.16b
+	.unreq X
+	.unreq Y
+.endm
+
+/*
+ * Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into
+ * them.
+ */
+.macro karatsuba1_store X Y
+	X .req \X
+	Y .req \Y
+	ext	v25.16b, X.16b, X.16b, #8
+	ext	v26.16b, Y.16b, Y.16b, #8
+	eor	v25.16b, v25.16b, X.16b
+	eor	v26.16b, v26.16b, Y.16b
+	pmull2	HI.1q, X.2d, Y.2d
+	pmull	LO.1q, X.1d, Y.1d
+	pmull	MI.1q, v25.1d, v26.1d
+	.unreq X
+	.unreq Y
+.endm
+
+/*
+ * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
+ * the result in PL, PH.
+ * [PH : PL] =
+ *   [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
+ */
+.macro karatsuba2
+	// v4 = [HI_1 + MI_1 : HI_0 + MI_0]
+	eor	v4.16b, HI.16b, MI.16b
+	// v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0]
+	eor	v4.16b, v4.16b, LO.16b
+	// v5 = [HI_0 : LO_1]
+	ext	v5.16b, LO.16b, HI.16b, #8
+	// v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0]
+	eor	v4.16b, v4.16b, v5.16b
+	// HI = [HI_0 : HI_1]
+	ext	HI.16b, HI.16b, HI.16b, #8
+	// LO = [LO_0 : LO_1]
+	ext	LO.16b, LO.16b, LO.16b, #8
+	// PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1]
+	ext	PH.16b, v4.16b, HI.16b, #8
+	// PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
+	ext	PL.16b, LO.16b, v4.16b, #8
+.endm
+
+/*
+ * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
+ *
+ * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
+ * x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
+ * product of two 128-bit polynomials in Montgomery form.  We need to reduce it
+ * mod g(x).  Also, since polynomials in Montgomery form have an "extra" factor
+ * of x^128, this product has two extra factors of x^128.  To get it back into
+ * Montgomery form, we need to remove one of these factors by dividing by x^128.
+ *
+ * To accomplish both of these goals, we add multiples of g(x) that cancel out
+ * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
+ * bits are zero, the polynomial division by x^128 can be done by right
+ * shifting.
+ *
+ * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
+ * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x).  The CPU can
+ * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
+ * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x).  Adding this to
+ * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
+ * = T_1 : T_0 = g*(x) * P_0.  Thus, bits 0-63 got "folded" into bits 64-191.
+ *
+ * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
+ * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
+ * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
+ * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
+ * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
+ *
+ * So our final computation is:
+ *   T = T_1 : T_0 = g*(x) * P_0
+ *   V = V_1 : V_0 = g*(x) * (P_1 + T_0)
+ *   p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
+ *
+ * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
+ * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
+ * T_1 into dest.  This allows us to reuse P_1 + T_0 when computing V.
+ */
+.macro montgomery_reduction dest
+	DEST .req \dest
+	// TMP_V = T_1 : T_0 = P_0 * g*(x)
+	pmull	TMP_V.1q, PL.1d, GSTAR.1d
+	// TMP_V = T_0 : T_1
+	ext	TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
+	// TMP_V = P_1 + T_0 : P_0 + T_1
+	eor	TMP_V.16b, PL.16b, TMP_V.16b
+	// PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
+	eor	PH.16b, PH.16b, TMP_V.16b
+	// TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x)
+	pmull2	TMP_V.1q, TMP_V.2d, GSTAR.2d
+	eor	DEST.16b, PH.16b, TMP_V.16b
+	.unreq DEST
+.endm
+
+/*
+ * Compute Polyval on 8 blocks.
+ *
+ * If reduce is set, also computes the montgomery reduction of the
+ * previous full_stride call and XORs with the first message block.
+ * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
+ * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
+ *
+ * Sets PL, PH.
+ */
+.macro full_stride reduce
+	eor		LO.16b, LO.16b, LO.16b
+	eor		MI.16b, MI.16b, MI.16b
+	eor		HI.16b, HI.16b, HI.16b
+
+	ld1		{M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
+	ld1		{M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64
+
+	karatsuba1 M7 KEY1
+	.if \reduce
+	pmull	TMP_V.1q, PL.1d, GSTAR.1d
+	.endif
+
+	karatsuba1 M6 KEY2
+	.if \reduce
+	ext	TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
+	.endif
+
+	karatsuba1 M5 KEY3
+	.if \reduce
+	eor	TMP_V.16b, PL.16b, TMP_V.16b
+	.endif
+
+	karatsuba1 M4 KEY4
+	.if \reduce
+	eor	PH.16b, PH.16b, TMP_V.16b
+	.endif
+
+	karatsuba1 M3 KEY5
+	.if \reduce
+	pmull2	TMP_V.1q, TMP_V.2d, GSTAR.2d
+	.endif
+
+	karatsuba1 M2 KEY6
+	.if \reduce
+	eor	SUM.16b, PH.16b, TMP_V.16b
+	.endif
+
+	karatsuba1 M1 KEY7
+	eor	M0.16b, M0.16b, SUM.16b
+
+	karatsuba1 M0 KEY8
+	karatsuba2
+.endm
+
+/*
+ * Handle any extra blocks after full_stride loop.
+ */
+.macro partial_stride
+	add	KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4)
+	sub	KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4
+	ld1	{KEY1.16b}, [KEY_POWERS], #16
+
+	ld1	{TMP_V.16b}, [MSG], #16
+	eor	SUM.16b, SUM.16b, TMP_V.16b
+	karatsuba1_store KEY1 SUM
+	sub	BLOCKS_LEFT, BLOCKS_LEFT, #1
+
+	tst	BLOCKS_LEFT, #4
+	beq	.Lpartial4BlocksDone
+	ld1	{M0.16b, M1.16b,  M2.16b, M3.16b}, [MSG], #64
+	ld1	{KEY8.16b, KEY7.16b, KEY6.16b,	KEY5.16b}, [KEY_POWERS], #64
+	karatsuba1 M0 KEY8
+	karatsuba1 M1 KEY7
+	karatsuba1 M2 KEY6
+	karatsuba1 M3 KEY5
+.Lpartial4BlocksDone:
+	tst	BLOCKS_LEFT, #2
+	beq	.Lpartial2BlocksDone
+	ld1	{M0.16b, M1.16b}, [MSG], #32
+	ld1	{KEY8.16b, KEY7.16b}, [KEY_POWERS], #32
+	karatsuba1 M0 KEY8
+	karatsuba1 M1 KEY7
+.Lpartial2BlocksDone:
+	tst	BLOCKS_LEFT, #1
+	beq	.LpartialDone
+	ld1	{M0.16b}, [MSG], #16
+	ld1	{KEY8.16b}, [KEY_POWERS], #16
+	karatsuba1 M0 KEY8
+.LpartialDone:
+	karatsuba2
+	montgomery_reduction SUM
+.endm
+
+/*
+ * Computes a = a * b * x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * void polyval_mul_pmull(struct polyval_elem *a,
+ *			  const struct polyval_elem *b);
+ */
+SYM_FUNC_START(polyval_mul_pmull)
+	adr	TMP, .Lgstar
+	ld1	{GSTAR.2d}, [TMP]
+	ld1	{v0.16b}, [x0]
+	ld1	{v1.16b}, [x1]
+	karatsuba1_store v0 v1
+	karatsuba2
+	montgomery_reduction SUM
+	st1	{SUM.16b}, [x0]
+	ret
+SYM_FUNC_END(polyval_mul_pmull)
+
+/*
+ * Perform polynomial evaluation as specified by POLYVAL.  This computes:
+ *	h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
+ * where n=nblocks, h is the hash key, and m_i are the message blocks.
+ *
+ * x0 - pointer to accumulator
+ * x1 - pointer to precomputed key powers h^8 ... h^1
+ * x2 - pointer to message blocks
+ * x3 - number of blocks to hash
+ *
+ * void polyval_blocks_pmull(struct polyval_elem *acc,
+ *			     const struct polyval_key *key,
+ *			     const u8 *data, size_t nblocks);
+ */
+SYM_FUNC_START(polyval_blocks_pmull)
+	adr	TMP, .Lgstar
+	mov	KEY_START, KEY_POWERS
+	ld1	{GSTAR.2d}, [TMP]
+	ld1	{SUM.16b}, [ACCUMULATOR]
+	subs	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+	blt .LstrideLoopExit
+	ld1	{KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
+	ld1	{KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64
+	full_stride 0
+	subs	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+	blt .LstrideLoopExitReduce
+.LstrideLoop:
+	full_stride 1
+	subs	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+	bge	.LstrideLoop
+.LstrideLoopExitReduce:
+	montgomery_reduction SUM
+.LstrideLoopExit:
+	adds	BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
+	beq	.LskipPartial
+	partial_stride
+.LskipPartial:
+	st1	{SUM.16b}, [ACCUMULATOR]
+	ret
+SYM_FUNC_END(polyval_blocks_pmull)
diff --git a/lib/crypto/arm64/polyval.h b/lib/crypto/arm64/polyval.h
new file mode 100644
index 000000000000..a39763395e9b
--- /dev/null
+++ b/lib/crypto/arm64/polyval.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * POLYVAL library functions, arm64 optimized
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+#define NUM_H_POWERS 8
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
+
+asmlinkage void polyval_mul_pmull(struct polyval_elem *a,
+				  const struct polyval_elem *b);
+asmlinkage void polyval_blocks_pmull(struct polyval_elem *acc,
+				     const struct polyval_key *key,
+				     const u8 *data, size_t nblocks);
+
+static void polyval_preparekey_arch(struct polyval_key *key,
+				    const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+	static_assert(ARRAY_SIZE(key->h_powers) == NUM_H_POWERS);
+	memcpy(&key->h_powers[NUM_H_POWERS - 1], raw_key, POLYVAL_BLOCK_SIZE);
+	if (static_branch_likely(&have_pmull) && may_use_simd()) {
+		scoped_ksimd() {
+			for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+				key->h_powers[i] = key->h_powers[i + 1];
+				polyval_mul_pmull(
+					&key->h_powers[i],
+					&key->h_powers[NUM_H_POWERS - 1]);
+			}
+		}
+	} else {
+		for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+			key->h_powers[i] = key->h_powers[i + 1];
+			polyval_mul_generic(&key->h_powers[i],
+					    &key->h_powers[NUM_H_POWERS - 1]);
+		}
+	}
+}
+
+static void polyval_mul_arch(struct polyval_elem *acc,
+			     const struct polyval_key *key)
+{
+	if (static_branch_likely(&have_pmull) && may_use_simd()) {
+		scoped_ksimd()
+			polyval_mul_pmull(acc, &key->h_powers[NUM_H_POWERS - 1]);
+	} else {
+		polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
+	}
+}
+
+static void polyval_blocks_arch(struct polyval_elem *acc,
+				const struct polyval_key *key,
+				const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_pmull) && may_use_simd()) {
+		do {
+			/* Allow rescheduling every 4 KiB. */
+			size_t n = min_t(size_t, nblocks,
+					 4096 / POLYVAL_BLOCK_SIZE);
+
+			scoped_ksimd()
+				polyval_blocks_pmull(acc, key, data, n);
+			data += n * POLYVAL_BLOCK_SIZE;
+			nblocks -= n;
+		} while (nblocks);
+	} else {
+		polyval_blocks_generic(acc, &key->h_powers[NUM_H_POWERS - 1],
+				       data, nblocks);
+	}
+}
+
+#define polyval_mod_init_arch polyval_mod_init_arch
+static void polyval_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(PMULL))
+		static_branch_enable(&have_pmull);
+}
diff --git a/lib/crypto/arm64/sha1-ce-core.S b/lib/crypto/arm64/sha1-ce-core.S
index 21efbbafd7d6..8fbd4767f0f0 100644
--- a/lib/crypto/arm64/sha1-ce-core.S
+++ b/lib/crypto/arm64/sha1-ce-core.S
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
+ * SHA-1 secure hash using ARMv8 Crypto Extensions
  *
  * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
  */
diff --git a/lib/crypto/arm64/sha1.h b/lib/crypto/arm64/sha1.h
index aaef4ebfc5e3..bc7071f1be09 100644
--- a/lib/crypto/arm64/sha1.h
+++ b/lib/crypto/arm64/sha1.h
@@ -4,7 +4,6 @@
  *
  * Copyright 2025 Google LLC
  */
-#include <asm/neon.h>
 #include <asm/simd.h>
 #include <linux/cpufeature.h>
 
@@ -20,9 +19,9 @@ static void sha1_blocks(struct sha1_block_state *state,
 		do {
 			size_t rem;
 
-			kernel_neon_begin();
-			rem = __sha1_ce_transform(state, data, nblocks);
-			kernel_neon_end();
+			scoped_ksimd()
+				rem = __sha1_ce_transform(state, data, nblocks);
+
 			data += (nblocks - rem) * SHA1_BLOCK_SIZE;
 			nblocks = rem;
 		} while (nblocks);
diff --git a/lib/crypto/arm64/sha256-ce.S b/lib/crypto/arm64/sha256-ce.S
index 410174ba5237..e4bfe42a61a9 100644
--- a/lib/crypto/arm64/sha256-ce.S
+++ b/lib/crypto/arm64/sha256-ce.S
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
+ * Core SHA-224/SHA-256 transform using v8 Crypto Extensions
  *
  * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
  */
diff --git a/lib/crypto/arm64/sha256.h b/lib/crypto/arm64/sha256.h
index 80d06df27d3a..568dff0f276a 100644
--- a/lib/crypto/arm64/sha256.h
+++ b/lib/crypto/arm64/sha256.h
@@ -4,7 +4,6 @@
  *
  * Copyright 2025 Google LLC
  */
-#include <asm/neon.h>
 #include <asm/simd.h>
 #include <linux/cpufeature.h>
 
@@ -27,17 +26,16 @@ static void sha256_blocks(struct sha256_block_state *state,
 			do {
 				size_t rem;
 
-				kernel_neon_begin();
-				rem = __sha256_ce_transform(state,
-							    data, nblocks);
-				kernel_neon_end();
+				scoped_ksimd()
+					rem = __sha256_ce_transform(state, data,
+								    nblocks);
+
 				data += (nblocks - rem) * SHA256_BLOCK_SIZE;
 				nblocks = rem;
 			} while (nblocks);
 		} else {
-			kernel_neon_begin();
-			sha256_block_neon(state, data, nblocks);
-			kernel_neon_end();
+			scoped_ksimd()
+				sha256_block_neon(state, data, nblocks);
 		}
 	} else {
 		sha256_block_data_order(state, data, nblocks);
@@ -66,9 +64,8 @@ static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
 	    static_branch_likely(&have_ce) && len >= SHA256_BLOCK_SIZE &&
 	    len <= 65536 && likely(may_use_simd())) {
-		kernel_neon_begin();
-		sha256_ce_finup2x(ctx, data1, data2, len, out1, out2);
-		kernel_neon_end();
+		scoped_ksimd()
+			sha256_ce_finup2x(ctx, data1, data2, len, out1, out2);
 		kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
 		kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
 		return true;
diff --git a/lib/crypto/arm64/sha3-ce-core.S b/lib/crypto/arm64/sha3-ce-core.S
new file mode 100644
index 000000000000..ace90b506490
--- /dev/null
+++ b/lib/crypto/arm64/sha3-ce-core.S
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Core SHA-3 transform using v8.2 Crypto Extensions
+ *
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.irp	b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+	.set	.Lv\b\().2d, \b
+	.set	.Lv\b\().16b, \b
+	.endr
+
+	/*
+	 * ARMv8.2 Crypto Extensions instructions
+	 */
+	.macro	eor3, rd, rn, rm, ra
+	.inst	0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
+	.endm
+
+	.macro	rax1, rd, rn, rm
+	.inst	0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
+	.endm
+
+	.macro	bcax, rd, rn, rm, ra
+	.inst	0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
+	.endm
+
+	.macro	xar, rd, rn, rm, imm6
+	.inst	0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16)
+	.endm
+
+	/*
+	 * size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
+	 *			    size_t nblocks, size_t block_size)
+	 *
+	 * block_size is assumed to be one of 72 (SHA3-512), 104 (SHA3-384), 136
+	 * (SHA3-256 and SHAKE256), 144 (SHA3-224), or 168 (SHAKE128).
+	 */
+	.text
+SYM_FUNC_START(sha3_ce_transform)
+	/* load state */
+	add	x8, x0, #32
+	ld1	{ v0.1d- v3.1d}, [x0]
+	ld1	{ v4.1d- v7.1d}, [x8], #32
+	ld1	{ v8.1d-v11.1d}, [x8], #32
+	ld1	{v12.1d-v15.1d}, [x8], #32
+	ld1	{v16.1d-v19.1d}, [x8], #32
+	ld1	{v20.1d-v23.1d}, [x8], #32
+	ld1	{v24.1d}, [x8]
+
+0:	sub	x2, x2, #1
+	mov	w8, #24
+	adr_l	x9, .Lsha3_rcon
+
+	/* load input */
+	ld1	{v25.8b-v28.8b}, [x1], #32
+	ld1	{v29.8b}, [x1], #8
+	eor	v0.8b, v0.8b, v25.8b
+	eor	v1.8b, v1.8b, v26.8b
+	eor	v2.8b, v2.8b, v27.8b
+	eor	v3.8b, v3.8b, v28.8b
+	eor	v4.8b, v4.8b, v29.8b
+
+	ld1	{v25.8b-v28.8b}, [x1], #32
+	eor	v5.8b, v5.8b, v25.8b
+	eor	v6.8b, v6.8b, v26.8b
+	eor	v7.8b, v7.8b, v27.8b
+	eor	v8.8b, v8.8b, v28.8b
+	cmp	x3, #72
+	b.eq	3f	/* SHA3-512 (block_size=72)? */
+
+	ld1	{v25.8b-v28.8b}, [x1], #32
+	eor	v9.8b, v9.8b, v25.8b
+	eor	v10.8b, v10.8b, v26.8b
+	eor	v11.8b, v11.8b, v27.8b
+	eor	v12.8b, v12.8b, v28.8b
+	cmp	x3, #104
+	b.eq	3f	/* SHA3-384 (block_size=104)? */
+
+	ld1	{v25.8b-v28.8b}, [x1], #32
+	eor	v13.8b, v13.8b, v25.8b
+	eor	v14.8b, v14.8b, v26.8b
+	eor	v15.8b, v15.8b, v27.8b
+	eor	v16.8b, v16.8b, v28.8b
+	cmp	x3, #144
+	b.lt	3f	/* SHA3-256 or SHAKE256 (block_size=136)? */
+	b.eq	2f	/* SHA3-224 (block_size=144)? */
+
+	/* SHAKE128 (block_size=168) */
+	ld1	{v25.8b-v28.8b}, [x1], #32
+	eor	v17.8b, v17.8b, v25.8b
+	eor	v18.8b, v18.8b, v26.8b
+	eor	v19.8b, v19.8b, v27.8b
+	eor	v20.8b, v20.8b, v28.8b
+	b	3f
+2:
+	/* SHA3-224 (block_size=144) */
+	ld1	{v25.8b}, [x1], #8
+	eor	v17.8b, v17.8b, v25.8b
+
+3:	sub	w8, w8, #1
+
+	eor3	v29.16b,  v4.16b,  v9.16b, v14.16b
+	eor3	v26.16b,  v1.16b,  v6.16b, v11.16b
+	eor3	v28.16b,  v3.16b,  v8.16b, v13.16b
+	eor3	v25.16b,  v0.16b,  v5.16b, v10.16b
+	eor3	v27.16b,  v2.16b,  v7.16b, v12.16b
+	eor3	v29.16b, v29.16b, v19.16b, v24.16b
+	eor3	v26.16b, v26.16b, v16.16b, v21.16b
+	eor3	v28.16b, v28.16b, v18.16b, v23.16b
+	eor3	v25.16b, v25.16b, v15.16b, v20.16b
+	eor3	v27.16b, v27.16b, v17.16b, v22.16b
+
+	rax1	v30.2d, v29.2d, v26.2d	// bc[0]
+	rax1	v26.2d, v26.2d, v28.2d	// bc[2]
+	rax1	v28.2d, v28.2d, v25.2d	// bc[4]
+	rax1	v25.2d, v25.2d, v27.2d	// bc[1]
+	rax1	v27.2d, v27.2d, v29.2d	// bc[3]
+
+	eor	 v0.16b,  v0.16b, v30.16b
+	xar	 v29.2d,   v1.2d,  v25.2d, (64 - 1)
+	xar	  v1.2d,   v6.2d,  v25.2d, (64 - 44)
+	xar	  v6.2d,   v9.2d,  v28.2d, (64 - 20)
+	xar	  v9.2d,  v22.2d,  v26.2d, (64 - 61)
+	xar	 v22.2d,  v14.2d,  v28.2d, (64 - 39)
+	xar	 v14.2d,  v20.2d,  v30.2d, (64 - 18)
+	xar	 v31.2d,   v2.2d,  v26.2d, (64 - 62)
+	xar	  v2.2d,  v12.2d,  v26.2d, (64 - 43)
+	xar	 v12.2d,  v13.2d,  v27.2d, (64 - 25)
+	xar	 v13.2d,  v19.2d,  v28.2d, (64 - 8)
+	xar	 v19.2d,  v23.2d,  v27.2d, (64 - 56)
+	xar	 v23.2d,  v15.2d,  v30.2d, (64 - 41)
+	xar	 v15.2d,   v4.2d,  v28.2d, (64 - 27)
+	xar	 v28.2d,  v24.2d,  v28.2d, (64 - 14)
+	xar	 v24.2d,  v21.2d,  v25.2d, (64 - 2)
+	xar	  v8.2d,   v8.2d,  v27.2d, (64 - 55)
+	xar	  v4.2d,  v16.2d,  v25.2d, (64 - 45)
+	xar	 v16.2d,   v5.2d,  v30.2d, (64 - 36)
+	xar	  v5.2d,   v3.2d,  v27.2d, (64 - 28)
+	xar	 v27.2d,  v18.2d,  v27.2d, (64 - 21)
+	xar	  v3.2d,  v17.2d,  v26.2d, (64 - 15)
+	xar	 v25.2d,  v11.2d,  v25.2d, (64 - 10)
+	xar	 v26.2d,   v7.2d,  v26.2d, (64 - 6)
+	xar	 v30.2d,  v10.2d,  v30.2d, (64 - 3)
+
+	bcax	v20.16b, v31.16b, v22.16b,  v8.16b
+	bcax	v21.16b,  v8.16b, v23.16b, v22.16b
+	bcax	v22.16b, v22.16b, v24.16b, v23.16b
+	bcax	v23.16b, v23.16b, v31.16b, v24.16b
+	bcax	v24.16b, v24.16b,  v8.16b, v31.16b
+
+	ld1r	{v31.2d}, [x9], #8
+
+	bcax	v17.16b, v25.16b, v19.16b,  v3.16b
+	bcax	v18.16b,  v3.16b, v15.16b, v19.16b
+	bcax	v19.16b, v19.16b, v16.16b, v15.16b
+	bcax	v15.16b, v15.16b, v25.16b, v16.16b
+	bcax	v16.16b, v16.16b,  v3.16b, v25.16b
+
+	bcax	v10.16b, v29.16b, v12.16b, v26.16b
+	bcax	v11.16b, v26.16b, v13.16b, v12.16b
+	bcax	v12.16b, v12.16b, v14.16b, v13.16b
+	bcax	v13.16b, v13.16b, v29.16b, v14.16b
+	bcax	v14.16b, v14.16b, v26.16b, v29.16b
+
+	bcax	 v7.16b, v30.16b,  v9.16b,  v4.16b
+	bcax	 v8.16b,  v4.16b,  v5.16b,  v9.16b
+	bcax	 v9.16b,  v9.16b,  v6.16b,  v5.16b
+	bcax	 v5.16b,  v5.16b, v30.16b,  v6.16b
+	bcax	 v6.16b,  v6.16b,  v4.16b, v30.16b
+
+	bcax	 v3.16b, v27.16b,  v0.16b, v28.16b
+	bcax	 v4.16b, v28.16b,  v1.16b,  v0.16b
+	bcax	 v0.16b,  v0.16b,  v2.16b,  v1.16b
+	bcax	 v1.16b,  v1.16b, v27.16b,  v2.16b
+	bcax	 v2.16b,  v2.16b, v28.16b, v27.16b
+
+	eor	 v0.16b,  v0.16b, v31.16b
+
+	cbnz	w8, 3b
+	cond_yield 4f, x8, x9
+	cbnz	x2, 0b
+
+	/* save state */
+4:	st1	{ v0.1d- v3.1d}, [x0], #32
+	st1	{ v4.1d- v7.1d}, [x0], #32
+	st1	{ v8.1d-v11.1d}, [x0], #32
+	st1	{v12.1d-v15.1d}, [x0], #32
+	st1	{v16.1d-v19.1d}, [x0], #32
+	st1	{v20.1d-v23.1d}, [x0], #32
+	st1	{v24.1d}, [x0]
+	mov	x0, x2
+	ret
+SYM_FUNC_END(sha3_ce_transform)
+
+	.section	".rodata", "a"
+	.align		8
+.Lsha3_rcon:
+	.quad	0x0000000000000001, 0x0000000000008082, 0x800000000000808a
+	.quad	0x8000000080008000, 0x000000000000808b, 0x0000000080000001
+	.quad	0x8000000080008081, 0x8000000000008009, 0x000000000000008a
+	.quad	0x0000000000000088, 0x0000000080008009, 0x000000008000000a
+	.quad	0x000000008000808b, 0x800000000000008b, 0x8000000000008089
+	.quad	0x8000000000008003, 0x8000000000008002, 0x8000000000000080
+	.quad	0x000000000000800a, 0x800000008000000a, 0x8000000080008081
+	.quad	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
diff --git a/lib/crypto/arm64/sha3.h b/lib/crypto/arm64/sha3.h
new file mode 100644
index 000000000000..b602f1b3b282
--- /dev/null
+++ b/lib/crypto/arm64/sha3.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3);
+
+asmlinkage size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
+				    size_t nblocks, size_t block_size);
+
+static void sha3_absorb_blocks(struct sha3_state *state, const u8 *data,
+			       size_t nblocks, size_t block_size)
+{
+	if (static_branch_likely(&have_sha3) && likely(may_use_simd())) {
+		do {
+			size_t rem;
+
+			scoped_ksimd()
+				rem = sha3_ce_transform(state, data, nblocks,
+							block_size);
+			data += (nblocks - rem) * block_size;
+			nblocks = rem;
+		} while (nblocks);
+	} else {
+		sha3_absorb_blocks_generic(state, data, nblocks, block_size);
+	}
+}
+
+static void sha3_keccakf(struct sha3_state *state)
+{
+	if (static_branch_likely(&have_sha3) && likely(may_use_simd())) {
+		/*
+		 * Passing zeroes into sha3_ce_transform() gives the plain
+		 * Keccak-f permutation, which is what we want here.  Any
+		 * supported block size may be used.  Use SHA3_512_BLOCK_SIZE
+		 * since it's the shortest.
+		 */
+		static const u8 zeroes[SHA3_512_BLOCK_SIZE];
+
+		scoped_ksimd()
+			sha3_ce_transform(state, zeroes, 1, sizeof(zeroes));
+	} else {
+		sha3_keccakf_generic(state);
+	}
+}
+
+#define sha3_mod_init_arch sha3_mod_init_arch
+static void sha3_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(SHA3))
+		static_branch_enable(&have_sha3);
+}
diff --git a/lib/crypto/arm64/sha512-ce-core.S b/lib/crypto/arm64/sha512-ce-core.S
index 22f1ded89bc8..ffd51acfd1ee 100644
--- a/lib/crypto/arm64/sha512-ce-core.S
+++ b/lib/crypto/arm64/sha512-ce-core.S
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * sha512-ce-core.S - core SHA-384/SHA-512 transform using v8 Crypto Extensions
+ * Core SHA-384/SHA-512 transform using v8 Crypto Extensions
  *
  * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
  *
diff --git a/lib/crypto/arm64/sha512.h b/lib/crypto/arm64/sha512.h
index ddb0d256f73a..7eb7ef04d268 100644
--- a/lib/crypto/arm64/sha512.h
+++ b/lib/crypto/arm64/sha512.h
@@ -4,7 +4,7 @@
  *
  * Copyright 2025 Google LLC
  */
-#include <asm/neon.h>
+
 #include <asm/simd.h>
 #include <linux/cpufeature.h>
 
@@ -24,9 +24,9 @@ static void sha512_blocks(struct sha512_block_state *state,
 		do {
 			size_t rem;
 
-			kernel_neon_begin();
-			rem = __sha512_ce_transform(state, data, nblocks);
-			kernel_neon_end();
+			scoped_ksimd()
+				rem = __sha512_ce_transform(state, data, nblocks);
+
 			data += (nblocks - rem) * SHA512_BLOCK_SIZE;
 			nblocks = rem;
 		} while (nblocks);
diff --git a/lib/crypto/blake2b.c b/lib/crypto/blake2b.c
new file mode 100644
index 000000000000..09c6d65d8a6e
--- /dev/null
+++ b/lib/crypto/blake2b.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2025 Google LLC
+ *
+ * This is an implementation of the BLAKE2b hash and PRF functions.
+ *
+ * Information: https://blake2.net/
+ */
+
+#include <crypto/blake2b.h>
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+static const u8 blake2b_sigma[12][16] = {
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+};
+
+static inline void blake2b_increment_counter(struct blake2b_ctx *ctx, u32 inc)
+{
+	ctx->t[0] += inc;
+	ctx->t[1] += (ctx->t[0] < inc);
+}
+
+static void __maybe_unused
+blake2b_compress_generic(struct blake2b_ctx *ctx,
+			 const u8 *data, size_t nblocks, u32 inc)
+{
+	u64 m[16];
+	u64 v[16];
+	int i;
+
+	WARN_ON(IS_ENABLED(DEBUG) &&
+		(nblocks > 1 && inc != BLAKE2B_BLOCK_SIZE));
+
+	while (nblocks > 0) {
+		blake2b_increment_counter(ctx, inc);
+		memcpy(m, data, BLAKE2B_BLOCK_SIZE);
+		le64_to_cpu_array(m, ARRAY_SIZE(m));
+		memcpy(v, ctx->h, 64);
+		v[ 8] = BLAKE2B_IV0;
+		v[ 9] = BLAKE2B_IV1;
+		v[10] = BLAKE2B_IV2;
+		v[11] = BLAKE2B_IV3;
+		v[12] = BLAKE2B_IV4 ^ ctx->t[0];
+		v[13] = BLAKE2B_IV5 ^ ctx->t[1];
+		v[14] = BLAKE2B_IV6 ^ ctx->f[0];
+		v[15] = BLAKE2B_IV7 ^ ctx->f[1];
+
+#define G(r, i, a, b, c, d) do { \
+	a += b + m[blake2b_sigma[r][2 * i + 0]]; \
+	d = ror64(d ^ a, 32); \
+	c += d; \
+	b = ror64(b ^ c, 24); \
+	a += b + m[blake2b_sigma[r][2 * i + 1]]; \
+	d = ror64(d ^ a, 16); \
+	c += d; \
+	b = ror64(b ^ c, 63); \
+} while (0)
+
+#define ROUND(r) do { \
+	G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
+	G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
+	G(r, 2, v[2], v[ 6], v[10], v[14]); \
+	G(r, 3, v[3], v[ 7], v[11], v[15]); \
+	G(r, 4, v[0], v[ 5], v[10], v[15]); \
+	G(r, 5, v[1], v[ 6], v[11], v[12]); \
+	G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
+	G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
+} while (0)
+		ROUND(0);
+		ROUND(1);
+		ROUND(2);
+		ROUND(3);
+		ROUND(4);
+		ROUND(5);
+		ROUND(6);
+		ROUND(7);
+		ROUND(8);
+		ROUND(9);
+		ROUND(10);
+		ROUND(11);
+
+#undef G
+#undef ROUND
+
+		for (i = 0; i < 8; ++i)
+			ctx->h[i] ^= v[i] ^ v[i + 8];
+
+		data += BLAKE2B_BLOCK_SIZE;
+		--nblocks;
+	}
+}
+
+#ifdef CONFIG_CRYPTO_LIB_BLAKE2B_ARCH
+#include "blake2b.h" /* $(SRCARCH)/blake2b.h */
+#else
+#define blake2b_compress blake2b_compress_generic
+#endif
+
+static inline void blake2b_set_lastblock(struct blake2b_ctx *ctx)
+{
+	ctx->f[0] = -1;
+}
+
+void blake2b_update(struct blake2b_ctx *ctx, const u8 *in, size_t inlen)
+{
+	const size_t fill = BLAKE2B_BLOCK_SIZE - ctx->buflen;
+
+	if (unlikely(!inlen))
+		return;
+	if (inlen > fill) {
+		memcpy(ctx->buf + ctx->buflen, in, fill);
+		blake2b_compress(ctx, ctx->buf, 1, BLAKE2B_BLOCK_SIZE);
+		ctx->buflen = 0;
+		in += fill;
+		inlen -= fill;
+	}
+	if (inlen > BLAKE2B_BLOCK_SIZE) {
+		const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2B_BLOCK_SIZE);
+
+		blake2b_compress(ctx, in, nblocks - 1, BLAKE2B_BLOCK_SIZE);
+		in += BLAKE2B_BLOCK_SIZE * (nblocks - 1);
+		inlen -= BLAKE2B_BLOCK_SIZE * (nblocks - 1);
+	}
+	memcpy(ctx->buf + ctx->buflen, in, inlen);
+	ctx->buflen += inlen;
+}
+EXPORT_SYMBOL(blake2b_update);
+
+void blake2b_final(struct blake2b_ctx *ctx, u8 *out)
+{
+	WARN_ON(IS_ENABLED(DEBUG) && !out);
+	blake2b_set_lastblock(ctx);
+	memset(ctx->buf + ctx->buflen, 0,
+	       BLAKE2B_BLOCK_SIZE - ctx->buflen); /* Padding */
+	blake2b_compress(ctx, ctx->buf, 1, ctx->buflen);
+	cpu_to_le64_array(ctx->h, ARRAY_SIZE(ctx->h));
+	memcpy(out, ctx->h, ctx->outlen);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL(blake2b_final);
+
+#ifdef blake2b_mod_init_arch
+static int __init blake2b_mod_init(void)
+{
+	blake2b_mod_init_arch();
+	return 0;
+}
+subsys_initcall(blake2b_mod_init);
+
+static void __exit blake2b_mod_exit(void)
+{
+}
+module_exit(blake2b_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("BLAKE2b hash function");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
index 5638ed9d882d..6182c21ed943 100644
--- a/lib/crypto/blake2s.c
+++ b/lib/crypto/blake2s.c
@@ -29,16 +29,15 @@ static const u8 blake2s_sigma[10][16] = {
 	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
 };
 
-static inline void blake2s_increment_counter(struct blake2s_state *state,
-					     const u32 inc)
+static inline void blake2s_increment_counter(struct blake2s_ctx *ctx, u32 inc)
 {
-	state->t[0] += inc;
-	state->t[1] += (state->t[0] < inc);
+	ctx->t[0] += inc;
+	ctx->t[1] += (ctx->t[0] < inc);
 }
 
 static void __maybe_unused
-blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
-			 size_t nblocks, const u32 inc)
+blake2s_compress_generic(struct blake2s_ctx *ctx,
+			 const u8 *data, size_t nblocks, u32 inc)
 {
 	u32 m[16];
 	u32 v[16];
@@ -48,18 +47,18 @@ blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 		(nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));
 
 	while (nblocks > 0) {
-		blake2s_increment_counter(state, inc);
-		memcpy(m, block, BLAKE2S_BLOCK_SIZE);
+		blake2s_increment_counter(ctx, inc);
+		memcpy(m, data, BLAKE2S_BLOCK_SIZE);
 		le32_to_cpu_array(m, ARRAY_SIZE(m));
-		memcpy(v, state->h, 32);
+		memcpy(v, ctx->h, 32);
 		v[ 8] = BLAKE2S_IV0;
 		v[ 9] = BLAKE2S_IV1;
 		v[10] = BLAKE2S_IV2;
 		v[11] = BLAKE2S_IV3;
-		v[12] = BLAKE2S_IV4 ^ state->t[0];
-		v[13] = BLAKE2S_IV5 ^ state->t[1];
-		v[14] = BLAKE2S_IV6 ^ state->f[0];
-		v[15] = BLAKE2S_IV7 ^ state->f[1];
+		v[12] = BLAKE2S_IV4 ^ ctx->t[0];
+		v[13] = BLAKE2S_IV5 ^ ctx->t[1];
+		v[14] = BLAKE2S_IV6 ^ ctx->f[0];
+		v[15] = BLAKE2S_IV7 ^ ctx->f[1];
 
 #define G(r, i, a, b, c, d) do { \
 	a += b + m[blake2s_sigma[r][2 * i + 0]]; \
@@ -97,9 +96,9 @@ blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 #undef ROUND
 
 		for (i = 0; i < 8; ++i)
-			state->h[i] ^= v[i] ^ v[i + 8];
+			ctx->h[i] ^= v[i] ^ v[i + 8];
 
-		block += BLAKE2S_BLOCK_SIZE;
+		data += BLAKE2S_BLOCK_SIZE;
 		--nblocks;
 	}
 }
@@ -110,45 +109,46 @@ blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 #define blake2s_compress blake2s_compress_generic
 #endif
 
-static inline void blake2s_set_lastblock(struct blake2s_state *state)
+static inline void blake2s_set_lastblock(struct blake2s_ctx *ctx)
 {
-	state->f[0] = -1;
+	ctx->f[0] = -1;
 }
 
-void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen)
+void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen)
 {
-	const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
+	const size_t fill = BLAKE2S_BLOCK_SIZE - ctx->buflen;
 
 	if (unlikely(!inlen))
 		return;
 	if (inlen > fill) {
-		memcpy(state->buf + state->buflen, in, fill);
-		blake2s_compress(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
-		state->buflen = 0;
+		memcpy(ctx->buf + ctx->buflen, in, fill);
+		blake2s_compress(ctx, ctx->buf, 1, BLAKE2S_BLOCK_SIZE);
+		ctx->buflen = 0;
 		in += fill;
 		inlen -= fill;
 	}
 	if (inlen > BLAKE2S_BLOCK_SIZE) {
 		const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
-		blake2s_compress(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
+
+		blake2s_compress(ctx, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
 		in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
 		inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
 	}
-	memcpy(state->buf + state->buflen, in, inlen);
-	state->buflen += inlen;
+	memcpy(ctx->buf + ctx->buflen, in, inlen);
+	ctx->buflen += inlen;
 }
 EXPORT_SYMBOL(blake2s_update);
 
-void blake2s_final(struct blake2s_state *state, u8 *out)
+void blake2s_final(struct blake2s_ctx *ctx, u8 *out)
 {
 	WARN_ON(IS_ENABLED(DEBUG) && !out);
-	blake2s_set_lastblock(state);
-	memset(state->buf + state->buflen, 0,
-	       BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
-	blake2s_compress(state, state->buf, 1, state->buflen);
-	cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
-	memcpy(out, state->h, state->outlen);
-	memzero_explicit(state, sizeof(*state));
+	blake2s_set_lastblock(ctx);
+	memset(ctx->buf + ctx->buflen, 0,
+	       BLAKE2S_BLOCK_SIZE - ctx->buflen); /* Padding */
+	blake2s_compress(ctx, ctx->buf, 1, ctx->buflen);
+	cpu_to_le32_array(ctx->h, ARRAY_SIZE(ctx->h));
+	memcpy(out, ctx->h, ctx->outlen);
+	memzero_explicit(ctx, sizeof(*ctx));
 }
 EXPORT_SYMBOL(blake2s_final);
 
diff --git a/lib/crypto/chacha20poly1305.c b/lib/crypto/chacha20poly1305.c
index 0b49d6aedefd..212ce33562af 100644
--- a/lib/crypto/chacha20poly1305.c
+++ b/lib/crypto/chacha20poly1305.c
@@ -89,7 +89,7 @@ __chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
 void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
 			      const u8 *ad, const size_t ad_len,
 			      const u64 nonce,
-			      const u8 key[CHACHA20POLY1305_KEY_SIZE])
+			      const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
 	struct chacha_state chacha_state;
 	u32 k[CHACHA_KEY_WORDS];
@@ -111,8 +111,8 @@ EXPORT_SYMBOL(chacha20poly1305_encrypt);
 
 void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len,
 			       const u8 *ad, const size_t ad_len,
-			       const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
-			       const u8 key[CHACHA20POLY1305_KEY_SIZE])
+			       const u8 nonce[at_least XCHACHA20POLY1305_NONCE_SIZE],
+			       const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
 	struct chacha_state chacha_state;
 
@@ -170,7 +170,7 @@ __chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
 bool chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
 			      const u8 *ad, const size_t ad_len,
 			      const u64 nonce,
-			      const u8 key[CHACHA20POLY1305_KEY_SIZE])
+			      const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
 	struct chacha_state chacha_state;
 	u32 k[CHACHA_KEY_WORDS];
@@ -195,8 +195,8 @@ EXPORT_SYMBOL(chacha20poly1305_decrypt);
 
 bool xchacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len,
 			       const u8 *ad, const size_t ad_len,
-			       const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE],
-			       const u8 key[CHACHA20POLY1305_KEY_SIZE])
+			       const u8 nonce[at_least XCHACHA20POLY1305_NONCE_SIZE],
+			       const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
 	struct chacha_state chacha_state;
 
@@ -211,7 +211,7 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src,
 				       const size_t src_len,
 				       const u8 *ad, const size_t ad_len,
 				       const u64 nonce,
-				       const u8 key[CHACHA20POLY1305_KEY_SIZE],
+				       const u8 key[at_least CHACHA20POLY1305_KEY_SIZE],
 				       int encrypt)
 {
 	const u8 *pad0 = page_address(ZERO_PAGE(0));
@@ -335,7 +335,7 @@ bool chacha20poly1305_crypt_sg_inplace(struct scatterlist *src,
 bool chacha20poly1305_encrypt_sg_inplace(struct scatterlist *src, size_t src_len,
 					 const u8 *ad, const size_t ad_len,
 					 const u64 nonce,
-					 const u8 key[CHACHA20POLY1305_KEY_SIZE])
+					 const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
 	return chacha20poly1305_crypt_sg_inplace(src, src_len, ad, ad_len,
 						 nonce, key, 1);
@@ -345,7 +345,7 @@ EXPORT_SYMBOL(chacha20poly1305_encrypt_sg_inplace);
 bool chacha20poly1305_decrypt_sg_inplace(struct scatterlist *src, size_t src_len,
 					 const u8 *ad, const size_t ad_len,
 					 const u64 nonce,
-					 const u8 key[CHACHA20POLY1305_KEY_SIZE])
+					 const u8 key[at_least CHACHA20POLY1305_KEY_SIZE])
 {
 	if (unlikely(src_len < POLY1305_DIGEST_SIZE))
 		return false;
diff --git a/lib/crypto/fips.h b/lib/crypto/fips.h
new file mode 100644
index 000000000000..023410c2e0db
--- /dev/null
+++ b/lib/crypto/fips.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: gen-fips-testvecs.py */
+
+#include <linux/fips.h>
+
+static const u8 fips_test_data[] __initconst __maybe_unused = {
+	0x66, 0x69, 0x70, 0x73, 0x20, 0x74, 0x65, 0x73,
+	0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00,
+};
+
+static const u8 fips_test_key[] __initconst __maybe_unused = {
+	0x66, 0x69, 0x70, 0x73, 0x20, 0x74, 0x65, 0x73,
+	0x74, 0x20, 0x6b, 0x65, 0x79, 0x00, 0x00, 0x00,
+};
+
+static const u8 fips_test_hmac_sha1_value[] __initconst __maybe_unused = {
+	0x29, 0xa9, 0x88, 0xb8, 0x5c, 0xb4, 0xaf, 0x4b,
+	0x97, 0x2a, 0xee, 0x87, 0x5b, 0x0a, 0x02, 0x55,
+	0x99, 0xbf, 0x86, 0x78,
+};
+
+static const u8 fips_test_hmac_sha256_value[] __initconst __maybe_unused = {
+	0x59, 0x25, 0x85, 0xcc, 0x40, 0xe9, 0x64, 0x2f,
+	0xe9, 0xbf, 0x82, 0xb7, 0xd3, 0x15, 0x3d, 0x43,
+	0x22, 0x0b, 0x4c, 0x00, 0x90, 0x14, 0x25, 0xcf,
+	0x9e, 0x13, 0x2b, 0xc2, 0x30, 0xe6, 0xe8, 0x93,
+};
+
+static const u8 fips_test_hmac_sha512_value[] __initconst __maybe_unused = {
+	0x6b, 0xea, 0x5d, 0x27, 0x49, 0x5b, 0x3f, 0xea,
+	0xde, 0x2d, 0xfa, 0x32, 0x75, 0xdb, 0x77, 0xc8,
+	0x26, 0xe9, 0x4e, 0x95, 0x4d, 0xad, 0x88, 0x02,
+	0x87, 0xf9, 0x52, 0x0a, 0xd1, 0x92, 0x80, 0x1d,
+	0x92, 0x7e, 0x3c, 0xbd, 0xb1, 0x3c, 0x49, 0x98,
+	0x44, 0x9c, 0x8f, 0xee, 0x3f, 0x02, 0x71, 0x51,
+	0x57, 0x0b, 0x15, 0x38, 0x95, 0xd8, 0xa3, 0x81,
+	0xba, 0xb3, 0x15, 0x37, 0x5c, 0x6d, 0x57, 0x2b,
+};
+
+static const u8 fips_test_sha3_256_value[] __initconst __maybe_unused = {
+	0x77, 0xc4, 0x8b, 0x69, 0x70, 0x5f, 0x0a, 0xb1,
+	0xb1, 0xa5, 0x82, 0x0a, 0x22, 0x2b, 0x49, 0x31,
+	0xba, 0x9b, 0xb6, 0xaa, 0x32, 0xa7, 0x97, 0x00,
+	0x98, 0xdb, 0xff, 0xe7, 0xc6, 0xde, 0xb5, 0x82,
+};
diff --git a/lib/crypto/polyval.c b/lib/crypto/polyval.c
new file mode 100644
index 000000000000..5796275f574a
--- /dev/null
+++ b/lib/crypto/polyval.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * POLYVAL library functions
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <crypto/polyval.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+/*
+ * POLYVAL is an almost-XOR-universal hash function.  Similar to GHASH, POLYVAL
+ * interprets the message as the coefficients of a polynomial in GF(2^128) and
+ * evaluates that polynomial at a secret point.  POLYVAL has a simple
+ * mathematical relationship with GHASH, but it uses a better field convention
+ * which makes it easier and faster to implement.
+ *
+ * POLYVAL is not a cryptographic hash function, and it should be used only by
+ * algorithms that are specifically designed to use it.
+ *
+ * POLYVAL is specified by "AES-GCM-SIV: Nonce Misuse-Resistant Authenticated
+ * Encryption" (https://datatracker.ietf.org/doc/html/rfc8452)
+ *
+ * POLYVAL is also used by HCTR2.  See "Length-preserving encryption with HCTR2"
+ * (https://eprint.iacr.org/2021/1441.pdf).
+ *
+ * This file provides a library API for POLYVAL.  This API can delegate to
+ * either a generic implementation or an architecture-optimized implementation.
+ *
+ * For the generic implementation, we don't use the traditional table approach
+ * to GF(2^128) multiplication.  That approach is not constant-time and requires
+ * a lot of memory.  Instead, we use a different approach which emulates
+ * carryless multiplication using standard multiplications by spreading the data
+ * bits apart using "holes".  This allows the carries to spill harmlessly.  This
+ * approach is borrowed from BoringSSL, which in turn credits BearSSL's
+ * documentation (https://bearssl.org/constanttime.html#ghash-for-gcm) for the
+ * "holes" trick and a presentation by Shay Gueron
+ * (https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf) for the
+ * 256-bit => 128-bit reduction algorithm.
+ */
+
+#ifdef CONFIG_ARCH_SUPPORTS_INT128
+
+/* Do a 64 x 64 => 128 bit carryless multiplication. */
+static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
+{
+	/*
+	 * With 64-bit multiplicands and one term every 4 bits, there would be
+	 * up to 64 / 4 = 16 one bits per column when each multiplication is
+	 * written out as a series of additions in the schoolbook manner.
+	 * Unfortunately, that doesn't work since the value 16 is 1 too large to
+	 * fit in 4 bits.  Carries would sometimes overflow into the next term.
+	 *
+	 * Using one term every 5 bits would work.  However, that would cost
+	 * 5 x 5 = 25 multiplications instead of 4 x 4 = 16.
+	 *
+	 * Instead, mask off 4 bits from one multiplicand, giving a max of 15
+	 * one bits per column.  Then handle those 4 bits separately.
+	 */
+	u64 a0 = a & 0x1111111111111110;
+	u64 a1 = a & 0x2222222222222220;
+	u64 a2 = a & 0x4444444444444440;
+	u64 a3 = a & 0x8888888888888880;
+
+	u64 b0 = b & 0x1111111111111111;
+	u64 b1 = b & 0x2222222222222222;
+	u64 b2 = b & 0x4444444444444444;
+	u64 b3 = b & 0x8888888888888888;
+
+	/* Multiply the high 60 bits of @a by @b. */
+	u128 c0 = (a0 * (u128)b0) ^ (a1 * (u128)b3) ^
+		  (a2 * (u128)b2) ^ (a3 * (u128)b1);
+	u128 c1 = (a0 * (u128)b1) ^ (a1 * (u128)b0) ^
+		  (a2 * (u128)b3) ^ (a3 * (u128)b2);
+	u128 c2 = (a0 * (u128)b2) ^ (a1 * (u128)b1) ^
+		  (a2 * (u128)b0) ^ (a3 * (u128)b3);
+	u128 c3 = (a0 * (u128)b3) ^ (a1 * (u128)b2) ^
+		  (a2 * (u128)b1) ^ (a3 * (u128)b0);
+
+	/* Multiply the low 4 bits of @a by @b. */
+	u64 e0 = -(a & 1) & b;
+	u64 e1 = -((a >> 1) & 1) & b;
+	u64 e2 = -((a >> 2) & 1) & b;
+	u64 e3 = -((a >> 3) & 1) & b;
+	u64 extra_lo = e0 ^ (e1 << 1) ^ (e2 << 2) ^ (e3 << 3);
+	u64 extra_hi = (e1 >> 63) ^ (e2 >> 62) ^ (e3 >> 61);
+
+	/* Add all the intermediate products together. */
+	*out_lo = (((u64)c0) & 0x1111111111111111) ^
+		  (((u64)c1) & 0x2222222222222222) ^
+		  (((u64)c2) & 0x4444444444444444) ^
+		  (((u64)c3) & 0x8888888888888888) ^ extra_lo;
+	*out_hi = (((u64)(c0 >> 64)) & 0x1111111111111111) ^
+		  (((u64)(c1 >> 64)) & 0x2222222222222222) ^
+		  (((u64)(c2 >> 64)) & 0x4444444444444444) ^
+		  (((u64)(c3 >> 64)) & 0x8888888888888888) ^ extra_hi;
+}
+
+#else /* CONFIG_ARCH_SUPPORTS_INT128 */
+
+/* Do a 32 x 32 => 64 bit carryless multiplication. */
+static u64 clmul32(u32 a, u32 b)
+{
+	/*
+	 * With 32-bit multiplicands and one term every 4 bits, there are up to
+	 * 32 / 4 = 8 one bits per column when each multiplication is written
+	 * out as a series of additions in the schoolbook manner.  The value 8
+	 * fits in 4 bits, so the carries don't overflow into the next term.
+	 */
+	u32 a0 = a & 0x11111111;
+	u32 a1 = a & 0x22222222;
+	u32 a2 = a & 0x44444444;
+	u32 a3 = a & 0x88888888;
+
+	u32 b0 = b & 0x11111111;
+	u32 b1 = b & 0x22222222;
+	u32 b2 = b & 0x44444444;
+	u32 b3 = b & 0x88888888;
+
+	u64 c0 = (a0 * (u64)b0) ^ (a1 * (u64)b3) ^
+		 (a2 * (u64)b2) ^ (a3 * (u64)b1);
+	u64 c1 = (a0 * (u64)b1) ^ (a1 * (u64)b0) ^
+		 (a2 * (u64)b3) ^ (a3 * (u64)b2);
+	u64 c2 = (a0 * (u64)b2) ^ (a1 * (u64)b1) ^
+		 (a2 * (u64)b0) ^ (a3 * (u64)b3);
+	u64 c3 = (a0 * (u64)b3) ^ (a1 * (u64)b2) ^
+		 (a2 * (u64)b1) ^ (a3 * (u64)b0);
+
+	/* Add all the intermediate products together. */
+	return (c0 & 0x1111111111111111) ^
+	       (c1 & 0x2222222222222222) ^
+	       (c2 & 0x4444444444444444) ^
+	       (c3 & 0x8888888888888888);
+}
+
+/* Do a 64 x 64 => 128 bit carryless multiplication. */
+static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
+{
+	u32 a_lo = (u32)a;
+	u32 a_hi = a >> 32;
+	u32 b_lo = (u32)b;
+	u32 b_hi = b >> 32;
+
+	/* Karatsuba multiplication */
+	u64 lo = clmul32(a_lo, b_lo);
+	u64 hi = clmul32(a_hi, b_hi);
+	u64 mi = clmul32(a_lo ^ a_hi, b_lo ^ b_hi) ^ lo ^ hi;
+
+	*out_lo = lo ^ (mi << 32);
+	*out_hi = hi ^ (mi >> 32);
+}
+#endif /* !CONFIG_ARCH_SUPPORTS_INT128 */
+
+/* Compute @a = @a * @b * x^-128 in the POLYVAL field. */
+static void __maybe_unused
+polyval_mul_generic(struct polyval_elem *a, const struct polyval_elem *b)
+{
+	u64 c0, c1, c2, c3, mi0, mi1;
+
+	/*
+	 * Carryless-multiply @a by @b using Karatsuba multiplication.  Store
+	 * the 256-bit product in @c0 (low) through @c3 (high).
+	 */
+	clmul64(le64_to_cpu(a->lo), le64_to_cpu(b->lo), &c0, &c1);
+	clmul64(le64_to_cpu(a->hi), le64_to_cpu(b->hi), &c2, &c3);
+	clmul64(le64_to_cpu(a->lo ^ a->hi), le64_to_cpu(b->lo ^ b->hi),
+		&mi0, &mi1);
+	mi0 ^= c0 ^ c2;
+	mi1 ^= c1 ^ c3;
+	c1 ^= mi0;
+	c2 ^= mi1;
+
+	/*
+	 * Cancel out the low 128 bits of the product by adding multiples of
+	 * G(x) = x^128 + x^127 + x^126 + x^121 + 1.  Do this in two steps, each
+	 * of which cancels out 64 bits.  Note that we break G(x) into three
+	 * parts: 1, x^64 * (x^63 + x^62 + x^57), and x^128 * 1.
+	 */
+
+	/*
+	 * First, add G(x) times c0 as follows:
+	 *
+	 * (c0, c1, c2) = (0,
+	 *                 c1 + (c0 * (x^63 + x^62 + x^57) mod x^64),
+	 *		   c2 + c0 + floor((c0 * (x^63 + x^62 + x^57)) / x^64))
+	 */
+	c1 ^= (c0 << 63) ^ (c0 << 62) ^ (c0 << 57);
+	c2 ^= c0 ^ (c0 >> 1) ^ (c0 >> 2) ^ (c0 >> 7);
+
+	/*
+	 * Second, add G(x) times the new c1:
+	 *
+	 * (c1, c2, c3) = (0,
+	 *                 c2 + (c1 * (x^63 + x^62 + x^57) mod x^64),
+	 *		   c3 + c1 + floor((c1 * (x^63 + x^62 + x^57)) / x^64))
+	 */
+	c2 ^= (c1 << 63) ^ (c1 << 62) ^ (c1 << 57);
+	c3 ^= c1 ^ (c1 >> 1) ^ (c1 >> 2) ^ (c1 >> 7);
+
+	/* Return (c2, c3).  This implicitly multiplies by x^-128. */
+	a->lo = cpu_to_le64(c2);
+	a->hi = cpu_to_le64(c3);
+}
+
+static void __maybe_unused
+polyval_blocks_generic(struct polyval_elem *acc, const struct polyval_elem *key,
+		       const u8 *data, size_t nblocks)
+{
+	do {
+		acc->lo ^= get_unaligned((__le64 *)data);
+		acc->hi ^= get_unaligned((__le64 *)(data + 8));
+		polyval_mul_generic(acc, key);
+		data += POLYVAL_BLOCK_SIZE;
+	} while (--nblocks);
+}
+
+/* Include the arch-optimized implementation of POLYVAL, if one is available. */
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+#include "polyval.h" /* $(SRCARCH)/polyval.h */
+void polyval_preparekey(struct polyval_key *key,
+			const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+	polyval_preparekey_arch(key, raw_key);
+}
+EXPORT_SYMBOL_GPL(polyval_preparekey);
+#endif /* Else, polyval_preparekey() is an inline function. */
+
+/*
+ * polyval_mul_generic() and polyval_blocks_generic() take the key as a
+ * polyval_elem rather than a polyval_key, so that arch-optimized
+ * implementations with a different key format can use it as a fallback (if they
+ * have H^1 stored somewhere in their struct).  Thus, the following dispatch
+ * code is needed to pass the appropriate key argument.
+ */
+
+static void polyval_mul(struct polyval_ctx *ctx)
+{
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+	polyval_mul_arch(&ctx->acc, ctx->key);
+#else
+	polyval_mul_generic(&ctx->acc, &ctx->key->h);
+#endif
+}
+
+static void polyval_blocks(struct polyval_ctx *ctx,
+			   const u8 *data, size_t nblocks)
+{
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+	polyval_blocks_arch(&ctx->acc, ctx->key, data, nblocks);
+#else
+	polyval_blocks_generic(&ctx->acc, &ctx->key->h, data, nblocks);
+#endif
+}
+
+void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len)
+{
+	if (unlikely(ctx->partial)) {
+		size_t n = min(len, POLYVAL_BLOCK_SIZE - ctx->partial);
+
+		len -= n;
+		while (n--)
+			ctx->acc.bytes[ctx->partial++] ^= *data++;
+		if (ctx->partial < POLYVAL_BLOCK_SIZE)
+			return;
+		polyval_mul(ctx);
+	}
+	if (len >= POLYVAL_BLOCK_SIZE) {
+		size_t nblocks = len / POLYVAL_BLOCK_SIZE;
+
+		polyval_blocks(ctx, data, nblocks);
+		data += len & ~(POLYVAL_BLOCK_SIZE - 1);
+		len &= POLYVAL_BLOCK_SIZE - 1;
+	}
+	for (size_t i = 0; i < len; i++)
+		ctx->acc.bytes[i] ^= data[i];
+	ctx->partial = len;
+}
+EXPORT_SYMBOL_GPL(polyval_update);
+
+void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE])
+{
+	if (unlikely(ctx->partial))
+		polyval_mul(ctx);
+	memcpy(out, &ctx->acc, POLYVAL_BLOCK_SIZE);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(polyval_final);
+
+#ifdef polyval_mod_init_arch
+static int __init polyval_mod_init(void)
+{
+	polyval_mod_init_arch();
+	return 0;
+}
+subsys_initcall(polyval_mod_init);
+
+static void __exit polyval_mod_exit(void)
+{
+}
+module_exit(polyval_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("POLYVAL almost-XOR-universal hash function");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/s390/sha3.h b/lib/crypto/s390/sha3.h
new file mode 100644
index 000000000000..85471404775a
--- /dev/null
+++ b/lib/crypto/s390/sha3.h
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-3 optimized using the CP Assist for Cryptographic Functions (CPACF)
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3_init_optim);
+
+static void sha3_absorb_blocks(struct sha3_state *state, const u8 *data,
+			       size_t nblocks, size_t block_size)
+{
+	if (static_branch_likely(&have_sha3)) {
+		/*
+		 * Note that KIMD assumes little-endian order of the state
+		 * words.  sha3_state already uses that order, though, so
+		 * there's no need for a byteswap.
+		 */
+		switch (block_size) {
+		case SHA3_224_BLOCK_SIZE:
+			cpacf_kimd(CPACF_KIMD_SHA3_224, state,
+				   data, nblocks * block_size);
+			return;
+		case SHA3_256_BLOCK_SIZE:
+			/*
+			 * This case handles both SHA3-256 and SHAKE256, since
+			 * they have the same block size.
+			 */
+			cpacf_kimd(CPACF_KIMD_SHA3_256, state,
+				   data, nblocks * block_size);
+			return;
+		case SHA3_384_BLOCK_SIZE:
+			cpacf_kimd(CPACF_KIMD_SHA3_384, state,
+				   data, nblocks * block_size);
+			return;
+		case SHA3_512_BLOCK_SIZE:
+			cpacf_kimd(CPACF_KIMD_SHA3_512, state,
+				   data, nblocks * block_size);
+			return;
+		}
+	}
+	sha3_absorb_blocks_generic(state, data, nblocks, block_size);
+}
+
+static void sha3_keccakf(struct sha3_state *state)
+{
+	if (static_branch_likely(&have_sha3)) {
+		/*
+		 * Passing zeroes into any of CPACF_KIMD_SHA3_* gives the plain
+		 * Keccak-f permutation, which is what we want here.  Use
+		 * SHA3-512 since it has the smallest block size.
+		 */
+		static const u8 zeroes[SHA3_512_BLOCK_SIZE];
+
+		cpacf_kimd(CPACF_KIMD_SHA3_512, state, zeroes, sizeof(zeroes));
+	} else {
+		sha3_keccakf_generic(state);
+	}
+}
+
+static inline bool s390_sha3(int func, const u8 *in, size_t in_len,
+			     u8 *out, size_t out_len)
+{
+	struct sha3_state state;
+
+	if (!static_branch_likely(&have_sha3))
+		return false;
+
+	if (static_branch_likely(&have_sha3_init_optim))
+		func |= CPACF_KLMD_NIP | CPACF_KLMD_DUFOP;
+	else
+		memset(&state, 0, sizeof(state));
+
+	cpacf_klmd(func, &state, in, in_len);
+
+	if (static_branch_likely(&have_sha3_init_optim))
+		kmsan_unpoison_memory(&state, out_len);
+
+	memcpy(out, &state, out_len);
+	memzero_explicit(&state, sizeof(state));
+	return true;
+}
+
+#define sha3_224_arch sha3_224_arch
+static bool sha3_224_arch(const u8 *in, size_t in_len,
+			  u8 out[SHA3_224_DIGEST_SIZE])
+{
+	return s390_sha3(CPACF_KLMD_SHA3_224, in, in_len,
+			 out, SHA3_224_DIGEST_SIZE);
+}
+
+#define sha3_256_arch sha3_256_arch
+static bool sha3_256_arch(const u8 *in, size_t in_len,
+			  u8 out[SHA3_256_DIGEST_SIZE])
+{
+	return s390_sha3(CPACF_KLMD_SHA3_256, in, in_len,
+			 out, SHA3_256_DIGEST_SIZE);
+}
+
+#define sha3_384_arch sha3_384_arch
+static bool sha3_384_arch(const u8 *in, size_t in_len,
+			  u8 out[SHA3_384_DIGEST_SIZE])
+{
+	return s390_sha3(CPACF_KLMD_SHA3_384, in, in_len,
+			 out, SHA3_384_DIGEST_SIZE);
+}
+
+#define sha3_512_arch sha3_512_arch
+static bool sha3_512_arch(const u8 *in, size_t in_len,
+			  u8 out[SHA3_512_DIGEST_SIZE])
+{
+	return s390_sha3(CPACF_KLMD_SHA3_512, in, in_len,
+			 out, SHA3_512_DIGEST_SIZE);
+}
+
+#define sha3_mod_init_arch sha3_mod_init_arch
+static void sha3_mod_init_arch(void)
+{
+	int num_present = 0;
+	int num_possible = 0;
+
+	if (!cpu_have_feature(S390_CPU_FEATURE_MSA))
+		return;
+	/*
+	 * Since all the SHA-3 functions are in Message-Security-Assist
+	 * Extension 6, just treat them as all or nothing.  This way we need
+	 * only one static_key.
+	 */
+#define QUERY(opcode, func) \
+	({ num_present += !!cpacf_query_func(opcode, func); num_possible++; })
+	QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_224);
+	QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_256);
+	QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_384);
+	QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_512);
+	QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_224);
+	QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_256);
+	QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_384);
+	QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_512);
+#undef QUERY
+
+	if (num_present == num_possible) {
+		static_branch_enable(&have_sha3);
+		if (test_facility(86))
+			static_branch_enable(&have_sha3_init_optim);
+	} else if (num_present != 0) {
+		pr_warn("Unsupported combination of SHA-3 facilities\n");
+	}
+}
diff --git a/lib/crypto/sha1.c b/lib/crypto/sha1.c
index 5904e4ae85d2..52788278cd17 100644
--- a/lib/crypto/sha1.c
+++ b/lib/crypto/sha1.c
@@ -12,6 +12,7 @@
 #include <linux/string.h>
 #include <linux/unaligned.h>
 #include <linux/wordpart.h>
+#include "fips.h"
 
 static const struct sha1_block_state sha1_iv = {
 	.h = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
@@ -330,10 +331,26 @@ void hmac_sha1_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 }
 EXPORT_SYMBOL_GPL(hmac_sha1_usingrawkey);
 
-#ifdef sha1_mod_init_arch
+#if defined(sha1_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
 static int __init sha1_mod_init(void)
 {
+#ifdef sha1_mod_init_arch
 	sha1_mod_init_arch();
+#endif
+	if (fips_enabled) {
+		/*
+		 * FIPS cryptographic algorithm self-test.  As per the FIPS
+		 * Implementation Guidance, testing HMAC-SHA1 satisfies the test
+		 * requirement for SHA-1 too.
+		 */
+		u8 mac[SHA1_DIGEST_SIZE];
+
+		hmac_sha1_usingrawkey(fips_test_key, sizeof(fips_test_key),
+				      fips_test_data, sizeof(fips_test_data),
+				      mac);
+		if (memcmp(fips_test_hmac_sha1_value, mac, sizeof(mac)) != 0)
+			panic("sha1: FIPS self-test failed\n");
+	}
 	return 0;
 }
 subsys_initcall(sha1_mod_init);
diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c
index 881b935418ce..5d6b77e7e141 100644
--- a/lib/crypto/sha256.c
+++ b/lib/crypto/sha256.c
@@ -17,6 +17,7 @@
 #include <linux/string.h>
 #include <linux/unaligned.h>
 #include <linux/wordpart.h>
+#include "fips.h"
 
 static const struct sha256_block_state sha224_iv = {
 	.h = {
@@ -269,8 +270,8 @@ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE])
 EXPORT_SYMBOL(sha256);
 
 /*
- * Pre-boot environment (as indicated by __DISABLE_EXPORTS being defined)
- * doesn't need either HMAC support or interleaved hashing support
+ * Pre-boot environments (as indicated by __DISABLE_EXPORTS being defined) just
+ * need the generic SHA-256 code.  Omit all other features from them.
  */
 #ifndef __DISABLE_EXPORTS
 
@@ -477,12 +478,27 @@ void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 	hmac_sha256_final(&ctx, out);
 }
 EXPORT_SYMBOL_GPL(hmac_sha256_usingrawkey);
-#endif /* !__DISABLE_EXPORTS */
 
-#ifdef sha256_mod_init_arch
+#if defined(sha256_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
 static int __init sha256_mod_init(void)
 {
+#ifdef sha256_mod_init_arch
 	sha256_mod_init_arch();
+#endif
+	if (fips_enabled) {
+		/*
+		 * FIPS cryptographic algorithm self-test.  As per the FIPS
+		 * Implementation Guidance, testing HMAC-SHA256 satisfies the
+		 * test requirement for SHA-224, SHA-256, and HMAC-SHA224 too.
+		 */
+		u8 mac[SHA256_DIGEST_SIZE];
+
+		hmac_sha256_usingrawkey(fips_test_key, sizeof(fips_test_key),
+					fips_test_data, sizeof(fips_test_data),
+					mac);
+		if (memcmp(fips_test_hmac_sha256_value, mac, sizeof(mac)) != 0)
+			panic("sha256: FIPS self-test failed\n");
+	}
 	return 0;
 }
 subsys_initcall(sha256_mod_init);
@@ -493,5 +509,7 @@ static void __exit sha256_mod_exit(void)
 module_exit(sha256_mod_exit);
 #endif
 
+#endif /* !__DISABLE_EXPORTS */
+
 MODULE_DESCRIPTION("SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions");
 MODULE_LICENSE("GPL");
diff --git a/lib/crypto/sha3.c b/lib/crypto/sha3.c
new file mode 100644
index 000000000000..32b7074de792
--- /dev/null
+++ b/lib/crypto/sha3.c
@@ -0,0 +1,411 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-3, as specified in
+ * https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
+ *
+ * SHA-3 code by Jeff Garzik <jeff@garzik.org>
+ *               Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ *               David Howells <dhowells@redhat.com>
+ *
+ * See also Documentation/crypto/sha3.rst
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <crypto/sha3.h>
+#include <crypto/utils.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/unaligned.h>
+#include "fips.h"
+
+/*
+ * On some 32-bit architectures, such as h8300, GCC ends up using over 1 KB of
+ * stack if the round calculation gets inlined into the loop in
+ * sha3_keccakf_generic().  On the other hand, on 64-bit architectures with
+ * plenty of [64-bit wide] general purpose registers, not inlining it severely
+ * hurts performance.  So let's use 64-bitness as a heuristic to decide whether
+ * to inline or not.
+ */
+#ifdef CONFIG_64BIT
+#define SHA3_INLINE inline
+#else
+#define SHA3_INLINE noinline
+#endif
+
+#define SHA3_KECCAK_ROUNDS 24
+
+static const u64 sha3_keccakf_rndc[SHA3_KECCAK_ROUNDS] = {
+	0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL,
+	0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL,
+	0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008aULL,
+	0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL,
+	0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL,
+	0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
+	0x000000000000800aULL, 0x800000008000000aULL, 0x8000000080008081ULL,
+	0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
+};
+
+/*
+ * Perform a single round of Keccak mixing.
+ */
+static SHA3_INLINE void sha3_keccakf_one_round_generic(u64 st[25], int round)
+{
+	u64 t[5], tt, bc[5];
+
+	/* Theta */
+	bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
+	bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
+	bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
+	bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
+	bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
+
+	t[0] = bc[4] ^ rol64(bc[1], 1);
+	t[1] = bc[0] ^ rol64(bc[2], 1);
+	t[2] = bc[1] ^ rol64(bc[3], 1);
+	t[3] = bc[2] ^ rol64(bc[4], 1);
+	t[4] = bc[3] ^ rol64(bc[0], 1);
+
+	st[0] ^= t[0];
+
+	/* Rho Pi */
+	tt = st[1];
+	st[ 1] = rol64(st[ 6] ^ t[1], 44);
+	st[ 6] = rol64(st[ 9] ^ t[4], 20);
+	st[ 9] = rol64(st[22] ^ t[2], 61);
+	st[22] = rol64(st[14] ^ t[4], 39);
+	st[14] = rol64(st[20] ^ t[0], 18);
+	st[20] = rol64(st[ 2] ^ t[2], 62);
+	st[ 2] = rol64(st[12] ^ t[2], 43);
+	st[12] = rol64(st[13] ^ t[3], 25);
+	st[13] = rol64(st[19] ^ t[4],  8);
+	st[19] = rol64(st[23] ^ t[3], 56);
+	st[23] = rol64(st[15] ^ t[0], 41);
+	st[15] = rol64(st[ 4] ^ t[4], 27);
+	st[ 4] = rol64(st[24] ^ t[4], 14);
+	st[24] = rol64(st[21] ^ t[1],  2);
+	st[21] = rol64(st[ 8] ^ t[3], 55);
+	st[ 8] = rol64(st[16] ^ t[1], 45);
+	st[16] = rol64(st[ 5] ^ t[0], 36);
+	st[ 5] = rol64(st[ 3] ^ t[3], 28);
+	st[ 3] = rol64(st[18] ^ t[3], 21);
+	st[18] = rol64(st[17] ^ t[2], 15);
+	st[17] = rol64(st[11] ^ t[1], 10);
+	st[11] = rol64(st[ 7] ^ t[2],  6);
+	st[ 7] = rol64(st[10] ^ t[0],  3);
+	st[10] = rol64(    tt ^ t[1],  1);
+
+	/* Chi */
+	bc[ 0] = ~st[ 1] & st[ 2];
+	bc[ 1] = ~st[ 2] & st[ 3];
+	bc[ 2] = ~st[ 3] & st[ 4];
+	bc[ 3] = ~st[ 4] & st[ 0];
+	bc[ 4] = ~st[ 0] & st[ 1];
+	st[ 0] ^= bc[ 0];
+	st[ 1] ^= bc[ 1];
+	st[ 2] ^= bc[ 2];
+	st[ 3] ^= bc[ 3];
+	st[ 4] ^= bc[ 4];
+
+	bc[ 0] = ~st[ 6] & st[ 7];
+	bc[ 1] = ~st[ 7] & st[ 8];
+	bc[ 2] = ~st[ 8] & st[ 9];
+	bc[ 3] = ~st[ 9] & st[ 5];
+	bc[ 4] = ~st[ 5] & st[ 6];
+	st[ 5] ^= bc[ 0];
+	st[ 6] ^= bc[ 1];
+	st[ 7] ^= bc[ 2];
+	st[ 8] ^= bc[ 3];
+	st[ 9] ^= bc[ 4];
+
+	bc[ 0] = ~st[11] & st[12];
+	bc[ 1] = ~st[12] & st[13];
+	bc[ 2] = ~st[13] & st[14];
+	bc[ 3] = ~st[14] & st[10];
+	bc[ 4] = ~st[10] & st[11];
+	st[10] ^= bc[ 0];
+	st[11] ^= bc[ 1];
+	st[12] ^= bc[ 2];
+	st[13] ^= bc[ 3];
+	st[14] ^= bc[ 4];
+
+	bc[ 0] = ~st[16] & st[17];
+	bc[ 1] = ~st[17] & st[18];
+	bc[ 2] = ~st[18] & st[19];
+	bc[ 3] = ~st[19] & st[15];
+	bc[ 4] = ~st[15] & st[16];
+	st[15] ^= bc[ 0];
+	st[16] ^= bc[ 1];
+	st[17] ^= bc[ 2];
+	st[18] ^= bc[ 3];
+	st[19] ^= bc[ 4];
+
+	bc[ 0] = ~st[21] & st[22];
+	bc[ 1] = ~st[22] & st[23];
+	bc[ 2] = ~st[23] & st[24];
+	bc[ 3] = ~st[24] & st[20];
+	bc[ 4] = ~st[20] & st[21];
+	st[20] ^= bc[ 0];
+	st[21] ^= bc[ 1];
+	st[22] ^= bc[ 2];
+	st[23] ^= bc[ 3];
+	st[24] ^= bc[ 4];
+
+	/* Iota */
+	st[0] ^= sha3_keccakf_rndc[round];
+}
+
+/* Generic implementation of the Keccak-f[1600] permutation */
+static void sha3_keccakf_generic(struct sha3_state *state)
+{
+	/*
+	 * Temporarily convert the state words from little-endian to native-
+	 * endian so that they can be operated on.  Note that on little-endian
+	 * machines this conversion is a no-op and is optimized out.
+	 */
+
+	for (int i = 0; i < ARRAY_SIZE(state->words); i++)
+		state->native_words[i] = le64_to_cpu(state->words[i]);
+
+	for (int round = 0; round < SHA3_KECCAK_ROUNDS; round++)
+		sha3_keccakf_one_round_generic(state->native_words, round);
+
+	for (int i = 0; i < ARRAY_SIZE(state->words); i++)
+		state->words[i] = cpu_to_le64(state->native_words[i]);
+}
+
+/*
+ * Generic implementation of absorbing the given nonzero number of full blocks
+ * into the sponge function Keccak[r=8*block_size, c=1600-8*block_size].
+ */
+static void __maybe_unused
+sha3_absorb_blocks_generic(struct sha3_state *state, const u8 *data,
+			   size_t nblocks, size_t block_size)
+{
+	do {
+		for (size_t i = 0; i < block_size; i += 8)
+			state->words[i / 8] ^= get_unaligned((__le64 *)&data[i]);
+		sha3_keccakf_generic(state);
+		data += block_size;
+	} while (--nblocks);
+}
+
+#ifdef CONFIG_CRYPTO_LIB_SHA3_ARCH
+#include "sha3.h" /* $(SRCARCH)/sha3.h */
+#else
+#define sha3_keccakf		sha3_keccakf_generic
+#define sha3_absorb_blocks	sha3_absorb_blocks_generic
+#endif
+
+void __sha3_update(struct __sha3_ctx *ctx, const u8 *in, size_t in_len)
+{
+	const size_t block_size = ctx->block_size;
+	size_t absorb_offset = ctx->absorb_offset;
+
+	/* Warn if squeezing has already begun. */
+	WARN_ON_ONCE(absorb_offset >= block_size);
+
+	if (absorb_offset && absorb_offset + in_len >= block_size) {
+		crypto_xor(&ctx->state.bytes[absorb_offset], in,
+			   block_size - absorb_offset);
+		in += block_size - absorb_offset;
+		in_len -= block_size - absorb_offset;
+		sha3_keccakf(&ctx->state);
+		absorb_offset = 0;
+	}
+
+	if (in_len >= block_size) {
+		size_t nblocks = in_len / block_size;
+
+		sha3_absorb_blocks(&ctx->state, in, nblocks, block_size);
+		in += nblocks * block_size;
+		in_len -= nblocks * block_size;
+	}
+
+	if (in_len) {
+		crypto_xor(&ctx->state.bytes[absorb_offset], in, in_len);
+		absorb_offset += in_len;
+	}
+	ctx->absorb_offset = absorb_offset;
+}
+EXPORT_SYMBOL_GPL(__sha3_update);
+
+void sha3_final(struct sha3_ctx *sha3_ctx, u8 *out)
+{
+	struct __sha3_ctx *ctx = &sha3_ctx->ctx;
+
+	ctx->state.bytes[ctx->absorb_offset] ^= 0x06;
+	ctx->state.bytes[ctx->block_size - 1] ^= 0x80;
+	sha3_keccakf(&ctx->state);
+	memcpy(out, ctx->state.bytes, ctx->digest_size);
+	sha3_zeroize_ctx(sha3_ctx);
+}
+EXPORT_SYMBOL_GPL(sha3_final);
+
+void shake_squeeze(struct shake_ctx *shake_ctx, u8 *out, size_t out_len)
+{
+	struct __sha3_ctx *ctx = &shake_ctx->ctx;
+	const size_t block_size = ctx->block_size;
+	size_t squeeze_offset = ctx->squeeze_offset;
+
+	if (ctx->absorb_offset < block_size) {
+		/* First squeeze: */
+
+		/* Add the domain separation suffix and padding. */
+		ctx->state.bytes[ctx->absorb_offset] ^= 0x1f;
+		ctx->state.bytes[block_size - 1] ^= 0x80;
+
+		/* Indicate that squeezing has begun. */
+		ctx->absorb_offset = block_size;
+
+		/*
+		 * Indicate that no output is pending yet, i.e. sha3_keccakf()
+		 * will need to be called before the first copy.
+		 */
+		squeeze_offset = block_size;
+	}
+	while (out_len) {
+		if (squeeze_offset == block_size) {
+			sha3_keccakf(&ctx->state);
+			squeeze_offset = 0;
+		}
+		size_t copy = min(out_len, block_size - squeeze_offset);
+
+		memcpy(out, &ctx->state.bytes[squeeze_offset], copy);
+		out += copy;
+		out_len -= copy;
+		squeeze_offset += copy;
+	}
+	ctx->squeeze_offset = squeeze_offset;
+}
+EXPORT_SYMBOL_GPL(shake_squeeze);
+
+#ifndef sha3_224_arch
+static inline bool sha3_224_arch(const u8 *in, size_t in_len,
+				 u8 out[SHA3_224_DIGEST_SIZE])
+{
+	return false;
+}
+#endif
+#ifndef sha3_256_arch
+static inline bool sha3_256_arch(const u8 *in, size_t in_len,
+				 u8 out[SHA3_256_DIGEST_SIZE])
+{
+	return false;
+}
+#endif
+#ifndef sha3_384_arch
+static inline bool sha3_384_arch(const u8 *in, size_t in_len,
+				 u8 out[SHA3_384_DIGEST_SIZE])
+{
+	return false;
+}
+#endif
+#ifndef sha3_512_arch
+static inline bool sha3_512_arch(const u8 *in, size_t in_len,
+				 u8 out[SHA3_512_DIGEST_SIZE])
+{
+	return false;
+}
+#endif
+
+void sha3_224(const u8 *in, size_t in_len, u8 out[SHA3_224_DIGEST_SIZE])
+{
+	struct sha3_ctx ctx;
+
+	if (sha3_224_arch(in, in_len, out))
+		return;
+	sha3_224_init(&ctx);
+	sha3_update(&ctx, in, in_len);
+	sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_224);
+
+void sha3_256(const u8 *in, size_t in_len, u8 out[SHA3_256_DIGEST_SIZE])
+{
+	struct sha3_ctx ctx;
+
+	if (sha3_256_arch(in, in_len, out))
+		return;
+	sha3_256_init(&ctx);
+	sha3_update(&ctx, in, in_len);
+	sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_256);
+
+void sha3_384(const u8 *in, size_t in_len, u8 out[SHA3_384_DIGEST_SIZE])
+{
+	struct sha3_ctx ctx;
+
+	if (sha3_384_arch(in, in_len, out))
+		return;
+	sha3_384_init(&ctx);
+	sha3_update(&ctx, in, in_len);
+	sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_384);
+
+void sha3_512(const u8 *in, size_t in_len, u8 out[SHA3_512_DIGEST_SIZE])
+{
+	struct sha3_ctx ctx;
+
+	if (sha3_512_arch(in, in_len, out))
+		return;
+	sha3_512_init(&ctx);
+	sha3_update(&ctx, in, in_len);
+	sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_512);
+
+void shake128(const u8 *in, size_t in_len, u8 *out, size_t out_len)
+{
+	struct shake_ctx ctx;
+
+	shake128_init(&ctx);
+	shake_update(&ctx, in, in_len);
+	shake_squeeze(&ctx, out, out_len);
+	shake_zeroize_ctx(&ctx);
+}
+EXPORT_SYMBOL_GPL(shake128);
+
+void shake256(const u8 *in, size_t in_len, u8 *out, size_t out_len)
+{
+	struct shake_ctx ctx;
+
+	shake256_init(&ctx);
+	shake_update(&ctx, in, in_len);
+	shake_squeeze(&ctx, out, out_len);
+	shake_zeroize_ctx(&ctx);
+}
+EXPORT_SYMBOL_GPL(shake256);
+
+#if defined(sha3_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
+static int __init sha3_mod_init(void)
+{
+#ifdef sha3_mod_init_arch
+	sha3_mod_init_arch();
+#endif
+	if (fips_enabled) {
+		/*
+		 * FIPS cryptographic algorithm self-test.  As per the FIPS
+		 * Implementation Guidance, testing any SHA-3 algorithm
+		 * satisfies the test requirement for all of them.
+		 */
+		u8 hash[SHA3_256_DIGEST_SIZE];
+
+		sha3_256(fips_test_data, sizeof(fips_test_data), hash);
+		if (memcmp(fips_test_sha3_256_value, hash, sizeof(hash)) != 0)
+			panic("sha3: FIPS self-test failed\n");
+	}
+	return 0;
+}
+subsys_initcall(sha3_mod_init);
+
+static void __exit sha3_mod_exit(void)
+{
+}
+module_exit(sha3_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("SHA-3 library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/sha512.c b/lib/crypto/sha512.c
index d8062188be98..605eab51aabd 100644
--- a/lib/crypto/sha512.c
+++ b/lib/crypto/sha512.c
@@ -17,6 +17,7 @@
 #include <linux/string.h>
 #include <linux/unaligned.h>
 #include <linux/wordpart.h>
+#include "fips.h"
 
 static const struct sha512_block_state sha384_iv = {
 	.h = {
@@ -405,10 +406,26 @@ void hmac_sha512_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 }
 EXPORT_SYMBOL_GPL(hmac_sha512_usingrawkey);
 
-#ifdef sha512_mod_init_arch
+#if defined(sha512_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
 static int __init sha512_mod_init(void)
 {
+#ifdef sha512_mod_init_arch
 	sha512_mod_init_arch();
+#endif
+	if (fips_enabled) {
+		/*
+		 * FIPS cryptographic algorithm self-test.  As per the FIPS
+		 * Implementation Guidance, testing HMAC-SHA512 satisfies the
+		 * test requirement for SHA-384, SHA-512, and HMAC-SHA384 too.
+		 */
+		u8 mac[SHA512_DIGEST_SIZE];
+
+		hmac_sha512_usingrawkey(fips_test_key, sizeof(fips_test_key),
+					fips_test_data, sizeof(fips_test_data),
+					mac);
+		if (memcmp(fips_test_hmac_sha512_value, mac, sizeof(mac)) != 0)
+			panic("sha512: FIPS self-test failed\n");
+	}
 	return 0;
 }
 subsys_initcall(sha512_mod_init);
diff --git a/lib/crypto/tests/Kconfig b/lib/crypto/tests/Kconfig
index 578af717e13a..61d435c450bb 100644
--- a/lib/crypto/tests/Kconfig
+++ b/lib/crypto/tests/Kconfig
@@ -1,5 +1,14 @@
 # SPDX-License-Identifier: GPL-2.0-or-later
 
+config CRYPTO_LIB_BLAKE2B_KUNIT_TEST
+	tristate "KUnit tests for BLAKE2b" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	select CRYPTO_LIB_BLAKE2B
+	help
+	  KUnit tests for the BLAKE2b cryptographic hash function.
+
 config CRYPTO_LIB_BLAKE2S_KUNIT_TEST
 	tristate "KUnit tests for BLAKE2s" if !KUNIT_ALL_TESTS
 	depends on KUNIT
@@ -38,6 +47,15 @@ config CRYPTO_LIB_POLY1305_KUNIT_TEST
 	help
 	  KUnit tests for the Poly1305 library functions.
 
+config CRYPTO_LIB_POLYVAL_KUNIT_TEST
+	tristate "KUnit tests for POLYVAL" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	select CRYPTO_LIB_POLYVAL
+	help
+	  KUnit tests for the POLYVAL library functions.
+
 config CRYPTO_LIB_SHA1_KUNIT_TEST
 	tristate "KUnit tests for SHA-1" if !KUNIT_ALL_TESTS
 	depends on KUNIT
@@ -72,6 +90,17 @@ config CRYPTO_LIB_SHA512_KUNIT_TEST
 	  KUnit tests for the SHA-384 and SHA-512 cryptographic hash functions
 	  and their corresponding HMACs.
 
+config CRYPTO_LIB_SHA3_KUNIT_TEST
+	tristate "KUnit tests for SHA-3" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS || CRYPTO_SELFTESTS
+	select CRYPTO_LIB_BENCHMARK_VISIBLE
+	select CRYPTO_LIB_SHA3
+	help
+	  KUnit tests for the SHA3 cryptographic hash and XOF functions,
+	  including SHA3-224, SHA3-256, SHA3-384, SHA3-512, SHAKE128 and
+	  SHAKE256.
+
 config CRYPTO_LIB_BENCHMARK_VISIBLE
 	bool
 
diff --git a/lib/crypto/tests/Makefile b/lib/crypto/tests/Makefile
index a71fad19922b..5109a0651925 100644
--- a/lib/crypto/tests/Makefile
+++ b/lib/crypto/tests/Makefile
@@ -1,9 +1,12 @@
 # SPDX-License-Identifier: GPL-2.0-or-later
 
+obj-$(CONFIG_CRYPTO_LIB_BLAKE2B_KUNIT_TEST) += blake2b_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_BLAKE2S_KUNIT_TEST) += blake2s_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_CURVE25519_KUNIT_TEST) += curve25519_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_MD5_KUNIT_TEST) += md5_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_POLY1305_KUNIT_TEST) += poly1305_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_POLYVAL_KUNIT_TEST) += polyval_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_SHA1_KUNIT_TEST) += sha1_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_SHA256_KUNIT_TEST) += sha224_kunit.o sha256_kunit.o
 obj-$(CONFIG_CRYPTO_LIB_SHA512_KUNIT_TEST) += sha384_kunit.o sha512_kunit.o
+obj-$(CONFIG_CRYPTO_LIB_SHA3_KUNIT_TEST) += sha3_kunit.o
diff --git a/lib/crypto/tests/blake2b-testvecs.h b/lib/crypto/tests/blake2b-testvecs.h
new file mode 100644
index 000000000000..9e407dbc219c
--- /dev/null
+++ b/lib/crypto/tests/blake2b-testvecs.h
@@ -0,0 +1,342 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py blake2b */
+
+static const struct {
+	size_t data_len;
+	u8 digest[BLAKE2B_HASH_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0x78, 0x6a, 0x02, 0xf7, 0x42, 0x01, 0x59, 0x03,
+			0xc6, 0xc6, 0xfd, 0x85, 0x25, 0x52, 0xd2, 0x72,
+			0x91, 0x2f, 0x47, 0x40, 0xe1, 0x58, 0x47, 0x61,
+			0x8a, 0x86, 0xe2, 0x17, 0xf7, 0x1f, 0x54, 0x19,
+			0xd2, 0x5e, 0x10, 0x31, 0xaf, 0xee, 0x58, 0x53,
+			0x13, 0x89, 0x64, 0x44, 0x93, 0x4e, 0xb0, 0x4b,
+			0x90, 0x3a, 0x68, 0x5b, 0x14, 0x48, 0xb7, 0x55,
+			0xd5, 0x6f, 0x70, 0x1a, 0xfe, 0x9b, 0xe2, 0xce,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0x6f, 0x2e, 0xcc, 0x83, 0x53, 0xa3, 0x20, 0x16,
+			0x5b, 0xda, 0xd0, 0x04, 0xd3, 0xcb, 0xe4, 0x37,
+			0x5b, 0xf0, 0x84, 0x36, 0xe1, 0xad, 0x45, 0xcc,
+			0x4d, 0x7f, 0x09, 0x68, 0xb2, 0x62, 0x93, 0x7f,
+			0x72, 0x32, 0xe8, 0xa7, 0x2f, 0x1f, 0x6f, 0xc6,
+			0x14, 0xd6, 0x70, 0xae, 0x0c, 0xf0, 0xf3, 0xce,
+			0x64, 0x4d, 0x22, 0xdf, 0xc7, 0xa7, 0xf8, 0xa8,
+			0x18, 0x23, 0xd8, 0x6c, 0xaf, 0x65, 0xa2, 0x54,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0x04, 0x13, 0xe2, 0x10, 0xbe, 0x65, 0xde, 0xce,
+			0x61, 0xa8, 0xe0, 0xd6, 0x35, 0xb1, 0xb8, 0x88,
+			0xd2, 0xea, 0x45, 0x3a, 0xe1, 0x8d, 0x94, 0xb5,
+			0x66, 0x06, 0x98, 0x96, 0x39, 0xf8, 0x0e, 0xcb,
+			0x34, 0xa6, 0xa8, 0x17, 0xfe, 0x56, 0xbc, 0xa9,
+			0x5e, 0x1b, 0xb1, 0xde, 0x3c, 0xc7, 0x78, 0x4f,
+			0x39, 0xc6, 0xfc, 0xa8, 0xb3, 0x27, 0x66, 0x3e,
+			0x4e, 0xb5, 0x5d, 0x08, 0x89, 0xee, 0xd1, 0xe0,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0x2b, 0x4a, 0xa3, 0x4e, 0x2b, 0x7a, 0x47, 0x20,
+			0x30, 0x5b, 0x09, 0x17, 0x3a, 0xf4, 0xcc, 0xf0,
+			0xf7, 0x7b, 0x97, 0x68, 0x98, 0x9f, 0x4f, 0x09,
+			0x46, 0x25, 0xe7, 0xd6, 0x53, 0x6b, 0xf9, 0x68,
+			0x48, 0x12, 0x44, 0x8c, 0x9a, 0xc8, 0xd4, 0x42,
+			0xeb, 0x2c, 0x5f, 0x41, 0xba, 0x17, 0xd0, 0xc3,
+			0xad, 0xfd, 0xfb, 0x42, 0x33, 0xcb, 0x08, 0x5d,
+			0xd2, 0x5c, 0x3d, 0xde, 0x87, 0x4d, 0xd6, 0xe4,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0xbf, 0x40, 0xf2, 0x38, 0x44, 0x8e, 0x24, 0x5e,
+			0xbc, 0x67, 0xbb, 0xf0, 0x10, 0x9a, 0x79, 0xbb,
+			0x36, 0x55, 0xce, 0xd2, 0xba, 0x04, 0x0d, 0xe8,
+			0x30, 0x29, 0x5c, 0x2a, 0xa6, 0x3a, 0x4f, 0x37,
+			0xac, 0x5f, 0xd4, 0x13, 0xa2, 0xf4, 0xfe, 0x80,
+			0x61, 0xd7, 0x58, 0x66, 0x0c, 0x7f, 0xa2, 0x56,
+			0x6b, 0x52, 0x7c, 0x22, 0x73, 0x7f, 0x17, 0xaa,
+			0x91, 0x5a, 0x22, 0x06, 0xd9, 0x00, 0x48, 0x12,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0x41, 0x04, 0x65, 0x93, 0x81, 0x9a, 0x20, 0x0a,
+			0x00, 0x60, 0x00, 0x64, 0x4c, 0x04, 0x3d, 0xe0,
+			0x6b, 0x17, 0x0c, 0xe1, 0x0e, 0x28, 0x8b, 0xa0,
+			0x76, 0xd2, 0x79, 0xb0, 0x33, 0x60, 0x61, 0x27,
+			0xf2, 0x64, 0xf1, 0x8a, 0xe5, 0x3e, 0xaa, 0x37,
+			0x60, 0xad, 0x2d, 0x75, 0x13, 0xae, 0xd8, 0x9e,
+			0xec, 0xe0, 0xe4, 0x40, 0x2f, 0x59, 0x44, 0xb0,
+			0x66, 0x7a, 0x68, 0x38, 0xce, 0x21, 0x99, 0x2a,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0x19, 0x6f, 0x9d, 0xc7, 0x87, 0x12, 0x5c, 0xa3,
+			0xe2, 0xd3, 0xf1, 0x82, 0xec, 0xf3, 0x55, 0x9c,
+			0x86, 0xd1, 0x6d, 0xde, 0xcf, 0x5b, 0xec, 0x4c,
+			0x43, 0x25, 0x85, 0x90, 0xef, 0xe8, 0xe3, 0x5f,
+			0x2c, 0x3a, 0x84, 0x07, 0xb8, 0x55, 0xfd, 0x5e,
+			0xa4, 0x45, 0xf2, 0xac, 0xe4, 0xbd, 0xc7, 0x96,
+			0x80, 0x59, 0x3e, 0xc9, 0xb1, 0x60, 0xb1, 0x2b,
+			0x17, 0x49, 0x7d, 0x3e, 0x7d, 0x4d, 0x70, 0x24,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0x73, 0x72, 0xd5, 0x0a, 0x97, 0xb4, 0x7d, 0xdb,
+			0x05, 0x14, 0x8e, 0x40, 0xc2, 0x9a, 0x8a, 0x74,
+			0x4b, 0xda, 0x7e, 0xfc, 0x97, 0x57, 0x23, 0x39,
+			0xdc, 0x57, 0x09, 0x13, 0x24, 0xfc, 0xf3, 0x23,
+			0x55, 0x48, 0xdd, 0xe5, 0x07, 0x9a, 0x6f, 0x7b,
+			0x62, 0xea, 0x4d, 0x79, 0xb4, 0xb9, 0xc5, 0x86,
+			0xc0, 0x34, 0xd6, 0xd2, 0x6c, 0xc3, 0x94, 0xfb,
+			0x34, 0xd6, 0x62, 0xae, 0xb8, 0x99, 0xf1, 0x38,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0x42, 0x3a, 0xe3, 0xa2, 0xae, 0x5a, 0x28, 0xce,
+			0xf1, 0x3c, 0x97, 0xc2, 0x34, 0xf6, 0xb5, 0x1e,
+			0xfc, 0x31, 0xb4, 0x04, 0x61, 0xb7, 0x54, 0x0b,
+			0x0d, 0x1a, 0x22, 0x9c, 0x04, 0x67, 0x5c, 0x4c,
+			0x75, 0x1b, 0x10, 0x0b, 0x99, 0xe2, 0xb1, 0x5e,
+			0x5d, 0x4b, 0x7a, 0xe6, 0xf6, 0xb5, 0x62, 0xee,
+			0x2d, 0x44, 0x57, 0xb2, 0x96, 0x73, 0x5e, 0xb9,
+			0x6a, 0xb2, 0xb3, 0x16, 0xa3, 0xd9, 0x6a, 0x60,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0x50, 0xb9, 0xbe, 0xb2, 0x69, 0x07, 0x45, 0x5b,
+			0x59, 0xde, 0x8d, 0xbf, 0x08, 0xdc, 0x2e, 0x7f,
+			0x93, 0x29, 0xc1, 0x91, 0xe8, 0x74, 0x03, 0x89,
+			0x20, 0xfb, 0xb2, 0x4b, 0xe8, 0x68, 0x6f, 0xe1,
+			0xb4, 0x30, 0xbe, 0x11, 0x3c, 0x43, 0x19, 0x66,
+			0x72, 0x78, 0xb7, 0xf4, 0xe9, 0x09, 0x18, 0x4e,
+			0xae, 0x4a, 0x24, 0xe0, 0x6f, 0x44, 0x02, 0xe3,
+			0xfd, 0xda, 0xb3, 0x3e, 0x3c, 0x6d, 0x54, 0x2e,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0xd6, 0xf2, 0xa9, 0x61, 0x3f, 0xce, 0x2a, 0x68,
+			0x19, 0x86, 0xff, 0xd1, 0xee, 0x89, 0x3b, 0xa4,
+			0x10, 0x9a, 0x91, 0x50, 0x35, 0x48, 0x9e, 0xf5,
+			0x9c, 0x95, 0xe0, 0xfb, 0x92, 0x0f, 0xa8, 0xf7,
+			0x6c, 0x43, 0x85, 0xf1, 0x6e, 0x11, 0x4e, 0x67,
+			0x78, 0xd7, 0x53, 0x25, 0x0c, 0xf8, 0xce, 0x38,
+			0x74, 0x08, 0xb0, 0x3c, 0x53, 0x20, 0x4d, 0xc4,
+			0x9a, 0xf5, 0x78, 0xe8, 0x41, 0x8f, 0xed, 0x1f,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0xe8, 0xb2, 0xc5, 0xa7, 0xf5, 0xfa, 0xee, 0xa0,
+			0x57, 0xba, 0x58, 0xf9, 0x0a, 0xf2, 0x64, 0x16,
+			0xa8, 0xa6, 0x03, 0x85, 0x3b, 0xb8, 0x6f, 0xca,
+			0x76, 0xc3, 0xa1, 0x2b, 0xec, 0xef, 0xc4, 0x66,
+			0x11, 0xdf, 0x03, 0x85, 0x9d, 0x0c, 0x37, 0x7b,
+			0xa9, 0x7b, 0x44, 0xfb, 0x11, 0x8f, 0x3f, 0x71,
+			0xcd, 0x81, 0x43, 0x2e, 0x71, 0x5c, 0x54, 0x9f,
+			0xca, 0x0f, 0x01, 0x91, 0xca, 0xaa, 0x93, 0xe9,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0x05, 0x8e, 0x9d, 0xdc, 0xe9, 0x36, 0x3e, 0x73,
+			0x63, 0x59, 0x69, 0x81, 0x0b, 0x8c, 0xc7, 0x9e,
+			0xcc, 0xe7, 0x9c, 0x19, 0x54, 0xa7, 0x2f, 0x86,
+			0xb5, 0xea, 0xae, 0x6d, 0xfe, 0x4e, 0x6e, 0x83,
+			0x8d, 0x1a, 0x1c, 0x70, 0x3f, 0x34, 0xa1, 0x04,
+			0x59, 0xd1, 0xbb, 0xaa, 0x58, 0xf7, 0xce, 0xfb,
+			0x86, 0x66, 0x22, 0xfc, 0x78, 0x74, 0x6e, 0x85,
+			0xf1, 0x59, 0x7d, 0x9e, 0x1c, 0x3b, 0xc6, 0x65,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0x6b, 0x1f, 0x7c, 0x9a, 0x65, 0x7f, 0x09, 0x61,
+			0xe5, 0x04, 0x9a, 0xf1, 0x4b, 0x36, 0x8e, 0x41,
+			0x86, 0xcf, 0x86, 0x19, 0xd8, 0xc9, 0x34, 0x70,
+			0x67, 0xd1, 0x03, 0x72, 0x12, 0xf7, 0x27, 0x92,
+			0x2e, 0x3d, 0x2b, 0x54, 0x9a, 0x48, 0xa4, 0xc2,
+			0x61, 0xea, 0x6a, 0xe8, 0xdd, 0x07, 0x41, 0x85,
+			0x58, 0x6d, 0xcd, 0x12, 0x0d, 0xbc, 0xb1, 0x23,
+			0xb2, 0xdb, 0x24, 0x1f, 0xc4, 0xa7, 0xae, 0xda,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x50, 0xd8, 0xdc, 0xb2, 0x50, 0x24, 0x7a, 0x49,
+			0xb1, 0x00, 0x73, 0x16, 0x1f, 0xce, 0xf9, 0xe8,
+			0x77, 0x0a, 0x27, 0x74, 0xc7, 0xeb, 0xf0, 0x62,
+			0xb9, 0xf3, 0x24, 0xa6, 0x03, 0x18, 0x40, 0xde,
+			0x9b, 0x1d, 0xa8, 0xd0, 0xbf, 0x66, 0xa3, 0xc1,
+			0x31, 0x04, 0x95, 0xc7, 0xc3, 0xb7, 0x11, 0xe2,
+			0x1e, 0x31, 0x49, 0x98, 0x06, 0xab, 0xf0, 0xe6,
+			0x5c, 0xac, 0x88, 0x28, 0x0b, 0x3d, 0xb2, 0xc2,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0xd4, 0x2b, 0x6b, 0x9e, 0xfc, 0x44, 0xc0, 0x90,
+			0x64, 0x77, 0x5d, 0xf3, 0x44, 0xb6, 0x92, 0x8f,
+			0x80, 0xe2, 0xe4, 0x9b, 0xaf, 0x49, 0x04, 0xea,
+			0x29, 0xf7, 0x4a, 0x33, 0x3f, 0xc7, 0x3b, 0xab,
+			0xa1, 0x71, 0x7f, 0xa2, 0x8e, 0x03, 0xa0, 0xd6,
+			0xa7, 0xcd, 0xe0, 0xf8, 0xd7, 0x3b, 0xa4, 0x0d,
+			0x84, 0x79, 0x12, 0x72, 0x3f, 0x8e, 0x48, 0x35,
+			0x76, 0x4f, 0x56, 0xe9, 0x21, 0x40, 0x19, 0xbe,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0x84, 0xd4, 0xd8, 0x6c, 0x60, 0x3d, 0x6e, 0xfd,
+			0x84, 0xb7, 0xdf, 0xba, 0x13, 0x5e, 0x07, 0x94,
+			0x5b, 0x6b, 0x62, 0x1d, 0x82, 0x02, 0xa7, 0xb3,
+			0x21, 0xdf, 0x42, 0x20, 0x85, 0xa8, 0x6f, 0x30,
+			0xf7, 0x03, 0xba, 0x66, 0x0e, 0xa6, 0x42, 0x21,
+			0x37, 0xe8, 0xed, 0x5b, 0x22, 0xf5, 0x4e, 0xa5,
+			0xe5, 0x80, 0x1b, 0x47, 0xf0, 0x49, 0xb3, 0xe5,
+			0x6e, 0xd9, 0xd9, 0x95, 0x3d, 0x2e, 0x42, 0x13,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0x71, 0x17, 0xab, 0x93, 0xfe, 0x3b, 0xa4, 0xe6,
+			0xcb, 0xb0, 0xea, 0x95, 0xe7, 0x1a, 0x01, 0xc0,
+			0x12, 0x33, 0xfe, 0xcc, 0x79, 0x15, 0xae, 0x56,
+			0xd2, 0x70, 0x44, 0x60, 0x54, 0x42, 0xa8, 0x69,
+			0x7e, 0xc3, 0x90, 0xa0, 0x0c, 0x63, 0x39, 0xff,
+			0x55, 0x53, 0xb8, 0x46, 0xef, 0x06, 0xcb, 0xba,
+			0x73, 0xf4, 0x76, 0x22, 0xf1, 0x60, 0x98, 0xbc,
+			0xbf, 0x76, 0x95, 0x85, 0x13, 0x1d, 0x11, 0x3b,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0x3a, 0xaa, 0x85, 0xa0, 0x8c, 0x8e, 0xe1, 0x9c,
+			0x9b, 0x43, 0x72, 0x7f, 0x40, 0x88, 0x3b, 0xd1,
+			0xc4, 0xd8, 0x2b, 0x69, 0xa6, 0x74, 0x47, 0x69,
+			0x5f, 0x7d, 0xab, 0x75, 0xa9, 0xf9, 0x88, 0x54,
+			0xce, 0x57, 0xcc, 0x9d, 0xac, 0x13, 0x91, 0xdb,
+			0x6d, 0x5c, 0xd8, 0xf4, 0x35, 0xc9, 0x30, 0xf0,
+			0x4b, 0x91, 0x25, 0xab, 0x92, 0xa8, 0xc8, 0x6f,
+			0xa0, 0xeb, 0x71, 0x56, 0x95, 0xab, 0xfd, 0xd7,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0xe1, 0xe9, 0xbe, 0x6c, 0x96, 0xe2, 0xe8, 0xa6,
+			0x53, 0xcd, 0x79, 0x77, 0x57, 0x51, 0x2f, 0xb2,
+			0x9f, 0xfc, 0x09, 0xaa, 0x2c, 0xbc, 0x6c, 0x5f,
+			0xb0, 0xf2, 0x12, 0x39, 0x54, 0xd7, 0x27, 0xf8,
+			0x33, 0x5d, 0xd4, 0x8a, 0xca, 0xd8, 0x2e, 0xbb,
+			0x02, 0x82, 0xca, 0x1b, 0x54, 0xfa, 0xd6, 0xf4,
+			0x49, 0x63, 0xfc, 0xc8, 0x73, 0xd4, 0x26, 0x8d,
+			0x4f, 0x1c, 0x56, 0xa7, 0xf4, 0x58, 0x6f, 0x51,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0xf2, 0xf6, 0xe1, 0x16, 0x98, 0x69, 0x74, 0x5f,
+			0x6c, 0xc4, 0x9d, 0x34, 0xa2, 0x84, 0x5d, 0x47,
+			0xac, 0x39, 0xe0, 0x14, 0x2d, 0x78, 0xfa, 0x27,
+			0xd5, 0x18, 0xaf, 0x26, 0x89, 0xa4, 0x69, 0xd3,
+			0x56, 0xde, 0xfe, 0x4b, 0x9f, 0x0c, 0x9d, 0x5a,
+			0x9a, 0x73, 0x3e, 0x3c, 0x76, 0x4b, 0x96, 0xca,
+			0x49, 0xda, 0x05, 0x8c, 0x53, 0xbb, 0x85, 0x89,
+			0x60, 0xc7, 0xe0, 0xb3, 0x51, 0x18, 0xd2, 0xd2,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0xfc, 0x5c, 0xcf, 0xbf, 0x29, 0xe3, 0x01, 0xef,
+			0x4b, 0x40, 0x70, 0x01, 0xca, 0x4d, 0x46, 0xce,
+			0xa9, 0x95, 0x5d, 0xb4, 0xf1, 0x79, 0x29, 0xdb,
+			0xac, 0x32, 0x3d, 0xd9, 0x60, 0x9e, 0x6b, 0xb8,
+			0x28, 0x62, 0xb7, 0x4a, 0xbb, 0x33, 0xb9, 0xd0,
+			0x83, 0xe0, 0xd7, 0x5a, 0x2d, 0x01, 0x4c, 0x61,
+			0x9e, 0x7d, 0x2d, 0x2d, 0x60, 0x29, 0x5e, 0x60,
+			0x10, 0xb7, 0x41, 0x00, 0x3f, 0xe5, 0xf7, 0x52,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0xf8, 0xe5, 0x4b, 0xe5, 0x89, 0xf9, 0x1b, 0x43,
+			0xbb, 0x65, 0x3d, 0xa0, 0xb4, 0xdc, 0x04, 0x26,
+			0x68, 0x15, 0xae, 0x4d, 0xd6, 0x03, 0xb7, 0x27,
+			0x06, 0x8c, 0x2a, 0x82, 0x51, 0x96, 0xbf, 0x83,
+			0x38, 0x96, 0x21, 0x8a, 0xd9, 0xf9, 0x4e, 0x38,
+			0xc6, 0xb3, 0xbd, 0xfe, 0xd3, 0x49, 0x90, 0xbc,
+			0xa1, 0x77, 0xd0, 0xa0, 0x3c, 0x2b, 0x4e, 0x10,
+			0x34, 0xc3, 0x17, 0x85, 0x3d, 0xec, 0xa8, 0x05,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0x38, 0x56, 0xaf, 0x83, 0x68, 0x9c, 0xba, 0xe3,
+			0xec, 0x51, 0xf5, 0xf4, 0x93, 0x48, 0x1d, 0xe6,
+			0xad, 0xa8, 0x8c, 0x70, 0x2a, 0xd9, 0xaa, 0x43,
+			0x04, 0x40, 0x95, 0xc1, 0xe6, 0x8a, 0xf5, 0x01,
+			0x6b, 0x79, 0xd9, 0xb4, 0xd0, 0x1d, 0x93, 0x26,
+			0xfe, 0xf5, 0x07, 0x57, 0xda, 0x08, 0x0a, 0x82,
+			0xc9, 0x17, 0x13, 0x5b, 0x9e, 0x11, 0x96, 0xa5,
+			0xd0, 0x92, 0xcd, 0xf1, 0xa3, 0x5b, 0x43, 0x21,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[BLAKE2B_HASH_SIZE] = {
+	0xa4, 0xf8, 0xf6, 0xa1, 0x36, 0x89, 0xc0, 0x2a,
+	0xc3, 0x42, 0x32, 0x71, 0xe5, 0xea, 0x14, 0x77,
+	0xf3, 0x99, 0x91, 0x87, 0x49, 0xc2, 0x8d, 0xa5,
+	0x2f, 0xed, 0x01, 0x35, 0x39, 0x64, 0x09, 0x25,
+	0xe3, 0xa8, 0x50, 0x97, 0x35, 0x8b, 0xf5, 0x19,
+	0x1e, 0xd5, 0x9f, 0x03, 0x0b, 0x65, 0x55, 0x0e,
+	0xa0, 0xb7, 0xda, 0x18, 0x7b, 0x7f, 0x88, 0x55,
+	0x1f, 0xdb, 0x82, 0x6b, 0x98, 0x90, 0x1c, 0xdd,
+};
+
+static const u8 blake2b_keyed_testvec_consolidated[BLAKE2B_HASH_SIZE] = {
+	0x2b, 0x89, 0x36, 0x3a, 0x36, 0xe4, 0x18, 0x38,
+	0xc4, 0x5b, 0x5c, 0xa5, 0x9a, 0xed, 0xf2, 0xee,
+	0x5a, 0xb6, 0x82, 0x6c, 0x63, 0xf2, 0x29, 0x57,
+	0xc7, 0xd5, 0x32, 0x27, 0xba, 0x88, 0xb1, 0xab,
+	0xf2, 0x2a, 0xc1, 0xea, 0xf3, 0x91, 0x89, 0x66,
+	0x47, 0x1e, 0x5b, 0xc6, 0x98, 0x12, 0xe9, 0x25,
+	0xbf, 0x72, 0xd2, 0x3f, 0x88, 0x97, 0x17, 0x51,
+	0xed, 0x96, 0xfb, 0xe9, 0xca, 0x52, 0x42, 0xc9,
+};
diff --git a/lib/crypto/tests/blake2b_kunit.c b/lib/crypto/tests/blake2b_kunit.c
new file mode 100644
index 000000000000..bc0be7da1e76
--- /dev/null
+++ b/lib/crypto/tests/blake2b_kunit.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/blake2b.h>
+#include "blake2b-testvecs.h"
+
+/*
+ * The following are compatibility functions that present BLAKE2b as an unkeyed
+ * hash function that produces hashes of fixed length BLAKE2B_HASH_SIZE, so that
+ * hash-test-template.h can be reused to test it.
+ */
+
+static void blake2b_default(const u8 *data, size_t len,
+			    u8 out[BLAKE2B_HASH_SIZE])
+{
+	blake2b(NULL, 0, data, len, out, BLAKE2B_HASH_SIZE);
+}
+
+static void blake2b_init_default(struct blake2b_ctx *ctx)
+{
+	blake2b_init(ctx, BLAKE2B_HASH_SIZE);
+}
+
+/*
+ * Generate the HASH_KUNIT_CASES using hash-test-template.h.  These test BLAKE2b
+ * with a key length of 0 and a hash length of BLAKE2B_HASH_SIZE.
+ */
+#define HASH blake2b_default
+#define HASH_CTX blake2b_ctx
+#define HASH_SIZE BLAKE2B_HASH_SIZE
+#define HASH_INIT blake2b_init_default
+#define HASH_UPDATE blake2b_update
+#define HASH_FINAL blake2b_final
+#include "hash-test-template.h"
+
+/*
+ * BLAKE2b specific test case which tests all possible combinations of key
+ * length and hash length.
+ */
+static void test_blake2b_all_key_and_hash_lens(struct kunit *test)
+{
+	const size_t data_len = 100;
+	u8 *data = &test_buf[0];
+	u8 *key = data + data_len;
+	u8 *hash = key + BLAKE2B_KEY_SIZE;
+	struct blake2b_ctx main_ctx;
+	u8 main_hash[BLAKE2B_HASH_SIZE];
+
+	rand_bytes_seeded_from_len(data, data_len);
+	blake2b_init(&main_ctx, BLAKE2B_HASH_SIZE);
+	for (int key_len = 0; key_len <= BLAKE2B_KEY_SIZE; key_len++) {
+		rand_bytes_seeded_from_len(key, key_len);
+		for (int out_len = 1; out_len <= BLAKE2B_HASH_SIZE; out_len++) {
+			blake2b(key, key_len, data, data_len, hash, out_len);
+			blake2b_update(&main_ctx, hash, out_len);
+		}
+	}
+	blake2b_final(&main_ctx, main_hash);
+	KUNIT_ASSERT_MEMEQ(test, main_hash, blake2b_keyed_testvec_consolidated,
+			   BLAKE2B_HASH_SIZE);
+}
+
+/*
+ * BLAKE2b specific test case which tests using a guarded buffer for all allowed
+ * key lengths.  Also tests both blake2b() and blake2b_init_key().
+ */
+static void test_blake2b_with_guarded_key_buf(struct kunit *test)
+{
+	const size_t data_len = 100;
+
+	rand_bytes(test_buf, data_len);
+	for (int key_len = 0; key_len <= BLAKE2B_KEY_SIZE; key_len++) {
+		u8 key[BLAKE2B_KEY_SIZE];
+		u8 *guarded_key = &test_buf[TEST_BUF_LEN - key_len];
+		u8 hash1[BLAKE2B_HASH_SIZE];
+		u8 hash2[BLAKE2B_HASH_SIZE];
+		struct blake2b_ctx ctx;
+
+		rand_bytes(key, key_len);
+		memcpy(guarded_key, key, key_len);
+
+		blake2b(key, key_len, test_buf, data_len,
+			hash1, BLAKE2B_HASH_SIZE);
+		blake2b(guarded_key, key_len, test_buf, data_len,
+			hash2, BLAKE2B_HASH_SIZE);
+		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2B_HASH_SIZE);
+
+		blake2b_init_key(&ctx, BLAKE2B_HASH_SIZE, guarded_key, key_len);
+		blake2b_update(&ctx, test_buf, data_len);
+		blake2b_final(&ctx, hash2);
+		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2B_HASH_SIZE);
+	}
+}
+
+/*
+ * BLAKE2b specific test case which tests using a guarded output buffer for all
+ * allowed output lengths.
+ */
+static void test_blake2b_with_guarded_out_buf(struct kunit *test)
+{
+	const size_t data_len = 100;
+
+	rand_bytes(test_buf, data_len);
+	for (int out_len = 1; out_len <= BLAKE2B_HASH_SIZE; out_len++) {
+		u8 hash[BLAKE2B_HASH_SIZE];
+		u8 *guarded_hash = &test_buf[TEST_BUF_LEN - out_len];
+
+		blake2b(NULL, 0, test_buf, data_len, hash, out_len);
+		blake2b(NULL, 0, test_buf, data_len, guarded_hash, out_len);
+		KUNIT_ASSERT_MEMEQ(test, hash, guarded_hash, out_len);
+	}
+}
+
+static struct kunit_case blake2b_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(test_blake2b_all_key_and_hash_lens),
+	KUNIT_CASE(test_blake2b_with_guarded_key_buf),
+	KUNIT_CASE(test_blake2b_with_guarded_out_buf),
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite blake2b_test_suite = {
+	.name = "blake2b",
+	.test_cases = blake2b_test_cases,
+	.suite_init = hash_suite_init,
+	.suite_exit = hash_suite_exit,
+};
+kunit_test_suite(blake2b_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for BLAKE2b");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/blake2s_kunit.c b/lib/crypto/tests/blake2s_kunit.c
index 057c40132246..6832d9aa7b82 100644
--- a/lib/crypto/tests/blake2s_kunit.c
+++ b/lib/crypto/tests/blake2s_kunit.c
@@ -14,12 +14,12 @@
 static void blake2s_default(const u8 *data, size_t len,
 			    u8 out[BLAKE2S_HASH_SIZE])
 {
-	blake2s(out, data, NULL, BLAKE2S_HASH_SIZE, len, 0);
+	blake2s(NULL, 0, data, len, out, BLAKE2S_HASH_SIZE);
 }
 
-static void blake2s_init_default(struct blake2s_state *state)
+static void blake2s_init_default(struct blake2s_ctx *ctx)
 {
-	blake2s_init(state, BLAKE2S_HASH_SIZE);
+	blake2s_init(ctx, BLAKE2S_HASH_SIZE);
 }
 
 /*
@@ -27,7 +27,7 @@ static void blake2s_init_default(struct blake2s_state *state)
  * with a key length of 0 and a hash length of BLAKE2S_HASH_SIZE.
  */
 #define HASH blake2s_default
-#define HASH_CTX blake2s_state
+#define HASH_CTX blake2s_ctx
 #define HASH_SIZE BLAKE2S_HASH_SIZE
 #define HASH_INIT blake2s_init_default
 #define HASH_UPDATE blake2s_update
@@ -44,19 +44,19 @@ static void test_blake2s_all_key_and_hash_lens(struct kunit *test)
 	u8 *data = &test_buf[0];
 	u8 *key = data + data_len;
 	u8 *hash = key + BLAKE2S_KEY_SIZE;
-	struct blake2s_state main_state;
+	struct blake2s_ctx main_ctx;
 	u8 main_hash[BLAKE2S_HASH_SIZE];
 
 	rand_bytes_seeded_from_len(data, data_len);
-	blake2s_init(&main_state, BLAKE2S_HASH_SIZE);
+	blake2s_init(&main_ctx, BLAKE2S_HASH_SIZE);
 	for (int key_len = 0; key_len <= BLAKE2S_KEY_SIZE; key_len++) {
 		rand_bytes_seeded_from_len(key, key_len);
 		for (int out_len = 1; out_len <= BLAKE2S_HASH_SIZE; out_len++) {
-			blake2s(hash, data, key, out_len, data_len, key_len);
-			blake2s_update(&main_state, hash, out_len);
+			blake2s(key, key_len, data, data_len, hash, out_len);
+			blake2s_update(&main_ctx, hash, out_len);
 		}
 	}
-	blake2s_final(&main_state, main_hash);
+	blake2s_final(&main_ctx, main_hash);
 	KUNIT_ASSERT_MEMEQ(test, main_hash, blake2s_keyed_testvec_consolidated,
 			   BLAKE2S_HASH_SIZE);
 }
@@ -75,21 +75,20 @@ static void test_blake2s_with_guarded_key_buf(struct kunit *test)
 		u8 *guarded_key = &test_buf[TEST_BUF_LEN - key_len];
 		u8 hash1[BLAKE2S_HASH_SIZE];
 		u8 hash2[BLAKE2S_HASH_SIZE];
-		struct blake2s_state state;
+		struct blake2s_ctx ctx;
 
 		rand_bytes(key, key_len);
 		memcpy(guarded_key, key, key_len);
 
-		blake2s(hash1, test_buf, key,
-			BLAKE2S_HASH_SIZE, data_len, key_len);
-		blake2s(hash2, test_buf, guarded_key,
-			BLAKE2S_HASH_SIZE, data_len, key_len);
+		blake2s(key, key_len, test_buf, data_len,
+			hash1, BLAKE2S_HASH_SIZE);
+		blake2s(guarded_key, key_len, test_buf, data_len,
+			hash2, BLAKE2S_HASH_SIZE);
 		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE);
 
-		blake2s_init_key(&state, BLAKE2S_HASH_SIZE,
-				 guarded_key, key_len);
-		blake2s_update(&state, test_buf, data_len);
-		blake2s_final(&state, hash2);
+		blake2s_init_key(&ctx, BLAKE2S_HASH_SIZE, guarded_key, key_len);
+		blake2s_update(&ctx, test_buf, data_len);
+		blake2s_final(&ctx, hash2);
 		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE);
 	}
 }
@@ -107,8 +106,8 @@ static void test_blake2s_with_guarded_out_buf(struct kunit *test)
 		u8 hash[BLAKE2S_HASH_SIZE];
 		u8 *guarded_hash = &test_buf[TEST_BUF_LEN - out_len];
 
-		blake2s(hash, test_buf, NULL, out_len, data_len, 0);
-		blake2s(guarded_hash, test_buf, NULL, out_len, data_len, 0);
+		blake2s(NULL, 0, test_buf, data_len, hash, out_len);
+		blake2s(NULL, 0, test_buf, data_len, guarded_hash, out_len);
 		KUNIT_ASSERT_MEMEQ(test, hash, guarded_hash, out_len);
 	}
 }
diff --git a/lib/crypto/tests/polyval-testvecs.h b/lib/crypto/tests/polyval-testvecs.h
new file mode 100644
index 000000000000..3d33f60d58bb
--- /dev/null
+++ b/lib/crypto/tests/polyval-testvecs.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py polyval */
+
+static const struct {
+	size_t data_len;
+	u8 digest[POLYVAL_DIGEST_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0xb5, 0x51, 0x69, 0x89, 0xd4, 0x3c, 0x59, 0xca,
+			0x6a, 0x1c, 0x2a, 0xe9, 0xa1, 0x9c, 0x6c, 0x83,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0xf4, 0x50, 0xaf, 0x07, 0xda, 0x42, 0xa7, 0x41,
+			0x4d, 0x24, 0x88, 0x87, 0xe3, 0x40, 0x73, 0x7c,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0x9e, 0x88, 0x78, 0x71, 0x4c, 0x55, 0x87, 0xe8,
+			0xb4, 0x96, 0x3d, 0x56, 0xc8, 0xb2, 0xe1, 0x68,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0x9e, 0x81, 0x37, 0x8f, 0x49, 0xf7, 0xa2, 0xe4,
+			0x04, 0x45, 0x12, 0x78, 0x45, 0x42, 0x27, 0xad,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0x60, 0x19, 0xd0, 0xa4, 0xf0, 0xde, 0x9e, 0xe7,
+			0x6a, 0x89, 0x1a, 0xea, 0x80, 0x14, 0xa9, 0xa3,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0x0c, 0xa2, 0x70, 0x4d, 0x7c, 0x89, 0xac, 0x41,
+			0xc2, 0x9e, 0x0d, 0x07, 0x07, 0x6a, 0x7f, 0xd5,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0x91, 0xd3, 0xa9, 0x5c, 0x79, 0x3d, 0x6b, 0x84,
+			0x99, 0x54, 0xa7, 0xb4, 0x06, 0x66, 0xfd, 0x1c,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0x29, 0x37, 0xb8, 0xe5, 0xd8, 0x27, 0x4d, 0xfb,
+			0x83, 0x4f, 0x67, 0xf7, 0xf9, 0xc1, 0x0a, 0x9d,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0x17, 0xa9, 0x06, 0x2c, 0xf3, 0xe8, 0x2e, 0xa6,
+			0x6b, 0xb2, 0x1f, 0x5d, 0x94, 0x3c, 0x02, 0xa2,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0x7c, 0x80, 0x74, 0xd7, 0xa1, 0x37, 0x30, 0x64,
+			0x3b, 0xa4, 0xa3, 0x98, 0xde, 0x47, 0x10, 0x23,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0x27, 0x3a, 0xcf, 0xf5, 0xaf, 0x9f, 0xd8, 0xd8,
+			0x2d, 0x6a, 0x91, 0xfb, 0xb8, 0xfa, 0xbe, 0x0c,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0x97, 0x6e, 0xc4, 0xbe, 0x6b, 0x15, 0xa6, 0x7c,
+			0xc4, 0xa2, 0xb8, 0x0a, 0x0e, 0x9c, 0xc7, 0x3a,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0x2b, 0xc3, 0x98, 0xba, 0x6e, 0x42, 0xf8, 0x18,
+			0x85, 0x69, 0x15, 0x37, 0x10, 0x60, 0xe6, 0xac,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x88, 0x21, 0x77, 0x89, 0xd7, 0x93, 0x90, 0xfc,
+			0xf3, 0xb0, 0xe3, 0xfb, 0x14, 0xe2, 0xcf, 0x74,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0x66, 0x3d, 0x3e, 0x08, 0xa0, 0x49, 0x81, 0x68,
+			0x3e, 0x3b, 0xc8, 0x80, 0x55, 0xd4, 0x15, 0xe9,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0x05, 0xf5, 0x06, 0x66, 0xe7, 0x11, 0x08, 0x84,
+			0xff, 0x94, 0x50, 0x85, 0x65, 0x95, 0x2a, 0x20,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0xd3, 0xa0, 0x51, 0x69, 0xb5, 0x38, 0xae, 0x1b,
+			0xe1, 0xa2, 0x89, 0xc6, 0x8d, 0x2b, 0x62, 0x37,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0x37, 0x6d, 0x6a, 0x14, 0xdc, 0xa5, 0x37, 0xfc,
+			0xfe, 0x67, 0x76, 0xb2, 0x64, 0x68, 0x64, 0x05,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0xe3, 0x12, 0x0c, 0x58, 0x46, 0x45, 0x27, 0x7a,
+			0x0e, 0xa2, 0xfa, 0x2c, 0x35, 0x73, 0x6c, 0x94,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0x63, 0x0d, 0xa1, 0xbc, 0x6e, 0x3e, 0xd3, 0x1d,
+			0x28, 0x52, 0xd2, 0xf4, 0x30, 0x2d, 0xff, 0xc4,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0xb2, 0x91, 0x49, 0xe2, 0x02, 0x98, 0x00, 0x79,
+			0x71, 0xb9, 0xd7, 0xd4, 0xb5, 0x94, 0x6d, 0x7d,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0x58, 0x96, 0x48, 0x69, 0x05, 0x17, 0xe1, 0x6d,
+			0xbc, 0xf2, 0x3d, 0x10, 0x96, 0x00, 0x74, 0x58,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0x99, 0x3c, 0xcb, 0x4d, 0x64, 0xc9, 0xa9, 0x41,
+			0x52, 0x93, 0xfd, 0x65, 0xc4, 0xcc, 0xa5, 0xe5,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[POLYVAL_DIGEST_SIZE] = {
+	0xdf, 0x68, 0x52, 0x99, 0x92, 0xc3, 0xe8, 0x88,
+	0x29, 0x13, 0xc8, 0x35, 0x67, 0xa3, 0xd3, 0xad,
+};
+
+static const u8 polyval_allones_hashofhashes[POLYVAL_DIGEST_SIZE] = {
+	0xd5, 0xf7, 0xfd, 0xb2, 0xa6, 0xef, 0x0b, 0x85,
+	0x0d, 0x0a, 0x06, 0x10, 0xbc, 0x64, 0x94, 0x73,
+};
diff --git a/lib/crypto/tests/polyval_kunit.c b/lib/crypto/tests/polyval_kunit.c
new file mode 100644
index 000000000000..e59f598c1572
--- /dev/null
+++ b/lib/crypto/tests/polyval_kunit.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/polyval.h>
+#include "polyval-testvecs.h"
+
+/*
+ * A fixed key used when presenting POLYVAL as an unkeyed hash function in order
+ * to reuse hash-test-template.h.  At the beginning of the test suite, this is
+ * initialized to a key prepared from bytes generated from a fixed seed.
+ */
+static struct polyval_key test_key;
+
+static void polyval_init_withtestkey(struct polyval_ctx *ctx)
+{
+	polyval_init(ctx, &test_key);
+}
+
+static void polyval_withtestkey(const u8 *data, size_t len,
+				u8 out[POLYVAL_BLOCK_SIZE])
+{
+	polyval(&test_key, data, len, out);
+}
+
+/* Generate the HASH_KUNIT_CASES using hash-test-template.h. */
+#define HASH polyval_withtestkey
+#define HASH_CTX polyval_ctx
+#define HASH_SIZE POLYVAL_BLOCK_SIZE
+#define HASH_INIT polyval_init_withtestkey
+#define HASH_UPDATE polyval_update
+#define HASH_FINAL polyval_final
+#include "hash-test-template.h"
+
+/*
+ * Test an example from RFC8452 ("AES-GCM-SIV: Nonce Misuse-Resistant
+ * Authenticated Encryption") to ensure compatibility with that.
+ */
+static void test_polyval_rfc8452_testvec(struct kunit *test)
+{
+	static const u8 raw_key[POLYVAL_BLOCK_SIZE] =
+		"\x31\x07\x28\xd9\x91\x1f\x1f\x38"
+		"\x37\xb2\x43\x16\xc3\xfa\xb9\xa0";
+	static const u8 data[48] =
+		"\x65\x78\x61\x6d\x70\x6c\x65\x00"
+		"\x00\x00\x00\x00\x00\x00\x00\x00"
+		"\x48\x65\x6c\x6c\x6f\x20\x77\x6f"
+		"\x72\x6c\x64\x00\x00\x00\x00\x00"
+		"\x38\x00\x00\x00\x00\x00\x00\x00"
+		"\x58\x00\x00\x00\x00\x00\x00\x00";
+	static const u8 expected_hash[POLYVAL_BLOCK_SIZE] =
+		"\xad\x7f\xcf\x0b\x51\x69\x85\x16"
+		"\x62\x67\x2f\x3c\x5f\x95\x13\x8f";
+	u8 hash[POLYVAL_BLOCK_SIZE];
+	struct polyval_key key;
+
+	polyval_preparekey(&key, raw_key);
+	polyval(&key, data, sizeof(data), hash);
+	KUNIT_ASSERT_MEMEQ(test, hash, expected_hash, sizeof(hash));
+}
+
+/*
+ * Test a key and messages containing all one bits.  This is useful to detect
+ * overflow bugs in implementations that emulate carryless multiplication using
+ * a series of standard multiplications with the bits spread out.
+ */
+static void test_polyval_allones_key_and_message(struct kunit *test)
+{
+	struct polyval_key key;
+	struct polyval_ctx hashofhashes_ctx;
+	u8 hash[POLYVAL_BLOCK_SIZE];
+
+	static_assert(TEST_BUF_LEN >= 4096);
+	memset(test_buf, 0xff, 4096);
+
+	polyval_preparekey(&key, test_buf);
+	polyval_init(&hashofhashes_ctx, &key);
+	for (size_t len = 0; len <= 4096; len += 16) {
+		polyval(&key, test_buf, len, hash);
+		polyval_update(&hashofhashes_ctx, hash, sizeof(hash));
+	}
+	polyval_final(&hashofhashes_ctx, hash);
+	KUNIT_ASSERT_MEMEQ(test, hash, polyval_allones_hashofhashes,
+			   sizeof(hash));
+}
+
+#define MAX_LEN_FOR_KEY_CHECK 1024
+
+/*
+ * Given two prepared keys which should be identical (but may differ in
+ * alignment and/or whether they are followed by a guard page or not), verify
+ * that they produce consistent results on various data lengths.
+ */
+static void check_key_consistency(struct kunit *test,
+				  const struct polyval_key *key1,
+				  const struct polyval_key *key2)
+{
+	u8 *data = test_buf;
+	u8 hash1[POLYVAL_BLOCK_SIZE];
+	u8 hash2[POLYVAL_BLOCK_SIZE];
+
+	rand_bytes(data, MAX_LEN_FOR_KEY_CHECK);
+	KUNIT_ASSERT_MEMEQ(test, key1, key2, sizeof(*key1));
+
+	for (int i = 0; i < 100; i++) {
+		size_t len = rand_length(MAX_LEN_FOR_KEY_CHECK);
+
+		polyval(key1, data, len, hash1);
+		polyval(key2, data, len, hash2);
+		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, sizeof(hash1));
+	}
+}
+
+/* Test that no buffer overreads occur on either raw_key or polyval_key. */
+static void test_polyval_with_guarded_key(struct kunit *test)
+{
+	u8 raw_key[POLYVAL_BLOCK_SIZE];
+	u8 *guarded_raw_key = &test_buf[TEST_BUF_LEN - sizeof(raw_key)];
+	struct polyval_key key1, key2;
+	struct polyval_key *guarded_key =
+		(struct polyval_key *)&test_buf[TEST_BUF_LEN - sizeof(key1)];
+
+	/* Prepare with regular buffers. */
+	rand_bytes(raw_key, sizeof(raw_key));
+	polyval_preparekey(&key1, raw_key);
+
+	/* Prepare with guarded raw_key, then check that it works. */
+	memcpy(guarded_raw_key, raw_key, sizeof(raw_key));
+	polyval_preparekey(&key2, guarded_raw_key);
+	check_key_consistency(test, &key1, &key2);
+
+	/* Prepare guarded polyval_key, then check that it works. */
+	polyval_preparekey(guarded_key, raw_key);
+	check_key_consistency(test, &key1, guarded_key);
+}
+
+/*
+ * Test that polyval_key only needs to be aligned to
+ * __alignof__(struct polyval_key), i.e. 8 bytes.  The assembly code may prefer
+ * 16-byte or higher alignment, but it musn't require it.
+ */
+static void test_polyval_with_minimally_aligned_key(struct kunit *test)
+{
+	u8 raw_key[POLYVAL_BLOCK_SIZE];
+	struct polyval_key key;
+	struct polyval_key *minaligned_key =
+		(struct polyval_key *)&test_buf[MAX_LEN_FOR_KEY_CHECK +
+						__alignof__(struct polyval_key)];
+
+	KUNIT_ASSERT_TRUE(test, IS_ALIGNED((uintptr_t)minaligned_key,
+					   __alignof__(struct polyval_key)));
+	KUNIT_ASSERT_TRUE(test,
+			  !IS_ALIGNED((uintptr_t)minaligned_key,
+				      2 * __alignof__(struct polyval_key)));
+
+	rand_bytes(raw_key, sizeof(raw_key));
+	polyval_preparekey(&key, raw_key);
+	polyval_preparekey(minaligned_key, raw_key);
+	check_key_consistency(test, &key, minaligned_key);
+}
+
+struct polyval_irq_test_state {
+	struct polyval_key expected_key;
+	u8 raw_key[POLYVAL_BLOCK_SIZE];
+};
+
+static bool polyval_irq_test_func(void *state_)
+{
+	struct polyval_irq_test_state *state = state_;
+	struct polyval_key key;
+
+	polyval_preparekey(&key, state->raw_key);
+	return memcmp(&key, &state->expected_key, sizeof(key)) == 0;
+}
+
+/*
+ * Test that polyval_preparekey() produces the same output regardless of whether
+ * FPU or vector registers are usable when it is called.
+ */
+static void test_polyval_preparekey_in_irqs(struct kunit *test)
+{
+	struct polyval_irq_test_state state;
+
+	rand_bytes(state.raw_key, sizeof(state.raw_key));
+	polyval_preparekey(&state.expected_key, state.raw_key);
+	kunit_run_irq_test(test, polyval_irq_test_func, 20000, &state);
+}
+
+static int polyval_suite_init(struct kunit_suite *suite)
+{
+	u8 raw_key[POLYVAL_BLOCK_SIZE];
+
+	rand_bytes_seeded_from_len(raw_key, sizeof(raw_key));
+	polyval_preparekey(&test_key, raw_key);
+	return hash_suite_init(suite);
+}
+
+static void polyval_suite_exit(struct kunit_suite *suite)
+{
+	hash_suite_exit(suite);
+}
+
+static struct kunit_case polyval_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(test_polyval_rfc8452_testvec),
+	KUNIT_CASE(test_polyval_allones_key_and_message),
+	KUNIT_CASE(test_polyval_with_guarded_key),
+	KUNIT_CASE(test_polyval_with_minimally_aligned_key),
+	KUNIT_CASE(test_polyval_preparekey_in_irqs),
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite polyval_test_suite = {
+	.name = "polyval",
+	.test_cases = polyval_test_cases,
+	.suite_init = polyval_suite_init,
+	.suite_exit = polyval_suite_exit,
+};
+kunit_test_suite(polyval_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for POLYVAL");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/tests/sha256_kunit.c b/lib/crypto/tests/sha256_kunit.c
index dcedfca06df6..5dccdee79693 100644
--- a/lib/crypto/tests/sha256_kunit.c
+++ b/lib/crypto/tests/sha256_kunit.c
@@ -68,6 +68,7 @@ static void test_sha256_finup_2x(struct kunit *test)
 	rand_bytes(data1_buf, max_data_len);
 	rand_bytes(data2_buf, max_data_len);
 	rand_bytes(salt, sizeof(salt));
+	memset(ctx, 0, sizeof(*ctx));
 
 	for (size_t i = 0; i < 500; i++) {
 		size_t salt_len = rand_length(sizeof(salt));
diff --git a/lib/crypto/tests/sha3-testvecs.h b/lib/crypto/tests/sha3-testvecs.h
new file mode 100644
index 000000000000..8d614a5fa0c3
--- /dev/null
+++ b/lib/crypto/tests/sha3-testvecs.h
@@ -0,0 +1,249 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: ./scripts/crypto/gen-hash-testvecs.py sha3 */
+
+/* SHA3-256 test vectors */
+
+static const struct {
+	size_t data_len;
+	u8 digest[SHA3_256_DIGEST_SIZE];
+} hash_testvecs[] = {
+	{
+		.data_len = 0,
+		.digest = {
+			0xa7, 0xff, 0xc6, 0xf8, 0xbf, 0x1e, 0xd7, 0x66,
+			0x51, 0xc1, 0x47, 0x56, 0xa0, 0x61, 0xd6, 0x62,
+			0xf5, 0x80, 0xff, 0x4d, 0xe4, 0x3b, 0x49, 0xfa,
+			0x82, 0xd8, 0x0a, 0x4b, 0x80, 0xf8, 0x43, 0x4a,
+		},
+	},
+	{
+		.data_len = 1,
+		.digest = {
+			0x11, 0x03, 0xe7, 0x84, 0x51, 0x50, 0x86, 0x35,
+			0x71, 0x8a, 0x70, 0xe3, 0xc4, 0x26, 0x7b, 0x21,
+			0x02, 0x13, 0xa0, 0x81, 0xe8, 0xe6, 0x14, 0x25,
+			0x07, 0x34, 0xe5, 0xc5, 0x40, 0x06, 0xf2, 0x8b,
+		},
+	},
+	{
+		.data_len = 2,
+		.digest = {
+			0x2f, 0x6f, 0x6d, 0x47, 0x48, 0x52, 0x11, 0xb9,
+			0xe4, 0x3d, 0xc8, 0x71, 0xcf, 0xb2, 0xee, 0xae,
+			0x5b, 0xf4, 0x12, 0x84, 0x5b, 0x1c, 0xec, 0x6c,
+			0xc1, 0x66, 0x88, 0xaa, 0xc3, 0x40, 0xbd, 0x7e,
+		},
+	},
+	{
+		.data_len = 3,
+		.digest = {
+			0xec, 0x02, 0xe8, 0x81, 0x4f, 0x84, 0x41, 0x69,
+			0x06, 0xd8, 0xdc, 0x1d, 0x01, 0x78, 0xd7, 0xcb,
+			0x39, 0xdf, 0xd3, 0x12, 0x1c, 0x99, 0xfd, 0xf3,
+			0x5c, 0x83, 0xc9, 0xc2, 0x7a, 0x7b, 0x6a, 0x05,
+		},
+	},
+	{
+		.data_len = 16,
+		.digest = {
+			0xff, 0x6f, 0xc3, 0x41, 0xc3, 0x5f, 0x34, 0x6d,
+			0xa7, 0xdf, 0x3e, 0xc2, 0x8b, 0x29, 0xb6, 0xf1,
+			0xf8, 0x67, 0xfd, 0xcd, 0xb1, 0x9f, 0x38, 0x08,
+			0x1d, 0x8d, 0xd9, 0xc2, 0x43, 0x66, 0x18, 0x6c,
+		},
+	},
+	{
+		.data_len = 32,
+		.digest = {
+			0xe4, 0xb1, 0x06, 0x17, 0xf8, 0x8b, 0x91, 0x95,
+			0xe7, 0x57, 0x66, 0xac, 0x08, 0xb2, 0x03, 0x3e,
+			0xf7, 0x84, 0x1f, 0xe3, 0x25, 0xa3, 0x11, 0xd2,
+			0x11, 0xa4, 0x78, 0x74, 0x2a, 0x43, 0x20, 0xa5,
+		},
+	},
+	{
+		.data_len = 48,
+		.digest = {
+			0xeb, 0x57, 0x5f, 0x20, 0xa3, 0x6b, 0xc7, 0xb4,
+			0x66, 0x2a, 0xa0, 0x30, 0x3b, 0x52, 0x00, 0xc9,
+			0xce, 0x6a, 0xd8, 0x1e, 0xbe, 0xed, 0xa1, 0xd1,
+			0xbe, 0x63, 0xc7, 0xe1, 0xe2, 0x66, 0x67, 0x0c,
+		},
+	},
+	{
+		.data_len = 49,
+		.digest = {
+			0xf0, 0x67, 0xad, 0x66, 0xbe, 0xec, 0x5a, 0xfd,
+			0x29, 0xd2, 0x4f, 0x1d, 0xb2, 0x24, 0xb8, 0x90,
+			0x05, 0x28, 0x0e, 0x66, 0x67, 0x74, 0x2d, 0xee,
+			0x66, 0x25, 0x11, 0xd1, 0x76, 0xa2, 0xfc, 0x3a,
+		},
+	},
+	{
+		.data_len = 63,
+		.digest = {
+			0x57, 0x56, 0x21, 0xb3, 0x2d, 0x2d, 0xe1, 0x9d,
+			0xbf, 0x2c, 0x82, 0xa8, 0xad, 0x7e, 0x6c, 0x46,
+			0xfb, 0x30, 0xeb, 0xce, 0xcf, 0xed, 0x2d, 0x65,
+			0xe7, 0xe4, 0x96, 0x69, 0xe0, 0x48, 0xd2, 0xb6,
+		},
+	},
+	{
+		.data_len = 64,
+		.digest = {
+			0x7b, 0xba, 0x67, 0x15, 0xe5, 0x21, 0xc4, 0x69,
+			0xd3, 0xef, 0x5c, 0x97, 0x9f, 0x5b, 0xba, 0x9c,
+			0xfa, 0x55, 0x64, 0xec, 0xb5, 0x37, 0x53, 0x1b,
+			0x3f, 0x4c, 0x0a, 0xed, 0x51, 0x98, 0x2b, 0x52,
+		},
+	},
+	{
+		.data_len = 65,
+		.digest = {
+			0x44, 0xb6, 0x6b, 0x83, 0x09, 0x83, 0x55, 0x83,
+			0xde, 0x1f, 0xcc, 0x33, 0xef, 0xdc, 0x05, 0xbb,
+			0x3b, 0x63, 0x76, 0x45, 0xe4, 0x8e, 0x14, 0x7a,
+			0x2d, 0xae, 0x90, 0xce, 0x68, 0xc3, 0xa4, 0xf2,
+		},
+	},
+	{
+		.data_len = 127,
+		.digest = {
+			0x50, 0x3e, 0x99, 0x4e, 0x28, 0x2b, 0xc9, 0xf4,
+			0xf5, 0xeb, 0x2b, 0x16, 0x04, 0x2d, 0xf5, 0xbe,
+			0xc0, 0x91, 0x41, 0x2a, 0x8e, 0x69, 0x5e, 0x39,
+			0x53, 0x2c, 0xc1, 0x18, 0xa5, 0xeb, 0xd8, 0xda,
+		},
+	},
+	{
+		.data_len = 128,
+		.digest = {
+			0x90, 0x0b, 0xa6, 0x92, 0x84, 0x30, 0xaf, 0xee,
+			0x38, 0x59, 0x83, 0x83, 0xe9, 0xfe, 0xab, 0x86,
+			0x79, 0x1b, 0xcd, 0xe7, 0x0a, 0x0f, 0x58, 0x53,
+			0x36, 0xab, 0x12, 0xe1, 0x5c, 0x97, 0xc1, 0xfb,
+		},
+	},
+	{
+		.data_len = 129,
+		.digest = {
+			0x2b, 0x52, 0x1e, 0x54, 0xbe, 0x38, 0x4c, 0x3e,
+			0x73, 0x37, 0x18, 0xf5, 0x25, 0x2c, 0xc8, 0xc7,
+			0xda, 0x7e, 0xb6, 0x47, 0x9d, 0xf4, 0x46, 0xce,
+			0xfa, 0x80, 0x20, 0x6b, 0xbd, 0xfd, 0x2a, 0xd8,
+		},
+	},
+	{
+		.data_len = 256,
+		.digest = {
+			0x45, 0xf0, 0xf5, 0x9b, 0xd9, 0x91, 0x26, 0xd5,
+			0x91, 0x3b, 0xf8, 0x87, 0x8b, 0x34, 0x02, 0x31,
+			0x64, 0xab, 0xf4, 0x1c, 0x6e, 0x34, 0x72, 0xdf,
+			0x32, 0x6d, 0xe5, 0xd2, 0x67, 0x5e, 0x86, 0x93,
+		},
+	},
+	{
+		.data_len = 511,
+		.digest = {
+			0xb3, 0xaf, 0x71, 0x64, 0xfa, 0xd4, 0xf1, 0x07,
+			0x38, 0xef, 0x04, 0x8e, 0x89, 0xf4, 0x02, 0xd2,
+			0xa5, 0xaf, 0x3b, 0xf5, 0x67, 0x56, 0xcf, 0xa9,
+			0x8e, 0x43, 0xf5, 0xb5, 0xe3, 0x91, 0x8e, 0xe7,
+		},
+	},
+	{
+		.data_len = 513,
+		.digest = {
+			0x51, 0xac, 0x0a, 0x65, 0xb7, 0x96, 0x20, 0xcf,
+			0x88, 0xf6, 0x97, 0x35, 0x89, 0x0d, 0x31, 0x0f,
+			0xbe, 0x17, 0xbe, 0x62, 0x03, 0x67, 0xc0, 0xee,
+			0x4f, 0xc1, 0xe3, 0x7f, 0x6f, 0xab, 0xac, 0xb4,
+		},
+	},
+	{
+		.data_len = 1000,
+		.digest = {
+			0x7e, 0xea, 0xa8, 0xd7, 0xde, 0x20, 0x1b, 0x58,
+			0x24, 0xd8, 0x26, 0x40, 0x36, 0x5f, 0x3f, 0xaa,
+			0xe5, 0x5a, 0xea, 0x98, 0x58, 0xd4, 0xd6, 0xfc,
+			0x20, 0x4c, 0x5c, 0x4f, 0xaf, 0x56, 0xc7, 0xc3,
+		},
+	},
+	{
+		.data_len = 3333,
+		.digest = {
+			0x61, 0xb1, 0xb1, 0x3e, 0x0e, 0x7e, 0x90, 0x3d,
+			0x31, 0x54, 0xbd, 0xc9, 0x0d, 0x53, 0x62, 0xf1,
+			0xcd, 0x18, 0x80, 0xf9, 0x91, 0x75, 0x41, 0xb3,
+			0x51, 0x39, 0x57, 0xa7, 0xa8, 0x1e, 0xfb, 0xc9,
+		},
+	},
+	{
+		.data_len = 4096,
+		.digest = {
+			0xab, 0x29, 0xda, 0x10, 0xc4, 0x11, 0x2d, 0x5c,
+			0xd1, 0xce, 0x1c, 0x95, 0xfa, 0xc6, 0xc7, 0xb0,
+			0x1b, 0xd1, 0xdc, 0x6f, 0xa0, 0x9d, 0x1b, 0x23,
+			0xfb, 0x6e, 0x90, 0x97, 0xd0, 0x75, 0x44, 0x7a,
+		},
+	},
+	{
+		.data_len = 4128,
+		.digest = {
+			0x02, 0x45, 0x95, 0xf4, 0x19, 0xb5, 0x93, 0x29,
+			0x90, 0xf2, 0x63, 0x3f, 0x89, 0xe8, 0xa5, 0x31,
+			0x76, 0xf2, 0x89, 0x79, 0x66, 0xd3, 0x96, 0xdf,
+			0x33, 0xd1, 0xa6, 0x17, 0x73, 0xb1, 0xd0, 0x45,
+		},
+	},
+	{
+		.data_len = 4160,
+		.digest = {
+			0xd1, 0x8e, 0x22, 0xea, 0x44, 0x87, 0x6e, 0x9d,
+			0xfb, 0x36, 0x02, 0x20, 0x63, 0xb7, 0x69, 0x45,
+			0x25, 0x41, 0x69, 0xe0, 0x9b, 0x87, 0xcf, 0xa3,
+			0x51, 0xbb, 0xfc, 0x8d, 0xf7, 0x29, 0xa7, 0xea,
+		},
+	},
+	{
+		.data_len = 4224,
+		.digest = {
+			0x11, 0x86, 0x7d, 0x84, 0xf9, 0x8c, 0x6e, 0xc4,
+			0x64, 0x36, 0xc6, 0xf3, 0x42, 0x92, 0x31, 0x2b,
+			0x1e, 0x12, 0xe6, 0x4d, 0xbe, 0xfa, 0x77, 0x3f,
+			0x89, 0x41, 0x33, 0x58, 0x1c, 0x98, 0x16, 0x0a,
+		},
+	},
+	{
+		.data_len = 16384,
+		.digest = {
+			0xb2, 0xba, 0x0c, 0x8c, 0x9d, 0xbb, 0x1e, 0xb0,
+			0x03, 0xb5, 0xdf, 0x4f, 0xf5, 0x35, 0xdb, 0xec,
+			0x60, 0xf2, 0x5b, 0xb6, 0xd0, 0x49, 0xd3, 0xed,
+			0x55, 0xc0, 0x7a, 0xd7, 0xaf, 0xa1, 0xea, 0x53,
+		},
+	},
+};
+
+static const u8 hash_testvec_consolidated[SHA3_256_DIGEST_SIZE] = {
+	0x3b, 0x33, 0x67, 0xf8, 0xea, 0x92, 0x78, 0x62,
+	0xdd, 0xbe, 0x72, 0x15, 0xbd, 0x6f, 0xfa, 0xe5,
+	0x5e, 0xab, 0x9f, 0xb1, 0xe4, 0x23, 0x7c, 0x2c,
+	0x80, 0xcf, 0x09, 0x75, 0xf8, 0xe2, 0xfa, 0x30,
+};
+
+/* SHAKE test vectors */
+
+static const u8 shake128_testvec_consolidated[SHA3_256_DIGEST_SIZE] = {
+	0x89, 0x88, 0x3a, 0x44, 0xec, 0xfe, 0x3c, 0xeb,
+	0x2f, 0x1c, 0x1d, 0xda, 0x9e, 0x36, 0x64, 0xf0,
+	0x85, 0x4c, 0x49, 0x12, 0x76, 0x5a, 0x4d, 0xe7,
+	0xa8, 0xfd, 0xcd, 0xbe, 0x45, 0xb4, 0x6f, 0xb0,
+};
+
+static const u8 shake256_testvec_consolidated[SHA3_256_DIGEST_SIZE] = {
+	0x5a, 0xfd, 0x66, 0x62, 0x5c, 0x37, 0x2b, 0x41,
+	0x77, 0x1c, 0x01, 0x5d, 0x64, 0x7c, 0x63, 0x7a,
+	0x7c, 0x76, 0x9e, 0xa8, 0xd1, 0xb0, 0x8e, 0x02,
+	0x16, 0x9b, 0xfe, 0x0e, 0xb5, 0xd8, 0x6a, 0xb5,
+};
diff --git a/lib/crypto/tests/sha3_kunit.c b/lib/crypto/tests/sha3_kunit.c
new file mode 100644
index 000000000000..ed5fbe80337f
--- /dev/null
+++ b/lib/crypto/tests/sha3_kunit.c
@@ -0,0 +1,422 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+#include <crypto/sha3.h>
+#include "sha3-testvecs.h"
+
+#define HASH		sha3_256
+#define HASH_CTX	sha3_ctx
+#define HASH_SIZE	SHA3_256_DIGEST_SIZE
+#define HASH_INIT	sha3_256_init
+#define HASH_UPDATE	sha3_update
+#define HASH_FINAL	sha3_final
+#include "hash-test-template.h"
+
+/*
+ * Sample message and the output generated for various algorithms by passing it
+ * into "openssl sha3-224" etc..
+ */
+static const u8 test_sha3_sample[] =
+	"The quick red fox jumped over the lazy brown dog!\n"
+	"The quick red fox jumped over the lazy brown dog!\n"
+	"The quick red fox jumped over the lazy brown dog!\n"
+	"The quick red fox jumped over the lazy brown dog!\n";
+
+static const u8 test_sha3_224[8 + SHA3_224_DIGEST_SIZE + 8] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+	0xd6, 0xe8, 0xd8, 0x80, 0xfa, 0x42, 0x80, 0x70,
+	0x7e, 0x7f, 0xd7, 0xd2, 0xd7, 0x7a, 0x35, 0x65,
+	0xf0, 0x0b, 0x4f, 0x9f, 0x2a, 0x33, 0xca, 0x0a,
+	0xef, 0xa6, 0x4c, 0xb8,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_sha3_256[8 + SHA3_256_DIGEST_SIZE + 8] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+	0xdb, 0x3b, 0xb0, 0xb8, 0x8d, 0x15, 0x78, 0xe5,
+	0x78, 0x76, 0x8e, 0x39, 0x7e, 0x89, 0x86, 0xb9,
+	0x14, 0x3a, 0x1e, 0xe7, 0x96, 0x7c, 0xf3, 0x25,
+	0x70, 0xbd, 0xc3, 0xa9, 0xae, 0x63, 0x71, 0x1d,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_sha3_384[8 + SHA3_384_DIGEST_SIZE + 8] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+	0x2d, 0x4b, 0x29, 0x85, 0x19, 0x94, 0xaa, 0x31,
+	0x9b, 0x04, 0x9d, 0x6e, 0x79, 0x66, 0xc7, 0x56,
+	0x8a, 0x2e, 0x99, 0x84, 0x06, 0xcf, 0x10, 0x2d,
+	0xec, 0xf0, 0x03, 0x04, 0x1f, 0xd5, 0x99, 0x63,
+	0x2f, 0xc3, 0x2b, 0x0d, 0xd9, 0x45, 0xf7, 0xbb,
+	0x0a, 0xc3, 0x46, 0xab, 0xfe, 0x4d, 0x94, 0xc2,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_sha3_512[8 + SHA3_512_DIGEST_SIZE + 8] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+	0xdd, 0x71, 0x3b, 0x44, 0xb6, 0x6c, 0xd7, 0x78,
+	0xe7, 0x93, 0xa1, 0x4c, 0xd7, 0x24, 0x16, 0xf1,
+	0xfd, 0xa2, 0x82, 0x4e, 0xed, 0x59, 0xe9, 0x83,
+	0x15, 0x38, 0x89, 0x7d, 0x39, 0x17, 0x0c, 0xb2,
+	0xcf, 0x12, 0x80, 0x78, 0xa1, 0x78, 0x41, 0xeb,
+	0xed, 0x21, 0x4c, 0xa4, 0x4a, 0x5f, 0x30, 0x1a,
+	0x70, 0x98, 0x4f, 0x14, 0xa2, 0xd1, 0x64, 0x1b,
+	0xc2, 0x0a, 0xff, 0x3b, 0xe8, 0x26, 0x41, 0x8f,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_shake128[8 + SHAKE128_DEFAULT_SIZE + 8] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+	0x41, 0xd6, 0xb8, 0x9c, 0xf8, 0xe8, 0x54, 0xf2,
+	0x5c, 0xde, 0x51, 0x12, 0xaf, 0x9e, 0x0d, 0x91,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static const u8 test_shake256[8 + SHAKE256_DEFAULT_SIZE + 8] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-before guard */
+	0xab, 0x06, 0xd4, 0xf9, 0x8b, 0xfd, 0xb2, 0xc4,
+	0xfe, 0xf1, 0xcc, 0xe2, 0x40, 0x45, 0xdd, 0x15,
+	0xcb, 0xdd, 0x02, 0x8d, 0xb7, 0x9f, 0x1e, 0x67,
+	0xd6, 0x7f, 0x98, 0x5e, 0x1b, 0x19, 0xf8, 0x01,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* Write-after guard */
+};
+
+static void test_sha3_224_basic(struct kunit *test)
+{
+	u8 out[8 + SHA3_224_DIGEST_SIZE + 8];
+
+	BUILD_BUG_ON(sizeof(out) != sizeof(test_sha3_224));
+
+	memset(out, 0, sizeof(out));
+	sha3_224(test_sha3_sample, sizeof(test_sha3_sample) - 1, out + 8);
+
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_sha3_224, sizeof(test_sha3_224),
+			       "SHA3-224 gives wrong output");
+}
+
+static void test_sha3_256_basic(struct kunit *test)
+{
+	u8 out[8 + SHA3_256_DIGEST_SIZE + 8];
+
+	BUILD_BUG_ON(sizeof(out) != sizeof(test_sha3_256));
+
+	memset(out, 0, sizeof(out));
+	sha3_256(test_sha3_sample, sizeof(test_sha3_sample) - 1, out + 8);
+
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_sha3_256, sizeof(test_sha3_256),
+			       "SHA3-256 gives wrong output");
+}
+
+static void test_sha3_384_basic(struct kunit *test)
+{
+	u8 out[8 + SHA3_384_DIGEST_SIZE + 8];
+
+	BUILD_BUG_ON(sizeof(out) != sizeof(test_sha3_384));
+
+	memset(out, 0, sizeof(out));
+	sha3_384(test_sha3_sample, sizeof(test_sha3_sample) - 1, out + 8);
+
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_sha3_384, sizeof(test_sha3_384),
+			       "SHA3-384 gives wrong output");
+}
+
+static void test_sha3_512_basic(struct kunit *test)
+{
+	u8 out[8 + SHA3_512_DIGEST_SIZE + 8];
+
+	BUILD_BUG_ON(sizeof(out) != sizeof(test_sha3_512));
+
+	memset(out, 0, sizeof(out));
+	sha3_512(test_sha3_sample, sizeof(test_sha3_sample) - 1, out + 8);
+
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_sha3_512, sizeof(test_sha3_512),
+			       "SHA3-512 gives wrong output");
+}
+
+static void test_shake128_basic(struct kunit *test)
+{
+	u8 out[8 + SHAKE128_DEFAULT_SIZE + 8];
+
+	BUILD_BUG_ON(sizeof(out) != sizeof(test_shake128));
+
+	memset(out, 0, sizeof(out));
+	shake128(test_sha3_sample, sizeof(test_sha3_sample) - 1,
+		 out + 8, sizeof(out) - 16);
+
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake128, sizeof(test_shake128),
+			       "SHAKE128 gives wrong output");
+}
+
+static void test_shake256_basic(struct kunit *test)
+{
+	u8 out[8 + SHAKE256_DEFAULT_SIZE + 8];
+
+	BUILD_BUG_ON(sizeof(out) != sizeof(test_shake256));
+
+	memset(out, 0, sizeof(out));
+	shake256(test_sha3_sample, sizeof(test_sha3_sample) - 1,
+		 out + 8, sizeof(out) - 16);
+
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake256, sizeof(test_shake256),
+			       "SHAKE256 gives wrong output");
+}
+
+/*
+ * Usable NIST tests.
+ *
+ * From: https://csrc.nist.gov/projects/cryptographic-standards-and-guidelines/example-values
+ */
+static const u8 test_nist_1600_sample[] = {
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3,
+	0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3, 0xa3
+};
+
+static const u8 test_shake128_nist_0[] = {
+	0x7f, 0x9c, 0x2b, 0xa4, 0xe8, 0x8f, 0x82, 0x7d,
+	0x61, 0x60, 0x45, 0x50, 0x76, 0x05, 0x85, 0x3e
+};
+
+static const u8 test_shake128_nist_1600[] = {
+	0x13, 0x1a, 0xb8, 0xd2, 0xb5, 0x94, 0x94, 0x6b,
+	0x9c, 0x81, 0x33, 0x3f, 0x9b, 0xb6, 0xe0, 0xce,
+};
+
+static const u8 test_shake256_nist_0[] = {
+	0x46, 0xb9, 0xdd, 0x2b, 0x0b, 0xa8, 0x8d, 0x13,
+	0x23, 0x3b, 0x3f, 0xeb, 0x74, 0x3e, 0xeb, 0x24,
+	0x3f, 0xcd, 0x52, 0xea, 0x62, 0xb8, 0x1b, 0x82,
+	0xb5, 0x0c, 0x27, 0x64, 0x6e, 0xd5, 0x76, 0x2f
+};
+
+static const u8 test_shake256_nist_1600[] = {
+	0xcd, 0x8a, 0x92, 0x0e, 0xd1, 0x41, 0xaa, 0x04,
+	0x07, 0xa2, 0x2d, 0x59, 0x28, 0x86, 0x52, 0xe9,
+	0xd9, 0xf1, 0xa7, 0xee, 0x0c, 0x1e, 0x7c, 0x1c,
+	0xa6, 0x99, 0x42, 0x4d, 0xa8, 0x4a, 0x90, 0x4d,
+};
+
+static void test_shake128_nist(struct kunit *test)
+{
+	u8 out[SHAKE128_DEFAULT_SIZE];
+
+	shake128("", 0, out, sizeof(out));
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake128_nist_0, sizeof(out),
+			       "SHAKE128 gives wrong output for NIST.0");
+
+	shake128(test_nist_1600_sample, sizeof(test_nist_1600_sample),
+		 out, sizeof(out));
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake128_nist_1600, sizeof(out),
+			       "SHAKE128 gives wrong output for NIST.1600");
+}
+
+static void test_shake256_nist(struct kunit *test)
+{
+	u8 out[SHAKE256_DEFAULT_SIZE];
+
+	shake256("", 0, out, sizeof(out));
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake256_nist_0, sizeof(out),
+			       "SHAKE256 gives wrong output for NIST.0");
+
+	shake256(test_nist_1600_sample, sizeof(test_nist_1600_sample),
+		 out, sizeof(out));
+	KUNIT_ASSERT_MEMEQ_MSG(test, out, test_shake256_nist_1600, sizeof(out),
+			       "SHAKE256 gives wrong output for NIST.1600");
+}
+
+static void shake(int alg, const u8 *in, size_t in_len, u8 *out, size_t out_len)
+{
+	if (alg == 0)
+		shake128(in, in_len, out, out_len);
+	else
+		shake256(in, in_len, out, out_len);
+}
+
+static void shake_init(struct shake_ctx *ctx, int alg)
+{
+	if (alg == 0)
+		shake128_init(ctx);
+	else
+		shake256_init(ctx);
+}
+
+/*
+ * Test each of SHAKE128 and SHAKE256 with all input lengths 0 through 4096, for
+ * both input and output.  The input and output lengths cycle through the values
+ * together, so we do 4096 tests total.  To verify all the SHAKE outputs,
+ * compute and verify the SHA3-256 digest of all of them concatenated together.
+ */
+static void test_shake_all_lens_up_to_4096(struct kunit *test)
+{
+	struct sha3_ctx main_ctx;
+	const size_t max_len = 4096;
+	u8 *const in = test_buf;
+	u8 *const out = &test_buf[TEST_BUF_LEN - max_len];
+	u8 main_hash[SHA3_256_DIGEST_SIZE];
+
+	KUNIT_ASSERT_LE(test, 2 * max_len, TEST_BUF_LEN);
+
+	rand_bytes_seeded_from_len(in, max_len);
+	for (int alg = 0; alg < 2; alg++) {
+		sha3_256_init(&main_ctx);
+		for (size_t in_len = 0; in_len <= max_len; in_len++) {
+			size_t out_len = (in_len * 293) % (max_len + 1);
+
+			shake(alg, in, in_len, out, out_len);
+			sha3_update(&main_ctx, out, out_len);
+		}
+		sha3_final(&main_ctx, main_hash);
+		if (alg == 0)
+			KUNIT_ASSERT_MEMEQ_MSG(test, main_hash,
+					       shake128_testvec_consolidated,
+					       sizeof(main_hash),
+					       "shake128() gives wrong output");
+		else
+			KUNIT_ASSERT_MEMEQ_MSG(test, main_hash,
+					       shake256_testvec_consolidated,
+					       sizeof(main_hash),
+					       "shake256() gives wrong output");
+	}
+}
+
+/*
+ * Test that a sequence of SHAKE squeezes gives the same output as a single
+ * squeeze of the same total length.
+ */
+static void test_shake_multiple_squeezes(struct kunit *test)
+{
+	const size_t max_len = 512;
+	u8 *ref_out;
+
+	KUNIT_ASSERT_GE(test, TEST_BUF_LEN, 2 * max_len);
+
+	ref_out = kunit_kzalloc(test, max_len, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test, ref_out);
+
+	for (int i = 0; i < 2000; i++) {
+		const int alg = rand32() % 2;
+		const size_t in_len = rand_length(max_len);
+		const size_t out_len = rand_length(max_len);
+		const size_t in_offs = rand_offset(max_len - in_len);
+		const size_t out_offs = rand_offset(max_len - out_len);
+		u8 *const in = &test_buf[in_offs];
+		u8 *const out = &test_buf[out_offs];
+		struct shake_ctx ctx;
+		size_t remaining_len, j, num_parts;
+
+		rand_bytes(in, in_len);
+		rand_bytes(out, out_len);
+
+		/* Compute the output using the one-shot function. */
+		shake(alg, in, in_len, ref_out, out_len);
+
+		/* Compute the output using a random sequence of squeezes. */
+		shake_init(&ctx, alg);
+		shake_update(&ctx, in, in_len);
+		remaining_len = out_len;
+		j = 0;
+		num_parts = 0;
+		while (rand_bool()) {
+			size_t part_len = rand_length(remaining_len);
+
+			shake_squeeze(&ctx, &out[j], part_len);
+			num_parts++;
+			j += part_len;
+			remaining_len -= part_len;
+		}
+		if (remaining_len != 0 || rand_bool()) {
+			shake_squeeze(&ctx, &out[j], remaining_len);
+			num_parts++;
+		}
+
+		/* Verify that the outputs are the same. */
+		KUNIT_ASSERT_MEMEQ_MSG(
+			test, out, ref_out, out_len,
+			"Multi-squeeze test failed with in_len=%zu in_offs=%zu out_len=%zu out_offs=%zu num_parts=%zu alg=%d",
+			in_len, in_offs, out_len, out_offs, num_parts, alg);
+	}
+}
+
+/*
+ * Test that SHAKE operations on buffers immediately followed by an unmapped
+ * page work as expected.  This catches out-of-bounds memory accesses even if
+ * they occur in assembly code.
+ */
+static void test_shake_with_guarded_bufs(struct kunit *test)
+{
+	const size_t max_len = 512;
+	u8 *reg_buf;
+
+	KUNIT_ASSERT_GE(test, TEST_BUF_LEN, max_len);
+
+	reg_buf = kunit_kzalloc(test, max_len, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test, reg_buf);
+
+	for (int alg = 0; alg < 2; alg++) {
+		for (size_t len = 0; len <= max_len; len++) {
+			u8 *guarded_buf = &test_buf[TEST_BUF_LEN - len];
+
+			rand_bytes(reg_buf, len);
+			memcpy(guarded_buf, reg_buf, len);
+
+			shake(alg, reg_buf, len, reg_buf, len);
+			shake(alg, guarded_buf, len, guarded_buf, len);
+
+			KUNIT_ASSERT_MEMEQ_MSG(
+				test, reg_buf, guarded_buf, len,
+				"Guard page test failed with len=%zu alg=%d",
+				len, alg);
+		}
+	}
+}
+
+static struct kunit_case sha3_test_cases[] = {
+	HASH_KUNIT_CASES,
+	KUNIT_CASE(test_sha3_224_basic),
+	KUNIT_CASE(test_sha3_256_basic),
+	KUNIT_CASE(test_sha3_384_basic),
+	KUNIT_CASE(test_sha3_512_basic),
+	KUNIT_CASE(test_shake128_basic),
+	KUNIT_CASE(test_shake256_basic),
+	KUNIT_CASE(test_shake128_nist),
+	KUNIT_CASE(test_shake256_nist),
+	KUNIT_CASE(test_shake_all_lens_up_to_4096),
+	KUNIT_CASE(test_shake_multiple_squeezes),
+	KUNIT_CASE(test_shake_with_guarded_bufs),
+	KUNIT_CASE(benchmark_hash),
+	{},
+};
+
+static struct kunit_suite sha3_test_suite = {
+	.name = "sha3",
+	.test_cases = sha3_test_cases,
+	.suite_init = hash_suite_init,
+	.suite_exit = hash_suite_exit,
+};
+kunit_test_suite(sha3_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests and benchmark for SHA3");
+MODULE_LICENSE("GPL");
diff --git a/lib/crypto/x86/blake2s-core.S b/lib/crypto/x86/blake2s-core.S
index ef8e9f427aab..7b1d98ca7482 100644
--- a/lib/crypto/x86/blake2s-core.S
+++ b/lib/crypto/x86/blake2s-core.S
@@ -6,19 +6,25 @@
 
 #include <linux/linkage.h>
 
-.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
+.section .rodata.cst32.iv, "aM", @progbits, 32
 .align 32
-IV:	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
+.Liv:
+	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
 	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
-.section .rodata.cst16.ROT16, "aM", @progbits, 16
+
+.section .rodata.cst16.ror16, "aM", @progbits, 16
 .align 16
-ROT16:	.octa 0x0D0C0F0E09080B0A0504070601000302
-.section .rodata.cst16.ROR328, "aM", @progbits, 16
+.Lror16:
+	.octa 0x0D0C0F0E09080B0A0504070601000302
+
+.section .rodata.cst16.ror8, "aM", @progbits, 16
 .align 16
-ROR328:	.octa 0x0C0F0E0D080B0A090407060500030201
-.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
+.Lror8:
+	.octa 0x0C0F0E0D080B0A090407060500030201
+
+.section .rodata.cst64.sigma, "aM", @progbits, 160
 .align 64
-SIGMA:
+.Lsigma:
 .byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
 .byte 14,  4,  9, 13, 10,  8, 15,  6,  5,  1,  0, 11,  3, 12,  2,  7
 .byte 11, 12,  5, 15,  8,  0,  2, 13,  9, 10,  3,  7,  4, 14,  6,  1
@@ -29,9 +35,10 @@ SIGMA:
 .byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
 .byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
 .byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
-.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 160
+
+.section .rodata.cst64.sigma2, "aM", @progbits, 160
 .align 64
-SIGMA2:
+.Lsigma2:
 .byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
 .byte  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
 .byte 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
@@ -43,36 +50,52 @@ SIGMA2:
 .byte 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
 .byte  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
 
+#define CTX		%rdi
+#define DATA		%rsi
+#define NBLOCKS		%rdx
+#define INC		%ecx
+
 .text
+//
+// void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
+//			       const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_ctx are used:
+//	u32 h[8];	(inout)
+//	u32 t[2];	(inout)
+//	u32 f[2];	(in)
+//
 SYM_FUNC_START(blake2s_compress_ssse3)
-	testq		%rdx,%rdx
-	je		.Lendofloop
-	movdqu		(%rdi),%xmm0
-	movdqu		0x10(%rdi),%xmm1
-	movdqa		ROT16(%rip),%xmm12
-	movdqa		ROR328(%rip),%xmm13
-	movdqu		0x20(%rdi),%xmm14
-	movq		%rcx,%xmm15
-	leaq		SIGMA+0xa0(%rip),%r8
-	jmp		.Lbeginofloop
+	movdqu		(CTX),%xmm0		// Load h[0..3]
+	movdqu		16(CTX),%xmm1		// Load h[4..7]
+	movdqa		.Lror16(%rip),%xmm12
+	movdqa		.Lror8(%rip),%xmm13
+	movdqu		32(CTX),%xmm14		// Load t and f
+	movd		INC,%xmm15		// Load inc
+	leaq		.Lsigma+160(%rip),%r8
+	jmp		.Lssse3_mainloop
+
 	.align		32
-.Lbeginofloop:
-	movdqa		%xmm0,%xmm10
-	movdqa		%xmm1,%xmm11
-	paddq		%xmm15,%xmm14
-	movdqa		IV(%rip),%xmm2
+.Lssse3_mainloop:
+	// Main loop: each iteration processes one 64-byte block.
+	movdqa		%xmm0,%xmm10		// Save h[0..3] and let v[0..3] = h[0..3]
+	movdqa		%xmm1,%xmm11		// Save h[4..7] and let v[4..7] = h[4..7]
+	paddq		%xmm15,%xmm14		// t += inc (64-bit addition)
+	movdqa		.Liv(%rip),%xmm2	// v[8..11] = iv[0..3]
 	movdqa		%xmm14,%xmm3
-	pxor		IV+0x10(%rip),%xmm3
-	leaq		SIGMA(%rip),%rcx
-.Lroundloop:
+	pxor		.Liv+16(%rip),%xmm3	// v[12..15] = iv[4..7] ^ [t, f]
+	leaq		.Lsigma(%rip),%rcx
+
+.Lssse3_roundloop:
+	// Round loop: each iteration does 1 round (of 10 rounds total).
 	movzbl		(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm4
-	movzbl		0x1(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm5
-	movzbl		0x2(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm6
-	movzbl		0x3(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm7
+	movd		(DATA,%rax,4),%xmm4
+	movzbl		1(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm5
+	movzbl		2(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm6
+	movzbl		3(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm7
 	punpckldq	%xmm5,%xmm4
 	punpckldq	%xmm7,%xmm6
 	punpcklqdq	%xmm6,%xmm4
@@ -83,17 +106,17 @@ SYM_FUNC_START(blake2s_compress_ssse3)
 	paddd		%xmm3,%xmm2
 	pxor		%xmm2,%xmm1
 	movdqa		%xmm1,%xmm8
-	psrld		$0xc,%xmm1
-	pslld		$0x14,%xmm8
+	psrld		$12,%xmm1
+	pslld		$20,%xmm8
 	por		%xmm8,%xmm1
-	movzbl		0x4(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm5
-	movzbl		0x5(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm6
-	movzbl		0x6(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm7
-	movzbl		0x7(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm4
+	movzbl		4(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm5
+	movzbl		5(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm6
+	movzbl		6(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm7
+	movzbl		7(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm4
 	punpckldq	%xmm6,%xmm5
 	punpckldq	%xmm4,%xmm7
 	punpcklqdq	%xmm7,%xmm5
@@ -104,20 +127,20 @@ SYM_FUNC_START(blake2s_compress_ssse3)
 	paddd		%xmm3,%xmm2
 	pxor		%xmm2,%xmm1
 	movdqa		%xmm1,%xmm8
-	psrld		$0x7,%xmm1
-	pslld		$0x19,%xmm8
+	psrld		$7,%xmm1
+	pslld		$25,%xmm8
 	por		%xmm8,%xmm1
 	pshufd		$0x93,%xmm0,%xmm0
 	pshufd		$0x4e,%xmm3,%xmm3
 	pshufd		$0x39,%xmm2,%xmm2
-	movzbl		0x8(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm6
-	movzbl		0x9(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm7
-	movzbl		0xa(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm4
-	movzbl		0xb(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm5
+	movzbl		8(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm6
+	movzbl		9(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm7
+	movzbl		10(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm4
+	movzbl		11(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm5
 	punpckldq	%xmm7,%xmm6
 	punpckldq	%xmm5,%xmm4
 	punpcklqdq	%xmm4,%xmm6
@@ -128,17 +151,17 @@ SYM_FUNC_START(blake2s_compress_ssse3)
 	paddd		%xmm3,%xmm2
 	pxor		%xmm2,%xmm1
 	movdqa		%xmm1,%xmm8
-	psrld		$0xc,%xmm1
-	pslld		$0x14,%xmm8
+	psrld		$12,%xmm1
+	pslld		$20,%xmm8
 	por		%xmm8,%xmm1
-	movzbl		0xc(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm7
-	movzbl		0xd(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm4
-	movzbl		0xe(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm5
-	movzbl		0xf(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm6
+	movzbl		12(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm7
+	movzbl		13(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm4
+	movzbl		14(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm5
+	movzbl		15(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm6
 	punpckldq	%xmm4,%xmm7
 	punpckldq	%xmm6,%xmm5
 	punpcklqdq	%xmm5,%xmm7
@@ -149,53 +172,68 @@ SYM_FUNC_START(blake2s_compress_ssse3)
 	paddd		%xmm3,%xmm2
 	pxor		%xmm2,%xmm1
 	movdqa		%xmm1,%xmm8
-	psrld		$0x7,%xmm1
-	pslld		$0x19,%xmm8
+	psrld		$7,%xmm1
+	pslld		$25,%xmm8
 	por		%xmm8,%xmm1
 	pshufd		$0x39,%xmm0,%xmm0
 	pshufd		$0x4e,%xmm3,%xmm3
 	pshufd		$0x93,%xmm2,%xmm2
-	addq		$0x10,%rcx
+	addq		$16,%rcx
 	cmpq		%r8,%rcx
-	jnz		.Lroundloop
+	jnz		.Lssse3_roundloop
+
+	// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
 	pxor		%xmm2,%xmm0
 	pxor		%xmm3,%xmm1
 	pxor		%xmm10,%xmm0
 	pxor		%xmm11,%xmm1
-	addq		$0x40,%rsi
-	decq		%rdx
-	jnz		.Lbeginofloop
-	movdqu		%xmm0,(%rdi)
-	movdqu		%xmm1,0x10(%rdi)
-	movdqu		%xmm14,0x20(%rdi)
-.Lendofloop:
+	addq		$64,DATA
+	decq		NBLOCKS
+	jnz		.Lssse3_mainloop
+
+	movdqu		%xmm0,(CTX)		// Store new h[0..3]
+	movdqu		%xmm1,16(CTX)		// Store new h[4..7]
+	movq		%xmm14,32(CTX)		// Store new t (f is unchanged)
 	RET
 SYM_FUNC_END(blake2s_compress_ssse3)
 
+//
+// void blake2s_compress_avx512(struct blake2s_ctx *ctx,
+//				const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_ctx are used:
+//	u32 h[8];	(inout)
+//	u32 t[2];	(inout)
+//	u32 f[2];	(in)
+//
 SYM_FUNC_START(blake2s_compress_avx512)
-	vmovdqu		(%rdi),%xmm0
-	vmovdqu		0x10(%rdi),%xmm1
-	vmovdqu		0x20(%rdi),%xmm4
-	vmovq		%rcx,%xmm5
-	vmovdqa		IV(%rip),%xmm14
-	vmovdqa		IV+16(%rip),%xmm15
-	jmp		.Lblake2s_compress_avx512_mainloop
-.align 32
-.Lblake2s_compress_avx512_mainloop:
-	vmovdqa		%xmm0,%xmm10
-	vmovdqa		%xmm1,%xmm11
-	vpaddq		%xmm5,%xmm4,%xmm4
-	vmovdqa		%xmm14,%xmm2
-	vpxor		%xmm15,%xmm4,%xmm3
-	vmovdqu		(%rsi),%ymm6
-	vmovdqu		0x20(%rsi),%ymm7
-	addq		$0x40,%rsi
-	leaq		SIGMA2(%rip),%rax
-	movb		$0xa,%cl
-.Lblake2s_compress_avx512_roundloop:
+	vmovdqu		(CTX),%xmm0		// Load h[0..3]
+	vmovdqu		16(CTX),%xmm1		// Load h[4..7]
+	vmovdqu		32(CTX),%xmm4		// Load t and f
+	vmovd		INC,%xmm5		// Load inc
+	vmovdqa		.Liv(%rip),%xmm14	// Load iv[0..3]
+	vmovdqa		.Liv+16(%rip),%xmm15	// Load iv[4..7]
+	jmp		.Lavx512_mainloop
+
+	.align		32
+.Lavx512_mainloop:
+	// Main loop: each iteration processes one 64-byte block.
+	vmovdqa		%xmm0,%xmm10		// Save h[0..3] and let v[0..3] = h[0..3]
+	vmovdqa		%xmm1,%xmm11		// Save h[4..7] and let v[4..7] = h[4..7]
+	vpaddq		%xmm5,%xmm4,%xmm4	// t += inc (64-bit addition)
+	vmovdqa		%xmm14,%xmm2		// v[8..11] = iv[0..3]
+	vpxor		%xmm15,%xmm4,%xmm3	// v[12..15] = iv[4..7] ^ [t, f]
+	vmovdqu		(DATA),%ymm6		// Load first 8 data words
+	vmovdqu		32(DATA),%ymm7		// Load second 8 data words
+	addq		$64,DATA
+	leaq		.Lsigma2(%rip),%rax
+	movb		$10,%cl			// Set num rounds remaining
+
+.Lavx512_roundloop:
+	// Round loop: each iteration does 1 round (of 10 rounds total).
 	vpmovzxbd	(%rax),%ymm8
-	vpmovzxbd	0x8(%rax),%ymm9
-	addq		$0x10,%rax
+	vpmovzxbd	8(%rax),%ymm9
+	addq		$16,%rax
 	vpermi2d	%ymm7,%ymm6,%ymm8
 	vpermi2d	%ymm7,%ymm6,%ymm9
 	vmovdqa		%ymm8,%ymm6
@@ -203,50 +241,51 @@ SYM_FUNC_START(blake2s_compress_avx512)
 	vpaddd		%xmm8,%xmm0,%xmm0
 	vpaddd		%xmm1,%xmm0,%xmm0
 	vpxor		%xmm0,%xmm3,%xmm3
-	vprord		$0x10,%xmm3,%xmm3
+	vprord		$16,%xmm3,%xmm3
 	vpaddd		%xmm3,%xmm2,%xmm2
 	vpxor		%xmm2,%xmm1,%xmm1
-	vprord		$0xc,%xmm1,%xmm1
-	vextracti128	$0x1,%ymm8,%xmm8
+	vprord		$12,%xmm1,%xmm1
+	vextracti128	$1,%ymm8,%xmm8
 	vpaddd		%xmm8,%xmm0,%xmm0
 	vpaddd		%xmm1,%xmm0,%xmm0
 	vpxor		%xmm0,%xmm3,%xmm3
-	vprord		$0x8,%xmm3,%xmm3
+	vprord		$8,%xmm3,%xmm3
 	vpaddd		%xmm3,%xmm2,%xmm2
 	vpxor		%xmm2,%xmm1,%xmm1
-	vprord		$0x7,%xmm1,%xmm1
+	vprord		$7,%xmm1,%xmm1
 	vpshufd		$0x93,%xmm0,%xmm0
 	vpshufd		$0x4e,%xmm3,%xmm3
 	vpshufd		$0x39,%xmm2,%xmm2
 	vpaddd		%xmm9,%xmm0,%xmm0
 	vpaddd		%xmm1,%xmm0,%xmm0
 	vpxor		%xmm0,%xmm3,%xmm3
-	vprord		$0x10,%xmm3,%xmm3
+	vprord		$16,%xmm3,%xmm3
 	vpaddd		%xmm3,%xmm2,%xmm2
 	vpxor		%xmm2,%xmm1,%xmm1
-	vprord		$0xc,%xmm1,%xmm1
-	vextracti128	$0x1,%ymm9,%xmm9
+	vprord		$12,%xmm1,%xmm1
+	vextracti128	$1,%ymm9,%xmm9
 	vpaddd		%xmm9,%xmm0,%xmm0
 	vpaddd		%xmm1,%xmm0,%xmm0
 	vpxor		%xmm0,%xmm3,%xmm3
-	vprord		$0x8,%xmm3,%xmm3
+	vprord		$8,%xmm3,%xmm3
 	vpaddd		%xmm3,%xmm2,%xmm2
 	vpxor		%xmm2,%xmm1,%xmm1
-	vprord		$0x7,%xmm1,%xmm1
+	vprord		$7,%xmm1,%xmm1
 	vpshufd		$0x39,%xmm0,%xmm0
 	vpshufd		$0x4e,%xmm3,%xmm3
 	vpshufd		$0x93,%xmm2,%xmm2
 	decb		%cl
-	jne		.Lblake2s_compress_avx512_roundloop
-	vpxor		%xmm10,%xmm0,%xmm0
-	vpxor		%xmm11,%xmm1,%xmm1
-	vpxor		%xmm2,%xmm0,%xmm0
-	vpxor		%xmm3,%xmm1,%xmm1
-	decq		%rdx
-	jne		.Lblake2s_compress_avx512_mainloop
-	vmovdqu		%xmm0,(%rdi)
-	vmovdqu		%xmm1,0x10(%rdi)
-	vmovdqu		%xmm4,0x20(%rdi)
+	jne		.Lavx512_roundloop
+
+	// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
+	vpternlogd	$0x96,%xmm10,%xmm2,%xmm0
+	vpternlogd	$0x96,%xmm11,%xmm3,%xmm1
+	decq		NBLOCKS
+	jne		.Lavx512_mainloop
+
+	vmovdqu		%xmm0,(CTX)		// Store new h[0..3]
+	vmovdqu		%xmm1,16(CTX)		// Store new h[4..7]
+	vmovq		%xmm4,32(CTX)		// Store new t (f is unchanged)
 	vzeroupper
 	RET
 SYM_FUNC_END(blake2s_compress_avx512)
diff --git a/lib/crypto/x86/blake2s.h b/lib/crypto/x86/blake2s.h
index b6d30d2fa045..f8eed6cb042e 100644
--- a/lib/crypto/x86/blake2s.h
+++ b/lib/crypto/x86/blake2s.h
@@ -11,24 +11,22 @@
 #include <linux/kernel.h>
 #include <linux/sizes.h>
 
-asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
-				       const u8 *block, const size_t nblocks,
-				       const u32 inc);
-asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
-					const u8 *block, const size_t nblocks,
-					const u32 inc);
+asmlinkage void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
+				       const u8 *data, size_t nblocks, u32 inc);
+asmlinkage void blake2s_compress_avx512(struct blake2s_ctx *ctx,
+					const u8 *data, size_t nblocks, u32 inc);
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
 
-static void blake2s_compress(struct blake2s_state *state, const u8 *block,
-			     size_t nblocks, const u32 inc)
+static void blake2s_compress(struct blake2s_ctx *ctx,
+			     const u8 *data, size_t nblocks, u32 inc)
 {
 	/* SIMD disables preemption, so relax after processing each page. */
 	BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
 
 	if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
-		blake2s_compress_generic(state, block, nblocks, inc);
+		blake2s_compress_generic(ctx, data, nblocks, inc);
 		return;
 	}
 
@@ -38,13 +36,13 @@ static void blake2s_compress(struct blake2s_state *state, const u8 *block,
 
 		kernel_fpu_begin();
 		if (static_branch_likely(&blake2s_use_avx512))
-			blake2s_compress_avx512(state, block, blocks, inc);
+			blake2s_compress_avx512(ctx, data, blocks, inc);
 		else
-			blake2s_compress_ssse3(state, block, blocks, inc);
+			blake2s_compress_ssse3(ctx, data, blocks, inc);
 		kernel_fpu_end();
 
+		data += blocks * BLAKE2S_BLOCK_SIZE;
 		nblocks -= blocks;
-		block += blocks * BLAKE2S_BLOCK_SIZE;
 	} while (nblocks);
 }
 
diff --git a/lib/crypto/x86/polyval-pclmul-avx.S b/lib/crypto/x86/polyval-pclmul-avx.S
new file mode 100644
index 000000000000..7f739465ad35
--- /dev/null
+++ b/lib/crypto/x86/polyval-pclmul-avx.S
@@ -0,0 +1,319 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2021 Google LLC
+ */
+/*
+ * This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI
+ * instructions. It works on 8 blocks at a time, by precomputing the first 8
+ * keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation
+ * allows us to split finite field multiplication into two steps.
+ *
+ * In the first step, we consider h^i, m_i as normal polynomials of degree less
+ * than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
+ * is simply polynomial multiplication.
+ *
+ * In the second step, we compute the reduction of p(x) modulo the finite field
+ * modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
+ * multiplication is finite field multiplication. The advantage is that the
+ * two-step process  only requires 1 finite field reduction for every 8
+ * polynomial multiplications. Further parallelism is gained by interleaving the
+ * multiplications and polynomial reductions.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define STRIDE_BLOCKS 8
+
+#define GSTAR %xmm7
+#define PL %xmm8
+#define PH %xmm9
+#define TMP_XMM %xmm11
+#define LO %xmm12
+#define HI %xmm13
+#define MI %xmm14
+#define SUM %xmm15
+
+#define ACCUMULATOR %rdi
+#define KEY_POWERS %rsi
+#define MSG %rdx
+#define BLOCKS_LEFT %rcx
+#define TMP %rax
+
+.section    .rodata.cst16.gstar, "aM", @progbits, 16
+.align 16
+
+.Lgstar:
+	.quad 0xc200000000000000, 0xc200000000000000
+
+.text
+
+/*
+ * Performs schoolbook1_iteration on two lists of 128-bit polynomials of length
+ * count pointed to by MSG and KEY_POWERS.
+ */
+.macro schoolbook1 count
+	.set i, 0
+	.rept (\count)
+		schoolbook1_iteration i 0
+		.set i, (i +1)
+	.endr
+.endm
+
+/*
+ * Computes the product of two 128-bit polynomials at the memory locations
+ * specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of
+ * the 256-bit product into LO, MI, HI.
+ *
+ * Given:
+ *   X = [X_1 : X_0]
+ *   Y = [Y_1 : Y_0]
+ *
+ * We compute:
+ *   LO += X_0 * Y_0
+ *   MI += X_0 * Y_1 + X_1 * Y_0
+ *   HI += X_1 * Y_1
+ *
+ * Later, the 256-bit result can be extracted as:
+ *   [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
+ * This step is done when computing the polynomial reduction for efficiency
+ * reasons.
+ *
+ * If xor_sum == 1, then also XOR the value of SUM into m_0.  This avoids an
+ * extra multiplication of SUM and h^8.
+ */
+.macro schoolbook1_iteration i xor_sum
+	movups (16*\i)(MSG), %xmm0
+	.if (\i == 0 && \xor_sum == 1)
+		pxor SUM, %xmm0
+	.endif
+	vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2
+	vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1
+	vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3
+	vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4
+	vpxor %xmm2, MI, MI
+	vpxor %xmm1, LO, LO
+	vpxor %xmm4, HI, HI
+	vpxor %xmm3, MI, MI
+.endm
+
+/*
+ * Performs the same computation as schoolbook1_iteration, except we expect the
+ * arguments to already be loaded into xmm0 and xmm1 and we set the result
+ * registers LO, MI, and HI directly rather than XOR'ing into them.
+ */
+.macro schoolbook1_noload
+	vpclmulqdq $0x01, %xmm0, %xmm1, MI
+	vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2
+	vpclmulqdq $0x00, %xmm0, %xmm1, LO
+	vpclmulqdq $0x11, %xmm0, %xmm1, HI
+	vpxor %xmm2, MI, MI
+.endm
+
+/*
+ * Computes the 256-bit polynomial represented by LO, HI, MI. Stores
+ * the result in PL, PH.
+ *   [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
+ */
+.macro schoolbook2
+	vpslldq $8, MI, PL
+	vpsrldq $8, MI, PH
+	pxor LO, PL
+	pxor HI, PH
+.endm
+
+/*
+ * Computes the 128-bit reduction of PH : PL. Stores the result in dest.
+ *
+ * This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
+ * x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
+ * product of two 128-bit polynomials in Montgomery form.  We need to reduce it
+ * mod g(x).  Also, since polynomials in Montgomery form have an "extra" factor
+ * of x^128, this product has two extra factors of x^128.  To get it back into
+ * Montgomery form, we need to remove one of these factors by dividing by x^128.
+ *
+ * To accomplish both of these goals, we add multiples of g(x) that cancel out
+ * the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
+ * bits are zero, the polynomial division by x^128 can be done by right shifting.
+ *
+ * Since the only nonzero term in the low 64 bits of g(x) is the constant term,
+ * the multiple of g(x) needed to cancel out P_0 is P_0 * g(x).  The CPU can
+ * only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
+ * x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x).  Adding this to
+ * the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
+ * = T_1 : T_0 = g*(x) * P_0.  Thus, bits 0-63 got "folded" into bits 64-191.
+ *
+ * Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
+ * 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
+ * + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
+ * x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
+ * P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
+ *
+ * So our final computation is:
+ *   T = T_1 : T_0 = g*(x) * P_0
+ *   V = V_1 : V_0 = g*(x) * (P_1 + T_0)
+ *   p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
+ *
+ * The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
+ * + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
+ * T_1 into dest.  This allows us to reuse P_1 + T_0 when computing V.
+ */
+.macro montgomery_reduction dest
+	vpclmulqdq $0x00, PL, GSTAR, TMP_XMM	# TMP_XMM = T_1 : T_0 = P_0 * g*(x)
+	pshufd $0b01001110, TMP_XMM, TMP_XMM	# TMP_XMM = T_0 : T_1
+	pxor PL, TMP_XMM			# TMP_XMM = P_1 + T_0 : P_0 + T_1
+	pxor TMP_XMM, PH			# PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
+	pclmulqdq $0x11, GSTAR, TMP_XMM		# TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)]
+	vpxor TMP_XMM, PH, \dest
+.endm
+
+/*
+ * Compute schoolbook multiplication for 8 blocks
+ * m_0h^8 + ... + m_7h^1
+ *
+ * If reduce is set, also computes the montgomery reduction of the
+ * previous full_stride call and XORs with the first message block.
+ * (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
+ * I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
+ */
+.macro full_stride reduce
+	pxor LO, LO
+	pxor HI, HI
+	pxor MI, MI
+
+	schoolbook1_iteration 7 0
+	.if \reduce
+		vpclmulqdq $0x00, PL, GSTAR, TMP_XMM
+	.endif
+
+	schoolbook1_iteration 6 0
+	.if \reduce
+		pshufd $0b01001110, TMP_XMM, TMP_XMM
+	.endif
+
+	schoolbook1_iteration 5 0
+	.if \reduce
+		pxor PL, TMP_XMM
+	.endif
+
+	schoolbook1_iteration 4 0
+	.if \reduce
+		pxor TMP_XMM, PH
+	.endif
+
+	schoolbook1_iteration 3 0
+	.if \reduce
+		pclmulqdq $0x11, GSTAR, TMP_XMM
+	.endif
+
+	schoolbook1_iteration 2 0
+	.if \reduce
+		vpxor TMP_XMM, PH, SUM
+	.endif
+
+	schoolbook1_iteration 1 0
+
+	schoolbook1_iteration 0 1
+
+	addq $(8*16), MSG
+	schoolbook2
+.endm
+
+/*
+ * Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS
+ */
+.macro partial_stride
+	mov BLOCKS_LEFT, TMP
+	shlq $4, TMP
+	addq $(16*STRIDE_BLOCKS), KEY_POWERS
+	subq TMP, KEY_POWERS
+
+	movups (MSG), %xmm0
+	pxor SUM, %xmm0
+	movups (KEY_POWERS), %xmm1
+	schoolbook1_noload
+	dec BLOCKS_LEFT
+	addq $16, MSG
+	addq $16, KEY_POWERS
+
+	test $4, BLOCKS_LEFT
+	jz .Lpartial4BlocksDone
+	schoolbook1 4
+	addq $(4*16), MSG
+	addq $(4*16), KEY_POWERS
+.Lpartial4BlocksDone:
+	test $2, BLOCKS_LEFT
+	jz .Lpartial2BlocksDone
+	schoolbook1 2
+	addq $(2*16), MSG
+	addq $(2*16), KEY_POWERS
+.Lpartial2BlocksDone:
+	test $1, BLOCKS_LEFT
+	jz .LpartialDone
+	schoolbook1 1
+.LpartialDone:
+	schoolbook2
+	montgomery_reduction SUM
+.endm
+
+/*
+ * Computes a = a * b * x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1.
+ *
+ * void polyval_mul_pclmul_avx(struct polyval_elem *a,
+ *			       const struct polyval_elem *b);
+ */
+SYM_FUNC_START(polyval_mul_pclmul_avx)
+	FRAME_BEGIN
+	vmovdqa .Lgstar(%rip), GSTAR
+	movups (%rdi), %xmm0
+	movups (%rsi), %xmm1
+	schoolbook1_noload
+	schoolbook2
+	montgomery_reduction SUM
+	movups SUM, (%rdi)
+	FRAME_END
+	RET
+SYM_FUNC_END(polyval_mul_pclmul_avx)
+
+/*
+ * Perform polynomial evaluation as specified by POLYVAL.  This computes:
+ *	h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
+ * where n=nblocks, h is the hash key, and m_i are the message blocks.
+ *
+ * rdi - pointer to the accumulator
+ * rsi - pointer to precomputed key powers h^8 ... h^1
+ * rdx - pointer to message blocks
+ * rcx - number of blocks to hash
+ *
+ * void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
+ *				  const struct polyval_key *key,
+ *				  const u8 *data, size_t nblocks);
+ */
+SYM_FUNC_START(polyval_blocks_pclmul_avx)
+	FRAME_BEGIN
+	vmovdqa .Lgstar(%rip), GSTAR
+	movups (ACCUMULATOR), SUM
+	subq $STRIDE_BLOCKS, BLOCKS_LEFT
+	js .LstrideLoopExit
+	full_stride 0
+	subq $STRIDE_BLOCKS, BLOCKS_LEFT
+	js .LstrideLoopExitReduce
+.LstrideLoop:
+	full_stride 1
+	subq $STRIDE_BLOCKS, BLOCKS_LEFT
+	jns .LstrideLoop
+.LstrideLoopExitReduce:
+	montgomery_reduction SUM
+.LstrideLoopExit:
+	add $STRIDE_BLOCKS, BLOCKS_LEFT
+	jz .LskipPartial
+	partial_stride
+.LskipPartial:
+	movups SUM, (ACCUMULATOR)
+	FRAME_END
+	RET
+SYM_FUNC_END(polyval_blocks_pclmul_avx)
diff --git a/lib/crypto/x86/polyval.h b/lib/crypto/x86/polyval.h
new file mode 100644
index 000000000000..ef8797521420
--- /dev/null
+++ b/lib/crypto/x86/polyval.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * POLYVAL library functions, x86_64 optimized
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <linux/cpufeature.h>
+
+#define NUM_H_POWERS 8
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmul_avx);
+
+asmlinkage void polyval_mul_pclmul_avx(struct polyval_elem *a,
+				       const struct polyval_elem *b);
+asmlinkage void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
+					  const struct polyval_key *key,
+					  const u8 *data, size_t nblocks);
+
+static void polyval_preparekey_arch(struct polyval_key *key,
+				    const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+	static_assert(ARRAY_SIZE(key->h_powers) == NUM_H_POWERS);
+	memcpy(&key->h_powers[NUM_H_POWERS - 1], raw_key, POLYVAL_BLOCK_SIZE);
+	if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+		kernel_fpu_begin();
+		for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+			key->h_powers[i] = key->h_powers[i + 1];
+			polyval_mul_pclmul_avx(
+				&key->h_powers[i],
+				&key->h_powers[NUM_H_POWERS - 1]);
+		}
+		kernel_fpu_end();
+	} else {
+		for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+			key->h_powers[i] = key->h_powers[i + 1];
+			polyval_mul_generic(&key->h_powers[i],
+					    &key->h_powers[NUM_H_POWERS - 1]);
+		}
+	}
+}
+
+static void polyval_mul_arch(struct polyval_elem *acc,
+			     const struct polyval_key *key)
+{
+	if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+		kernel_fpu_begin();
+		polyval_mul_pclmul_avx(acc, &key->h_powers[NUM_H_POWERS - 1]);
+		kernel_fpu_end();
+	} else {
+		polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
+	}
+}
+
+static void polyval_blocks_arch(struct polyval_elem *acc,
+				const struct polyval_key *key,
+				const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+		do {
+			/* Allow rescheduling every 4 KiB. */
+			size_t n = min_t(size_t, nblocks,
+					 4096 / POLYVAL_BLOCK_SIZE);
+
+			kernel_fpu_begin();
+			polyval_blocks_pclmul_avx(acc, key, data, n);
+			kernel_fpu_end();
+			data += n * POLYVAL_BLOCK_SIZE;
+			nblocks -= n;
+		} while (nblocks);
+	} else {
+		polyval_blocks_generic(acc, &key->h_powers[NUM_H_POWERS - 1],
+				       data, nblocks);
+	}
+}
+
+#define polyval_mod_init_arch polyval_mod_init_arch
+static void polyval_mod_init_arch(void)
+{
+	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ) &&
+	    boot_cpu_has(X86_FEATURE_AVX))
+		static_branch_enable(&have_pclmul_avx);
+}