13 files changed, 55 insertions, 476 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 2e3f93b690f4..4fb985b76e97 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -44,6 +44,8 @@ config ARM
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select ARCH_USE_MEMTEST
+	# https://github.com/llvm/llvm-project/commit/d130f402642fba3d065aacb506cb061c899558de
+	select ARCH_USES_CFI_GENERIC_LLVM_PASS if CLANG_VERSION < 220000
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
 	select ARCH_WANT_GENERAL_HUGETLB
 	select ARCH_WANT_IPC_PARSE_VERSION
diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts
index aa9576d8ab56..48ca25f57ef6 100644
--- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts
+++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts
@@ -1254,3 +1254,17 @@
 	max-frequency = <25000000>;
 	bus-width = <4>;
 };
+
+/*
+ * FIXME: rgmii delay is introduced by MAC (configured in u-boot now)
+ * instead of PCB on fuji board, so the "phy-mode" should be updated to
+ * "rgmii-[tx|rx]id" when the aspeed-mac driver can handle the delay
+ * properly.
+ */
+&mac3 {
+	status = "okay";
+	phy-mode = "rgmii";
+	phy-handle = <&ethphy3>;
+	pinctrl-names = "default";
+	pinctrl-0 = <&pinctrl_rgmii4_default>;
+};
diff --git a/arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts b/arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts
index ac44c745bdf8..a39a021a3910 100644
--- a/arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts
+++ b/arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts
@@ -55,8 +55,8 @@
 	mdio {
 		/delete-node/ switch@1e;
 
-		bcm54210e: ethernet-phy@0 {
-			reg = <0>;
+		bcm54210e: ethernet-phy@25 {
+			reg = <25>;
 		};
 	};
 };
diff --git a/arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts b/arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts
index 06545a6052f7..43ff5eafb2bb 100644
--- a/arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts
+++ b/arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts
@@ -259,7 +259,7 @@
 	pinctrl-0 = <&pinctrl_audmux>;
 	status = "okay";
 
-	ssi2 {
+	mux-ssi2 {
 		fsl,audmux-port = <1>;
 		fsl,port-config = <
 			(IMX_AUDMUX_V2_PTCR_SYN |
@@ -271,7 +271,7 @@
 		>;
 	};
 
-	aud3 {
+	mux-aud3 {
 		fsl,audmux-port = <2>;
 		fsl,port-config = <
 			IMX_AUDMUX_V2_PTCR_SYN
diff --git a/arch/arm/boot/dts/nxp/imx/imx6ul.dtsi b/arch/arm/boot/dts/nxp/imx/imx6ul.dtsi
index 6de224dd2bb9..6eb80f867f50 100644
--- a/arch/arm/boot/dts/nxp/imx/imx6ul.dtsi
+++ b/arch/arm/boot/dts/nxp/imx/imx6ul.dtsi
@@ -339,7 +339,7 @@
 					#sound-dai-cells = <0>;
 					compatible = "fsl,imx6ul-sai", "fsl,imx6sx-sai";
 					reg = <0x02030000 0x4000>;
-					interrupts = <GIC_SPI 24 IRQ_TYPE_LEVEL_HIGH>;
+					interrupts = <GIC_SPI 25 IRQ_TYPE_LEVEL_HIGH>;
 					clocks = <&clks IMX6UL_CLK_SAI3_IPG>,
 						 <&clks IMX6UL_CLK_SAI3>,
 						 <&clks IMX6UL_CLK_DUMMY>, <&clks IMX6UL_CLK_DUMMY>;
diff --git a/arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts b/arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts
index 107b00b9a939..540642e99a41 100644
--- a/arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts
+++ b/arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts
@@ -136,7 +136,7 @@
 		interrupt-parent = <&gpio2>;
 		interrupts = <8 IRQ_TYPE_EDGE_FALLING>;
 		reset-gpios = <&gpio2 14 GPIO_ACTIVE_LOW>;
-		report-rate-hz = <6>;
+		report-rate-hz = <60>;
 		/* settings valid only for Hycon touchscreen */
 		touchscreen-size-x = <1280>;
 		touchscreen-size-y = <800>;
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index c436eec22d86..f30d743df264 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -33,22 +33,6 @@ config CRYPTO_NHPOLY1305_NEON
 	  Architecture: arm using:
 	  - NEON (Advanced SIMD) extensions
 
-config CRYPTO_BLAKE2B_NEON
-	tristate "Hash functions: BLAKE2b (NEON)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_BLAKE2B
-	help
-	  BLAKE2b cryptographic hash function (RFC 7693)
-
-	  Architecture: arm using
-	  - NEON (Advanced SIMD) extensions
-
-	  BLAKE2b digest algorithm optimized with ARM NEON instructions.
-	  On ARM processors that have NEON support but not the ARMv8
-	  Crypto Extensions, typically this BLAKE2b implementation is
-	  much faster than the SHA-2 family and slightly faster than
-	  SHA-1.
-
 config CRYPTO_AES_ARM
 	tristate "Ciphers: AES"
 	select CRYPTO_ALGAPI
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 6346a73effc0..86dd43313dbf 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -5,7 +5,6 @@
 
 obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
 obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
-obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
 
 obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
@@ -13,7 +12,6 @@ obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
 
 aes-arm-y	:= aes-cipher-core.o aes-cipher-glue.o
 aes-arm-bs-y	:= aes-neonbs-core.o aes-neonbs-glue.o
-blake2b-neon-y  := blake2b-neon-core.o blake2b-neon-glue.o
 aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
diff --git a/arch/arm/crypto/blake2b-neon-core.S b/arch/arm/crypto/blake2b-neon-core.S
deleted file mode 100644
index 0406a186377f..000000000000
--- a/arch/arm/crypto/blake2b-neon-core.S
+++ /dev/null
@@ -1,347 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * BLAKE2b digest algorithm, NEON accelerated
- *
- * Copyright 2020 Google LLC
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
-
-#include <linux/linkage.h>
-
-	.text
-	.fpu		neon
-
-	// The arguments to blake2b_compress_neon()
-	STATE		.req	r0
-	BLOCK		.req	r1
-	NBLOCKS		.req	r2
-	INC		.req	r3
-
-	// Pointers to the rotation tables
-	ROR24_TABLE	.req	r4
-	ROR16_TABLE	.req	r5
-
-	// The original stack pointer
-	ORIG_SP		.req	r6
-
-	// NEON registers which contain the message words of the current block.
-	// M_0-M_3 are occasionally used for other purposes too.
-	M_0		.req	d16
-	M_1		.req	d17
-	M_2		.req	d18
-	M_3		.req	d19
-	M_4		.req	d20
-	M_5		.req	d21
-	M_6		.req	d22
-	M_7		.req	d23
-	M_8		.req	d24
-	M_9		.req	d25
-	M_10		.req	d26
-	M_11		.req	d27
-	M_12		.req	d28
-	M_13		.req	d29
-	M_14		.req	d30
-	M_15		.req	d31
-
-	.align		4
-	// Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
-	// instruction.  This is the most efficient way to implement these
-	// rotation amounts with NEON.  (On Cortex-A53 it's the same speed as
-	// vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
-.Lror24_table:
-	.byte		3, 4, 5, 6, 7, 0, 1, 2
-.Lror16_table:
-	.byte		2, 3, 4, 5, 6, 7, 0, 1
-	// The BLAKE2b initialization vector
-.Lblake2b_IV:
-	.quad		0x6a09e667f3bcc908, 0xbb67ae8584caa73b
-	.quad		0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
-	.quad		0x510e527fade682d1, 0x9b05688c2b3e6c1f
-	.quad		0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
-
-// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
-// NEON registers q0-q7.  The message block is in q8..q15 (M_0-M_15).  The stack
-// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
-// (M_0-M_3), so that they can be reloaded if they are used as temporary
-// registers.  The macro arguments s0-s15 give the order in which the message
-// words are used in this round.  'final' is 1 if this is the final round.
-.macro	_blake2b_round	s0, s1, s2, s3, s4, s5, s6, s7, \
-			s8, s9, s10, s11, s12, s13, s14, s15, final=0
-
-	// Mix the columns:
-	// (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
-	// (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
-
-	// a += b + m[blake2b_sigma[r][2*i + 0]];
-	vadd.u64	q0, q0, q2
-	vadd.u64	q1, q1, q3
-	vadd.u64	d0, d0, M_\s0
-	vadd.u64	d1, d1, M_\s2
-	vadd.u64	d2, d2, M_\s4
-	vadd.u64	d3, d3, M_\s6
-
-	// d = ror64(d ^ a, 32);
-	veor		q6, q6, q0
-	veor		q7, q7, q1
-	vrev64.32	q6, q6
-	vrev64.32	q7, q7
-
-	// c += d;
-	vadd.u64	q4, q4, q6
-	vadd.u64	q5, q5, q7
-
-	// b = ror64(b ^ c, 24);
-	vld1.8		{M_0}, [ROR24_TABLE, :64]
-	veor		q2, q2, q4
-	veor		q3, q3, q5
-	vtbl.8		d4, {d4}, M_0
-	vtbl.8		d5, {d5}, M_0
-	vtbl.8		d6, {d6}, M_0
-	vtbl.8		d7, {d7}, M_0
-
-	// a += b + m[blake2b_sigma[r][2*i + 1]];
-	//
-	// M_0 got clobbered above, so we have to reload it if any of the four
-	// message words this step needs happens to be M_0.  Otherwise we don't
-	// need to reload it here, as it will just get clobbered again below.
-.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
-	vld1.8		{M_0}, [sp, :64]
-.endif
-	vadd.u64	q0, q0, q2
-	vadd.u64	q1, q1, q3
-	vadd.u64	d0, d0, M_\s1
-	vadd.u64	d1, d1, M_\s3
-	vadd.u64	d2, d2, M_\s5
-	vadd.u64	d3, d3, M_\s7
-
-	// d = ror64(d ^ a, 16);
-	vld1.8		{M_0}, [ROR16_TABLE, :64]
-	veor		q6, q6, q0
-	veor		q7, q7, q1
-	vtbl.8		d12, {d12}, M_0
-	vtbl.8		d13, {d13}, M_0
-	vtbl.8		d14, {d14}, M_0
-	vtbl.8		d15, {d15}, M_0
-
-	// c += d;
-	vadd.u64	q4, q4, q6
-	vadd.u64	q5, q5, q7
-
-	// b = ror64(b ^ c, 63);
-	//
-	// This rotation amount isn't a multiple of 8, so it has to be
-	// implemented using a pair of shifts, which requires temporary
-	// registers.  Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
-	veor		q8, q2, q4
-	veor		q9, q3, q5
-	vshr.u64	q2, q8, #63
-	vshr.u64	q3, q9, #63
-	vsli.u64	q2, q8, #1
-	vsli.u64	q3, q9, #1
-	vld1.8		{q8-q9}, [sp, :256]
-
-	// Mix the diagonals:
-	// (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
-	// (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
-	//
-	// There are two possible ways to do this: use 'vext' instructions to
-	// shift the rows of the matrix so that the diagonals become columns,
-	// and undo it afterwards; or just use 64-bit operations on 'd'
-	// registers instead of 128-bit operations on 'q' registers.  We use the
-	// latter approach, as it performs much better on Cortex-A7.
-
-	// a += b + m[blake2b_sigma[r][2*i + 0]];
-	vadd.u64	d0, d0, d5
-	vadd.u64	d1, d1, d6
-	vadd.u64	d2, d2, d7
-	vadd.u64	d3, d3, d4
-	vadd.u64	d0, d0, M_\s8
-	vadd.u64	d1, d1, M_\s10
-	vadd.u64	d2, d2, M_\s12
-	vadd.u64	d3, d3, M_\s14
-
-	// d = ror64(d ^ a, 32);
-	veor		d15, d15, d0
-	veor		d12, d12, d1
-	veor		d13, d13, d2
-	veor		d14, d14, d3
-	vrev64.32	d15, d15
-	vrev64.32	d12, d12
-	vrev64.32	d13, d13
-	vrev64.32	d14, d14
-
-	// c += d;
-	vadd.u64	d10, d10, d15
-	vadd.u64	d11, d11, d12
-	vadd.u64	d8, d8, d13
-	vadd.u64	d9, d9, d14
-
-	// b = ror64(b ^ c, 24);
-	vld1.8		{M_0}, [ROR24_TABLE, :64]
-	veor		d5, d5, d10
-	veor		d6, d6, d11
-	veor		d7, d7, d8
-	veor		d4, d4, d9
-	vtbl.8		d5, {d5}, M_0
-	vtbl.8		d6, {d6}, M_0
-	vtbl.8		d7, {d7}, M_0
-	vtbl.8		d4, {d4}, M_0
-
-	// a += b + m[blake2b_sigma[r][2*i + 1]];
-.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
-	vld1.8		{M_0}, [sp, :64]
-.endif
-	vadd.u64	d0, d0, d5
-	vadd.u64	d1, d1, d6
-	vadd.u64	d2, d2, d7
-	vadd.u64	d3, d3, d4
-	vadd.u64	d0, d0, M_\s9
-	vadd.u64	d1, d1, M_\s11
-	vadd.u64	d2, d2, M_\s13
-	vadd.u64	d3, d3, M_\s15
-
-	// d = ror64(d ^ a, 16);
-	vld1.8		{M_0}, [ROR16_TABLE, :64]
-	veor		d15, d15, d0
-	veor		d12, d12, d1
-	veor		d13, d13, d2
-	veor		d14, d14, d3
-	vtbl.8		d12, {d12}, M_0
-	vtbl.8		d13, {d13}, M_0
-	vtbl.8		d14, {d14}, M_0
-	vtbl.8		d15, {d15}, M_0
-
-	// c += d;
-	vadd.u64	d10, d10, d15
-	vadd.u64	d11, d11, d12
-	vadd.u64	d8, d8, d13
-	vadd.u64	d9, d9, d14
-
-	// b = ror64(b ^ c, 63);
-	veor		d16, d4, d9
-	veor		d17, d5, d10
-	veor		d18, d6, d11
-	veor		d19, d7, d8
-	vshr.u64	q2, q8, #63
-	vshr.u64	q3, q9, #63
-	vsli.u64	q2, q8, #1
-	vsli.u64	q3, q9, #1
-	// Reloading q8-q9 can be skipped on the final round.
-.if ! \final
-	vld1.8		{q8-q9}, [sp, :256]
-.endif
-.endm
-
-//
-// void blake2b_compress_neon(struct blake2b_state *state,
-//			      const u8 *block, size_t nblocks, u32 inc);
-//
-// Only the first three fields of struct blake2b_state are used:
-//	u64 h[8];	(inout)
-//	u64 t[2];	(inout)
-//	u64 f[2];	(in)
-//
-	.align		5
-ENTRY(blake2b_compress_neon)
-	push		{r4-r10}
-
-	// Allocate a 32-byte stack buffer that is 32-byte aligned.
-	mov		ORIG_SP, sp
-	sub		ip, sp, #32
-	bic		ip, ip, #31
-	mov		sp, ip
-
-	adr		ROR24_TABLE, .Lror24_table
-	adr		ROR16_TABLE, .Lror16_table
-
-	mov		ip, STATE
-	vld1.64		{q0-q1}, [ip]!		// Load h[0..3]
-	vld1.64		{q2-q3}, [ip]!		// Load h[4..7]
-.Lnext_block:
-	  adr		r10, .Lblake2b_IV
-	vld1.64		{q14-q15}, [ip]		// Load t[0..1] and f[0..1]
-	vld1.64		{q4-q5}, [r10]!		// Load IV[0..3]
-	  vmov		r7, r8, d28		// Copy t[0] to (r7, r8)
-	vld1.64		{q6-q7}, [r10]		// Load IV[4..7]
-	  adds		r7, r7, INC		// Increment counter
-	bcs		.Lslow_inc_ctr
-	vmov.i32	d28[0], r7
-	vst1.64		{d28}, [ip]		// Update t[0]
-.Linc_ctr_done:
-
-	// Load the next message block and finish initializing the state matrix
-	// 'v'.  Fortunately, there are exactly enough NEON registers to fit the
-	// entire state matrix in q0-q7 and the entire message block in q8-15.
-	//
-	// However, _blake2b_round also needs some extra registers for rotates,
-	// so we have to spill some registers.  It's better to spill the message
-	// registers than the state registers, as the message doesn't change.
-	// Therefore we store a copy of the first 32 bytes of the message block
-	// (q8-q9) in an aligned buffer on the stack so that they can be
-	// reloaded when needed.  (We could just reload directly from the
-	// message buffer, but it's faster to use aligned loads.)
-	vld1.8		{q8-q9}, [BLOCK]!
-	  veor		q6, q6, q14	// v[12..13] = IV[4..5] ^ t[0..1]
-	vld1.8		{q10-q11}, [BLOCK]!
-	  veor		q7, q7, q15	// v[14..15] = IV[6..7] ^ f[0..1]
-	vld1.8		{q12-q13}, [BLOCK]!
-	vst1.8		{q8-q9}, [sp, :256]
-	  mov		ip, STATE
-	vld1.8		{q14-q15}, [BLOCK]!
-
-	// Execute the rounds.  Each round is provided the order in which it
-	// needs to use the message words.
-	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
-	_blake2b_round	11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
-	_blake2b_round	7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
-	_blake2b_round	9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
-	_blake2b_round	2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
-	_blake2b_round	12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
-	_blake2b_round	13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
-	_blake2b_round	6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
-	_blake2b_round	10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
-	_blake2b_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-	_blake2b_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
-			final=1
-
-	// Fold the final state matrix into the hash chaining value:
-	//
-	//	for (i = 0; i < 8; i++)
-	//		h[i] ^= v[i] ^ v[i + 8];
-	//
-	  vld1.64	{q8-q9}, [ip]!		// Load old h[0..3]
-	veor		q0, q0, q4		// v[0..1] ^= v[8..9]
-	veor		q1, q1, q5		// v[2..3] ^= v[10..11]
-	  vld1.64	{q10-q11}, [ip]		// Load old h[4..7]
-	veor		q2, q2, q6		// v[4..5] ^= v[12..13]
-	veor		q3, q3, q7		// v[6..7] ^= v[14..15]
-	veor		q0, q0, q8		// v[0..1] ^= h[0..1]
-	veor		q1, q1, q9		// v[2..3] ^= h[2..3]
-	  mov		ip, STATE
-	  subs		NBLOCKS, NBLOCKS, #1	// nblocks--
-	  vst1.64	{q0-q1}, [ip]!		// Store new h[0..3]
-	veor		q2, q2, q10		// v[4..5] ^= h[4..5]
-	veor		q3, q3, q11		// v[6..7] ^= h[6..7]
-	  vst1.64	{q2-q3}, [ip]!		// Store new h[4..7]
-
-	// Advance to the next block, if there is one.
-	bne		.Lnext_block		// nblocks != 0?
-
-	mov		sp, ORIG_SP
-	pop		{r4-r10}
-	mov		pc, lr
-
-.Lslow_inc_ctr:
-	// Handle the case where the counter overflowed its low 32 bits, by
-	// carrying the overflow bit into the full 128-bit counter.
-	vmov		r9, r10, d29
-	adcs		r8, r8, #0
-	adcs		r9, r9, #0
-	adc		r10, r10, #0
-	vmov		d28, r7, r8
-	vmov		d29, r9, r10
-	vst1.64		{q14}, [ip]		// Update t[0] and t[1]
-	b		.Linc_ctr_done
-ENDPROC(blake2b_compress_neon)
diff --git a/arch/arm/crypto/blake2b-neon-glue.c b/arch/arm/crypto/blake2b-neon-glue.c
deleted file mode 100644
index 2ff443a91724..000000000000
--- a/arch/arm/crypto/blake2b-neon-glue.c
+++ /dev/null
@@ -1,104 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * BLAKE2b digest algorithm, NEON accelerated
- *
- * Copyright 2020 Google LLC
- */
-
-#include <crypto/internal/blake2b.h>
-#include <crypto/internal/hash.h>
-
-#include <linux/module.h>
-#include <linux/sizes.h>
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void blake2b_compress_neon(struct blake2b_state *state,
-				      const u8 *block, size_t nblocks, u32 inc);
-
-static void blake2b_compress_arch(struct blake2b_state *state,
-				  const u8 *block, size_t nblocks, u32 inc)
-{
-	do {
-		const size_t blocks = min_t(size_t, nblocks,
-					    SZ_4K / BLAKE2B_BLOCK_SIZE);
-
-		kernel_neon_begin();
-		blake2b_compress_neon(state, block, blocks, inc);
-		kernel_neon_end();
-
-		nblocks -= blocks;
-		block += blocks * BLAKE2B_BLOCK_SIZE;
-	} while (nblocks);
-}
-
-static int crypto_blake2b_update_neon(struct shash_desc *desc,
-				      const u8 *in, unsigned int inlen)
-{
-	return crypto_blake2b_update_bo(desc, in, inlen, blake2b_compress_arch);
-}
-
-static int crypto_blake2b_finup_neon(struct shash_desc *desc, const u8 *in,
-				     unsigned int inlen, u8 *out)
-{
-	return crypto_blake2b_finup(desc, in, inlen, out,
-				    blake2b_compress_arch);
-}
-
-#define BLAKE2B_ALG(name, driver_name, digest_size)			\
-	{								\
-		.base.cra_name		= name,				\
-		.base.cra_driver_name	= driver_name,			\
-		.base.cra_priority	= 200,				\
-		.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY |	\
-					  CRYPTO_AHASH_ALG_BLOCK_ONLY |	\
-					  CRYPTO_AHASH_ALG_FINAL_NONZERO, \
-		.base.cra_blocksize	= BLAKE2B_BLOCK_SIZE,		\
-		.base.cra_ctxsize	= sizeof(struct blake2b_tfm_ctx), \
-		.base.cra_module	= THIS_MODULE,			\
-		.digestsize		= digest_size,			\
-		.setkey			= crypto_blake2b_setkey,	\
-		.init			= crypto_blake2b_init,		\
-		.update			= crypto_blake2b_update_neon,	\
-		.finup			= crypto_blake2b_finup_neon,	\
-		.descsize		= sizeof(struct blake2b_state),	\
-		.statesize		= BLAKE2B_STATE_SIZE,		\
-	}
-
-static struct shash_alg blake2b_neon_algs[] = {
-	BLAKE2B_ALG("blake2b-160", "blake2b-160-neon", BLAKE2B_160_HASH_SIZE),
-	BLAKE2B_ALG("blake2b-256", "blake2b-256-neon", BLAKE2B_256_HASH_SIZE),
-	BLAKE2B_ALG("blake2b-384", "blake2b-384-neon", BLAKE2B_384_HASH_SIZE),
-	BLAKE2B_ALG("blake2b-512", "blake2b-512-neon", BLAKE2B_512_HASH_SIZE),
-};
-
-static int __init blake2b_neon_mod_init(void)
-{
-	if (!(elf_hwcap & HWCAP_NEON))
-		return -ENODEV;
-
-	return crypto_register_shashes(blake2b_neon_algs,
-				       ARRAY_SIZE(blake2b_neon_algs));
-}
-
-static void __exit blake2b_neon_mod_exit(void)
-{
-	crypto_unregister_shashes(blake2b_neon_algs,
-				  ARRAY_SIZE(blake2b_neon_algs));
-}
-
-module_init(blake2b_neon_mod_init);
-module_exit(blake2b_neon_mod_exit);
-
-MODULE_DESCRIPTION("BLAKE2b digest algorithm, NEON accelerated");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
-MODULE_ALIAS_CRYPTO("blake2b-160");
-MODULE_ALIAS_CRYPTO("blake2b-160-neon");
-MODULE_ALIAS_CRYPTO("blake2b-256");
-MODULE_ALIAS_CRYPTO("blake2b-256-neon");
-MODULE_ALIAS_CRYPTO("blake2b-384");
-MODULE_ALIAS_CRYPTO("blake2b-384-neon");
-MODULE_ALIAS_CRYPTO("blake2b-512");
-MODULE_ALIAS_CRYPTO("blake2b-512-neon");
diff --git a/arch/arm/include/asm/simd.h b/arch/arm/include/asm/simd.h
index be08a8da046f..8549fa8b7253 100644
--- a/arch/arm/include/asm/simd.h
+++ b/arch/arm/include/asm/simd.h
@@ -2,14 +2,21 @@
 #ifndef _ASM_SIMD_H
 #define _ASM_SIMD_H
 
+#include <linux/cleanup.h>
 #include <linux/compiler_attributes.h>
 #include <linux/preempt.h>
 #include <linux/types.h>
 
+#include <asm/neon.h>
+
 static __must_check inline bool may_use_simd(void)
 {
 	return IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && !in_hardirq()
 	       && !irqs_disabled();
 }
 
+DEFINE_LOCK_GUARD_0(ksimd, kernel_neon_begin(), kernel_neon_end())
+
+#define scoped_ksimd()	scoped_guard(ksimd)
+
 #endif	/* _ASM_SIMD_H */
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index f90be312418e..d6ae80b5df36 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -283,10 +283,17 @@ extern int __put_user_8(void *, unsigned long long);
 	__gu_err;							\
 })
 
+/*
+ * This is a type: either unsigned long, if the argument fits into
+ * that type, or otherwise unsigned long long.
+ */
+#define __long_type(x) \
+	__typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
+
 #define __get_user_err(x, ptr, err, __t)				\
 do {									\
 	unsigned long __gu_addr = (unsigned long)(ptr);			\
-	unsigned long __gu_val;						\
+	__long_type(x) __gu_val;					\
 	unsigned int __ua_flags;					\
 	__chk_user_ptr(ptr);						\
 	might_fault();							\
@@ -295,6 +302,7 @@ do {									\
 	case 1:	__get_user_asm_byte(__gu_val, __gu_addr, err, __t); break;	\
 	case 2:	__get_user_asm_half(__gu_val, __gu_addr, err, __t); break;	\
 	case 4:	__get_user_asm_word(__gu_val, __gu_addr, err, __t); break;	\
+	case 8:	__get_user_asm_dword(__gu_val, __gu_addr, err, __t); break;	\
 	default: (__gu_val) = __get_user_bad();				\
 	}								\
 	uaccess_restore(__ua_flags);					\
@@ -353,6 +361,22 @@ do {									\
 #define __get_user_asm_word(x, addr, err, __t)			\
 	__get_user_asm(x, addr, err, "ldr" __t)
 
+#ifdef __ARMEB__
+#define __WORD0_OFFS	4
+#define __WORD1_OFFS	0
+#else
+#define __WORD0_OFFS	0
+#define __WORD1_OFFS	4
+#endif
+
+#define __get_user_asm_dword(x, addr, err, __t)				\
+	({								\
+	unsigned long __w0, __w1;					\
+	__get_user_asm(__w0, addr + __WORD0_OFFS, err, "ldr" __t);	\
+	__get_user_asm(__w1, addr + __WORD1_OFFS, err, "ldr" __t);	\
+	(x) = ((u64)__w1 << 32) | (u64) __w0;				\
+})
+
 #define __put_user_switch(x, ptr, __err, __fn)				\
 	do {								\
 		const __typeof__(*(ptr)) __user *__pu_ptr = (ptr);	\
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index b07e699aaa3c..fd09afae72a2 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -484,3 +484,4 @@
 467	common	open_tree_attr			sys_open_tree_attr
 468	common	file_getattr			sys_file_getattr
 469	common	file_setattr			sys_file_setattr
+470	common	listns				sys_listns