diff options
Diffstat (limited to 'arch/arm')
| -rw-r--r-- | arch/arm/Kconfig | 2 | ||||
| -rw-r--r-- | arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts | 14 | ||||
| -rw-r--r-- | arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts | 4 | ||||
| -rw-r--r-- | arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts | 4 | ||||
| -rw-r--r-- | arch/arm/boot/dts/nxp/imx/imx6ul.dtsi | 2 | ||||
| -rw-r--r-- | arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts | 2 | ||||
| -rw-r--r-- | arch/arm/crypto/Kconfig | 16 | ||||
| -rw-r--r-- | arch/arm/crypto/Makefile | 2 | ||||
| -rw-r--r-- | arch/arm/crypto/blake2b-neon-core.S | 347 | ||||
| -rw-r--r-- | arch/arm/crypto/blake2b-neon-glue.c | 104 | ||||
| -rw-r--r-- | arch/arm/include/asm/simd.h | 7 | ||||
| -rw-r--r-- | arch/arm/include/asm/uaccess.h | 26 | ||||
| -rw-r--r-- | arch/arm/tools/syscall.tbl | 1 |
13 files changed, 55 insertions, 476 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 2e3f93b690f4..4fb985b76e97 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -44,6 +44,8 @@ config ARM select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_MEMTEST + # https://github.com/llvm/llvm-project/commit/d130f402642fba3d065aacb506cb061c899558de + select ARCH_USES_CFI_GENERIC_LLVM_PASS if CLANG_VERSION < 220000 select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_GENERAL_HUGETLB select ARCH_WANT_IPC_PARSE_VERSION diff --git a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts index aa9576d8ab56..48ca25f57ef6 100644 --- a/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts +++ b/arch/arm/boot/dts/aspeed/aspeed-bmc-facebook-fuji-data64.dts @@ -1254,3 +1254,17 @@ max-frequency = <25000000>; bus-width = <4>; }; + +/* + * FIXME: rgmii delay is introduced by MAC (configured in u-boot now) + * instead of PCB on fuji board, so the "phy-mode" should be updated to + * "rgmii-[tx|rx]id" when the aspeed-mac driver can handle the delay + * properly. + */ +&mac3 { + status = "okay"; + phy-mode = "rgmii"; + phy-handle = <ðphy3>; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_rgmii4_default>; +}; diff --git a/arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts b/arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts index ac44c745bdf8..a39a021a3910 100644 --- a/arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts +++ b/arch/arm/boot/dts/broadcom/bcm47189-luxul-xap-1440.dts @@ -55,8 +55,8 @@ mdio { /delete-node/ switch@1e; - bcm54210e: ethernet-phy@0 { - reg = <0>; + bcm54210e: ethernet-phy@25 { + reg = <25>; }; }; }; diff --git a/arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts b/arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts index 06545a6052f7..43ff5eafb2bb 100644 --- a/arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts +++ b/arch/arm/boot/dts/nxp/imx/imx51-zii-rdu1.dts @@ -259,7 +259,7 @@ pinctrl-0 = <&pinctrl_audmux>; status = "okay"; - ssi2 { + mux-ssi2 { fsl,audmux-port = <1>; fsl,port-config = < (IMX_AUDMUX_V2_PTCR_SYN | @@ -271,7 +271,7 @@ >; }; - aud3 { + mux-aud3 { fsl,audmux-port = <2>; fsl,port-config = < IMX_AUDMUX_V2_PTCR_SYN diff --git a/arch/arm/boot/dts/nxp/imx/imx6ul.dtsi b/arch/arm/boot/dts/nxp/imx/imx6ul.dtsi index 6de224dd2bb9..6eb80f867f50 100644 --- a/arch/arm/boot/dts/nxp/imx/imx6ul.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx6ul.dtsi @@ -339,7 +339,7 @@ #sound-dai-cells = <0>; compatible = "fsl,imx6ul-sai", "fsl,imx6sx-sai"; reg = <0x02030000 0x4000>; - interrupts = <GIC_SPI 24 IRQ_TYPE_LEVEL_HIGH>; + interrupts = <GIC_SPI 25 IRQ_TYPE_LEVEL_HIGH>; clocks = <&clks IMX6UL_CLK_SAI3_IPG>, <&clks IMX6UL_CLK_SAI3>, <&clks IMX6UL_CLK_DUMMY>, <&clks IMX6UL_CLK_DUMMY>; diff --git a/arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts b/arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts index 107b00b9a939..540642e99a41 100644 --- a/arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts +++ b/arch/arm/boot/dts/nxp/imx/imx6ull-engicam-microgea-rmm.dts @@ -136,7 +136,7 @@ interrupt-parent = <&gpio2>; interrupts = <8 IRQ_TYPE_EDGE_FALLING>; reset-gpios = <&gpio2 14 GPIO_ACTIVE_LOW>; - report-rate-hz = <6>; + report-rate-hz = <60>; /* settings valid only for Hycon touchscreen */ touchscreen-size-x = <1280>; touchscreen-size-y = <800>; diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig index c436eec22d86..f30d743df264 100644 --- a/arch/arm/crypto/Kconfig +++ b/arch/arm/crypto/Kconfig @@ -33,22 +33,6 @@ config CRYPTO_NHPOLY1305_NEON Architecture: arm using: - NEON (Advanced SIMD) extensions -config CRYPTO_BLAKE2B_NEON - tristate "Hash functions: BLAKE2b (NEON)" - depends on KERNEL_MODE_NEON - select CRYPTO_BLAKE2B - help - BLAKE2b cryptographic hash function (RFC 7693) - - Architecture: arm using - - NEON (Advanced SIMD) extensions - - BLAKE2b digest algorithm optimized with ARM NEON instructions. - On ARM processors that have NEON support but not the ARMv8 - Crypto Extensions, typically this BLAKE2b implementation is - much faster than the SHA-2 family and slightly faster than - SHA-1. - config CRYPTO_AES_ARM tristate "Ciphers: AES" select CRYPTO_ALGAPI diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index 6346a73effc0..86dd43313dbf 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile @@ -5,7 +5,6 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o -obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o @@ -13,7 +12,6 @@ obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o aes-arm-y := aes-cipher-core.o aes-cipher-glue.o aes-arm-bs-y := aes-neonbs-core.o aes-neonbs-glue.o -blake2b-neon-y := blake2b-neon-core.o blake2b-neon-glue.o aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o diff --git a/arch/arm/crypto/blake2b-neon-core.S b/arch/arm/crypto/blake2b-neon-core.S deleted file mode 100644 index 0406a186377f..000000000000 --- a/arch/arm/crypto/blake2b-neon-core.S +++ /dev/null @@ -1,347 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * BLAKE2b digest algorithm, NEON accelerated - * - * Copyright 2020 Google LLC - * - * Author: Eric Biggers <ebiggers@google.com> - */ - -#include <linux/linkage.h> - - .text - .fpu neon - - // The arguments to blake2b_compress_neon() - STATE .req r0 - BLOCK .req r1 - NBLOCKS .req r2 - INC .req r3 - - // Pointers to the rotation tables - ROR24_TABLE .req r4 - ROR16_TABLE .req r5 - - // The original stack pointer - ORIG_SP .req r6 - - // NEON registers which contain the message words of the current block. - // M_0-M_3 are occasionally used for other purposes too. - M_0 .req d16 - M_1 .req d17 - M_2 .req d18 - M_3 .req d19 - M_4 .req d20 - M_5 .req d21 - M_6 .req d22 - M_7 .req d23 - M_8 .req d24 - M_9 .req d25 - M_10 .req d26 - M_11 .req d27 - M_12 .req d28 - M_13 .req d29 - M_14 .req d30 - M_15 .req d31 - - .align 4 - // Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8 - // instruction. This is the most efficient way to implement these - // rotation amounts with NEON. (On Cortex-A53 it's the same speed as - // vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.) -.Lror24_table: - .byte 3, 4, 5, 6, 7, 0, 1, 2 -.Lror16_table: - .byte 2, 3, 4, 5, 6, 7, 0, 1 - // The BLAKE2b initialization vector -.Lblake2b_IV: - .quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b - .quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1 - .quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f - .quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 - -// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the -// NEON registers q0-q7. The message block is in q8..q15 (M_0-M_15). The stack -// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9 -// (M_0-M_3), so that they can be reloaded if they are used as temporary -// registers. The macro arguments s0-s15 give the order in which the message -// words are used in this round. 'final' is 1 if this is the final round. -.macro _blake2b_round s0, s1, s2, s3, s4, s5, s6, s7, \ - s8, s9, s10, s11, s12, s13, s14, s15, final=0 - - // Mix the columns: - // (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]), - // (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]). - - // a += b + m[blake2b_sigma[r][2*i + 0]]; - vadd.u64 q0, q0, q2 - vadd.u64 q1, q1, q3 - vadd.u64 d0, d0, M_\s0 - vadd.u64 d1, d1, M_\s2 - vadd.u64 d2, d2, M_\s4 - vadd.u64 d3, d3, M_\s6 - - // d = ror64(d ^ a, 32); - veor q6, q6, q0 - veor q7, q7, q1 - vrev64.32 q6, q6 - vrev64.32 q7, q7 - - // c += d; - vadd.u64 q4, q4, q6 - vadd.u64 q5, q5, q7 - - // b = ror64(b ^ c, 24); - vld1.8 {M_0}, [ROR24_TABLE, :64] - veor q2, q2, q4 - veor q3, q3, q5 - vtbl.8 d4, {d4}, M_0 - vtbl.8 d5, {d5}, M_0 - vtbl.8 d6, {d6}, M_0 - vtbl.8 d7, {d7}, M_0 - - // a += b + m[blake2b_sigma[r][2*i + 1]]; - // - // M_0 got clobbered above, so we have to reload it if any of the four - // message words this step needs happens to be M_0. Otherwise we don't - // need to reload it here, as it will just get clobbered again below. -.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0 - vld1.8 {M_0}, [sp, :64] -.endif - vadd.u64 q0, q0, q2 - vadd.u64 q1, q1, q3 - vadd.u64 d0, d0, M_\s1 - vadd.u64 d1, d1, M_\s3 - vadd.u64 d2, d2, M_\s5 - vadd.u64 d3, d3, M_\s7 - - // d = ror64(d ^ a, 16); - vld1.8 {M_0}, [ROR16_TABLE, :64] - veor q6, q6, q0 - veor q7, q7, q1 - vtbl.8 d12, {d12}, M_0 - vtbl.8 d13, {d13}, M_0 - vtbl.8 d14, {d14}, M_0 - vtbl.8 d15, {d15}, M_0 - - // c += d; - vadd.u64 q4, q4, q6 - vadd.u64 q5, q5, q7 - - // b = ror64(b ^ c, 63); - // - // This rotation amount isn't a multiple of 8, so it has to be - // implemented using a pair of shifts, which requires temporary - // registers. Use q8-q9 (M_0-M_3) for this, and reload them afterwards. - veor q8, q2, q4 - veor q9, q3, q5 - vshr.u64 q2, q8, #63 - vshr.u64 q3, q9, #63 - vsli.u64 q2, q8, #1 - vsli.u64 q3, q9, #1 - vld1.8 {q8-q9}, [sp, :256] - - // Mix the diagonals: - // (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]), - // (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]). - // - // There are two possible ways to do this: use 'vext' instructions to - // shift the rows of the matrix so that the diagonals become columns, - // and undo it afterwards; or just use 64-bit operations on 'd' - // registers instead of 128-bit operations on 'q' registers. We use the - // latter approach, as it performs much better on Cortex-A7. - - // a += b + m[blake2b_sigma[r][2*i + 0]]; - vadd.u64 d0, d0, d5 - vadd.u64 d1, d1, d6 - vadd.u64 d2, d2, d7 - vadd.u64 d3, d3, d4 - vadd.u64 d0, d0, M_\s8 - vadd.u64 d1, d1, M_\s10 - vadd.u64 d2, d2, M_\s12 - vadd.u64 d3, d3, M_\s14 - - // d = ror64(d ^ a, 32); - veor d15, d15, d0 - veor d12, d12, d1 - veor d13, d13, d2 - veor d14, d14, d3 - vrev64.32 d15, d15 - vrev64.32 d12, d12 - vrev64.32 d13, d13 - vrev64.32 d14, d14 - - // c += d; - vadd.u64 d10, d10, d15 - vadd.u64 d11, d11, d12 - vadd.u64 d8, d8, d13 - vadd.u64 d9, d9, d14 - - // b = ror64(b ^ c, 24); - vld1.8 {M_0}, [ROR24_TABLE, :64] - veor d5, d5, d10 - veor d6, d6, d11 - veor d7, d7, d8 - veor d4, d4, d9 - vtbl.8 d5, {d5}, M_0 - vtbl.8 d6, {d6}, M_0 - vtbl.8 d7, {d7}, M_0 - vtbl.8 d4, {d4}, M_0 - - // a += b + m[blake2b_sigma[r][2*i + 1]]; -.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0 - vld1.8 {M_0}, [sp, :64] -.endif - vadd.u64 d0, d0, d5 - vadd.u64 d1, d1, d6 - vadd.u64 d2, d2, d7 - vadd.u64 d3, d3, d4 - vadd.u64 d0, d0, M_\s9 - vadd.u64 d1, d1, M_\s11 - vadd.u64 d2, d2, M_\s13 - vadd.u64 d3, d3, M_\s15 - - // d = ror64(d ^ a, 16); - vld1.8 {M_0}, [ROR16_TABLE, :64] - veor d15, d15, d0 - veor d12, d12, d1 - veor d13, d13, d2 - veor d14, d14, d3 - vtbl.8 d12, {d12}, M_0 - vtbl.8 d13, {d13}, M_0 - vtbl.8 d14, {d14}, M_0 - vtbl.8 d15, {d15}, M_0 - - // c += d; - vadd.u64 d10, d10, d15 - vadd.u64 d11, d11, d12 - vadd.u64 d8, d8, d13 - vadd.u64 d9, d9, d14 - - // b = ror64(b ^ c, 63); - veor d16, d4, d9 - veor d17, d5, d10 - veor d18, d6, d11 - veor d19, d7, d8 - vshr.u64 q2, q8, #63 - vshr.u64 q3, q9, #63 - vsli.u64 q2, q8, #1 - vsli.u64 q3, q9, #1 - // Reloading q8-q9 can be skipped on the final round. -.if ! \final - vld1.8 {q8-q9}, [sp, :256] -.endif -.endm - -// -// void blake2b_compress_neon(struct blake2b_state *state, -// const u8 *block, size_t nblocks, u32 inc); -// -// Only the first three fields of struct blake2b_state are used: -// u64 h[8]; (inout) -// u64 t[2]; (inout) -// u64 f[2]; (in) -// - .align 5 -ENTRY(blake2b_compress_neon) - push {r4-r10} - - // Allocate a 32-byte stack buffer that is 32-byte aligned. - mov ORIG_SP, sp - sub ip, sp, #32 - bic ip, ip, #31 - mov sp, ip - - adr ROR24_TABLE, .Lror24_table - adr ROR16_TABLE, .Lror16_table - - mov ip, STATE - vld1.64 {q0-q1}, [ip]! // Load h[0..3] - vld1.64 {q2-q3}, [ip]! // Load h[4..7] -.Lnext_block: - adr r10, .Lblake2b_IV - vld1.64 {q14-q15}, [ip] // Load t[0..1] and f[0..1] - vld1.64 {q4-q5}, [r10]! // Load IV[0..3] - vmov r7, r8, d28 // Copy t[0] to (r7, r8) - vld1.64 {q6-q7}, [r10] // Load IV[4..7] - adds r7, r7, INC // Increment counter - bcs .Lslow_inc_ctr - vmov.i32 d28[0], r7 - vst1.64 {d28}, [ip] // Update t[0] -.Linc_ctr_done: - - // Load the next message block and finish initializing the state matrix - // 'v'. Fortunately, there are exactly enough NEON registers to fit the - // entire state matrix in q0-q7 and the entire message block in q8-15. - // - // However, _blake2b_round also needs some extra registers for rotates, - // so we have to spill some registers. It's better to spill the message - // registers than the state registers, as the message doesn't change. - // Therefore we store a copy of the first 32 bytes of the message block - // (q8-q9) in an aligned buffer on the stack so that they can be - // reloaded when needed. (We could just reload directly from the - // message buffer, but it's faster to use aligned loads.) - vld1.8 {q8-q9}, [BLOCK]! - veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1] - vld1.8 {q10-q11}, [BLOCK]! - veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1] - vld1.8 {q12-q13}, [BLOCK]! - vst1.8 {q8-q9}, [sp, :256] - mov ip, STATE - vld1.8 {q14-q15}, [BLOCK]! - - // Execute the rounds. Each round is provided the order in which it - // needs to use the message words. - _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 - _blake2b_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 - _blake2b_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 - _blake2b_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 - _blake2b_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 - _blake2b_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 - _blake2b_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 - _blake2b_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 - _blake2b_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 - _blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - _blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \ - final=1 - - // Fold the final state matrix into the hash chaining value: - // - // for (i = 0; i < 8; i++) - // h[i] ^= v[i] ^ v[i + 8]; - // - vld1.64 {q8-q9}, [ip]! // Load old h[0..3] - veor q0, q0, q4 // v[0..1] ^= v[8..9] - veor q1, q1, q5 // v[2..3] ^= v[10..11] - vld1.64 {q10-q11}, [ip] // Load old h[4..7] - veor q2, q2, q6 // v[4..5] ^= v[12..13] - veor q3, q3, q7 // v[6..7] ^= v[14..15] - veor q0, q0, q8 // v[0..1] ^= h[0..1] - veor q1, q1, q9 // v[2..3] ^= h[2..3] - mov ip, STATE - subs NBLOCKS, NBLOCKS, #1 // nblocks-- - vst1.64 {q0-q1}, [ip]! // Store new h[0..3] - veor q2, q2, q10 // v[4..5] ^= h[4..5] - veor q3, q3, q11 // v[6..7] ^= h[6..7] - vst1.64 {q2-q3}, [ip]! // Store new h[4..7] - - // Advance to the next block, if there is one. - bne .Lnext_block // nblocks != 0? - - mov sp, ORIG_SP - pop {r4-r10} - mov pc, lr - -.Lslow_inc_ctr: - // Handle the case where the counter overflowed its low 32 bits, by - // carrying the overflow bit into the full 128-bit counter. - vmov r9, r10, d29 - adcs r8, r8, #0 - adcs r9, r9, #0 - adc r10, r10, #0 - vmov d28, r7, r8 - vmov d29, r9, r10 - vst1.64 {q14}, [ip] // Update t[0] and t[1] - b .Linc_ctr_done -ENDPROC(blake2b_compress_neon) diff --git a/arch/arm/crypto/blake2b-neon-glue.c b/arch/arm/crypto/blake2b-neon-glue.c deleted file mode 100644 index 2ff443a91724..000000000000 --- a/arch/arm/crypto/blake2b-neon-glue.c +++ /dev/null @@ -1,104 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * BLAKE2b digest algorithm, NEON accelerated - * - * Copyright 2020 Google LLC - */ - -#include <crypto/internal/blake2b.h> -#include <crypto/internal/hash.h> - -#include <linux/module.h> -#include <linux/sizes.h> - -#include <asm/neon.h> -#include <asm/simd.h> - -asmlinkage void blake2b_compress_neon(struct blake2b_state *state, - const u8 *block, size_t nblocks, u32 inc); - -static void blake2b_compress_arch(struct blake2b_state *state, - const u8 *block, size_t nblocks, u32 inc) -{ - do { - const size_t blocks = min_t(size_t, nblocks, - SZ_4K / BLAKE2B_BLOCK_SIZE); - - kernel_neon_begin(); - blake2b_compress_neon(state, block, blocks, inc); - kernel_neon_end(); - - nblocks -= blocks; - block += blocks * BLAKE2B_BLOCK_SIZE; - } while (nblocks); -} - -static int crypto_blake2b_update_neon(struct shash_desc *desc, - const u8 *in, unsigned int inlen) -{ - return crypto_blake2b_update_bo(desc, in, inlen, blake2b_compress_arch); -} - -static int crypto_blake2b_finup_neon(struct shash_desc *desc, const u8 *in, - unsigned int inlen, u8 *out) -{ - return crypto_blake2b_finup(desc, in, inlen, out, - blake2b_compress_arch); -} - -#define BLAKE2B_ALG(name, driver_name, digest_size) \ - { \ - .base.cra_name = name, \ - .base.cra_driver_name = driver_name, \ - .base.cra_priority = 200, \ - .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY | \ - CRYPTO_AHASH_ALG_BLOCK_ONLY | \ - CRYPTO_AHASH_ALG_FINAL_NONZERO, \ - .base.cra_blocksize = BLAKE2B_BLOCK_SIZE, \ - .base.cra_ctxsize = sizeof(struct blake2b_tfm_ctx), \ - .base.cra_module = THIS_MODULE, \ - .digestsize = digest_size, \ - .setkey = crypto_blake2b_setkey, \ - .init = crypto_blake2b_init, \ - .update = crypto_blake2b_update_neon, \ - .finup = crypto_blake2b_finup_neon, \ - .descsize = sizeof(struct blake2b_state), \ - .statesize = BLAKE2B_STATE_SIZE, \ - } - -static struct shash_alg blake2b_neon_algs[] = { - BLAKE2B_ALG("blake2b-160", "blake2b-160-neon", BLAKE2B_160_HASH_SIZE), - BLAKE2B_ALG("blake2b-256", "blake2b-256-neon", BLAKE2B_256_HASH_SIZE), - BLAKE2B_ALG("blake2b-384", "blake2b-384-neon", BLAKE2B_384_HASH_SIZE), - BLAKE2B_ALG("blake2b-512", "blake2b-512-neon", BLAKE2B_512_HASH_SIZE), -}; - -static int __init blake2b_neon_mod_init(void) -{ - if (!(elf_hwcap & HWCAP_NEON)) - return -ENODEV; - - return crypto_register_shashes(blake2b_neon_algs, - ARRAY_SIZE(blake2b_neon_algs)); -} - -static void __exit blake2b_neon_mod_exit(void) -{ - crypto_unregister_shashes(blake2b_neon_algs, - ARRAY_SIZE(blake2b_neon_algs)); -} - -module_init(blake2b_neon_mod_init); -module_exit(blake2b_neon_mod_exit); - -MODULE_DESCRIPTION("BLAKE2b digest algorithm, NEON accelerated"); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>"); -MODULE_ALIAS_CRYPTO("blake2b-160"); -MODULE_ALIAS_CRYPTO("blake2b-160-neon"); -MODULE_ALIAS_CRYPTO("blake2b-256"); -MODULE_ALIAS_CRYPTO("blake2b-256-neon"); -MODULE_ALIAS_CRYPTO("blake2b-384"); -MODULE_ALIAS_CRYPTO("blake2b-384-neon"); -MODULE_ALIAS_CRYPTO("blake2b-512"); -MODULE_ALIAS_CRYPTO("blake2b-512-neon"); diff --git a/arch/arm/include/asm/simd.h b/arch/arm/include/asm/simd.h index be08a8da046f..8549fa8b7253 100644 --- a/arch/arm/include/asm/simd.h +++ b/arch/arm/include/asm/simd.h @@ -2,14 +2,21 @@ #ifndef _ASM_SIMD_H #define _ASM_SIMD_H +#include <linux/cleanup.h> #include <linux/compiler_attributes.h> #include <linux/preempt.h> #include <linux/types.h> +#include <asm/neon.h> + static __must_check inline bool may_use_simd(void) { return IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && !in_hardirq() && !irqs_disabled(); } +DEFINE_LOCK_GUARD_0(ksimd, kernel_neon_begin(), kernel_neon_end()) + +#define scoped_ksimd() scoped_guard(ksimd) + #endif /* _ASM_SIMD_H */ diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index f90be312418e..d6ae80b5df36 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -283,10 +283,17 @@ extern int __put_user_8(void *, unsigned long long); __gu_err; \ }) +/* + * This is a type: either unsigned long, if the argument fits into + * that type, or otherwise unsigned long long. + */ +#define __long_type(x) \ + __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) + #define __get_user_err(x, ptr, err, __t) \ do { \ unsigned long __gu_addr = (unsigned long)(ptr); \ - unsigned long __gu_val; \ + __long_type(x) __gu_val; \ unsigned int __ua_flags; \ __chk_user_ptr(ptr); \ might_fault(); \ @@ -295,6 +302,7 @@ do { \ case 1: __get_user_asm_byte(__gu_val, __gu_addr, err, __t); break; \ case 2: __get_user_asm_half(__gu_val, __gu_addr, err, __t); break; \ case 4: __get_user_asm_word(__gu_val, __gu_addr, err, __t); break; \ + case 8: __get_user_asm_dword(__gu_val, __gu_addr, err, __t); break; \ default: (__gu_val) = __get_user_bad(); \ } \ uaccess_restore(__ua_flags); \ @@ -353,6 +361,22 @@ do { \ #define __get_user_asm_word(x, addr, err, __t) \ __get_user_asm(x, addr, err, "ldr" __t) +#ifdef __ARMEB__ +#define __WORD0_OFFS 4 +#define __WORD1_OFFS 0 +#else +#define __WORD0_OFFS 0 +#define __WORD1_OFFS 4 +#endif + +#define __get_user_asm_dword(x, addr, err, __t) \ + ({ \ + unsigned long __w0, __w1; \ + __get_user_asm(__w0, addr + __WORD0_OFFS, err, "ldr" __t); \ + __get_user_asm(__w1, addr + __WORD1_OFFS, err, "ldr" __t); \ + (x) = ((u64)__w1 << 32) | (u64) __w0; \ +}) + #define __put_user_switch(x, ptr, __err, __fn) \ do { \ const __typeof__(*(ptr)) __user *__pu_ptr = (ptr); \ diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index b07e699aaa3c..fd09afae72a2 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -484,3 +484,4 @@ 467 common open_tree_attr sys_open_tree_attr 468 common file_getattr sys_file_getattr 469 common file_setattr sys_file_setattr +470 common listns sys_listns |
