diff options
Diffstat (limited to 'arch/powerpc/lib')
-rw-r--r-- | arch/powerpc/lib/Makefile | 6 | ||||
-rw-r--r-- | arch/powerpc/lib/crc-t10dif.c (renamed from arch/powerpc/lib/crc-t10dif-glue.c) | 18 | ||||
-rw-r--r-- | arch/powerpc/lib/crc-vpmsum-template.S (renamed from arch/powerpc/lib/crc32-vpmsum_core.S) | 0 | ||||
-rw-r--r-- | arch/powerpc/lib/crc32.c (renamed from arch/powerpc/lib/crc32-glue.c) | 17 | ||||
-rw-r--r-- | arch/powerpc/lib/crc32c-vpmsum_asm.S | 2 | ||||
-rw-r--r-- | arch/powerpc/lib/crct10dif-vpmsum_asm.S | 2 | ||||
-rw-r--r-- | arch/powerpc/lib/crypto/Kconfig | 22 | ||||
-rw-r--r-- | arch/powerpc/lib/crypto/Makefile | 10 | ||||
-rw-r--r-- | arch/powerpc/lib/crypto/chacha-p10-glue.c | 100 | ||||
-rw-r--r-- | arch/powerpc/lib/crypto/chacha-p10le-8x.S | 840 | ||||
-rw-r--r-- | arch/powerpc/lib/crypto/poly1305-p10-glue.c | 96 | ||||
-rw-r--r-- | arch/powerpc/lib/crypto/poly1305-p10le_64.S | 1075 | ||||
-rw-r--r-- | arch/powerpc/lib/crypto/sha256-spe-asm.S | 318 | ||||
-rw-r--r-- | arch/powerpc/lib/crypto/sha256.c | 70 | ||||
-rw-r--r-- | arch/powerpc/lib/vmx-helper.c | 2 |
15 files changed, 2556 insertions, 22 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index dd8a4b52a0cc..481f968e42c7 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -3,6 +3,8 @@ # Makefile for ppc-specific library files.. # +obj-y += crypto/ + CFLAGS_code-patching.o += -fno-stack-protector CFLAGS_feature-fixups.o += -fno-stack-protector @@ -79,9 +81,9 @@ CFLAGS_xor_vmx.o += -mhard-float -maltivec $(call cc-option,-mabi=altivec) CFLAGS_xor_vmx.o += -isystem $(shell $(CC) -print-file-name=include) obj-$(CONFIG_CRC32_ARCH) += crc32-powerpc.o -crc32-powerpc-y := crc32-glue.o crc32c-vpmsum_asm.o +crc32-powerpc-y := crc32.o crc32c-vpmsum_asm.o obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-powerpc.o -crc-t10dif-powerpc-y := crc-t10dif-glue.o crct10dif-vpmsum_asm.o +crc-t10dif-powerpc-y := crc-t10dif.o crct10dif-vpmsum_asm.o obj-$(CONFIG_PPC64) += $(obj64-y) diff --git a/arch/powerpc/lib/crc-t10dif-glue.c b/arch/powerpc/lib/crc-t10dif.c index f411b0120cc5..be23ded3a9df 100644 --- a/arch/powerpc/lib/crc-t10dif-glue.c +++ b/arch/powerpc/lib/crc-t10dif.c @@ -6,22 +6,22 @@ * [based on crc32c-vpmsum_glue.c] */ -#include <linux/crc-t10dif.h> +#include <asm/switch_to.h> #include <crypto/internal/simd.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/string.h> -#include <linux/kernel.h> #include <linux/cpufeature.h> -#include <asm/simd.h> -#include <asm/switch_to.h> +#include <linux/crc-t10dif.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/preempt.h> +#include <linux/uaccess.h> #define VMX_ALIGN 16 #define VMX_ALIGN_MASK (VMX_ALIGN-1) #define VECTOR_BREAKPOINT 64 -static DEFINE_STATIC_KEY_FALSE(have_vec_crypto); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto); u32 __crct10dif_vpmsum(u32 crc, unsigned char const *p, size_t len); @@ -71,7 +71,7 @@ static int __init crc_t10dif_powerpc_init(void) static_branch_enable(&have_vec_crypto); return 0; } -arch_initcall(crc_t10dif_powerpc_init); +subsys_initcall(crc_t10dif_powerpc_init); static void __exit crc_t10dif_powerpc_exit(void) { diff --git a/arch/powerpc/lib/crc32-vpmsum_core.S b/arch/powerpc/lib/crc-vpmsum-template.S index b0f87f595b26..b0f87f595b26 100644 --- a/arch/powerpc/lib/crc32-vpmsum_core.S +++ b/arch/powerpc/lib/crc-vpmsum-template.S diff --git a/arch/powerpc/lib/crc32-glue.c b/arch/powerpc/lib/crc32.c index dbd10f339183..0d9befb6e7b8 100644 --- a/arch/powerpc/lib/crc32-glue.c +++ b/arch/powerpc/lib/crc32.c @@ -1,19 +1,20 @@ // SPDX-License-Identifier: GPL-2.0-only -#include <linux/crc32.h> +#include <asm/switch_to.h> #include <crypto/internal/simd.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/kernel.h> #include <linux/cpufeature.h> -#include <asm/simd.h> -#include <asm/switch_to.h> +#include <linux/crc32.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/preempt.h> +#include <linux/uaccess.h> #define VMX_ALIGN 16 #define VMX_ALIGN_MASK (VMX_ALIGN-1) #define VECTOR_BREAKPOINT 512 -static DEFINE_STATIC_KEY_FALSE(have_vec_crypto); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vec_crypto); u32 __crc32c_vpmsum(u32 crc, const u8 *p, size_t len); @@ -72,7 +73,7 @@ static int __init crc32_powerpc_init(void) static_branch_enable(&have_vec_crypto); return 0; } -arch_initcall(crc32_powerpc_init); +subsys_initcall(crc32_powerpc_init); static void __exit crc32_powerpc_exit(void) { diff --git a/arch/powerpc/lib/crc32c-vpmsum_asm.S b/arch/powerpc/lib/crc32c-vpmsum_asm.S index bf442004ea1f..1b35c55cce0a 100644 --- a/arch/powerpc/lib/crc32c-vpmsum_asm.S +++ b/arch/powerpc/lib/crc32c-vpmsum_asm.S @@ -839,4 +839,4 @@ #define CRC_FUNCTION_NAME __crc32c_vpmsum #define REFLECT -#include "crc32-vpmsum_core.S" +#include "crc-vpmsum-template.S" diff --git a/arch/powerpc/lib/crct10dif-vpmsum_asm.S b/arch/powerpc/lib/crct10dif-vpmsum_asm.S index f0b93a0fe168..47a6266d89a8 100644 --- a/arch/powerpc/lib/crct10dif-vpmsum_asm.S +++ b/arch/powerpc/lib/crct10dif-vpmsum_asm.S @@ -842,4 +842,4 @@ .octa 0x0000000000000000000000018bb70000 #define CRC_FUNCTION_NAME __crct10dif_vpmsum -#include "crc32-vpmsum_core.S" +#include "crc-vpmsum-template.S" diff --git a/arch/powerpc/lib/crypto/Kconfig b/arch/powerpc/lib/crypto/Kconfig new file mode 100644 index 000000000000..3f9e1bbd9905 --- /dev/null +++ b/arch/powerpc/lib/crypto/Kconfig @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config CRYPTO_CHACHA20_P10 + tristate + depends on PPC64 && CPU_LITTLE_ENDIAN && VSX + default CRYPTO_LIB_CHACHA + select CRYPTO_LIB_CHACHA_GENERIC + select CRYPTO_ARCH_HAVE_LIB_CHACHA + +config CRYPTO_POLY1305_P10 + tristate + depends on PPC64 && CPU_LITTLE_ENDIAN && VSX + depends on BROKEN # Needs to be fixed to work in softirq context + default CRYPTO_LIB_POLY1305 + select CRYPTO_ARCH_HAVE_LIB_POLY1305 + select CRYPTO_LIB_POLY1305_GENERIC + +config CRYPTO_SHA256_PPC_SPE + tristate + depends on SPE + default CRYPTO_LIB_SHA256 + select CRYPTO_ARCH_HAVE_LIB_SHA256 diff --git a/arch/powerpc/lib/crypto/Makefile b/arch/powerpc/lib/crypto/Makefile new file mode 100644 index 000000000000..27f231f8e334 --- /dev/null +++ b/arch/powerpc/lib/crypto/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_CRYPTO_CHACHA20_P10) += chacha-p10-crypto.o +chacha-p10-crypto-y := chacha-p10-glue.o chacha-p10le-8x.o + +obj-$(CONFIG_CRYPTO_POLY1305_P10) += poly1305-p10-crypto.o +poly1305-p10-crypto-y := poly1305-p10-glue.o poly1305-p10le_64.o + +obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o +sha256-ppc-spe-y := sha256.o sha256-spe-asm.o diff --git a/arch/powerpc/lib/crypto/chacha-p10-glue.c b/arch/powerpc/lib/crypto/chacha-p10-glue.c new file mode 100644 index 000000000000..fcd23c6f1590 --- /dev/null +++ b/arch/powerpc/lib/crypto/chacha-p10-glue.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * ChaCha stream cipher (P10 accelerated) + * + * Copyright 2023- IBM Corp. All rights reserved. + */ + +#include <crypto/chacha.h> +#include <crypto/internal/simd.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/cpufeature.h> +#include <linux/sizes.h> +#include <asm/simd.h> +#include <asm/switch_to.h> + +asmlinkage void chacha_p10le_8x(const struct chacha_state *state, u8 *dst, + const u8 *src, unsigned int len, int nrounds); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10); + +static void vsx_begin(void) +{ + preempt_disable(); + enable_kernel_vsx(); +} + +static void vsx_end(void) +{ + disable_kernel_vsx(); + preempt_enable(); +} + +static void chacha_p10_do_8x(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + unsigned int l = bytes & ~0x0FF; + + if (l > 0) { + chacha_p10le_8x(state, dst, src, l, nrounds); + bytes -= l; + src += l; + dst += l; + state->x[12] += l / CHACHA_BLOCK_SIZE; + } + + if (bytes > 0) + chacha_crypt_generic(state, dst, src, bytes, nrounds); +} + +void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds) +{ + hchacha_block_generic(state, out, nrounds); +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + if (!static_branch_likely(&have_p10) || bytes <= CHACHA_BLOCK_SIZE || + !crypto_simd_usable()) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + do { + unsigned int todo = min_t(unsigned int, bytes, SZ_4K); + + vsx_begin(); + chacha_p10_do_8x(state, dst, src, todo, nrounds); + vsx_end(); + + bytes -= todo; + src += todo; + dst += todo; + } while (bytes); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +bool chacha_is_arch_optimized(void) +{ + return static_key_enabled(&have_p10); +} +EXPORT_SYMBOL(chacha_is_arch_optimized); + +static int __init chacha_p10_init(void) +{ + if (cpu_has_feature(CPU_FTR_ARCH_31)) + static_branch_enable(&have_p10); + return 0; +} +subsys_initcall(chacha_p10_init); + +static void __exit chacha_p10_exit(void) +{ +} +module_exit(chacha_p10_exit); + +MODULE_DESCRIPTION("ChaCha stream cipher (P10 accelerated)"); +MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/powerpc/lib/crypto/chacha-p10le-8x.S b/arch/powerpc/lib/crypto/chacha-p10le-8x.S new file mode 100644 index 000000000000..b29562bd5d40 --- /dev/null +++ b/arch/powerpc/lib/crypto/chacha-p10le-8x.S @@ -0,0 +1,840 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +# +# Accelerated chacha20 implementation for ppc64le. +# +# Copyright 2023- IBM Corp. All rights reserved +# +#=================================================================================== +# Written by Danny Tsen <dtsen@us.ibm.com> +# +# do rounds, 8 quarter rounds +# 1. a += b; d ^= a; d <<<= 16; +# 2. c += d; b ^= c; b <<<= 12; +# 3. a += b; d ^= a; d <<<= 8; +# 4. c += d; b ^= c; b <<<= 7 +# +# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 16 +# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 12 +# row1 = (row1 + row2), row4 = row1 xor row4, row4 rotate each word by 8 +# row3 = (row3 + row4), row2 = row3 xor row2, row2 rotate each word by 7 +# +# 4 blocks (a b c d) +# +# a0 b0 c0 d0 +# a1 b1 c1 d1 +# ... +# a4 b4 c4 d4 +# ... +# a8 b8 c8 d8 +# ... +# a12 b12 c12 d12 +# a13 ... +# a14 ... +# a15 b15 c15 d15 +# +# Column round (v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) +# Diagnal round (v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) +# + +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/asm-compat.h> +#include <linux/linkage.h> + +.machine "any" +.text + +.macro SAVE_GPR GPR OFFSET FRAME + std \GPR,\OFFSET(\FRAME) +.endm + +.macro SAVE_VRS VRS OFFSET FRAME + li 16, \OFFSET + stvx \VRS, 16, \FRAME +.endm + +.macro SAVE_VSX VSX OFFSET FRAME + li 16, \OFFSET + stxvx \VSX, 16, \FRAME +.endm + +.macro RESTORE_GPR GPR OFFSET FRAME + ld \GPR,\OFFSET(\FRAME) +.endm + +.macro RESTORE_VRS VRS OFFSET FRAME + li 16, \OFFSET + lvx \VRS, 16, \FRAME +.endm + +.macro RESTORE_VSX VSX OFFSET FRAME + li 16, \OFFSET + lxvx \VSX, 16, \FRAME +.endm + +.macro SAVE_REGS + mflr 0 + std 0, 16(1) + stdu 1,-752(1) + + SAVE_GPR 14, 112, 1 + SAVE_GPR 15, 120, 1 + SAVE_GPR 16, 128, 1 + SAVE_GPR 17, 136, 1 + SAVE_GPR 18, 144, 1 + SAVE_GPR 19, 152, 1 + SAVE_GPR 20, 160, 1 + SAVE_GPR 21, 168, 1 + SAVE_GPR 22, 176, 1 + SAVE_GPR 23, 184, 1 + SAVE_GPR 24, 192, 1 + SAVE_GPR 25, 200, 1 + SAVE_GPR 26, 208, 1 + SAVE_GPR 27, 216, 1 + SAVE_GPR 28, 224, 1 + SAVE_GPR 29, 232, 1 + SAVE_GPR 30, 240, 1 + SAVE_GPR 31, 248, 1 + + addi 9, 1, 256 + SAVE_VRS 20, 0, 9 + SAVE_VRS 21, 16, 9 + SAVE_VRS 22, 32, 9 + SAVE_VRS 23, 48, 9 + SAVE_VRS 24, 64, 9 + SAVE_VRS 25, 80, 9 + SAVE_VRS 26, 96, 9 + SAVE_VRS 27, 112, 9 + SAVE_VRS 28, 128, 9 + SAVE_VRS 29, 144, 9 + SAVE_VRS 30, 160, 9 + SAVE_VRS 31, 176, 9 + + SAVE_VSX 14, 192, 9 + SAVE_VSX 15, 208, 9 + SAVE_VSX 16, 224, 9 + SAVE_VSX 17, 240, 9 + SAVE_VSX 18, 256, 9 + SAVE_VSX 19, 272, 9 + SAVE_VSX 20, 288, 9 + SAVE_VSX 21, 304, 9 + SAVE_VSX 22, 320, 9 + SAVE_VSX 23, 336, 9 + SAVE_VSX 24, 352, 9 + SAVE_VSX 25, 368, 9 + SAVE_VSX 26, 384, 9 + SAVE_VSX 27, 400, 9 + SAVE_VSX 28, 416, 9 + SAVE_VSX 29, 432, 9 + SAVE_VSX 30, 448, 9 + SAVE_VSX 31, 464, 9 +.endm # SAVE_REGS + +.macro RESTORE_REGS + addi 9, 1, 256 + RESTORE_VRS 20, 0, 9 + RESTORE_VRS 21, 16, 9 + RESTORE_VRS 22, 32, 9 + RESTORE_VRS 23, 48, 9 + RESTORE_VRS 24, 64, 9 + RESTORE_VRS 25, 80, 9 + RESTORE_VRS 26, 96, 9 + RESTORE_VRS 27, 112, 9 + RESTORE_VRS 28, 128, 9 + RESTORE_VRS 29, 144, 9 + RESTORE_VRS 30, 160, 9 + RESTORE_VRS 31, 176, 9 + + RESTORE_VSX 14, 192, 9 + RESTORE_VSX 15, 208, 9 + RESTORE_VSX 16, 224, 9 + RESTORE_VSX 17, 240, 9 + RESTORE_VSX 18, 256, 9 + RESTORE_VSX 19, 272, 9 + RESTORE_VSX 20, 288, 9 + RESTORE_VSX 21, 304, 9 + RESTORE_VSX 22, 320, 9 + RESTORE_VSX 23, 336, 9 + RESTORE_VSX 24, 352, 9 + RESTORE_VSX 25, 368, 9 + RESTORE_VSX 26, 384, 9 + RESTORE_VSX 27, 400, 9 + RESTORE_VSX 28, 416, 9 + RESTORE_VSX 29, 432, 9 + RESTORE_VSX 30, 448, 9 + RESTORE_VSX 31, 464, 9 + + RESTORE_GPR 14, 112, 1 + RESTORE_GPR 15, 120, 1 + RESTORE_GPR 16, 128, 1 + RESTORE_GPR 17, 136, 1 + RESTORE_GPR 18, 144, 1 + RESTORE_GPR 19, 152, 1 + RESTORE_GPR 20, 160, 1 + RESTORE_GPR 21, 168, 1 + RESTORE_GPR 22, 176, 1 + RESTORE_GPR 23, 184, 1 + RESTORE_GPR 24, 192, 1 + RESTORE_GPR 25, 200, 1 + RESTORE_GPR 26, 208, 1 + RESTORE_GPR 27, 216, 1 + RESTORE_GPR 28, 224, 1 + RESTORE_GPR 29, 232, 1 + RESTORE_GPR 30, 240, 1 + RESTORE_GPR 31, 248, 1 + + addi 1, 1, 752 + ld 0, 16(1) + mtlr 0 +.endm # RESTORE_REGS + +.macro QT_loop_8x + # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) + xxlor 0, 32+25, 32+25 + xxlor 32+25, 20, 20 + vadduwm 0, 0, 4 + vadduwm 1, 1, 5 + vadduwm 2, 2, 6 + vadduwm 3, 3, 7 + vadduwm 16, 16, 20 + vadduwm 17, 17, 21 + vadduwm 18, 18, 22 + vadduwm 19, 19, 23 + + vpermxor 12, 12, 0, 25 + vpermxor 13, 13, 1, 25 + vpermxor 14, 14, 2, 25 + vpermxor 15, 15, 3, 25 + vpermxor 28, 28, 16, 25 + vpermxor 29, 29, 17, 25 + vpermxor 30, 30, 18, 25 + vpermxor 31, 31, 19, 25 + xxlor 32+25, 0, 0 + vadduwm 8, 8, 12 + vadduwm 9, 9, 13 + vadduwm 10, 10, 14 + vadduwm 11, 11, 15 + vadduwm 24, 24, 28 + vadduwm 25, 25, 29 + vadduwm 26, 26, 30 + vadduwm 27, 27, 31 + vxor 4, 4, 8 + vxor 5, 5, 9 + vxor 6, 6, 10 + vxor 7, 7, 11 + vxor 20, 20, 24 + vxor 21, 21, 25 + vxor 22, 22, 26 + vxor 23, 23, 27 + + xxlor 0, 32+25, 32+25 + xxlor 32+25, 21, 21 + vrlw 4, 4, 25 # + vrlw 5, 5, 25 + vrlw 6, 6, 25 + vrlw 7, 7, 25 + vrlw 20, 20, 25 # + vrlw 21, 21, 25 + vrlw 22, 22, 25 + vrlw 23, 23, 25 + xxlor 32+25, 0, 0 + vadduwm 0, 0, 4 + vadduwm 1, 1, 5 + vadduwm 2, 2, 6 + vadduwm 3, 3, 7 + vadduwm 16, 16, 20 + vadduwm 17, 17, 21 + vadduwm 18, 18, 22 + vadduwm 19, 19, 23 + + xxlor 0, 32+25, 32+25 + xxlor 32+25, 22, 22 + vpermxor 12, 12, 0, 25 + vpermxor 13, 13, 1, 25 + vpermxor 14, 14, 2, 25 + vpermxor 15, 15, 3, 25 + vpermxor 28, 28, 16, 25 + vpermxor 29, 29, 17, 25 + vpermxor 30, 30, 18, 25 + vpermxor 31, 31, 19, 25 + xxlor 32+25, 0, 0 + vadduwm 8, 8, 12 + vadduwm 9, 9, 13 + vadduwm 10, 10, 14 + vadduwm 11, 11, 15 + vadduwm 24, 24, 28 + vadduwm 25, 25, 29 + vadduwm 26, 26, 30 + vadduwm 27, 27, 31 + xxlor 0, 32+28, 32+28 + xxlor 32+28, 23, 23 + vxor 4, 4, 8 + vxor 5, 5, 9 + vxor 6, 6, 10 + vxor 7, 7, 11 + vxor 20, 20, 24 + vxor 21, 21, 25 + vxor 22, 22, 26 + vxor 23, 23, 27 + vrlw 4, 4, 28 # + vrlw 5, 5, 28 + vrlw 6, 6, 28 + vrlw 7, 7, 28 + vrlw 20, 20, 28 # + vrlw 21, 21, 28 + vrlw 22, 22, 28 + vrlw 23, 23, 28 + xxlor 32+28, 0, 0 + + # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) + xxlor 0, 32+25, 32+25 + xxlor 32+25, 20, 20 + vadduwm 0, 0, 5 + vadduwm 1, 1, 6 + vadduwm 2, 2, 7 + vadduwm 3, 3, 4 + vadduwm 16, 16, 21 + vadduwm 17, 17, 22 + vadduwm 18, 18, 23 + vadduwm 19, 19, 20 + + vpermxor 15, 15, 0, 25 + vpermxor 12, 12, 1, 25 + vpermxor 13, 13, 2, 25 + vpermxor 14, 14, 3, 25 + vpermxor 31, 31, 16, 25 + vpermxor 28, 28, 17, 25 + vpermxor 29, 29, 18, 25 + vpermxor 30, 30, 19, 25 + + xxlor 32+25, 0, 0 + vadduwm 10, 10, 15 + vadduwm 11, 11, 12 + vadduwm 8, 8, 13 + vadduwm 9, 9, 14 + vadduwm 26, 26, 31 + vadduwm 27, 27, 28 + vadduwm 24, 24, 29 + vadduwm 25, 25, 30 + vxor 5, 5, 10 + vxor 6, 6, 11 + vxor 7, 7, 8 + vxor 4, 4, 9 + vxor 21, 21, 26 + vxor 22, 22, 27 + vxor 23, 23, 24 + vxor 20, 20, 25 + + xxlor 0, 32+25, 32+25 + xxlor 32+25, 21, 21 + vrlw 5, 5, 25 + vrlw 6, 6, 25 + vrlw 7, 7, 25 + vrlw 4, 4, 25 + vrlw 21, 21, 25 + vrlw 22, 22, 25 + vrlw 23, 23, 25 + vrlw 20, 20, 25 + xxlor 32+25, 0, 0 + + vadduwm 0, 0, 5 + vadduwm 1, 1, 6 + vadduwm 2, 2, 7 + vadduwm 3, 3, 4 + vadduwm 16, 16, 21 + vadduwm 17, 17, 22 + vadduwm 18, 18, 23 + vadduwm 19, 19, 20 + + xxlor 0, 32+25, 32+25 + xxlor 32+25, 22, 22 + vpermxor 15, 15, 0, 25 + vpermxor 12, 12, 1, 25 + vpermxor 13, 13, 2, 25 + vpermxor 14, 14, 3, 25 + vpermxor 31, 31, 16, 25 + vpermxor 28, 28, 17, 25 + vpermxor 29, 29, 18, 25 + vpermxor 30, 30, 19, 25 + xxlor 32+25, 0, 0 + + vadduwm 10, 10, 15 + vadduwm 11, 11, 12 + vadduwm 8, 8, 13 + vadduwm 9, 9, 14 + vadduwm 26, 26, 31 + vadduwm 27, 27, 28 + vadduwm 24, 24, 29 + vadduwm 25, 25, 30 + + xxlor 0, 32+28, 32+28 + xxlor 32+28, 23, 23 + vxor 5, 5, 10 + vxor 6, 6, 11 + vxor 7, 7, 8 + vxor 4, 4, 9 + vxor 21, 21, 26 + vxor 22, 22, 27 + vxor 23, 23, 24 + vxor 20, 20, 25 + vrlw 5, 5, 28 + vrlw 6, 6, 28 + vrlw 7, 7, 28 + vrlw 4, 4, 28 + vrlw 21, 21, 28 + vrlw 22, 22, 28 + vrlw 23, 23, 28 + vrlw 20, 20, 28 + xxlor 32+28, 0, 0 +.endm + +.macro QT_loop_4x + # QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) + vadduwm 0, 0, 4 + vadduwm 1, 1, 5 + vadduwm 2, 2, 6 + vadduwm 3, 3, 7 + vpermxor 12, 12, 0, 20 + vpermxor 13, 13, 1, 20 + vpermxor 14, 14, 2, 20 + vpermxor 15, 15, 3, 20 + vadduwm 8, 8, 12 + vadduwm 9, 9, 13 + vadduwm 10, 10, 14 + vadduwm 11, 11, 15 + vxor 4, 4, 8 + vxor 5, 5, 9 + vxor 6, 6, 10 + vxor 7, 7, 11 + vrlw 4, 4, 21 + vrlw 5, 5, 21 + vrlw 6, 6, 21 + vrlw 7, 7, 21 + vadduwm 0, 0, 4 + vadduwm 1, 1, 5 + vadduwm 2, 2, 6 + vadduwm 3, 3, 7 + vpermxor 12, 12, 0, 22 + vpermxor 13, 13, 1, 22 + vpermxor 14, 14, 2, 22 + vpermxor 15, 15, 3, 22 + vadduwm 8, 8, 12 + vadduwm 9, 9, 13 + vadduwm 10, 10, 14 + vadduwm 11, 11, 15 + vxor 4, 4, 8 + vxor 5, 5, 9 + vxor 6, 6, 10 + vxor 7, 7, 11 + vrlw 4, 4, 23 + vrlw 5, 5, 23 + vrlw 6, 6, 23 + vrlw 7, 7, 23 + + # QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) + vadduwm 0, 0, 5 + vadduwm 1, 1, 6 + vadduwm 2, 2, 7 + vadduwm 3, 3, 4 + vpermxor 15, 15, 0, 20 + vpermxor 12, 12, 1, 20 + vpermxor 13, 13, 2, 20 + vpermxor 14, 14, 3, 20 + vadduwm 10, 10, 15 + vadduwm 11, 11, 12 + vadduwm 8, 8, 13 + vadduwm 9, 9, 14 + vxor 5, 5, 10 + vxor 6, 6, 11 + vxor 7, 7, 8 + vxor 4, 4, 9 + vrlw 5, 5, 21 + vrlw 6, 6, 21 + vrlw 7, 7, 21 + vrlw 4, 4, 21 + vadduwm 0, 0, 5 + vadduwm 1, 1, 6 + vadduwm 2, 2, 7 + vadduwm 3, 3, 4 + vpermxor 15, 15, 0, 22 + vpermxor 12, 12, 1, 22 + vpermxor 13, 13, 2, 22 + vpermxor 14, 14, 3, 22 + vadduwm 10, 10, 15 + vadduwm 11, 11, 12 + vadduwm 8, 8, 13 + vadduwm 9, 9, 14 + vxor 5, 5, 10 + vxor 6, 6, 11 + vxor 7, 7, 8 + vxor 4, 4, 9 + vrlw 5, 5, 23 + vrlw 6, 6, 23 + vrlw 7, 7, 23 + vrlw 4, 4, 23 +.endm + +# Transpose +.macro TP_4x a0 a1 a2 a3 + xxmrghw 10, 32+\a0, 32+\a1 # a0, a1, b0, b1 + xxmrghw 11, 32+\a2, 32+\a3 # a2, a3, b2, b3 + xxmrglw 12, 32+\a0, 32+\a1 # c0, c1, d0, d1 + xxmrglw 13, 32+\a2, 32+\a3 # c2, c3, d2, d3 + xxpermdi 32+\a0, 10, 11, 0 # a0, a1, a2, a3 + xxpermdi 32+\a1, 10, 11, 3 # b0, b1, b2, b3 + xxpermdi 32+\a2, 12, 13, 0 # c0, c1, c2, c3 + xxpermdi 32+\a3, 12, 13, 3 # d0, d1, d2, d3 +.endm + +# key stream = working state + state +.macro Add_state S + vadduwm \S+0, \S+0, 16-\S + vadduwm \S+4, \S+4, 17-\S + vadduwm \S+8, \S+8, 18-\S + vadduwm \S+12, \S+12, 19-\S + + vadduwm \S+1, \S+1, 16-\S + vadduwm \S+5, \S+5, 17-\S + vadduwm \S+9, \S+9, 18-\S + vadduwm \S+13, \S+13, 19-\S + + vadduwm \S+2, \S+2, 16-\S + vadduwm \S+6, \S+6, 17-\S + vadduwm \S+10, \S+10, 18-\S + vadduwm \S+14, \S+14, 19-\S + + vadduwm \S+3, \S+3, 16-\S + vadduwm \S+7, \S+7, 17-\S + vadduwm \S+11, \S+11, 18-\S + vadduwm \S+15, \S+15, 19-\S +.endm + +# +# write 256 bytes +# +.macro Write_256 S + add 9, 14, 5 + add 16, 14, 4 + lxvw4x 0, 0, 9 + lxvw4x 1, 17, 9 + lxvw4x 2, 18, 9 + lxvw4x 3, 19, 9 + lxvw4x 4, 20, 9 + lxvw4x 5, 21, 9 + lxvw4x 6, 22, 9 + lxvw4x 7, 23, 9 + lxvw4x 8, 24, 9 + lxvw4x 9, 25, 9 + lxvw4x 10, 26, 9 + lxvw4x 11, 27, 9 + lxvw4x 12, 28, 9 + lxvw4x 13, 29, 9 + lxvw4x 14, 30, 9 + lxvw4x 15, 31, 9 + + xxlxor \S+32, \S+32, 0 + xxlxor \S+36, \S+36, 1 + xxlxor \S+40, \S+40, 2 + xxlxor \S+44, \S+44, 3 + xxlxor \S+33, \S+33, 4 + xxlxor \S+37, \S+37, 5 + xxlxor \S+41, \S+41, 6 + xxlxor \S+45, \S+45, 7 + xxlxor \S+34, \S+34, 8 + xxlxor \S+38, \S+38, 9 + xxlxor \S+42, \S+42, 10 + xxlxor \S+46, \S+46, 11 + xxlxor \S+35, \S+35, 12 + xxlxor \S+39, \S+39, 13 + xxlxor \S+43, \S+43, 14 + xxlxor \S+47, \S+47, 15 + + stxvw4x \S+32, 0, 16 + stxvw4x \S+36, 17, 16 + stxvw4x \S+40, 18, 16 + stxvw4x \S+44, 19, 16 + + stxvw4x \S+33, 20, 16 + stxvw4x \S+37, 21, 16 + stxvw4x \S+41, 22, 16 + stxvw4x \S+45, 23, 16 + + stxvw4x \S+34, 24, 16 + stxvw4x \S+38, 25, 16 + stxvw4x \S+42, 26, 16 + stxvw4x \S+46, 27, 16 + + stxvw4x \S+35, 28, 16 + stxvw4x \S+39, 29, 16 + stxvw4x \S+43, 30, 16 + stxvw4x \S+47, 31, 16 + +.endm + +# +# void chacha_p10le_8x(const struct chacha_state *state, u8 *dst, const u8 *src, +# unsigned int len, int nrounds); +# +SYM_FUNC_START(chacha_p10le_8x) +.align 5 + cmpdi 6, 0 + ble Out_no_chacha + + SAVE_REGS + + # r17 - r31 mainly for Write_256 macro. + li 17, 16 + li 18, 32 + li 19, 48 + li 20, 64 + li 21, 80 + li 22, 96 + li 23, 112 + li 24, 128 + li 25, 144 + li 26, 160 + li 27, 176 + li 28, 192 + li 29, 208 + li 30, 224 + li 31, 240 + + mr 15, 6 # len + li 14, 0 # offset to inp and outp + + lxvw4x 48, 0, 3 # vr16, constants + lxvw4x 49, 17, 3 # vr17, key 1 + lxvw4x 50, 18, 3 # vr18, key 2 + lxvw4x 51, 19, 3 # vr19, counter, nonce + + # create (0, 1, 2, 3) counters + vspltisw 0, 0 + vspltisw 1, 1 + vspltisw 2, 2 + vspltisw 3, 3 + vmrghw 4, 0, 1 + vmrglw 5, 2, 3 + vsldoi 30, 4, 5, 8 # vr30 counter, 4 (0, 1, 2, 3) + + vspltisw 21, 12 + vspltisw 23, 7 + + addis 11, 2, permx@toc@ha + addi 11, 11, permx@toc@l + lxvw4x 32+20, 0, 11 + lxvw4x 32+22, 17, 11 + + sradi 8, 7, 1 + + mtctr 8 + + # save constants to vsx + xxlor 16, 48, 48 + xxlor 17, 49, 49 + xxlor 18, 50, 50 + xxlor 19, 51, 51 + + vspltisw 25, 4 + vspltisw 26, 8 + + xxlor 25, 32+26, 32+26 + xxlor 24, 32+25, 32+25 + + vadduwm 31, 30, 25 # counter = (0, 1, 2, 3) + (4, 4, 4, 4) + xxlor 30, 32+30, 32+30 + xxlor 31, 32+31, 32+31 + + xxlor 20, 32+20, 32+20 + xxlor 21, 32+21, 32+21 + xxlor 22, 32+22, 32+22 + xxlor 23, 32+23, 32+23 + + cmpdi 6, 512 + blt Loop_last + +Loop_8x: + xxspltw 32+0, 16, 0 + xxspltw 32+1, 16, 1 + xxspltw 32+2, 16, 2 + xxspltw 32+3, 16, 3 + + xxspltw 32+4, 17, 0 + xxspltw 32+5, 17, 1 + xxspltw 32+6, 17, 2 + xxspltw 32+7, 17, 3 + xxspltw 32+8, 18, 0 + xxspltw 32+9, 18, 1 + xxspltw 32+10, 18, 2 + xxspltw 32+11, 18, 3 + xxspltw 32+12, 19, 0 + xxspltw 32+13, 19, 1 + xxspltw 32+14, 19, 2 + xxspltw 32+15, 19, 3 + vadduwm 12, 12, 30 # increase counter + + xxspltw 32+16, 16, 0 + xxspltw 32+17, 16, 1 + xxspltw 32+18, 16, 2 + xxspltw 32+19, 16, 3 + + xxspltw 32+20, 17, 0 + xxspltw 32+21, 17, 1 + xxspltw 32+22, 17, 2 + xxspltw 32+23, 17, 3 + xxspltw 32+24, 18, 0 + xxspltw 32+25, 18, 1 + xxspltw 32+26, 18, 2 + xxspltw 32+27, 18, 3 + xxspltw 32+28, 19, 0 + xxspltw 32+29, 19, 1 + vadduwm 28, 28, 31 # increase counter + xxspltw 32+30, 19, 2 + xxspltw 32+31, 19, 3 + +.align 5 +quarter_loop_8x: + QT_loop_8x + + bdnz quarter_loop_8x + + xxlor 0, 32+30, 32+30 + xxlor 32+30, 30, 30 + vadduwm 12, 12, 30 + xxlor 32+30, 0, 0 + TP_4x 0, 1, 2, 3 + TP_4x 4, 5, 6, 7 + TP_4x 8, 9, 10, 11 + TP_4x 12, 13, 14, 15 + + xxlor 0, 48, 48 + xxlor 1, 49, 49 + xxlor 2, 50, 50 + xxlor 3, 51, 51 + xxlor 48, 16, 16 + xxlor 49, 17, 17 + xxlor 50, 18, 18 + xxlor 51, 19, 19 + Add_state 0 + xxlor 48, 0, 0 + xxlor 49, 1, 1 + xxlor 50, 2, 2 + xxlor 51, 3, 3 + Write_256 0 + addi 14, 14, 256 # offset +=256 + addi 15, 15, -256 # len -=256 + + xxlor 5, 32+31, 32+31 + xxlor 32+31, 31, 31 + vadduwm 28, 28, 31 + xxlor 32+31, 5, 5 + TP_4x 16+0, 16+1, 16+2, 16+3 + TP_4x 16+4, 16+5, 16+6, 16+7 + TP_4x 16+8, 16+9, 16+10, 16+11 + TP_4x 16+12, 16+13, 16+14, 16+15 + + xxlor 32, 16, 16 + xxlor 33, 17, 17 + xxlor 34, 18, 18 + xxlor 35, 19, 19 + Add_state 16 + Write_256 16 + addi 14, 14, 256 # offset +=256 + addi 15, 15, -256 # len +=256 + + xxlor 32+24, 24, 24 + xxlor 32+25, 25, 25 + xxlor 32+30, 30, 30 + vadduwm 30, 30, 25 + vadduwm 31, 30, 24 + xxlor 30, 32+30, 32+30 + xxlor 31, 32+31, 32+31 + + cmpdi 15, 0 + beq Out_loop + + cmpdi 15, 512 + blt Loop_last + + mtctr 8 + b Loop_8x + +Loop_last: + lxvw4x 48, 0, 3 # vr16, constants + lxvw4x 49, 17, 3 # vr17, key 1 + lxvw4x 50, 18, 3 # vr18, key 2 + lxvw4x 51, 19, 3 # vr19, counter, nonce + + vspltisw 21, 12 + vspltisw 23, 7 + addis 11, 2, permx@toc@ha + addi 11, 11, permx@toc@l + lxvw4x 32+20, 0, 11 + lxvw4x 32+22, 17, 11 + + sradi 8, 7, 1 + mtctr 8 + +Loop_4x: + vspltw 0, 16, 0 + vspltw 1, 16, 1 + vspltw 2, 16, 2 + vspltw 3, 16, 3 + + vspltw 4, 17, 0 + vspltw 5, 17, 1 + vspltw 6, 17, 2 + vspltw 7, 17, 3 + vspltw 8, 18, 0 + vspltw 9, 18, 1 + vspltw 10, 18, 2 + vspltw 11, 18, 3 + vspltw 12, 19, 0 + vadduwm 12, 12, 30 # increase counter + vspltw 13, 19, 1 + vspltw 14, 19, 2 + vspltw 15, 19, 3 + +.align 5 +quarter_loop: + QT_loop_4x + + bdnz quarter_loop + + vadduwm 12, 12, 30 + TP_4x 0, 1, 2, 3 + TP_4x 4, 5, 6, 7 + TP_4x 8, 9, 10, 11 + TP_4x 12, 13, 14, 15 + + Add_state 0 + Write_256 0 + addi 14, 14, 256 # offset += 256 + addi 15, 15, -256 # len += 256 + + # Update state counter + vspltisw 25, 4 + vadduwm 30, 30, 25 + + cmpdi 15, 0 + beq Out_loop + cmpdi 15, 256 + blt Out_loop + + mtctr 8 + b Loop_4x + +Out_loop: + RESTORE_REGS + blr + +Out_no_chacha: + li 3, 0 + blr +SYM_FUNC_END(chacha_p10le_8x) + +SYM_DATA_START_LOCAL(PERMX) +.align 5 +permx: +.long 0x22330011, 0x66774455, 0xaabb8899, 0xeeffccdd +.long 0x11223300, 0x55667744, 0x99aabb88, 0xddeeffcc +SYM_DATA_END(PERMX) diff --git a/arch/powerpc/lib/crypto/poly1305-p10-glue.c b/arch/powerpc/lib/crypto/poly1305-p10-glue.c new file mode 100644 index 000000000000..3f1664a724b6 --- /dev/null +++ b/arch/powerpc/lib/crypto/poly1305-p10-glue.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Poly1305 authenticator algorithm, RFC7539. + * + * Copyright 2023- IBM Corp. All rights reserved. + */ +#include <asm/switch_to.h> +#include <crypto/internal/poly1305.h> +#include <linux/cpufeature.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/unaligned.h> + +asmlinkage void poly1305_p10le_4blocks(struct poly1305_block_state *state, const u8 *m, u32 mlen); +asmlinkage void poly1305_64s(struct poly1305_block_state *state, const u8 *m, u32 mlen, int highbit); +asmlinkage void poly1305_emit_64(const struct poly1305_state *state, const u32 nonce[4], u8 digest[POLY1305_DIGEST_SIZE]); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_p10); + +static void vsx_begin(void) +{ + preempt_disable(); + enable_kernel_vsx(); +} + +static void vsx_end(void) +{ + disable_kernel_vsx(); + preempt_enable(); +} + +void poly1305_block_init_arch(struct poly1305_block_state *dctx, + const u8 raw_key[POLY1305_BLOCK_SIZE]) +{ + if (!static_key_enabled(&have_p10)) + return poly1305_block_init_generic(dctx, raw_key); + + dctx->h = (struct poly1305_state){}; + dctx->core_r.key.r64[0] = get_unaligned_le64(raw_key + 0); + dctx->core_r.key.r64[1] = get_unaligned_le64(raw_key + 8); +} +EXPORT_SYMBOL_GPL(poly1305_block_init_arch); + +void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, + unsigned int len, u32 padbit) +{ + if (!static_key_enabled(&have_p10)) + return poly1305_blocks_generic(state, src, len, padbit); + vsx_begin(); + if (len >= POLY1305_BLOCK_SIZE * 4) { + poly1305_p10le_4blocks(state, src, len); + src += len - (len % (POLY1305_BLOCK_SIZE * 4)); + len %= POLY1305_BLOCK_SIZE * 4; + } + while (len >= POLY1305_BLOCK_SIZE) { + poly1305_64s(state, src, POLY1305_BLOCK_SIZE, padbit); + len -= POLY1305_BLOCK_SIZE; + src += POLY1305_BLOCK_SIZE; + } + vsx_end(); +} +EXPORT_SYMBOL_GPL(poly1305_blocks_arch); + +void poly1305_emit_arch(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], + const u32 nonce[4]) +{ + if (!static_key_enabled(&have_p10)) + return poly1305_emit_generic(state, digest, nonce); + poly1305_emit_64(state, nonce, digest); +} +EXPORT_SYMBOL_GPL(poly1305_emit_arch); + +bool poly1305_is_arch_optimized(void) +{ + return static_key_enabled(&have_p10); +} +EXPORT_SYMBOL(poly1305_is_arch_optimized); + +static int __init poly1305_p10_init(void) +{ + if (cpu_has_feature(CPU_FTR_ARCH_31)) + static_branch_enable(&have_p10); + return 0; +} +subsys_initcall(poly1305_p10_init); + +static void __exit poly1305_p10_exit(void) +{ +} +module_exit(poly1305_p10_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com>"); +MODULE_DESCRIPTION("Optimized Poly1305 for P10"); diff --git a/arch/powerpc/lib/crypto/poly1305-p10le_64.S b/arch/powerpc/lib/crypto/poly1305-p10le_64.S new file mode 100644 index 000000000000..a3c1987f1ecd --- /dev/null +++ b/arch/powerpc/lib/crypto/poly1305-p10le_64.S @@ -0,0 +1,1075 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +# +# Accelerated poly1305 implementation for ppc64le. +# +# Copyright 2023- IBM Corp. All rights reserved +# +#=================================================================================== +# Written by Danny Tsen <dtsen@us.ibm.com> +# +# Poly1305 - this version mainly using vector/VSX/Scalar +# - 26 bits limbs +# - Handle multiple 64 byte blcok. +# +# Block size 16 bytes +# key = (r, s) +# clamp r &= 0x0FFFFFFC0FFFFFFC 0x0FFFFFFC0FFFFFFF +# p = 2^130 - 5 +# a += m +# a = (r + a) % p +# a += s +# +# Improve performance by breaking down polynominal to the sum of products with +# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r +# +# 07/22/21 - this revison based on the above sum of products. Setup r^4, r^3, r^2, r and s3, s2, s1, s0 +# to 9 vectors for multiplications. +# +# setup r^4, r^3, r^2, r vectors +# vs [r^1, r^3, r^2, r^4] +# vs0 = [r0,.....] +# vs1 = [r1,.....] +# vs2 = [r2,.....] +# vs3 = [r3,.....] +# vs4 = [r4,.....] +# vs5 = [r1*5,...] +# vs6 = [r2*5,...] +# vs7 = [r2*5,...] +# vs8 = [r4*5,...] +# +# Each word in a vector consists a member of a "r/s" in [a * r/s]. +# +# r0, r4*5, r3*5, r2*5, r1*5; +# r1, r0, r4*5, r3*5, r2*5; +# r2, r1, r0, r4*5, r3*5; +# r3, r2, r1, r0, r4*5; +# r4, r3, r2, r1, r0 ; +# +# +# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m) +# k = 32 bytes key +# r3 = k (r, s) +# r4 = mlen +# r5 = m +# +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/asm-compat.h> +#include <linux/linkage.h> + +.machine "any" + +.text + +.macro SAVE_GPR GPR OFFSET FRAME + std \GPR,\OFFSET(\FRAME) +.endm + +.macro SAVE_VRS VRS OFFSET FRAME + li 16, \OFFSET + stvx \VRS, 16, \FRAME +.endm + +.macro SAVE_VSX VSX OFFSET FRAME + li 16, \OFFSET + stxvx \VSX, 16, \FRAME +.endm + +.macro RESTORE_GPR GPR OFFSET FRAME + ld \GPR,\OFFSET(\FRAME) +.endm + +.macro RESTORE_VRS VRS OFFSET FRAME + li 16, \OFFSET + lvx \VRS, 16, \FRAME +.endm + +.macro RESTORE_VSX VSX OFFSET FRAME + li 16, \OFFSET + lxvx \VSX, 16, \FRAME +.endm + +.macro SAVE_REGS + mflr 0 + std 0, 16(1) + stdu 1,-752(1) + + SAVE_GPR 14, 112, 1 + SAVE_GPR 15, 120, 1 + SAVE_GPR 16, 128, 1 + SAVE_GPR 17, 136, 1 + SAVE_GPR 18, 144, 1 + SAVE_GPR 19, 152, 1 + SAVE_GPR 20, 160, 1 + SAVE_GPR 21, 168, 1 + SAVE_GPR 22, 176, 1 + SAVE_GPR 23, 184, 1 + SAVE_GPR 24, 192, 1 + SAVE_GPR 25, 200, 1 + SAVE_GPR 26, 208, 1 + SAVE_GPR 27, 216, 1 + SAVE_GPR 28, 224, 1 + SAVE_GPR 29, 232, 1 + SAVE_GPR 30, 240, 1 + SAVE_GPR 31, 248, 1 + + addi 9, 1, 256 + SAVE_VRS 20, 0, 9 + SAVE_VRS 21, 16, 9 + SAVE_VRS 22, 32, 9 + SAVE_VRS 23, 48, 9 + SAVE_VRS 24, 64, 9 + SAVE_VRS 25, 80, 9 + SAVE_VRS 26, 96, 9 + SAVE_VRS 27, 112, 9 + SAVE_VRS 28, 128, 9 + SAVE_VRS 29, 144, 9 + SAVE_VRS 30, 160, 9 + SAVE_VRS 31, 176, 9 + + SAVE_VSX 14, 192, 9 + SAVE_VSX 15, 208, 9 + SAVE_VSX 16, 224, 9 + SAVE_VSX 17, 240, 9 + SAVE_VSX 18, 256, 9 + SAVE_VSX 19, 272, 9 + SAVE_VSX 20, 288, 9 + SAVE_VSX 21, 304, 9 + SAVE_VSX 22, 320, 9 + SAVE_VSX 23, 336, 9 + SAVE_VSX 24, 352, 9 + SAVE_VSX 25, 368, 9 + SAVE_VSX 26, 384, 9 + SAVE_VSX 27, 400, 9 + SAVE_VSX 28, 416, 9 + SAVE_VSX 29, 432, 9 + SAVE_VSX 30, 448, 9 + SAVE_VSX 31, 464, 9 +.endm # SAVE_REGS + +.macro RESTORE_REGS + addi 9, 1, 256 + RESTORE_VRS 20, 0, 9 + RESTORE_VRS 21, 16, 9 + RESTORE_VRS 22, 32, 9 + RESTORE_VRS 23, 48, 9 + RESTORE_VRS 24, 64, 9 + RESTORE_VRS 25, 80, 9 + RESTORE_VRS 26, 96, 9 + RESTORE_VRS 27, 112, 9 + RESTORE_VRS 28, 128, 9 + RESTORE_VRS 29, 144, 9 + RESTORE_VRS 30, 160, 9 + RESTORE_VRS 31, 176, 9 + + RESTORE_VSX 14, 192, 9 + RESTORE_VSX 15, 208, 9 + RESTORE_VSX 16, 224, 9 + RESTORE_VSX 17, 240, 9 + RESTORE_VSX 18, 256, 9 + RESTORE_VSX 19, 272, 9 + RESTORE_VSX 20, 288, 9 + RESTORE_VSX 21, 304, 9 + RESTORE_VSX 22, 320, 9 + RESTORE_VSX 23, 336, 9 + RESTORE_VSX 24, 352, 9 + RESTORE_VSX 25, 368, 9 + RESTORE_VSX 26, 384, 9 + RESTORE_VSX 27, 400, 9 + RESTORE_VSX 28, 416, 9 + RESTORE_VSX 29, 432, 9 + RESTORE_VSX 30, 448, 9 + RESTORE_VSX 31, 464, 9 + + RESTORE_GPR 14, 112, 1 + RESTORE_GPR 15, 120, 1 + RESTORE_GPR 16, 128, 1 + RESTORE_GPR 17, 136, 1 + RESTORE_GPR 18, 144, 1 + RESTORE_GPR 19, 152, 1 + RESTORE_GPR 20, 160, 1 + RESTORE_GPR 21, 168, 1 + RESTORE_GPR 22, 176, 1 + RESTORE_GPR 23, 184, 1 + RESTORE_GPR 24, 192, 1 + RESTORE_GPR 25, 200, 1 + RESTORE_GPR 26, 208, 1 + RESTORE_GPR 27, 216, 1 + RESTORE_GPR 28, 224, 1 + RESTORE_GPR 29, 232, 1 + RESTORE_GPR 30, 240, 1 + RESTORE_GPR 31, 248, 1 + + addi 1, 1, 752 + ld 0, 16(1) + mtlr 0 +.endm # RESTORE_REGS + +# +# p[0] = a0*r0 + a1*r4*5 + a2*r3*5 + a3*r2*5 + a4*r1*5; +# p[1] = a0*r1 + a1*r0 + a2*r4*5 + a3*r3*5 + a4*r2*5; +# p[2] = a0*r2 + a1*r1 + a2*r0 + a3*r4*5 + a4*r3*5; +# p[3] = a0*r3 + a1*r2 + a2*r1 + a3*r0 + a4*r4*5; +# p[4] = a0*r4 + a1*r3 + a2*r2 + a3*r1 + a4*r0 ; +# +# [r^2, r^3, r^1, r^4] +# [m3, m2, m4, m1] +# +# multiply odd and even words +.macro mul_odd + vmulouw 14, 4, 26 + vmulouw 10, 5, 3 + vmulouw 11, 6, 2 + vmulouw 12, 7, 1 + vmulouw 13, 8, 0 + vmulouw 15, 4, 27 + vaddudm 14, 14, 10 + vaddudm 14, 14, 11 + vmulouw 10, 5, 26 + vmulouw 11, 6, 3 + vaddudm 14, 14, 12 + vaddudm 14, 14, 13 # x0 + vaddudm 15, 15, 10 + vaddudm 15, 15, 11 + vmulouw 12, 7, 2 + vmulouw 13, 8, 1 + vaddudm 15, 15, 12 + vaddudm 15, 15, 13 # x1 + vmulouw 16, 4, 28 + vmulouw 10, 5, 27 + vmulouw 11, 6, 26 + vaddudm 16, 16, 10 + vaddudm 16, 16, 11 + vmulouw 12, 7, 3 + vmulouw 13, 8, 2 + vaddudm 16, 16, 12 + vaddudm 16, 16, 13 # x2 + vmulouw 17, 4, 29 + vmulouw 10, 5, 28 + vmulouw 11, 6, 27 + vaddudm 17, 17, 10 + vaddudm 17, 17, 11 + vmulouw 12, 7, 26 + vmulouw 13, 8, 3 + vaddudm 17, 17, 12 + vaddudm 17, 17, 13 # x3 + vmulouw 18, 4, 30 + vmulouw 10, 5, 29 + vmulouw 11, 6, 28 + vaddudm 18, 18, 10 + vaddudm 18, 18, 11 + vmulouw 12, 7, 27 + vmulouw 13, 8, 26 + vaddudm 18, 18, 12 + vaddudm 18, 18, 13 # x4 +.endm + +.macro mul_even + vmuleuw 9, 4, 26 + vmuleuw 10, 5, 3 + vmuleuw 11, 6, 2 + vmuleuw 12, 7, 1 + vmuleuw 13, 8, 0 + vaddudm 14, 14, 9 + vaddudm 14, 14, 10 + vaddudm 14, 14, 11 + vaddudm 14, 14, 12 + vaddudm 14, 14, 13 # x0 + + vmuleuw 9, 4, 27 + vmuleuw 10, 5, 26 + vmuleuw 11, 6, 3 + vmuleuw 12, 7, 2 + vmuleuw 13, 8, 1 + vaddudm 15, 15, 9 + vaddudm 15, 15, 10 + vaddudm 15, 15, 11 + vaddudm 15, 15, 12 + vaddudm 15, 15, 13 # x1 + + vmuleuw 9, 4, 28 + vmuleuw 10, 5, 27 + vmuleuw 11, 6, 26 + vmuleuw 12, 7, 3 + vmuleuw 13, 8, 2 + vaddudm 16, 16, 9 + vaddudm 16, 16, 10 + vaddudm 16, 16, 11 + vaddudm 16, 16, 12 + vaddudm 16, 16, 13 # x2 + + vmuleuw 9, 4, 29 + vmuleuw 10, 5, 28 + vmuleuw 11, 6, 27 + vmuleuw 12, 7, 26 + vmuleuw 13, 8, 3 + vaddudm 17, 17, 9 + vaddudm 17, 17, 10 + vaddudm 17, 17, 11 + vaddudm 17, 17, 12 + vaddudm 17, 17, 13 # x3 + + vmuleuw 9, 4, 30 + vmuleuw 10, 5, 29 + vmuleuw 11, 6, 28 + vmuleuw 12, 7, 27 + vmuleuw 13, 8, 26 + vaddudm 18, 18, 9 + vaddudm 18, 18, 10 + vaddudm 18, 18, 11 + vaddudm 18, 18, 12 + vaddudm 18, 18, 13 # x4 +.endm + +# +# poly1305_setup_r +# +# setup r^4, r^3, r^2, r vectors +# [r, r^3, r^2, r^4] +# vs0 = [r0,...] +# vs1 = [r1,...] +# vs2 = [r2,...] +# vs3 = [r3,...] +# vs4 = [r4,...] +# vs5 = [r4*5,...] +# vs6 = [r3*5,...] +# vs7 = [r2*5,...] +# vs8 = [r1*5,...] +# +# r0, r4*5, r3*5, r2*5, r1*5; +# r1, r0, r4*5, r3*5, r2*5; +# r2, r1, r0, r4*5, r3*5; +# r3, r2, r1, r0, r4*5; +# r4, r3, r2, r1, r0 ; +# +.macro poly1305_setup_r + + # save r + xxlor 26, 58, 58 + xxlor 27, 59, 59 + xxlor 28, 60, 60 + xxlor 29, 61, 61 + xxlor 30, 62, 62 + + xxlxor 31, 31, 31 + +# [r, r^3, r^2, r^4] + # compute r^2 + vmr 4, 26 + vmr 5, 27 + vmr 6, 28 + vmr 7, 29 + vmr 8, 30 + bl do_mul # r^2 r^1 + xxpermdi 58, 58, 36, 0x3 # r0 + xxpermdi 59, 59, 37, 0x3 # r1 + xxpermdi 60, 60, 38, 0x3 # r2 + xxpermdi 61, 61, 39, 0x3 # r3 + xxpermdi 62, 62, 40, 0x3 # r4 + xxpermdi 36, 36, 36, 0x3 + xxpermdi 37, 37, 37, 0x3 + xxpermdi 38, 38, 38, 0x3 + xxpermdi 39, 39, 39, 0x3 + xxpermdi 40, 40, 40, 0x3 + vspltisb 13, 2 + vsld 9, 27, 13 + vsld 10, 28, 13 + vsld 11, 29, 13 + vsld 12, 30, 13 + vaddudm 0, 9, 27 + vaddudm 1, 10, 28 + vaddudm 2, 11, 29 + vaddudm 3, 12, 30 + + bl do_mul # r^4 r^3 + vmrgow 26, 26, 4 + vmrgow 27, 27, 5 + vmrgow 28, 28, 6 + vmrgow 29, 29, 7 + vmrgow 30, 30, 8 + vspltisb 13, 2 + vsld 9, 27, 13 + vsld 10, 28, 13 + vsld 11, 29, 13 + vsld 12, 30, 13 + vaddudm 0, 9, 27 + vaddudm 1, 10, 28 + vaddudm 2, 11, 29 + vaddudm 3, 12, 30 + + # r^2 r^4 + xxlor 0, 58, 58 + xxlor 1, 59, 59 + xxlor 2, 60, 60 + xxlor 3, 61, 61 + xxlor 4, 62, 62 + xxlor 5, 32, 32 + xxlor 6, 33, 33 + xxlor 7, 34, 34 + xxlor 8, 35, 35 + + vspltw 9, 26, 3 + vspltw 10, 26, 2 + vmrgow 26, 10, 9 + vspltw 9, 27, 3 + vspltw 10, 27, 2 + vmrgow 27, 10, 9 + vspltw 9, 28, 3 + vspltw 10, 28, 2 + vmrgow 28, 10, 9 + vspltw 9, 29, 3 + vspltw 10, 29, 2 + vmrgow 29, 10, 9 + vspltw 9, 30, 3 + vspltw 10, 30, 2 + vmrgow 30, 10, 9 + + vsld 9, 27, 13 + vsld 10, 28, 13 + vsld 11, 29, 13 + vsld 12, 30, 13 + vaddudm 0, 9, 27 + vaddudm 1, 10, 28 + vaddudm 2, 11, 29 + vaddudm 3, 12, 30 +.endm + +SYM_FUNC_START_LOCAL(do_mul) + mul_odd + + # do reduction ( h %= p ) + # carry reduction + vspltisb 9, 2 + vsrd 10, 14, 31 + vsrd 11, 17, 31 + vand 7, 17, 25 + vand 4, 14, 25 + vaddudm 18, 18, 11 + vsrd 12, 18, 31 + vaddudm 15, 15, 10 + + vsrd 11, 15, 31 + vand 8, 18, 25 + vand 5, 15, 25 + vaddudm 4, 4, 12 + vsld 10, 12, 9 + vaddudm 6, 16, 11 + + vsrd 13, 6, 31 + vand 6, 6, 25 + vaddudm 4, 4, 10 + vsrd 10, 4, 31 + vaddudm 7, 7, 13 + + vsrd 11, 7, 31 + vand 7, 7, 25 + vand 4, 4, 25 + vaddudm 5, 5, 10 + vaddudm 8, 8, 11 + blr +SYM_FUNC_END(do_mul) + +# +# init key +# +.macro do_poly1305_init + addis 10, 2, rmask@toc@ha + addi 10, 10, rmask@toc@l + + ld 11, 0(10) + ld 12, 8(10) + + li 14, 16 + li 15, 32 + addis 10, 2, cnum@toc@ha + addi 10, 10, cnum@toc@l + lvx 25, 0, 10 # v25 - mask + lvx 31, 14, 10 # v31 = 1a + lvx 19, 15, 10 # v19 = 1 << 24 + lxv 24, 48(10) # vs24 + lxv 25, 64(10) # vs25 + + # initialize + # load key from r3 to vectors + ld 9, 24(3) + ld 10, 32(3) + and. 9, 9, 11 + and. 10, 10, 12 + + # break 26 bits + extrdi 14, 9, 26, 38 + extrdi 15, 9, 26, 12 + extrdi 16, 9, 12, 0 + mtvsrdd 58, 0, 14 + insrdi 16, 10, 14, 38 + mtvsrdd 59, 0, 15 + extrdi 17, 10, 26, 24 + mtvsrdd 60, 0, 16 + extrdi 18, 10, 24, 0 + mtvsrdd 61, 0, 17 + mtvsrdd 62, 0, 18 + + # r1 = r1 * 5, r2 = r2 * 5, r3 = r3 * 5, r4 = r4 * 5 + li 9, 5 + mtvsrdd 36, 0, 9 + vmulouw 0, 27, 4 # v0 = rr0 + vmulouw 1, 28, 4 # v1 = rr1 + vmulouw 2, 29, 4 # v2 = rr2 + vmulouw 3, 30, 4 # v3 = rr3 +.endm + +# +# poly1305_p10le_4blocks( uint8_t *k, uint32_t mlen, uint8_t *m) +# k = 32 bytes key +# r3 = k (r, s) +# r4 = mlen +# r5 = m +# +SYM_FUNC_START(poly1305_p10le_4blocks) +.align 5 + cmpdi 5, 64 + blt Out_no_poly1305 + + SAVE_REGS + + do_poly1305_init + + li 21, 0 # counter to message + + poly1305_setup_r + + # load previous H state + # break/convert r6 to 26 bits + ld 9, 0(3) + ld 10, 8(3) + ld 19, 16(3) + sldi 19, 19, 24 + mtvsrdd 41, 0, 19 + extrdi 14, 9, 26, 38 + extrdi 15, 9, 26, 12 + extrdi 16, 9, 12, 0 + mtvsrdd 36, 0, 14 + insrdi 16, 10, 14, 38 + mtvsrdd 37, 0, 15 + extrdi 17, 10, 26, 24 + mtvsrdd 38, 0, 16 + extrdi 18, 10, 24, 0 + mtvsrdd 39, 0, 17 + mtvsrdd 40, 0, 18 + vor 8, 8, 9 + + # input m1 m2 + add 20, 4, 21 + xxlor 49, 24, 24 + xxlor 50, 25, 25 + lxvw4x 43, 0, 20 + addi 17, 20, 16 + lxvw4x 44, 0, 17 + vperm 14, 11, 12, 17 + vperm 15, 11, 12, 18 + vand 9, 14, 25 # a0 + vsrd 10, 14, 31 # >> 26 + vsrd 11, 10, 31 # 12 bits left + vand 10, 10, 25 # a1 + vspltisb 13, 12 + vand 16, 15, 25 + vsld 12, 16, 13 + vor 11, 11, 12 + vand 11, 11, 25 # a2 + vspltisb 13, 14 + vsrd 12, 15, 13 # >> 14 + vsrd 13, 12, 31 # >> 26, a4 + vand 12, 12, 25 # a3 + + vaddudm 20, 4, 9 + vaddudm 21, 5, 10 + vaddudm 22, 6, 11 + vaddudm 23, 7, 12 + vaddudm 24, 8, 13 + + # m3 m4 + addi 17, 17, 16 + lxvw4x 43, 0, 17 + addi 17, 17, 16 + lxvw4x 44, 0, 17 + vperm 14, 11, 12, 17 + vperm 15, 11, 12, 18 + vand 9, 14, 25 # a0 + vsrd 10, 14, 31 # >> 26 + vsrd 11, 10, 31 # 12 bits left + vand 10, 10, 25 # a1 + vspltisb 13, 12 + vand 16, 15, 25 + vsld 12, 16, 13 + vspltisb 13, 14 + vor 11, 11, 12 + vand 11, 11, 25 # a2 + vsrd 12, 15, 13 # >> 14 + vsrd 13, 12, 31 # >> 26, a4 + vand 12, 12, 25 # a3 + + # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1] + vmrgow 4, 9, 20 + vmrgow 5, 10, 21 + vmrgow 6, 11, 22 + vmrgow 7, 12, 23 + vmrgow 8, 13, 24 + vaddudm 8, 8, 19 + + addi 5, 5, -64 # len -= 64 + addi 21, 21, 64 # offset += 64 + + li 9, 64 + divdu 31, 5, 9 + + cmpdi 31, 0 + ble Skip_block_loop + + mtctr 31 + +# h4 = m1 * r⁴ + m2 * r³ + m3 * r² + m4 * r +# Rewrite the polynominal sum of product as follows, +# h1 = (h0 + m1) * r^2, h2 = (h0 + m2) * r^2 +# h3 = (h1 + m3) * r^2, h4 = (h2 + m4) * r^2 --> (h0 + m1) r*4 + (h3 + m3) r^2, (h0 + m2) r^4 + (h0 + m4) r^2 +# .... Repeat +# h5 = (h3 + m5) * r^2, h6 = (h4 + m6) * r^2 --> +# h7 = (h5 + m7) * r^2, h8 = (h6 + m8) * r^1 --> m5 * r^4 + m6 * r^3 + m7 * r^2 + m8 * r +# +loop_4blocks: + + # Multiply odd words and even words + mul_odd + mul_even + # carry reduction + vspltisb 9, 2 + vsrd 10, 14, 31 + vsrd 11, 17, 31 + vand 7, 17, 25 + vand 4, 14, 25 + vaddudm 18, 18, 11 + vsrd 12, 18, 31 + vaddudm 15, 15, 10 + + vsrd 11, 15, 31 + vand 8, 18, 25 + vand 5, 15, 25 + vaddudm 4, 4, 12 + vsld 10, 12, 9 + vaddudm 6, 16, 11 + + vsrd 13, 6, 31 + vand 6, 6, 25 + vaddudm 4, 4, 10 + vsrd 10, 4, 31 + vaddudm 7, 7, 13 + + vsrd 11, 7, 31 + vand 7, 7, 25 + vand 4, 4, 25 + vaddudm 5, 5, 10 + vaddudm 8, 8, 11 + + # input m1 m2 m3 m4 + add 20, 4, 21 + xxlor 49, 24, 24 + xxlor 50, 25, 25 + lxvw4x 43, 0, 20 + addi 17, 20, 16 + lxvw4x 44, 0, 17 + vperm 14, 11, 12, 17 + vperm 15, 11, 12, 18 + addi 17, 17, 16 + lxvw4x 43, 0, 17 + addi 17, 17, 16 + lxvw4x 44, 0, 17 + vperm 17, 11, 12, 17 + vperm 18, 11, 12, 18 + + vand 20, 14, 25 # a0 + vand 9, 17, 25 # a0 + vsrd 21, 14, 31 # >> 26 + vsrd 22, 21, 31 # 12 bits left + vsrd 10, 17, 31 # >> 26 + vsrd 11, 10, 31 # 12 bits left + + vand 21, 21, 25 # a1 + vand 10, 10, 25 # a1 + + vspltisb 13, 12 + vand 16, 15, 25 + vsld 23, 16, 13 + vor 22, 22, 23 + vand 22, 22, 25 # a2 + vand 16, 18, 25 + vsld 12, 16, 13 + vor 11, 11, 12 + vand 11, 11, 25 # a2 + vspltisb 13, 14 + vsrd 23, 15, 13 # >> 14 + vsrd 24, 23, 31 # >> 26, a4 + vand 23, 23, 25 # a3 + vsrd 12, 18, 13 # >> 14 + vsrd 13, 12, 31 # >> 26, a4 + vand 12, 12, 25 # a3 + + vaddudm 4, 4, 20 + vaddudm 5, 5, 21 + vaddudm 6, 6, 22 + vaddudm 7, 7, 23 + vaddudm 8, 8, 24 + + # Smash 4 message blocks into 5 vectors of [m4, m2, m3, m1] + vmrgow 4, 9, 4 + vmrgow 5, 10, 5 + vmrgow 6, 11, 6 + vmrgow 7, 12, 7 + vmrgow 8, 13, 8 + vaddudm 8, 8, 19 + + addi 5, 5, -64 # len -= 64 + addi 21, 21, 64 # offset += 64 + + bdnz loop_4blocks + +Skip_block_loop: + xxlor 58, 0, 0 + xxlor 59, 1, 1 + xxlor 60, 2, 2 + xxlor 61, 3, 3 + xxlor 62, 4, 4 + xxlor 32, 5, 5 + xxlor 33, 6, 6 + xxlor 34, 7, 7 + xxlor 35, 8, 8 + + # Multiply odd words and even words + mul_odd + mul_even + + # Sum the products. + xxpermdi 41, 31, 46, 0 + xxpermdi 42, 31, 47, 0 + vaddudm 4, 14, 9 + xxpermdi 36, 31, 36, 3 + vaddudm 5, 15, 10 + xxpermdi 37, 31, 37, 3 + xxpermdi 43, 31, 48, 0 + vaddudm 6, 16, 11 + xxpermdi 38, 31, 38, 3 + xxpermdi 44, 31, 49, 0 + vaddudm 7, 17, 12 + xxpermdi 39, 31, 39, 3 + xxpermdi 45, 31, 50, 0 + vaddudm 8, 18, 13 + xxpermdi 40, 31, 40, 3 + + # carry reduction + vspltisb 9, 2 + vsrd 10, 4, 31 + vsrd 11, 7, 31 + vand 7, 7, 25 + vand 4, 4, 25 + vaddudm 8, 8, 11 + vsrd 12, 8, 31 + vaddudm 5, 5, 10 + + vsrd 11, 5, 31 + vand 8, 8, 25 + vand 5, 5, 25 + vaddudm 4, 4, 12 + vsld 10, 12, 9 + vaddudm 6, 6, 11 + + vsrd 13, 6, 31 + vand 6, 6, 25 + vaddudm 4, 4, 10 + vsrd 10, 4, 31 + vaddudm 7, 7, 13 + + vsrd 11, 7, 31 + vand 7, 7, 25 + vand 4, 4, 25 + vaddudm 5, 5, 10 + vsrd 10, 5, 31 + vand 5, 5, 25 + vaddudm 6, 6, 10 + vaddudm 8, 8, 11 + + b do_final_update + +do_final_update: + # combine 26 bit limbs + # v4, v5, v6, v7 and v8 are 26 bit vectors + vsld 5, 5, 31 + vor 20, 4, 5 + vspltisb 11, 12 + vsrd 12, 6, 11 + vsld 6, 6, 31 + vsld 6, 6, 31 + vor 20, 20, 6 + vspltisb 11, 14 + vsld 7, 7, 11 + vor 21, 7, 12 + mfvsrld 16, 40 # save last 2 bytes + vsld 8, 8, 11 + vsld 8, 8, 31 + vor 21, 21, 8 + mfvsrld 17, 52 + mfvsrld 19, 53 + srdi 16, 16, 24 + + std 17, 0(3) + std 19, 8(3) + stw 16, 16(3) + +Out_loop: + li 3, 0 + + RESTORE_REGS + + blr + +Out_no_poly1305: + li 3, 0 + blr +SYM_FUNC_END(poly1305_p10le_4blocks) + +# +# ======================================================================= +# The following functions implement 64 x 64 bits multiplication poly1305. +# +SYM_FUNC_START_LOCAL(Poly1305_init_64) + # mask 0x0FFFFFFC0FFFFFFC + # mask 0x0FFFFFFC0FFFFFFF + addis 10, 2, rmask@toc@ha + addi 10, 10, rmask@toc@l + ld 11, 0(10) + ld 12, 8(10) + + # initialize + # load key from r3 + ld 9, 24(3) + ld 10, 32(3) + and. 9, 9, 11 # cramp mask r0 + and. 10, 10, 12 # cramp mask r1 + + srdi 21, 10, 2 + add 19, 21, 10 # s1: r19 - (r1 >> 2) *5 + + # setup r and s + li 25, 0 + mtvsrdd 32+0, 9, 19 # r0, s1 + mtvsrdd 32+1, 10, 9 # r1, r0 + mtvsrdd 32+2, 19, 25 # s1 + mtvsrdd 32+3, 9, 25 # r0 + + blr +SYM_FUNC_END(Poly1305_init_64) + +# Poly1305_mult +# v6 = (h0, h1), v8 = h2 +# v0 = (r0, s1), v1 = (r1, r0), v2 = s1, v3 = r0 +# +# Output: v7, v10, v11 +# +SYM_FUNC_START_LOCAL(Poly1305_mult) + # + # d0 = h0 * r0 + h1 * s1 + vmsumudm 7, 6, 0, 9 # h0 * r0, h1 * s1 + + # d1 = h0 * r1 + h1 * r0 + h2 * s1 + vmsumudm 11, 6, 1, 9 # h0 * r1, h1 * r0 + vmsumudm 10, 8, 2, 11 # d1 += h2 * s1 + + # d2 = r0 + vmsumudm 11, 8, 3, 9 # d2 = h2 * r0 + blr +SYM_FUNC_END(Poly1305_mult) + +# +# carry reduction +# h %=p +# +# Input: v7, v10, v11 +# Output: r27, r28, r29 +# +SYM_FUNC_START_LOCAL(Carry_reduction) + mfvsrld 27, 32+7 + mfvsrld 28, 32+10 + mfvsrld 29, 32+11 + mfvsrd 20, 32+7 # h0.h + mfvsrd 21, 32+10 # h1.h + + addc 28, 28, 20 + adde 29, 29, 21 + srdi 22, 29, 0x2 + sldi 23, 22, 0x2 + add 23, 23, 22 # (h2 & 3) * 5 + addc 27, 27, 23 # h0 + addze 28, 28 # h1 + andi. 29, 29, 0x3 # h2 + blr +SYM_FUNC_END(Carry_reduction) + +# +# poly1305 multiplication +# h *= r, h %= p +# d0 = h0 * r0 + h1 * s1 +# d1 = h0 * r1 + h1 * r0 + h2 * s1 +# d2 = h0 * r0 +# +# +# unsigned int poly1305_test_64s(unisgned char *state, const byte *src, size_t len, highbit) +# - no highbit if final leftover block (highbit = 0) +# +SYM_FUNC_START(poly1305_64s) + cmpdi 5, 0 + ble Out_no_poly1305_64 + + mflr 0 + std 0, 16(1) + stdu 1,-400(1) + + SAVE_GPR 14, 112, 1 + SAVE_GPR 15, 120, 1 + SAVE_GPR 16, 128, 1 + SAVE_GPR 17, 136, 1 + SAVE_GPR 18, 144, 1 + SAVE_GPR 19, 152, 1 + SAVE_GPR 20, 160, 1 + SAVE_GPR 21, 168, 1 + SAVE_GPR 22, 176, 1 + SAVE_GPR 23, 184, 1 + SAVE_GPR 24, 192, 1 + SAVE_GPR 25, 200, 1 + SAVE_GPR 26, 208, 1 + SAVE_GPR 27, 216, 1 + SAVE_GPR 28, 224, 1 + SAVE_GPR 29, 232, 1 + SAVE_GPR 30, 240, 1 + SAVE_GPR 31, 248, 1 + + # Init poly1305 + bl Poly1305_init_64 + + li 25, 0 # offset to inp and outp + + add 11, 25, 4 + + # load h + # h0, h1, h2? + ld 27, 0(3) + ld 28, 8(3) + lwz 29, 16(3) + + li 30, 16 + divdu 31, 5, 30 + + mtctr 31 + + mr 24, 6 # highbit + +Loop_block_64: + vxor 9, 9, 9 + + ld 20, 0(11) + ld 21, 8(11) + addi 11, 11, 16 + + addc 27, 27, 20 + adde 28, 28, 21 + adde 29, 29, 24 + + li 22, 0 + mtvsrdd 32+6, 27, 28 # h0, h1 + mtvsrdd 32+8, 29, 22 # h2 + + bl Poly1305_mult + + bl Carry_reduction + + bdnz Loop_block_64 + + std 27, 0(3) + std 28, 8(3) + stw 29, 16(3) + + li 3, 0 + + RESTORE_GPR 14, 112, 1 + RESTORE_GPR 15, 120, 1 + RESTORE_GPR 16, 128, 1 + RESTORE_GPR 17, 136, 1 + RESTORE_GPR 18, 144, 1 + RESTORE_GPR 19, 152, 1 + RESTORE_GPR 20, 160, 1 + RESTORE_GPR 21, 168, 1 + RESTORE_GPR 22, 176, 1 + RESTORE_GPR 23, 184, 1 + RESTORE_GPR 24, 192, 1 + RESTORE_GPR 25, 200, 1 + RESTORE_GPR 26, 208, 1 + RESTORE_GPR 27, 216, 1 + RESTORE_GPR 28, 224, 1 + RESTORE_GPR 29, 232, 1 + RESTORE_GPR 30, 240, 1 + RESTORE_GPR 31, 248, 1 + + addi 1, 1, 400 + ld 0, 16(1) + mtlr 0 + + blr + +Out_no_poly1305_64: + li 3, 0 + blr +SYM_FUNC_END(poly1305_64s) + +# +# Input: r3 = h, r4 = s, r5 = mac +# mac = h + s +# +SYM_FUNC_START(poly1305_emit_64) + ld 10, 0(3) + ld 11, 8(3) + ld 12, 16(3) + + # compare modulus + # h + 5 + (-p) + mr 6, 10 + mr 7, 11 + mr 8, 12 + addic. 6, 6, 5 + addze 7, 7 + addze 8, 8 + srdi 9, 8, 2 # overflow? + cmpdi 9, 0 + beq Skip_h64 + mr 10, 6 + mr 11, 7 + mr 12, 8 + +Skip_h64: + ld 6, 0(4) + ld 7, 8(4) + addc 10, 10, 6 + adde 11, 11, 7 + addze 12, 12 + + std 10, 0(5) + std 11, 8(5) + blr +SYM_FUNC_END(poly1305_emit_64) + +SYM_DATA_START_LOCAL(RMASK) +.align 5 +rmask: +.byte 0xff, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f, 0xfc, 0xff, 0xff, 0x0f +cnum: +.long 0x03ffffff, 0x00000000, 0x03ffffff, 0x00000000 +.long 0x1a, 0x00, 0x1a, 0x00 +.long 0x01000000, 0x01000000, 0x01000000, 0x01000000 +.long 0x00010203, 0x04050607, 0x10111213, 0x14151617 +.long 0x08090a0b, 0x0c0d0e0f, 0x18191a1b, 0x1c1d1e1f +SYM_DATA_END(RMASK) diff --git a/arch/powerpc/lib/crypto/sha256-spe-asm.S b/arch/powerpc/lib/crypto/sha256-spe-asm.S new file mode 100644 index 000000000000..cd99d71dae34 --- /dev/null +++ b/arch/powerpc/lib/crypto/sha256-spe-asm.S @@ -0,0 +1,318 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Fast SHA-256 implementation for SPE instruction set (PPC) + * + * This code makes use of the SPE SIMD instruction set as defined in + * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf + * Implementation is based on optimization guide notes from + * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf + * + * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> + */ + +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> + +#define rHP r3 /* pointer to hash values in memory */ +#define rKP r24 /* pointer to round constants */ +#define rWP r4 /* pointer to input data */ + +#define rH0 r5 /* 8 32 bit hash values in 8 registers */ +#define rH1 r6 +#define rH2 r7 +#define rH3 r8 +#define rH4 r9 +#define rH5 r10 +#define rH6 r11 +#define rH7 r12 + +#define rW0 r14 /* 64 bit registers. 16 words in 8 registers */ +#define rW1 r15 +#define rW2 r16 +#define rW3 r17 +#define rW4 r18 +#define rW5 r19 +#define rW6 r20 +#define rW7 r21 + +#define rT0 r22 /* 64 bit temporaries */ +#define rT1 r23 +#define rT2 r0 /* 32 bit temporaries */ +#define rT3 r25 + +#define CMP_KN_LOOP +#define CMP_KC_LOOP \ + cmpwi rT1,0; + +#define INITIALIZE \ + stwu r1,-128(r1); /* create stack frame */ \ + evstdw r14,8(r1); /* We must save non volatile */ \ + evstdw r15,16(r1); /* registers. Take the chance */ \ + evstdw r16,24(r1); /* and save the SPE part too */ \ + evstdw r17,32(r1); \ + evstdw r18,40(r1); \ + evstdw r19,48(r1); \ + evstdw r20,56(r1); \ + evstdw r21,64(r1); \ + evstdw r22,72(r1); \ + evstdw r23,80(r1); \ + stw r24,88(r1); /* save normal registers */ \ + stw r25,92(r1); + + +#define FINALIZE \ + evldw r14,8(r1); /* restore SPE registers */ \ + evldw r15,16(r1); \ + evldw r16,24(r1); \ + evldw r17,32(r1); \ + evldw r18,40(r1); \ + evldw r19,48(r1); \ + evldw r20,56(r1); \ + evldw r21,64(r1); \ + evldw r22,72(r1); \ + evldw r23,80(r1); \ + lwz r24,88(r1); /* restore normal registers */ \ + lwz r25,92(r1); \ + xor r0,r0,r0; \ + stw r0,8(r1); /* Delete sensitive data */ \ + stw r0,16(r1); /* that we might have pushed */ \ + stw r0,24(r1); /* from other context that runs */ \ + stw r0,32(r1); /* the same code. Assume that */ \ + stw r0,40(r1); /* the lower part of the GPRs */ \ + stw r0,48(r1); /* was already overwritten on */ \ + stw r0,56(r1); /* the way down to here */ \ + stw r0,64(r1); \ + stw r0,72(r1); \ + stw r0,80(r1); \ + addi r1,r1,128; /* cleanup stack frame */ + +#ifdef __BIG_ENDIAN__ +#define LOAD_DATA(reg, off) \ + lwz reg,off(rWP); /* load data */ +#define NEXT_BLOCK \ + addi rWP,rWP,64; /* increment per block */ +#else +#define LOAD_DATA(reg, off) \ + lwbrx reg,0,rWP; /* load data */ \ + addi rWP,rWP,4; /* increment per word */ +#define NEXT_BLOCK /* nothing to do */ +#endif + +#define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \ + LOAD_DATA(w, off) /* 1: W */ \ + rotrwi rT0,e,6; /* 1: S1 = e rotr 6 */ \ + rotrwi rT1,e,11; /* 1: S1' = e rotr 11 */ \ + rotrwi rT2,e,25; /* 1: S1" = e rotr 25 */ \ + xor rT0,rT0,rT1; /* 1: S1 = S1 xor S1' */ \ + and rT3,e,f; /* 1: ch = e and f */ \ + xor rT0,rT0,rT2; /* 1: S1 = S1 xor S1" */ \ + andc rT1,g,e; /* 1: ch' = ~e and g */ \ + lwz rT2,off(rKP); /* 1: K */ \ + xor rT3,rT3,rT1; /* 1: ch = ch xor ch' */ \ + add h,h,rT0; /* 1: temp1 = h + S1 */ \ + add rT3,rT3,w; /* 1: temp1' = ch + w */ \ + rotrwi rT0,a,2; /* 1: S0 = a rotr 2 */ \ + add h,h,rT3; /* 1: temp1 = temp1 + temp1' */ \ + rotrwi rT1,a,13; /* 1: S0' = a rotr 13 */ \ + add h,h,rT2; /* 1: temp1 = temp1 + K */ \ + rotrwi rT3,a,22; /* 1: S0" = a rotr 22 */ \ + xor rT0,rT0,rT1; /* 1: S0 = S0 xor S0' */ \ + add d,d,h; /* 1: d = d + temp1 */ \ + xor rT3,rT0,rT3; /* 1: S0 = S0 xor S0" */ \ + evmergelo w,w,w; /* shift W */ \ + or rT2,a,b; /* 1: maj = a or b */ \ + and rT1,a,b; /* 1: maj' = a and b */ \ + and rT2,rT2,c; /* 1: maj = maj and c */ \ + LOAD_DATA(w, off+4) /* 2: W */ \ + or rT2,rT1,rT2; /* 1: maj = maj or maj' */ \ + rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \ + add rT3,rT3,rT2; /* 1: temp2 = S0 + maj */ \ + rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \ + add h,h,rT3; /* 1: h = temp1 + temp2 */ \ + rotrwi rT2,d,25; /* 2: S1" = e rotr 25 */ \ + xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \ + and rT3,d,e; /* 2: ch = e and f */ \ + xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \ + andc rT1,f,d; /* 2: ch' = ~e and g */ \ + lwz rT2,off+4(rKP); /* 2: K */ \ + xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \ + add g,g,rT0; /* 2: temp1 = h + S1 */ \ + add rT3,rT3,w; /* 2: temp1' = ch + w */ \ + rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \ + add g,g,rT3; /* 2: temp1 = temp1 + temp1' */ \ + rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \ + add g,g,rT2; /* 2: temp1 = temp1 + K */ \ + rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \ + xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \ + or rT2,h,a; /* 2: maj = a or b */ \ + xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \ + and rT1,h,a; /* 2: maj' = a and b */ \ + and rT2,rT2,b; /* 2: maj = maj and c */ \ + add c,c,g; /* 2: d = d + temp1 */ \ + or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \ + add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \ + add g,g,rT3 /* 2: h = temp1 + temp2 */ + +#define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \ + rotrwi rT2,e,6; /* 1: S1 = e rotr 6 */ \ + evmergelohi rT0,w0,w1; /* w[-15] */ \ + rotrwi rT3,e,11; /* 1: S1' = e rotr 11 */ \ + evsrwiu rT1,rT0,3; /* s0 = w[-15] >> 3 */ \ + xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \ + evrlwi rT0,rT0,25; /* s0' = w[-15] rotr 7 */ \ + rotrwi rT3,e,25; /* 1: S1' = e rotr 25 */ \ + evxor rT1,rT1,rT0; /* s0 = s0 xor s0' */ \ + xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \ + evrlwi rT0,rT0,21; /* s0' = w[-15] rotr 18 */ \ + add h,h,rT2; /* 1: temp1 = h + S1 */ \ + evxor rT0,rT0,rT1; /* s0 = s0 xor s0' */ \ + and rT2,e,f; /* 1: ch = e and f */ \ + evaddw w0,w0,rT0; /* w = w[-16] + s0 */ \ + andc rT3,g,e; /* 1: ch' = ~e and g */ \ + evsrwiu rT0,w7,10; /* s1 = w[-2] >> 10 */ \ + xor rT2,rT2,rT3; /* 1: ch = ch xor ch' */ \ + evrlwi rT1,w7,15; /* s1' = w[-2] rotr 17 */ \ + add h,h,rT2; /* 1: temp1 = temp1 + ch */ \ + evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \ + rotrwi rT2,a,2; /* 1: S0 = a rotr 2 */ \ + evrlwi rT1,w7,13; /* s1' = w[-2] rotr 19 */ \ + rotrwi rT3,a,13; /* 1: S0' = a rotr 13 */ \ + evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \ + xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \ + evldw rT1,off(rKP); /* k */ \ + rotrwi rT3,a,22; /* 1: S0' = a rotr 22 */ \ + evaddw w0,w0,rT0; /* w = w + s1 */ \ + xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \ + evmergelohi rT0,w4,w5; /* w[-7] */ \ + and rT3,a,b; /* 1: maj = a and b */ \ + evaddw w0,w0,rT0; /* w = w + w[-7] */ \ + CMP_K##k##_LOOP \ + add rT2,rT2,rT3; /* 1: temp2 = S0 + maj */ \ + evaddw rT1,rT1,w0; /* wk = w + k */ \ + xor rT3,a,b; /* 1: maj = a xor b */ \ + evmergehi rT0,rT1,rT1; /* wk1/wk2 */ \ + and rT3,rT3,c; /* 1: maj = maj and c */ \ + add h,h,rT0; /* 1: temp1 = temp1 + wk */ \ + add rT2,rT2,rT3; /* 1: temp2 = temp2 + maj */ \ + add g,g,rT1; /* 2: temp1 = temp1 + wk */ \ + add d,d,h; /* 1: d = d + temp1 */ \ + rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \ + add h,h,rT2; /* 1: h = temp1 + temp2 */ \ + rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \ + rotrwi rT2,d,25; /* 2: S" = e rotr 25 */ \ + xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \ + and rT3,d,e; /* 2: ch = e and f */ \ + xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \ + andc rT1,f,d; /* 2: ch' = ~e and g */ \ + add g,g,rT0; /* 2: temp1 = h + S1 */ \ + xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \ + rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \ + add g,g,rT3; /* 2: temp1 = temp1 + ch */ \ + rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \ + rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \ + xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \ + or rT2,h,a; /* 2: maj = a or b */ \ + and rT1,h,a; /* 2: maj' = a and b */ \ + and rT2,rT2,b; /* 2: maj = maj and c */ \ + xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \ + or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \ + add c,c,g; /* 2: d = d + temp1 */ \ + add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \ + add g,g,rT3 /* 2: h = temp1 + temp2 */ + +_GLOBAL(ppc_spe_sha256_transform) + INITIALIZE + + mtctr r5 + lwz rH0,0(rHP) + lwz rH1,4(rHP) + lwz rH2,8(rHP) + lwz rH3,12(rHP) + lwz rH4,16(rHP) + lwz rH5,20(rHP) + lwz rH6,24(rHP) + lwz rH7,28(rHP) + +ppc_spe_sha256_main: + lis rKP,PPC_SPE_SHA256_K@ha + addi rKP,rKP,PPC_SPE_SHA256_K@l + + R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0) + R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8) + R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16) + R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24) + R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32) + R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40) + R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48) + R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56) +ppc_spe_sha256_16_rounds: + addi rKP,rKP,64 + R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, + rW0, rW1, rW4, rW5, rW7, N, 0) + R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, + rW1, rW2, rW5, rW6, rW0, N, 8) + R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, + rW2, rW3, rW6, rW7, rW1, N, 16) + R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, + rW3, rW4, rW7, rW0, rW2, N, 24) + R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, + rW4, rW5, rW0, rW1, rW3, N, 32) + R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, + rW5, rW6, rW1, rW2, rW4, N, 40) + R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, + rW6, rW7, rW2, rW3, rW5, N, 48) + R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, + rW7, rW0, rW3, rW4, rW6, C, 56) + bt gt,ppc_spe_sha256_16_rounds + + lwz rW0,0(rHP) + NEXT_BLOCK + lwz rW1,4(rHP) + lwz rW2,8(rHP) + lwz rW3,12(rHP) + lwz rW4,16(rHP) + lwz rW5,20(rHP) + lwz rW6,24(rHP) + lwz rW7,28(rHP) + + add rH0,rH0,rW0 + stw rH0,0(rHP) + add rH1,rH1,rW1 + stw rH1,4(rHP) + add rH2,rH2,rW2 + stw rH2,8(rHP) + add rH3,rH3,rW3 + stw rH3,12(rHP) + add rH4,rH4,rW4 + stw rH4,16(rHP) + add rH5,rH5,rW5 + stw rH5,20(rHP) + add rH6,rH6,rW6 + stw rH6,24(rHP) + add rH7,rH7,rW7 + stw rH7,28(rHP) + + bdnz ppc_spe_sha256_main + + FINALIZE + blr + +.data +.align 5 +PPC_SPE_SHA256_K: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 diff --git a/arch/powerpc/lib/crypto/sha256.c b/arch/powerpc/lib/crypto/sha256.c new file mode 100644 index 000000000000..6b0f079587eb --- /dev/null +++ b/arch/powerpc/lib/crypto/sha256.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * SHA-256 Secure Hash Algorithm, SPE optimized + * + * Based on generic implementation. The assembler module takes care + * about the SPE registers so it can run from interrupt context. + * + * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> + */ + +#include <asm/switch_to.h> +#include <crypto/internal/sha2.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/preempt.h> + +/* + * MAX_BYTES defines the number of bytes that are allowed to be processed + * between preempt_disable() and preempt_enable(). SHA256 takes ~2,000 + * operations per 64 bytes. e500 cores can issue two arithmetic instructions + * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2). + * Thus 1KB of input data will need an estimated maximum of 18,000 cycles. + * Headroom for cache misses included. Even with the low end model clocked + * at 667 MHz this equals to a critical time window of less than 27us. + * + */ +#define MAX_BYTES 1024 + +extern void ppc_spe_sha256_transform(u32 *state, const u8 *src, u32 blocks); + +static void spe_begin(void) +{ + /* We just start SPE operations and will save SPE registers later. */ + preempt_disable(); + enable_kernel_spe(); +} + +static void spe_end(void) +{ + disable_kernel_spe(); + /* reenable preemption */ + preempt_enable(); +} + +void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], + const u8 *data, size_t nblocks) +{ + do { + /* cut input data into smaller blocks */ + u32 unit = min_t(size_t, nblocks, + MAX_BYTES / SHA256_BLOCK_SIZE); + + spe_begin(); + ppc_spe_sha256_transform(state, data, unit); + spe_end(); + + data += unit * SHA256_BLOCK_SIZE; + nblocks -= unit; + } while (nblocks); +} +EXPORT_SYMBOL_GPL(sha256_blocks_arch); + +bool sha256_is_arch_optimized(void) +{ + return true; +} +EXPORT_SYMBOL_GPL(sha256_is_arch_optimized); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA-256 Secure Hash Algorithm, SPE optimized"); diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c index 58ed6bd613a6..54340912398f 100644 --- a/arch/powerpc/lib/vmx-helper.c +++ b/arch/powerpc/lib/vmx-helper.c @@ -45,7 +45,7 @@ int exit_vmx_usercopy(void) * set and we are preemptible. The hack here is to schedule a * decrementer to fire here and reschedule for us if necessary. */ - if (IS_ENABLED(CONFIG_PREEMPTION) && need_resched()) + if (need_irq_preemption() && need_resched()) set_dec(1); return 0; } |