diff options
Diffstat (limited to 'arch')
177 files changed, 5302 insertions, 5223 deletions
diff --git a/arch/arm/include/asm/simd.h b/arch/arm/include/asm/simd.h index d37559762180..be08a8da046f 100644 --- a/arch/arm/include/asm/simd.h +++ b/arch/arm/include/asm/simd.h @@ -8,7 +8,8 @@ static __must_check inline bool may_use_simd(void) { - return IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && !in_hardirq(); + return IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && !in_hardirq() + && !irqs_disabled(); } #endif /* _ASM_SIMD_H */ diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c index 748698e91a4b..27e64f782cb3 100644 --- a/arch/arm/mm/ioremap.c +++ b/arch/arm/mm/ioremap.c @@ -515,7 +515,5 @@ void __init early_ioremap_init(void) bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, unsigned long flags) { - unsigned long pfn = PHYS_PFN(offset); - - return memblock_is_map_memory(pfn); + return memblock_is_map_memory(offset); } diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c index 7803d50b90f8..e559ad3cd148 100644 --- a/arch/arm/vfp/vfpmodule.c +++ b/arch/arm/vfp/vfpmodule.c @@ -877,6 +877,7 @@ void kernel_neon_begin(void) * the kernel mode NEON register contents never need to be preserved. */ BUG_ON(in_hardirq()); + BUG_ON(irqs_disabled()); cpu = __smp_processor_id(); fpexc = fmrx(FPEXC) | FPEXC_EN; diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 1e7c7475e43f..ba5df0df02a4 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -298,19 +298,6 @@ .Lskip_gcs_\@: .endm -.macro __init_el2_mpam - /* Memory Partitioning And Monitoring: disable EL2 traps */ - mrs x1, id_aa64pfr0_el1 - ubfx x0, x1, #ID_AA64PFR0_EL1_MPAM_SHIFT, #4 - cbz x0, .Lskip_mpam_\@ // skip if no MPAM - msr_s SYS_MPAM2_EL2, xzr // use the default partition - // and disable lower traps - mrs_s x0, SYS_MPAMIDR_EL1 - tbz x0, #MPAMIDR_EL1_HAS_HCR_SHIFT, .Lskip_mpam_\@ // skip if no MPAMHCR reg - msr_s SYS_MPAMHCR_EL2, xzr // clear TRAP_MPAMIDR_EL1 -> EL2 -.Lskip_mpam_\@: -.endm - /** * Initialize EL2 registers to sane values. This should be called early on all * cores that were booted in EL2. Note that everything gets initialised as @@ -328,7 +315,6 @@ __init_el2_stage2 __init_el2_gicv3 __init_el2_hstr - __init_el2_mpam __init_el2_nvhe_idregs __init_el2_cptr __init_el2_fgt @@ -375,6 +361,16 @@ #endif .macro finalise_el2_state + check_override id_aa64pfr0, ID_AA64PFR0_EL1_MPAM_SHIFT, .Linit_mpam_\@, .Lskip_mpam_\@, x1, x2 + +.Linit_mpam_\@: + msr_s SYS_MPAM2_EL2, xzr // use the default partition + // and disable lower traps + mrs_s x0, SYS_MPAMIDR_EL1 + tbz x0, #MPAMIDR_EL1_HAS_HCR_SHIFT, .Lskip_mpam_\@ // skip if no MPAMHCR reg + msr_s SYS_MPAMHCR_EL2, xzr // clear TRAP_MPAMIDR_EL1 -> EL2 + +.Lskip_mpam_\@: check_override id_aa64pfr0, ID_AA64PFR0_EL1_SVE_SHIFT, .Linit_sve_\@, .Lskip_sve_\@, x1, x2 .Linit_sve_\@: /* SVE register access */ diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 9e93733523f6..74a4f738c5f5 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -58,7 +58,7 @@ #define INIT_DIR_SIZE (PAGE_SIZE * (EARLY_PAGES(SWAPPER_PGTABLE_LEVELS, KIMAGE_VADDR, _end, EXTRA_PAGE) \ + EARLY_SEGMENT_EXTRA_PAGES)) -#define INIT_IDMAP_DIR_PAGES (EARLY_PAGES(INIT_IDMAP_PGTABLE_LEVELS, KIMAGE_VADDR, _end, 1)) +#define INIT_IDMAP_DIR_PAGES (EARLY_PAGES(INIT_IDMAP_PGTABLE_LEVELS, KIMAGE_VADDR, kimage_limit, 1)) #define INIT_IDMAP_DIR_SIZE ((INIT_IDMAP_DIR_PAGES + EARLY_IDMAP_EXTRA_PAGES) * PAGE_SIZE) #define INIT_IDMAP_FDT_PAGES (EARLY_PAGES(INIT_IDMAP_PGTABLE_LEVELS, 0UL, UL(MAX_FDT_SIZE), 1) - 1) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index eba1a98657f1..aa9efee17277 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -323,13 +323,14 @@ static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) } /* - * If mprotect/munmap/etc occurs during TLB batched flushing, we need to - * synchronise all the TLBI issued with a DSB to avoid the race mentioned in - * flush_tlb_batched_pending(). + * If mprotect/munmap/etc occurs during TLB batched flushing, we need to ensure + * all the previously issued TLBIs targeting mm have completed. But since we + * can be executing on a remote CPU, a DSB cannot guarantee this like it can + * for arch_tlbbatch_flush(). Our only option is to flush the entire mm. */ static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) { - dsb(ish); + flush_tlb_mm(mm); } /* diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 45ea79cacf46..b34044e20128 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1199,8 +1199,10 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) cpacr_restore(cpacr); } - if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) + if (id_aa64pfr0_mpam(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { + info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); init_cpu_ftr_reg(SYS_MPAMIDR_EL1, info->reg_mpamidr); + } if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) init_cpu_ftr_reg(SYS_GMID_EL1, info->reg_gmid); @@ -1453,7 +1455,8 @@ void update_cpu_features(int cpu, cpacr_restore(cpacr); } - if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) { + if (id_aa64pfr0_mpam(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { + info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); taint |= check_update_ftr_reg(SYS_MPAMIDR_EL1, cpu, info->reg_mpamidr, boot->reg_mpamidr); } diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 94525abd1c22..c1f2b6b04b41 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -496,8 +496,11 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) __cpuinfo_store_cpu_32bit(&info->aarch32); - if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) - info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); + /* + * info->reg_mpamidr deferred to {init,update}_cpu_features because we + * don't want to read it (and trigger a trap on buggy firmware) if + * using an aa64pfr0_el1 override to unconditionally disable MPAM. + */ if (IS_ENABLED(CONFIG_ARM64_SME) && id_aa64pfr1_sme(info->reg_id_aa64pfr1)) { diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 5a69b6eb4090..714b0b5ec5ac 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -10,6 +10,10 @@ #error This file should only be included in vmlinux.lds.S #endif +#if defined(CONFIG_LD_IS_LLD) && CONFIG_LLD_VERSION < 210000 +#define ASSERT(...) +#endif + #define PI_EXPORT_SYM(sym) \ __PI_EXPORT_SYM(sym, __pi_ ## sym, Cannot export BSS symbol sym to startup code) #define __PI_EXPORT_SYM(sym, pisym, msg)\ @@ -140,4 +144,17 @@ KVM_NVHE_ALIAS(kvm_protected_mode_initialized); _kernel_codesize = ABSOLUTE(__inittext_end - _text); #endif +/* + * LLD will occasionally error out with a '__init_end does not converge' error + * if INIT_IDMAP_DIR_SIZE is defined in terms of _end, as this results in a + * circular dependency. Counter this by dimensioning the initial IDMAP page + * tables based on kimage_limit, which is defined such that its value should + * not change as a result of the initdata segment being pushed over a 64k + * segment boundary due to changes in INIT_IDMAP_DIR_SIZE, provided that its + * value doesn't change by more than 2M between linker passes. + */ +kimage_limit = ALIGN(ABSOLUTE(_end + SZ_64K), SZ_2M); + +#undef ASSERT + #endif /* __ARM64_KERNEL_IMAGE_VARS_H */ diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c index c6b185b885f7..bc57b290e5e7 100644 --- a/arch/arm64/kernel/pi/idreg-override.c +++ b/arch/arm64/kernel/pi/idreg-override.c @@ -127,6 +127,7 @@ static const struct ftr_set_desc pfr0 __prel64_initconst = { .fields = { FIELD("sve", ID_AA64PFR0_EL1_SVE_SHIFT, pfr0_sve_filter), FIELD("el0", ID_AA64PFR0_EL1_EL0_SHIFT, NULL), + FIELD("mpam", ID_AA64PFR0_EL1_MPAM_SHIFT, NULL), {} }, }; @@ -154,6 +155,7 @@ static const struct ftr_set_desc pfr1 __prel64_initconst = { FIELD("gcs", ID_AA64PFR1_EL1_GCS_SHIFT, NULL), FIELD("mte", ID_AA64PFR1_EL1_MTE_SHIFT, NULL), FIELD("sme", ID_AA64PFR1_EL1_SME_SHIFT, pfr1_sme_filter), + FIELD("mpam_frac", ID_AA64PFR1_EL1_MPAM_frac_SHIFT, NULL), {} }, }; @@ -246,6 +248,7 @@ static const struct { { "rodata=off", "arm64_sw.rodataoff=1" }, { "arm64.nolva", "id_aa64mmfr2.varange=0" }, { "arm64.no32bit_el0", "id_aa64pfr0.el0=1" }, + { "arm64.nompam", "id_aa64pfr0.mpam=0 id_aa64pfr1.mpam_frac=0" }, }; static int __init parse_hexdigit(const char *p, u64 *v) diff --git a/arch/mips/Kbuild.platforms b/arch/mips/Kbuild.platforms index bca37ddf974b..41a00fa860c1 100644 --- a/arch/mips/Kbuild.platforms +++ b/arch/mips/Kbuild.platforms @@ -11,6 +11,7 @@ platform-$(CONFIG_CAVIUM_OCTEON_SOC) += cavium-octeon/ platform-$(CONFIG_EYEQ) += mobileye/ platform-$(CONFIG_MIPS_COBALT) += cobalt/ platform-$(CONFIG_MACH_DECSTATION) += dec/ +platform-$(CONFIG_ECONET) += econet/ platform-$(CONFIG_MIPS_GENERIC) += generic/ platform-$(CONFIG_MACH_JAZZ) += jazz/ platform-$(CONFIG_LANTIQ) += lantiq/ diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index fc0772c1bad4..1e48184ecf1e 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -391,6 +391,31 @@ config MACH_DECSTATION otherwise choose R3000. +config ECONET + bool "EcoNet MIPS family" + select BOOT_RAW + select CPU_BIG_ENDIAN + select DEBUG_ZBOOT if DEBUG_KERNEL + select EARLY_PRINTK_8250 + select ECONET_EN751221_TIMER + select SERIAL_8250 + select SERIAL_OF_PLATFORM + select SYS_SUPPORTS_BIG_ENDIAN + select SYS_HAS_CPU_MIPS32_R1 + select SYS_HAS_CPU_MIPS32_R2 + select SYS_HAS_EARLY_PRINTK + select SYS_SUPPORTS_32BIT_KERNEL + select SYS_SUPPORTS_MIPS16 + select SYS_SUPPORTS_ZBOOT_UART16550 + select USE_GENERIC_EARLY_PRINTK_8250 + select USE_OF + help + EcoNet EN75xx MIPS devices are big endian MIPS machines used + in XPON (fiber) and DSL applications. They have SPI, PCI, USB, + GPIO, and Ethernet, with optional XPON, DSL, and VoIP DSP cores. + Don't confuse these with the Airoha ARM devices sometimes referred + to as "EcoNet", this family is for MIPS based devices only. + config MACH_JAZZ bool "Jazz family of machines" select ARC_MEMORY @@ -617,6 +642,7 @@ config EYEQ select USB_UHCI_BIG_ENDIAN_DESC if CPU_BIG_ENDIAN select USB_UHCI_BIG_ENDIAN_MMIO if CPU_BIG_ENDIAN select USE_OF + select HOTPLUG_PARALLEL if SMP help Select this to build a kernel supporting EyeQ SoC from Mobileye. @@ -1020,6 +1046,7 @@ source "arch/mips/ath79/Kconfig" source "arch/mips/bcm47xx/Kconfig" source "arch/mips/bcm63xx/Kconfig" source "arch/mips/bmips/Kconfig" +source "arch/mips/econet/Kconfig" source "arch/mips/generic/Kconfig" source "arch/mips/ingenic/Kconfig" source "arch/mips/jazz/Kconfig" @@ -2287,6 +2314,7 @@ config MIPS_CPS select MIPS_CM select MIPS_CPS_PM if HOTPLUG_CPU select SMP + select HOTPLUG_SMT if HOTPLUG_PARALLEL select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU select SYNC_R4K if (CEVT_R4K || CSRC_R4K) select SYS_SUPPORTS_HOTPLUG_CPU diff --git a/arch/mips/alchemy/common/gpiolib.c b/arch/mips/alchemy/common/gpiolib.c index 1b16daaa86ae..411f70ceb762 100644 --- a/arch/mips/alchemy/common/gpiolib.c +++ b/arch/mips/alchemy/common/gpiolib.c @@ -119,9 +119,11 @@ static int alchemy_gpic_get(struct gpio_chip *chip, unsigned int off) return !!au1300_gpio_get_value(off + AU1300_GPIO_BASE); } -static void alchemy_gpic_set(struct gpio_chip *chip, unsigned int off, int v) +static int alchemy_gpic_set(struct gpio_chip *chip, unsigned int off, int v) { au1300_gpio_set_value(off + AU1300_GPIO_BASE, v); + + return 0; } static int alchemy_gpic_dir_input(struct gpio_chip *chip, unsigned int off) @@ -145,7 +147,7 @@ static struct gpio_chip au1300_gpiochip = { .direction_input = alchemy_gpic_dir_input, .direction_output = alchemy_gpic_dir_output, .get = alchemy_gpic_get, - .set = alchemy_gpic_set, + .set_rv = alchemy_gpic_set, .to_irq = alchemy_gpic_gpio_to_irq, .base = AU1300_GPIO_BASE, .ngpio = AU1300_GPIO_NUM, diff --git a/arch/mips/bcm63xx/boards/board_bcm963xx.c b/arch/mips/bcm63xx/boards/board_bcm963xx.c index 9cc8fbf218a5..c5617b889b1c 100644 --- a/arch/mips/bcm63xx/boards/board_bcm963xx.c +++ b/arch/mips/bcm63xx/boards/board_bcm963xx.c @@ -764,7 +764,7 @@ void __init board_prom_init(void) snprintf(cfe_version, 12, "%s", (char *) &cfe[4]); } } else { - strcpy(cfe_version, "unknown"); + strscpy(cfe_version, "unknown"); } pr_info("CFE version: %s\n", cfe_version); diff --git a/arch/mips/bcm63xx/gpio.c b/arch/mips/bcm63xx/gpio.c index 5c4a233db55f..e7a53cd0dec5 100644 --- a/arch/mips/bcm63xx/gpio.c +++ b/arch/mips/bcm63xx/gpio.c @@ -35,8 +35,7 @@ static void bcm63xx_gpio_out_low_reg_init(void) static DEFINE_SPINLOCK(bcm63xx_gpio_lock); static u32 gpio_out_low, gpio_out_high; -static void bcm63xx_gpio_set(struct gpio_chip *chip, - unsigned gpio, int val) +static int bcm63xx_gpio_set(struct gpio_chip *chip, unsigned int gpio, int val) { u32 reg; u32 mask; @@ -62,6 +61,8 @@ static void bcm63xx_gpio_set(struct gpio_chip *chip, *v &= ~mask; bcm_gpio_writel(*v, reg); spin_unlock_irqrestore(&bcm63xx_gpio_lock, flags); + + return 0; } static int bcm63xx_gpio_get(struct gpio_chip *chip, unsigned gpio) @@ -130,7 +131,7 @@ static struct gpio_chip bcm63xx_gpio_chip = { .direction_input = bcm63xx_gpio_direction_input, .direction_output = bcm63xx_gpio_direction_output, .get = bcm63xx_gpio_get, - .set = bcm63xx_gpio_set, + .set_rv = bcm63xx_gpio_set, .base = 0, }; diff --git a/arch/mips/boot/compressed/uart-16550.c b/arch/mips/boot/compressed/uart-16550.c index db618e72a0c4..529e77a6487c 100644 --- a/arch/mips/boot/compressed/uart-16550.c +++ b/arch/mips/boot/compressed/uart-16550.c @@ -20,6 +20,11 @@ #define PORT(offset) (CKSEG1ADDR(INGENIC_UART_BASE_ADDR) + (4 * offset)) #endif +#ifdef CONFIG_ECONET +#define EN75_UART_BASE 0x1fbf0003 +#define PORT(offset) (CKSEG1ADDR(EN75_UART_BASE) + (4 * (offset))) +#endif + #ifndef IOTYPE #define IOTYPE char #endif diff --git a/arch/mips/boot/dts/Makefile b/arch/mips/boot/dts/Makefile index ff468439a8c4..7375c6ced82b 100644 --- a/arch/mips/boot/dts/Makefile +++ b/arch/mips/boot/dts/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 subdir-$(CONFIG_BMIPS_GENERIC) += brcm subdir-$(CONFIG_CAVIUM_OCTEON_SOC) += cavium-octeon +subdir-$(CONFIG_ECONET) += econet subdir-$(CONFIG_EYEQ) += mobileye subdir-$(CONFIG_FIT_IMAGE_FDT_MARDUK) += img subdir-$(CONFIG_FIT_IMAGE_FDT_BOSTON) += img diff --git a/arch/mips/boot/dts/econet/Makefile b/arch/mips/boot/dts/econet/Makefile new file mode 100644 index 000000000000..b467d5624e39 --- /dev/null +++ b/arch/mips/boot/dts/econet/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +dtb-$(CONFIG_DTB_ECONET_SMARTFIBER_XP8421_B) += en751221_smartfiber_xp8421-b.dtb diff --git a/arch/mips/boot/dts/econet/en751221.dtsi b/arch/mips/boot/dts/econet/en751221.dtsi new file mode 100644 index 000000000000..66197e73d4f0 --- /dev/null +++ b/arch/mips/boot/dts/econet/en751221.dtsi @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/dts-v1/; + +/ { + compatible = "econet,en751221"; + #address-cells = <1>; + #size-cells = <1>; + + hpt_clock: clock { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <200000000>; /* 200 MHz */ + }; + + cpus: cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu@0 { + device_type = "cpu"; + compatible = "mips,mips24KEc"; + reg = <0>; + }; + }; + + cpuintc: interrupt-controller { + compatible = "mti,cpu-interrupt-controller"; + interrupt-controller; + #address-cells = <0>; + #interrupt-cells = <1>; + }; + + intc: interrupt-controller@1fb40000 { + compatible = "econet,en751221-intc"; + reg = <0x1fb40000 0x100>; + interrupt-parent = <&cpuintc>; + interrupts = <2>; + + interrupt-controller; + #interrupt-cells = <1>; + econet,shadow-interrupts = <7 2>, <8 3>, <13 12>, <30 29>; + }; + + uart: serial@1fbf0000 { + compatible = "ns16550"; + reg = <0x1fbf0000 0x30>; + reg-io-width = <4>; + reg-shift = <2>; + interrupt-parent = <&intc>; + interrupts = <0>; + /* + * Conversion of baud rate to clock frequency requires a + * computation that is not in the ns16550 driver, so this + * uart is fixed at 115200 baud. + */ + clock-frequency = <1843200>; + }; + + timer_hpt: timer@1fbf0400 { + compatible = "econet,en751221-timer"; + reg = <0x1fbf0400 0x100>; + + interrupt-parent = <&intc>; + interrupts = <30>; + clocks = <&hpt_clock>; + }; +}; diff --git a/arch/mips/boot/dts/econet/en751221_smartfiber_xp8421-b.dts b/arch/mips/boot/dts/econet/en751221_smartfiber_xp8421-b.dts new file mode 100644 index 000000000000..8223c5bce67f --- /dev/null +++ b/arch/mips/boot/dts/econet/en751221_smartfiber_xp8421-b.dts @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/dts-v1/; + +#include "en751221.dtsi" + +/ { + model = "SmartFiber XP8421-B"; + compatible = "smartfiber,xp8421-b", "econet,en751221"; + + memory@0 { + device_type = "memory"; + reg = <0x00000000 0x1c000000>; + }; + + chosen { + stdout-path = "/serial@1fbf0000:115200"; + linux,usable-memory-range = <0x00020000 0x1bfe0000>; + }; +}; diff --git a/arch/mips/boot/dts/loongson/loongson64c_4core_ls7a.dts b/arch/mips/boot/dts/loongson/loongson64c_4core_ls7a.dts index c7ea4f1c0bb2..6c277ab83d4b 100644 --- a/arch/mips/boot/dts/loongson/loongson64c_4core_ls7a.dts +++ b/arch/mips/boot/dts/loongson/loongson64c_4core_ls7a.dts @@ -29,6 +29,7 @@ compatible = "loongson,pch-msi-1.0"; reg = <0 0x2ff00000 0 0x8>; interrupt-controller; + #interrupt-cells = <1>; msi-controller; loongson,msi-base-vec = <64>; loongson,msi-num-vecs = <64>; diff --git a/arch/mips/boot/dts/pic32/pic32mzda.dtsi b/arch/mips/boot/dts/pic32/pic32mzda.dtsi index fdc721b414a8..feca35ba56a4 100644 --- a/arch/mips/boot/dts/pic32/pic32mzda.dtsi +++ b/arch/mips/boot/dts/pic32/pic32mzda.dtsi @@ -225,7 +225,7 @@ gpio-ranges = <&pic32_pinctrl 0 144 16>; }; - sdhci: sdhci@1f8ec000 { + sdhci: mmc@1f8ec000 { compatible = "microchip,pic32mzda-sdhci"; reg = <0x1f8ec000 0x100>; interrupts = <191 IRQ_TYPE_LEVEL_HIGH>; diff --git a/arch/mips/boot/dts/realtek/rtl930x.dtsi b/arch/mips/boot/dts/realtek/rtl930x.dtsi index f2e57ea3a60c..101bab72a95f 100644 --- a/arch/mips/boot/dts/realtek/rtl930x.dtsi +++ b/arch/mips/boot/dts/realtek/rtl930x.dtsi @@ -69,6 +69,39 @@ #size-cells = <0>; status = "disabled"; }; + + mdio_controller: mdio-controller@ca00 { + compatible = "realtek,rtl9301-mdio"; + reg = <0xca00 0x200>; + #address-cells = <1>; + #size-cells = <0>; + status = "disabled"; + + mdio0: mdio-bus@0 { + reg = <0>; + #address-cells = <1>; + #size-cells = <0>; + status = "disabled"; + }; + mdio1: mdio-bus@1 { + reg = <1>; + #address-cells = <1>; + #size-cells = <0>; + status = "disabled"; + }; + mdio2: mdio-bus@2 { + reg = <2>; + #address-cells = <1>; + #size-cells = <0>; + status = "disabled"; + }; + mdio3: mdio-bus@3 { + reg = <3>; + #address-cells = <1>; + #size-cells = <0>; + status = "disabled"; + }; + }; }; soc: soc@18000000 { diff --git a/arch/mips/econet/Kconfig b/arch/mips/econet/Kconfig new file mode 100644 index 000000000000..fd69884cc9a8 --- /dev/null +++ b/arch/mips/econet/Kconfig @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: GPL-2.0 +if ECONET + +choice + prompt "EcoNet SoC selection" + default SOC_ECONET_EN751221 + help + Select EcoNet MIPS SoC type. Individual SoCs within a family are + very similar, so is it enough to select the right family, and + then customize to the specific SoC using the device tree only. + + config SOC_ECONET_EN751221 + bool "EN751221 family" + select COMMON_CLK + select ECONET_EN751221_INTC + select IRQ_MIPS_CPU + select SMP + select SMP_UP + select SYS_SUPPORTS_SMP + help + The EN751221 family includes EN7512, RN7513, EN7521, EN7526. + They are based on single core MIPS 34Kc processors. To boot + this kernel, you will need a device tree such as + MIPS_RAW_APPENDED_DTB=y, and a root filesystem. +endchoice + +choice + prompt "Devicetree selection" + default DTB_ECONET_NONE + help + Select the devicetree. + + config DTB_ECONET_NONE + bool "None" + + config DTB_ECONET_SMARTFIBER_XP8421_B + bool "EN751221 SmartFiber XP8421-B" + depends on SOC_ECONET_EN751221 + select BUILTIN_DTB + help + The SmartFiber XP8421-B is a device based on the EN751221 SoC. + It has 512MB of memory and 256MB of NAND flash. This kernel + needs only an appended initramfs to boot. It can be loaded + through XMODEM and booted from memory in the bootloader, or + it can be packed in tclinux.trx format and written to flash. +endchoice + +endif diff --git a/arch/mips/econet/Makefile b/arch/mips/econet/Makefile new file mode 100644 index 000000000000..7e4529e7d3d7 --- /dev/null +++ b/arch/mips/econet/Makefile @@ -0,0 +1,2 @@ + +obj-y := init.o diff --git a/arch/mips/econet/Platform b/arch/mips/econet/Platform new file mode 100644 index 000000000000..ea5616447bcd --- /dev/null +++ b/arch/mips/econet/Platform @@ -0,0 +1,5 @@ +# To address a 7.2MB kernel size limit in the EcoNet SDK bootloader, +# we put the load address well above where the bootloader loads and then use +# zboot. So please set CONFIG_ZBOOT_LOAD_ADDRESS to the address where your +# bootloader actually places the kernel. +load-$(CONFIG_ECONET) += 0xffffffff81000000 diff --git a/arch/mips/econet/init.c b/arch/mips/econet/init.c new file mode 100644 index 000000000000..6f43ffb209cb --- /dev/null +++ b/arch/mips/econet/init.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * EcoNet setup code + * + * Copyright (C) 2025 Caleb James DeLisle <cjd@cjdns.fr> + */ + +#include <linux/init.h> +#include <linux/of_clk.h> +#include <linux/irqchip.h> + +#include <asm/addrspace.h> +#include <asm/io.h> +#include <asm/bootinfo.h> +#include <asm/time.h> +#include <asm/prom.h> +#include <asm/smp-ops.h> +#include <asm/reboot.h> + +#define CR_AHB_RSTCR ((void __iomem *)CKSEG1ADDR(0x1fb00040)) +#define RESET BIT(31) + +#define UART_BASE CKSEG1ADDR(0x1fbf0003) +#define UART_REG_SHIFT 2 + +static void hw_reset(char *command) +{ + iowrite32(RESET, CR_AHB_RSTCR); +} + +/* 1. Bring up early printk. */ +void __init prom_init(void) +{ + setup_8250_early_printk_port(UART_BASE, UART_REG_SHIFT, 0); + _machine_restart = hw_reset; +} + +/* 2. Parse the DT and find memory */ +void __init plat_mem_setup(void) +{ + void *dtb; + + set_io_port_base(KSEG1); + + dtb = get_fdt(); + if (!dtb) + panic("no dtb found"); + + __dt_setup_arch(dtb); + + early_init_dt_scan_memory(); +} + +/* 3. Overload __weak device_tree_init(), add SMP_UP ops */ +void __init device_tree_init(void) +{ + unflatten_and_copy_device_tree(); + + register_up_smp_ops(); +} + +const char *get_system_type(void) +{ + return "EcoNet-EN75xx"; +} + +/* 4. Initialize the IRQ subsystem */ +void __init arch_init_irq(void) +{ + irqchip_init(); +} + +/* 5. Timers */ +void __init plat_time_init(void) +{ + of_clk_init(NULL); + timer_probe(); +} diff --git a/arch/mips/include/asm/mach-loongson2ef/cs5536/cs5536_pci.h b/arch/mips/include/asm/mach-loongson2ef/cs5536/cs5536_pci.h index a0d4b752899e..5dbc9b13d15b 100644 --- a/arch/mips/include/asm/mach-loongson2ef/cs5536/cs5536_pci.h +++ b/arch/mips/include/asm/mach-loongson2ef/cs5536/cs5536_pci.h @@ -12,12 +12,32 @@ #ifndef _CS5536_PCI_H #define _CS5536_PCI_H +#include <linux/init.h> #include <linux/types.h> #include <linux/pci_regs.h> extern void cs5536_pci_conf_write4(int function, int reg, u32 value); extern u32 cs5536_pci_conf_read4(int function, int reg); +extern void pci_ehci_write_reg(int reg, u32 value); +extern u32 pci_ehci_read_reg(int reg); + +extern void pci_ide_write_reg(int reg, u32 value); +extern u32 pci_ide_read_reg(int reg); + +extern void pci_acc_write_reg(int reg, u32 value); +extern u32 pci_acc_read_reg(int reg); + +extern void pci_ohci_write_reg(int reg, u32 value); +extern u32 pci_ohci_read_reg(int reg); + +extern void pci_isa_write_bar(int n, u32 value); +extern u32 pci_isa_read_bar(int n); +extern void pci_isa_write_reg(int reg, u32 value); +extern u32 pci_isa_read_reg(int reg); + +extern int __init init_mfgpt_clocksource(void); + #define CS5536_ACC_INTR 9 #define CS5536_IDE_INTR 14 #define CS5536_USB_INTR 11 diff --git a/arch/mips/include/asm/mach-loongson2ef/loongson.h b/arch/mips/include/asm/mach-loongson2ef/loongson.h index ca039b8dcde3..4a098fb10232 100644 --- a/arch/mips/include/asm/mach-loongson2ef/loongson.h +++ b/arch/mips/include/asm/mach-loongson2ef/loongson.h @@ -18,6 +18,9 @@ extern void bonito_irq_init(void); extern void mach_prepare_reboot(void); extern void mach_prepare_shutdown(void); +/* machine-specific PROM functions */ +extern void __init mach_prom_init_machtype(void); + /* environment arguments from bootloader */ extern u32 cpu_clock_freq; extern u32 memsize, highmemsize; @@ -45,6 +48,12 @@ extern void __init mach_init_irq(void); extern void mach_irq_dispatch(unsigned int pending); extern int mach_i8259_irq(void); +/* power management functions */ +extern void setup_wakeup_events(void); +extern int wakeup_loongson(void); +extern void __weak mach_suspend(void); +extern void __weak mach_resume(void); + /* We need this in some places... */ #define delay() ({ \ int x; \ diff --git a/arch/mips/include/asm/topology.h b/arch/mips/include/asm/topology.h index 0673d2d0f2e6..5158c802eb65 100644 --- a/arch/mips/include/asm/topology.h +++ b/arch/mips/include/asm/topology.h @@ -16,6 +16,9 @@ #define topology_core_id(cpu) (cpu_core(&cpu_data[cpu])) #define topology_core_cpumask(cpu) (&cpu_core_map[cpu]) #define topology_sibling_cpumask(cpu) (&cpu_sibling_map[cpu]) + +extern struct cpumask __cpu_primary_thread_mask; +#define cpu_primary_thread_mask ((const struct cpumask *)&__cpu_primary_thread_mask) #endif #endif /* __ASM_TOPOLOGY_H */ diff --git a/arch/mips/kernel/gpio_txx9.c b/arch/mips/kernel/gpio_txx9.c index 8c083612df9d..027fb57d0d79 100644 --- a/arch/mips/kernel/gpio_txx9.c +++ b/arch/mips/kernel/gpio_txx9.c @@ -32,14 +32,16 @@ static void txx9_gpio_set_raw(unsigned int offset, int value) __raw_writel(val, &txx9_pioptr->dout); } -static void txx9_gpio_set(struct gpio_chip *chip, unsigned int offset, - int value) +static int txx9_gpio_set(struct gpio_chip *chip, unsigned int offset, + int value) { unsigned long flags; spin_lock_irqsave(&txx9_gpio_lock, flags); txx9_gpio_set_raw(offset, value); mmiowb(); spin_unlock_irqrestore(&txx9_gpio_lock, flags); + + return 0; } static int txx9_gpio_dir_in(struct gpio_chip *chip, unsigned int offset) @@ -68,7 +70,7 @@ static int txx9_gpio_dir_out(struct gpio_chip *chip, unsigned int offset, static struct gpio_chip txx9_gpio_chip = { .get = txx9_gpio_get, - .set = txx9_gpio_set, + .set_rv = txx9_gpio_set, .direction_input = txx9_gpio_dir_in, .direction_output = txx9_gpio_dir_out, .label = "TXx9", diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index f7107479c7fa..b890d64d352c 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -922,11 +922,13 @@ static const struct pt_regs_offset regoffset_table[] = { */ int regs_query_register_offset(const char *name) { - const struct pt_regs_offset *roff; - for (roff = regoffset_table; roff->name != NULL; roff++) - if (!strcmp(roff->name, name)) - return roff->offset; - return -EINVAL; + const struct pt_regs_offset *roff; + + for (roff = regoffset_table; roff->name != NULL; roff++) + if (!strcmp(roff->name, name)) + return roff->offset; + + return -EINVAL; } #if defined(CONFIG_32BIT) || defined(CONFIG_MIPS32_O32) @@ -937,7 +939,7 @@ static const struct user_regset mips_regsets[] = { .n = ELF_NGREG, .size = sizeof(unsigned int), .align = sizeof(unsigned int), - .regset_get = gpr32_get, + .regset_get = gpr32_get, .set = gpr32_set, }, [REGSET_DSP] = { @@ -945,7 +947,7 @@ static const struct user_regset mips_regsets[] = { .n = NUM_DSP_REGS + 1, .size = sizeof(u32), .align = sizeof(u32), - .regset_get = dsp32_get, + .regset_get = dsp32_get, .set = dsp32_set, .active = dsp_active, }, @@ -955,7 +957,7 @@ static const struct user_regset mips_regsets[] = { .n = ELF_NFPREG, .size = sizeof(elf_fpreg_t), .align = sizeof(elf_fpreg_t), - .regset_get = fpr_get, + .regset_get = fpr_get, .set = fpr_set, }, [REGSET_FP_MODE] = { @@ -963,7 +965,7 @@ static const struct user_regset mips_regsets[] = { .n = 1, .size = sizeof(int), .align = sizeof(int), - .regset_get = fp_mode_get, + .regset_get = fp_mode_get, .set = fp_mode_set, }, #endif @@ -973,7 +975,7 @@ static const struct user_regset mips_regsets[] = { .n = NUM_FPU_REGS + 1, .size = 16, .align = 16, - .regset_get = msa_get, + .regset_get = msa_get, .set = msa_set, }, #endif @@ -997,7 +999,7 @@ static const struct user_regset mips64_regsets[] = { .n = ELF_NGREG, .size = sizeof(unsigned long), .align = sizeof(unsigned long), - .regset_get = gpr64_get, + .regset_get = gpr64_get, .set = gpr64_set, }, [REGSET_DSP] = { @@ -1005,7 +1007,7 @@ static const struct user_regset mips64_regsets[] = { .n = NUM_DSP_REGS + 1, .size = sizeof(u64), .align = sizeof(u64), - .regset_get = dsp64_get, + .regset_get = dsp64_get, .set = dsp64_set, .active = dsp_active, }, @@ -1015,7 +1017,7 @@ static const struct user_regset mips64_regsets[] = { .n = 1, .size = sizeof(int), .align = sizeof(int), - .regset_get = fp_mode_get, + .regset_get = fp_mode_get, .set = fp_mode_set, }, [REGSET_FPR] = { @@ -1023,7 +1025,7 @@ static const struct user_regset mips64_regsets[] = { .n = ELF_NFPREG, .size = sizeof(elf_fpreg_t), .align = sizeof(elf_fpreg_t), - .regset_get = fpr_get, + .regset_get = fpr_get, .set = fpr_set, }, #endif @@ -1033,7 +1035,7 @@ static const struct user_regset mips64_regsets[] = { .n = NUM_FPU_REGS + 1, .size = 16, .align = 16, - .regset_get = msa_get, + .regset_get = msa_get, .set = msa_set, }, #endif @@ -1351,7 +1353,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs) */ asmlinkage void syscall_trace_leave(struct pt_regs *regs) { - /* + /* * We may come here right after calling schedule_user() * or do_notify_resume(), in which case we can be in RCU * user mode. diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c index cc26d56f3ab6..7b0e69af4097 100644 --- a/arch/mips/kernel/smp-cps.c +++ b/arch/mips/kernel/smp-cps.c @@ -236,6 +236,7 @@ static void __init cps_smp_setup(void) /* Use the number of VPEs in cluster 0 core 0 for smp_num_siblings */ if (!cl && !c) smp_num_siblings = core_vpes; + cpumask_set_cpu(nvpes, &__cpu_primary_thread_mask); for (v = 0; v < min_t(int, core_vpes, NR_CPUS - nvpes); v++) { cpu_set_cluster(&cpu_data[nvpes + v], cl); @@ -368,6 +369,7 @@ static void __init cps_prepare_cpus(unsigned int max_cpus) cl = cpu_cluster(¤t_cpu_data); c = cpu_core(¤t_cpu_data); cluster_bootcfg = &mips_cps_cluster_bootcfg[cl]; + cpu_smt_set_num_threads(core_vpes, core_vpes); core_bootcfg = &cluster_bootcfg->core_config[c]; bitmap_set(cluster_bootcfg->core_power, cpu_core(¤t_cpu_data), 1); atomic_set(&core_bootcfg->vpe_mask, 1 << cpu_vpe_id(¤t_cpu_data)); diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 39e193cad2b9..4868e79f3b30 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -56,8 +56,10 @@ EXPORT_SYMBOL(cpu_sibling_map); cpumask_t cpu_core_map[NR_CPUS] __read_mostly; EXPORT_SYMBOL(cpu_core_map); +#ifndef CONFIG_HOTPLUG_PARALLEL static DECLARE_COMPLETION(cpu_starting); static DECLARE_COMPLETION(cpu_running); +#endif /* * A logical cpu mask containing only one VPE per core to @@ -74,6 +76,8 @@ static cpumask_t cpu_core_setup_map; cpumask_t cpu_coherent_mask; +struct cpumask __cpu_primary_thread_mask __read_mostly; + unsigned int smp_max_threads __initdata = UINT_MAX; static int __init early_nosmt(char *s) @@ -367,6 +371,9 @@ asmlinkage void start_secondary(void) * to an option instead of something based on .cputype */ +#ifdef CONFIG_HOTPLUG_PARALLEL + cpuhp_ap_sync_alive(); +#endif calibrate_delay(); cpu_data[cpu].udelay_val = loops_per_jiffy; @@ -376,8 +383,10 @@ asmlinkage void start_secondary(void) cpumask_set_cpu(cpu, &cpu_coherent_mask); notify_cpu_starting(cpu); +#ifndef CONFIG_HOTPLUG_PARALLEL /* Notify boot CPU that we're starting & ready to sync counters */ complete(&cpu_starting); +#endif synchronise_count_slave(cpu); @@ -386,11 +395,13 @@ asmlinkage void start_secondary(void) calculate_cpu_foreign_map(); +#ifndef CONFIG_HOTPLUG_PARALLEL /* * Notify boot CPU that we're up & online and it can safely return * from __cpu_up */ complete(&cpu_running); +#endif /* * irq will be enabled in ->smp_finish(), enabling it too early @@ -447,6 +458,12 @@ void __init smp_prepare_boot_cpu(void) set_cpu_online(0, true); } +#ifdef CONFIG_HOTPLUG_PARALLEL +int arch_cpuhp_kick_ap_alive(unsigned int cpu, struct task_struct *tidle) +{ + return mp_ops->boot_secondary(cpu, tidle); +} +#else int __cpu_up(unsigned int cpu, struct task_struct *tidle) { int err; @@ -466,6 +483,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) wait_for_completion(&cpu_running); return 0; } +#endif #ifdef CONFIG_PROFILING /* Not really SMP stuff ... */ diff --git a/arch/mips/kernel/vpe.c b/arch/mips/kernel/vpe.c index 737d0d4fdcd3..2b67c44adab9 100644 --- a/arch/mips/kernel/vpe.c +++ b/arch/mips/kernel/vpe.c @@ -22,6 +22,7 @@ #include <linux/vmalloc.h> #include <linux/elf.h> #include <linux/seq_file.h> +#include <linux/string.h> #include <linux/syscalls.h> #include <linux/moduleloader.h> #include <linux/interrupt.h> @@ -582,7 +583,7 @@ static int vpe_elfload(struct vpe *v) struct module mod; /* so we can re-use the relocations code */ memset(&mod, 0, sizeof(struct module)); - strcpy(mod.name, "VPE loader"); + strscpy(mod.name, "VPE loader"); hdr = (Elf_Ehdr *) v->pbuffer; len = v->plen; diff --git a/arch/mips/rb532/gpio.c b/arch/mips/rb532/gpio.c index ea6ebfea4a67..0e47cd59b6cb 100644 --- a/arch/mips/rb532/gpio.c +++ b/arch/mips/rb532/gpio.c @@ -105,13 +105,15 @@ static int rb532_gpio_get(struct gpio_chip *chip, unsigned offset) /* * Set output GPIO level */ -static void rb532_gpio_set(struct gpio_chip *chip, - unsigned offset, int value) +static int rb532_gpio_set(struct gpio_chip *chip, unsigned int offset, + int value) { struct rb532_gpio_chip *gpch; gpch = gpiochip_get_data(chip); rb532_set_bit(value, offset, gpch->regbase + GPIOD); + + return 0; } /* @@ -162,7 +164,7 @@ static struct rb532_gpio_chip rb532_gpio_chip[] = { .direction_input = rb532_gpio_direction_input, .direction_output = rb532_gpio_direction_output, .get = rb532_gpio_get, - .set = rb532_gpio_set, + .set_rv = rb532_gpio_set, .to_irq = rb532_gpio_to_irq, .base = 0, .ngpio = 32, diff --git a/arch/mips/txx9/generic/setup.c b/arch/mips/txx9/generic/setup.c index 1e67fecd466e..0586ca7668b4 100644 --- a/arch/mips/txx9/generic/setup.c +++ b/arch/mips/txx9/generic/setup.c @@ -603,8 +603,8 @@ static int txx9_iocled_get(struct gpio_chip *chip, unsigned int offset) return !!(data->cur_val & (1 << offset)); } -static void txx9_iocled_set(struct gpio_chip *chip, unsigned int offset, - int value) +static int txx9_iocled_set(struct gpio_chip *chip, unsigned int offset, + int value) { struct txx9_iocled_data *data = gpiochip_get_data(chip); unsigned long flags; @@ -616,6 +616,8 @@ static void txx9_iocled_set(struct gpio_chip *chip, unsigned int offset, writeb(data->cur_val, data->mmioaddr); mmiowb(); spin_unlock_irqrestore(&txx9_iocled_lock, flags); + + return 0; } static int txx9_iocled_dir_in(struct gpio_chip *chip, unsigned int offset) @@ -653,7 +655,7 @@ void __init txx9_iocled_init(unsigned long baseaddr, if (!iocled->mmioaddr) goto out_free; iocled->chip.get = txx9_iocled_get; - iocled->chip.set = txx9_iocled_set; + iocled->chip.set_rv = txx9_iocled_set; iocled->chip.direction_input = txx9_iocled_dir_in; iocled->chip.direction_output = txx9_iocled_dir_out; iocled->chip.label = "iocled"; diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile index fb4c493aaffa..69d4593f64fe 100644 --- a/arch/mips/vdso/Makefile +++ b/arch/mips/vdso/Makefile @@ -27,6 +27,7 @@ endif # offsets. cflags-vdso := $(ccflags-vdso) \ $(filter -W%,$(filter-out -Wa$(comma)%,$(KBUILD_CFLAGS))) \ + $(filter -std=%,$(KBUILD_CFLAGS)) \ -O3 -g -fPIC -fno-strict-aliasing -fno-common -fno-builtin -G 0 \ -mrelax-pic-calls $(call cc-option, -mexplicit-relocs) \ -fno-stack-protector -fno-jump-tables -DDISABLE_BRANCH_PROFILING \ diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index bbec87b79309..36061f4732b7 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -70,6 +70,7 @@ config RISCV # LLD >= 14: https://github.com/llvm/llvm-project/issues/50505 select ARCH_SUPPORTS_LTO_CLANG if LLD_VERSION >= 140000 select ARCH_SUPPORTS_LTO_CLANG_THIN if LLD_VERSION >= 140000 + select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS if 64BIT && MMU select ARCH_SUPPORTS_PAGE_TABLE_CHECK if MMU select ARCH_SUPPORTS_PER_VMA_LOCK if MMU select ARCH_SUPPORTS_RT @@ -99,6 +100,7 @@ config RISCV select EDAC_SUPPORT select FRAME_POINTER if PERF_EVENTS || (FUNCTION_TRACER && !DYNAMIC_FTRACE) select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY if DYNAMIC_FTRACE + select FUNCTION_ALIGNMENT_8B if DYNAMIC_FTRACE_WITH_CALL_OPS select GENERIC_ARCH_TOPOLOGY select GENERIC_ATOMIC64 if !64BIT select GENERIC_CLOCKEVENTS_BROADCAST if SMP @@ -143,6 +145,7 @@ config RISCV select HAVE_ARCH_THREAD_STRUCT_WHITELIST select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT && MMU + select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if 64BIT && MMU select HAVE_ARCH_USERFAULTFD_MINOR if 64BIT && USERFAULTFD select HAVE_ARCH_VMAP_STACK if MMU && 64BIT select HAVE_ASM_MODVERSIONS @@ -150,13 +153,15 @@ config RISCV select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS if MMU select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && (CLANG_SUPPORTS_DYNAMIC_FTRACE || GCC_SUPPORTS_DYNAMIC_FTRACE) - select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + select FUNCTION_ALIGNMENT_4B if HAVE_DYNAMIC_FTRACE && RISCV_ISA_C + select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS if HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS + select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_ARGS && !CFI_CLANG) select HAVE_DYNAMIC_FTRACE_WITH_ARGS if HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_GRAPH_FUNC select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL select HAVE_FUNCTION_GRAPH_TRACER if HAVE_DYNAMIC_FTRACE_WITH_ARGS select HAVE_FUNCTION_GRAPH_FREGS - select HAVE_FUNCTION_TRACER if !XIP_KERNEL && !PREEMPTION + select HAVE_FUNCTION_TRACER if !XIP_KERNEL select HAVE_EBPF_JIT if MMU select HAVE_GUP_FAST if MMU select HAVE_FUNCTION_ARG_ACCESS_API @@ -218,6 +223,7 @@ config RISCV select THREAD_INFO_IN_TASK select TRACE_IRQFLAGS_SUPPORT select UACCESS_MEMCPY if !MMU + select VDSO_GETRANDOM if HAVE_GENERIC_VDSO select USER_STACKTRACE_SUPPORT select ZONE_DMA32 if 64BIT @@ -236,6 +242,7 @@ config CLANG_SUPPORTS_DYNAMIC_FTRACE config GCC_SUPPORTS_DYNAMIC_FTRACE def_bool CC_IS_GCC depends on $(cc-option,-fpatchable-function-entry=8) + depends on CC_HAS_MIN_FUNCTION_ALIGNMENT || !RISCV_ISA_C config HAVE_SHADOW_CALL_STACK def_bool $(cc-option,-fsanitize=shadow-call-stack) @@ -664,12 +671,12 @@ config RISCV_ISA_V_PREEMPTIVE default y help Usually, in-kernel SIMD routines are run with preemption disabled. - Functions which envoke long running SIMD thus must yield core's + Functions which invoke long running SIMD thus must yield the core's vector unit to prevent blocking other tasks for too long. - This config allows kernel to run SIMD without explicitly disable - preemption. Enabling this config will result in higher memory - consumption due to the allocation of per-task's kernel Vector context. + This config allows the kernel to run SIMD without explicitly disabling + preemption. Enabling this config will result in higher memory consumption + due to the allocation of per-task's kernel Vector context. config RISCV_ISA_ZAWRS bool "Zawrs extension support for more efficient busy waiting" @@ -842,6 +849,21 @@ config RISCV_ISA_ZICBOZ If you don't know what to do here, say Y. +config RISCV_ISA_ZICBOP + bool "Zicbop extension support for cache block prefetch" + depends on MMU + depends on RISCV_ALTERNATIVE + default y + help + Adds support to dynamically detect the presence of the ZICBOP + extension (Cache Block Prefetch Operations) and enable its + usage. + + The Zicbop extension can be used to prefetch cache blocks for + read/write fetch. + + If you don't know what to do here, say Y. + config TOOLCHAIN_NEEDS_EXPLICIT_ZICSR_ZIFENCEI def_bool y # https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=aed44286efa8ae8717a77d94b51ac3614e2ca6dc @@ -1171,8 +1193,8 @@ config CMDLINE_FALLBACK config CMDLINE_EXTEND bool "Extend bootloader kernel arguments" help - The command-line arguments provided during boot will be - appended to the built-in command line. This is useful in + The built-in command line will be appended to the command- + line arguments provided during boot. This is useful in cases where the provided arguments are insufficient and you don't want to or cannot modify them. diff --git a/arch/riscv/Kconfig.vendor b/arch/riscv/Kconfig.vendor index b096548fe0ff..e14f26368963 100644 --- a/arch/riscv/Kconfig.vendor +++ b/arch/riscv/Kconfig.vendor @@ -16,6 +16,19 @@ config RISCV_ISA_VENDOR_EXT_ANDES If you don't know what to do here, say Y. endmenu +menu "SiFive" +config RISCV_ISA_VENDOR_EXT_SIFIVE + bool "SiFive vendor extension support" + select RISCV_ISA_VENDOR_EXT + default y + help + Say N here if you want to disable all SiFive vendor extension + support. This will cause any SiFive vendor extensions that are + requested by hardware probing to be ignored. + + If you don't know what to do here, say Y. +endmenu + menu "T-Head" config RISCV_ISA_VENDOR_EXT_THEAD bool "T-Head vendor extension support" diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 539d2aef5cab..df57654a615e 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -15,9 +15,9 @@ ifeq ($(CONFIG_DYNAMIC_FTRACE),y) LDFLAGS_vmlinux += --no-relax KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY ifeq ($(CONFIG_RISCV_ISA_C),y) - CC_FLAGS_FTRACE := -fpatchable-function-entry=4 + CC_FLAGS_FTRACE := -fpatchable-function-entry=8,4 else - CC_FLAGS_FTRACE := -fpatchable-function-entry=2 + CC_FLAGS_FTRACE := -fpatchable-function-entry=4,2 endif endif diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index eea825ee58e1..fe8bd8afb418 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -18,12 +18,9 @@ CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_PERF=y CONFIG_CGROUP_BPF=y -CONFIG_NAMESPACES=y CONFIG_USER_NS=y CONFIG_CHECKPOINT_RESTORE=y CONFIG_BLK_DEV_INITRD=y -CONFIG_EXPERT=y -# CONFIG_SYSFS_SYSCALL is not set CONFIG_PROFILING=y CONFIG_ARCH_MICROCHIP=y CONFIG_ARCH_SIFIVE=y @@ -182,6 +179,7 @@ CONFIG_REGULATOR_FIXED_VOLTAGE=y CONFIG_REGULATOR_AXP20X=y CONFIG_REGULATOR_GPIO=y CONFIG_MEDIA_SUPPORT=m +CONFIG_MEDIA_PLATFORM_SUPPORT=y CONFIG_VIDEO_CADENCE_CSI2RX=m CONFIG_DRM=m CONFIG_DRM_RADEON=m @@ -297,25 +295,7 @@ CONFIG_DEFAULT_SECURITY_DAC=y CONFIG_CRYPTO_USER_API_HASH=y CONFIG_CRYPTO_DEV_VIRTIO=y CONFIG_PRINTK_TIME=y +CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_FS=y -CONFIG_DEBUG_PAGEALLOC=y -CONFIG_SCHED_STACK_END_CHECK=y -CONFIG_DEBUG_VM=y -CONFIG_DEBUG_VM_PGFLAGS=y -CONFIG_DEBUG_MEMORY_INIT=y -CONFIG_DEBUG_PER_CPU_MAPS=y -CONFIG_SOFTLOCKUP_DETECTOR=y -CONFIG_WQ_WATCHDOG=y -CONFIG_DEBUG_RT_MUTEXES=y -CONFIG_DEBUG_SPINLOCK=y -CONFIG_DEBUG_MUTEXES=y -CONFIG_DEBUG_RWSEMS=y -CONFIG_DEBUG_ATOMIC_SLEEP=y -CONFIG_DEBUG_LIST=y -CONFIG_DEBUG_PLIST=y -CONFIG_DEBUG_SG=y -# CONFIG_RCU_TRACE is not set -CONFIG_RCU_EQS_DEBUG=y -# CONFIG_FTRACE is not set # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_MEMTEST=y diff --git a/arch/riscv/include/asm/asm-prototypes.h b/arch/riscv/include/asm/asm-prototypes.h index bfc8ea5f9319..a9988bf21ec8 100644 --- a/arch/riscv/include/asm/asm-prototypes.h +++ b/arch/riscv/include/asm/asm-prototypes.h @@ -12,7 +12,7 @@ long long __ashlti3(long long a, int b); #ifdef CONFIG_RISCV_ISA_V #ifdef CONFIG_MMU -asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n); +asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n, bool enable_sum); #endif /* CONFIG_MMU */ void xor_regs_2_(unsigned long bytes, unsigned long *__restrict p1, diff --git a/arch/riscv/include/asm/barrier.h b/arch/riscv/include/asm/barrier.h index e1d9bf1deca6..b8c5726d86ac 100644 --- a/arch/riscv/include/asm/barrier.h +++ b/arch/riscv/include/asm/barrier.h @@ -14,11 +14,6 @@ #include <asm/cmpxchg.h> #include <asm/fence.h> -#define nop() __asm__ __volatile__ ("nop") -#define __nops(n) ".rept " #n "\nnop\n.endr\n" -#define nops(n) __asm__ __volatile__ (__nops(n)) - - /* These barriers need to enforce ordering on both devices or memory. */ #define __mb() RISCV_FENCE(iorw, iorw) #define __rmb() RISCV_FENCE(ir, ir) diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index b59ffeb668d6..6086b38d5427 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -85,6 +85,7 @@ static inline void flush_icache_range(unsigned long start, unsigned long end) extern unsigned int riscv_cbom_block_size; extern unsigned int riscv_cboz_block_size; +extern unsigned int riscv_cbop_block_size; void riscv_init_cbo_blocksizes(void); #ifdef CONFIG_RISCV_DMA_NONCOHERENT diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h index 2ec119eb147b..0b749e710216 100644 --- a/arch/riscv/include/asm/cmpxchg.h +++ b/arch/riscv/include/asm/cmpxchg.h @@ -13,6 +13,7 @@ #include <asm/hwcap.h> #include <asm/insn-def.h> #include <asm/cpufeature-macros.h> +#include <asm/processor.h> #define __arch_xchg_masked(sc_sfx, swap_sfx, prepend, sc_append, \ swap_append, r, p, n) \ @@ -37,6 +38,7 @@ \ __asm__ __volatile__ ( \ prepend \ + PREFETCHW_ASM(%5) \ "0: lr.w %0, %2\n" \ " and %1, %0, %z4\n" \ " or %1, %1, %z3\n" \ @@ -44,7 +46,7 @@ " bnez %1, 0b\n" \ sc_append \ : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b)) \ - : "rJ" (__newx), "rJ" (~__mask) \ + : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b) \ : "memory"); \ \ r = (__typeof__(*(p)))((__retx & __mask) >> __s); \ diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h index f56b409361fb..fbd0e4306c93 100644 --- a/arch/riscv/include/asm/cpufeature.h +++ b/arch/riscv/include/asm/cpufeature.h @@ -67,11 +67,11 @@ void __init riscv_user_isa_enable(void); _RISCV_ISA_EXT_DATA(_name, _id, _sub_exts, ARRAY_SIZE(_sub_exts), _validate) bool __init check_unaligned_access_emulated_all_cpus(void); +void unaligned_access_init(void); +int cpu_online_unaligned_access_init(unsigned int cpu); #if defined(CONFIG_RISCV_SCALAR_MISALIGNED) -void check_unaligned_access_emulated(struct work_struct *work __always_unused); void unaligned_emulation_finish(void); bool unaligned_ctl_available(void); -DECLARE_PER_CPU(long, misaligned_access_speed); #else static inline bool unaligned_ctl_available(void) { @@ -79,6 +79,16 @@ static inline bool unaligned_ctl_available(void) } #endif +#if defined(CONFIG_RISCV_MISALIGNED) +DECLARE_PER_CPU(long, misaligned_access_speed); +bool misaligned_traps_can_delegate(void); +#else +static inline bool misaligned_traps_can_delegate(void) +{ + return false; +} +#endif + bool __init check_vector_unaligned_access_emulated_all_cpus(void); #if defined(CONFIG_RISCV_VECTOR_MISALIGNED) void check_vector_unaligned_access_emulated(struct work_struct *work __always_unused); diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h index d627f63ee289..22ebea3c2b26 100644 --- a/arch/riscv/include/asm/ftrace.h +++ b/arch/riscv/include/asm/ftrace.h @@ -20,10 +20,9 @@ extern void *return_address(unsigned int level); #define ftrace_return_address(n) return_address(n) void _mcount(void); -static inline unsigned long ftrace_call_adjust(unsigned long addr) -{ - return addr; -} +unsigned long ftrace_call_adjust(unsigned long addr); +unsigned long arch_ftrace_get_symaddr(unsigned long fentry_ip); +#define ftrace_get_symaddr(fentry_ip) arch_ftrace_get_symaddr(fentry_ip) /* * Let's do like x86/arm64 and ignore the compat syscalls. @@ -57,12 +56,21 @@ struct dyn_arch_ftrace { * 2) jalr: setting low-12 offset to ra, jump to ra, and set ra to * return address (original pc + 4) * + * The first 2 instructions for each tracable function is compiled to 2 nop + * instructions. Then, the kernel initializes the first instruction to auipc at + * boot time (<ftrace disable>). The second instruction is patched to jalr to + * start the trace. + * + *<Image>: + * 0: nop + * 4: nop + * *<ftrace enable>: - * 0: auipc t0/ra, 0x? - * 4: jalr t0/ra, ?(t0/ra) + * 0: auipc t0, 0x? + * 4: jalr t0, ?(t0) * *<ftrace disable>: - * 0: nop + * 0: auipc t0, 0x? * 4: nop * * Dynamic ftrace generates probes to call sites, so we must deal with @@ -75,10 +83,9 @@ struct dyn_arch_ftrace { #define AUIPC_OFFSET_MASK (0xfffff000) #define AUIPC_PAD (0x00001000) #define JALR_SHIFT 20 -#define JALR_RA (0x000080e7) -#define AUIPC_RA (0x00000097) #define JALR_T0 (0x000282e7) #define AUIPC_T0 (0x00000297) +#define JALR_RANGE (JALR_SIGN_MASK - 1) #define to_jalr_t0(offset) \ (((offset & JALR_OFFSET_MASK) << JALR_SHIFT) | JALR_T0) @@ -96,26 +103,14 @@ do { \ call[1] = to_jalr_t0(offset); \ } while (0) -#define to_jalr_ra(offset) \ - (((offset & JALR_OFFSET_MASK) << JALR_SHIFT) | JALR_RA) - -#define to_auipc_ra(offset) \ - ((offset & JALR_SIGN_MASK) ? \ - (((offset & AUIPC_OFFSET_MASK) + AUIPC_PAD) | AUIPC_RA) : \ - ((offset & AUIPC_OFFSET_MASK) | AUIPC_RA)) - -#define make_call_ra(caller, callee, call) \ -do { \ - unsigned int offset = \ - (unsigned long) (callee) - (unsigned long) (caller); \ - call[0] = to_auipc_ra(offset); \ - call[1] = to_jalr_ra(offset); \ -} while (0) - /* - * Let auipc+jalr be the basic *mcount unit*, so we make it 8 bytes here. + * Only the jalr insn in the auipc+jalr is patched, so we make it 4 + * bytes here. */ -#define MCOUNT_INSN_SIZE 8 +#define MCOUNT_INSN_SIZE 4 +#define MCOUNT_AUIPC_SIZE 4 +#define MCOUNT_JALR_SIZE 4 +#define MCOUNT_NOP4_SIZE 4 #ifndef __ASSEMBLY__ struct dyn_ftrace; @@ -135,6 +130,9 @@ struct __arch_ftrace_regs { unsigned long sp; unsigned long s0; unsigned long t1; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + unsigned long direct_tramp; +#endif union { unsigned long args[8]; struct { @@ -146,6 +144,13 @@ struct __arch_ftrace_regs { unsigned long a5; unsigned long a6; unsigned long a7; +#ifdef CONFIG_CC_IS_CLANG + unsigned long t2; + unsigned long t3; + unsigned long t4; + unsigned long t5; + unsigned long t6; +#endif }; }; }; @@ -221,10 +226,13 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); #define ftrace_graph_func ftrace_graph_func +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsigned long addr) { arch_ftrace_regs(fregs)->t1 = addr; } +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + #endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */ #endif /* __ASSEMBLY__ */ diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index e3cbf203cdde..affd63e11b0a 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -105,6 +105,7 @@ #define RISCV_ISA_EXT_ZVFBFWMA 96 #define RISCV_ISA_EXT_ZAAMO 97 #define RISCV_ISA_EXT_ZALRSC 98 +#define RISCV_ISA_EXT_ZICBOP 99 #define RISCV_ISA_EXT_XLINUXENVCFG 127 diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h index 1f690fea0e03..7fe0a379474a 100644 --- a/arch/riscv/include/asm/hwprobe.h +++ b/arch/riscv/include/asm/hwprobe.h @@ -8,7 +8,7 @@ #include <uapi/asm/hwprobe.h> -#define RISCV_HWPROBE_MAX_KEY 12 +#define RISCV_HWPROBE_MAX_KEY 13 static inline bool riscv_hwprobe_key_is_valid(__s64 key) { @@ -22,6 +22,7 @@ static inline bool hwprobe_key_is_bitmask(__s64 key) case RISCV_HWPROBE_KEY_IMA_EXT_0: case RISCV_HWPROBE_KEY_CPUPERF_0: case RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0: + case RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0: return true; } diff --git a/arch/riscv/include/asm/image.h b/arch/riscv/include/asm/image.h index e0b319af3681..8927a6ea1127 100644 --- a/arch/riscv/include/asm/image.h +++ b/arch/riscv/include/asm/image.h @@ -30,6 +30,8 @@ RISCV_HEADER_VERSION_MINOR) #ifndef __ASSEMBLY__ +#define riscv_image_flag_field(flags, field)\ + (((flags) >> field##_SHIFT) & field##_MASK) /** * struct riscv_image_header - riscv kernel image header * @code0: Executable code diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h index 71060a2f838e..d5adbaec1d01 100644 --- a/arch/riscv/include/asm/insn-def.h +++ b/arch/riscv/include/asm/insn-def.h @@ -18,6 +18,13 @@ #define INSN_I_RD_SHIFT 7 #define INSN_I_OPCODE_SHIFT 0 +#define INSN_S_SIMM7_SHIFT 25 +#define INSN_S_RS2_SHIFT 20 +#define INSN_S_RS1_SHIFT 15 +#define INSN_S_FUNC3_SHIFT 12 +#define INSN_S_SIMM5_SHIFT 7 +#define INSN_S_OPCODE_SHIFT 0 + #ifdef __ASSEMBLY__ #ifdef CONFIG_AS_HAS_INSN @@ -30,6 +37,10 @@ .insn i \opcode, \func3, \rd, \rs1, \simm12 .endm + .macro insn_s, opcode, func3, rs2, simm12, rs1 + .insn s \opcode, \func3, \rs2, \simm12(\rs1) + .endm + #else #include <asm/gpr-num.h> @@ -51,10 +62,20 @@ (\simm12 << INSN_I_SIMM12_SHIFT)) .endm + .macro insn_s, opcode, func3, rs2, simm12, rs1 + .4byte ((\opcode << INSN_S_OPCODE_SHIFT) | \ + (\func3 << INSN_S_FUNC3_SHIFT) | \ + (.L__gpr_num_\rs2 << INSN_S_RS2_SHIFT) | \ + (.L__gpr_num_\rs1 << INSN_S_RS1_SHIFT) | \ + ((\simm12 & 0x1f) << INSN_S_SIMM5_SHIFT) | \ + (((\simm12 >> 5) & 0x7f) << INSN_S_SIMM7_SHIFT)) + .endm + #endif #define __INSN_R(...) insn_r __VA_ARGS__ #define __INSN_I(...) insn_i __VA_ARGS__ +#define __INSN_S(...) insn_s __VA_ARGS__ #else /* ! __ASSEMBLY__ */ @@ -66,6 +87,9 @@ #define __INSN_I(opcode, func3, rd, rs1, simm12) \ ".insn i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n" +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \ + ".insn s " opcode ", " func3 ", " rs2 ", " simm12 "(" rs1 ")\n" + #else #include <linux/stringify.h> @@ -92,12 +116,26 @@ " (\\simm12 << " __stringify(INSN_I_SIMM12_SHIFT) "))\n" \ " .endm\n" +#define DEFINE_INSN_S \ + __DEFINE_ASM_GPR_NUMS \ +" .macro insn_s, opcode, func3, rs2, simm12, rs1\n" \ +" .4byte ((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |" \ +" (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |" \ +" (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \ +" (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \ +" ((\\simm12 & 0x1f) << " __stringify(INSN_S_SIMM5_SHIFT) ") |" \ +" (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \ +" .endm\n" + #define UNDEFINE_INSN_R \ " .purgem insn_r\n" #define UNDEFINE_INSN_I \ " .purgem insn_i\n" +#define UNDEFINE_INSN_S \ +" .purgem insn_s\n" + #define __INSN_R(opcode, func3, func7, rd, rs1, rs2) \ DEFINE_INSN_R \ "insn_r " opcode ", " func3 ", " func7 ", " rd ", " rs1 ", " rs2 "\n" \ @@ -108,6 +146,11 @@ "insn_i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n" \ UNDEFINE_INSN_I +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \ + DEFINE_INSN_S \ + "insn_s " opcode ", " func3 ", " rs2 ", " simm12 ", " rs1 "\n" \ + UNDEFINE_INSN_S + #endif #endif /* ! __ASSEMBLY__ */ @@ -120,6 +163,10 @@ __INSN_I(RV_##opcode, RV_##func3, RV_##rd, \ RV_##rs1, RV_##simm12) +#define INSN_S(opcode, func3, rs2, simm12, rs1) \ + __INSN_S(RV_##opcode, RV_##func3, RV_##rs2, \ + RV_##simm12, RV_##rs1) + #define RV_OPCODE(v) __ASM_STR(v) #define RV_FUNC3(v) __ASM_STR(v) #define RV_FUNC7(v) __ASM_STR(v) @@ -133,6 +180,7 @@ #define RV___RS2(v) __RV_REG(v) #define RV_OPCODE_MISC_MEM RV_OPCODE(15) +#define RV_OPCODE_OP_IMM RV_OPCODE(19) #define RV_OPCODE_SYSTEM RV_OPCODE(115) #define HFENCE_VVMA(vaddr, asid) \ @@ -196,6 +244,18 @@ INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \ RS1(base), SIMM12(4)) +#define PREFETCH_I(base, offset) \ + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(0), \ + SIMM12((offset) & 0xfe0), RS1(base)) + +#define PREFETCH_R(base, offset) \ + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(1), \ + SIMM12((offset) & 0xfe0), RS1(base)) + +#define PREFETCH_W(base, offset) \ + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \ + SIMM12((offset) & 0xfe0), RS1(base)) + #define RISCV_PAUSE ".4byte 0x100000f" #define ZAWRS_WRS_NTO ".4byte 0x00d00073" #define ZAWRS_WRS_STO ".4byte 0x01d00073" @@ -203,4 +263,10 @@ #define RISCV_INSN_NOP4 _AC(0x00000013, U) +#ifndef __ASSEMBLY__ +#define nop() __asm__ __volatile__ ("nop") +#define __nops(n) ".rept " #n "\nnop\n.endr\n" +#define nops(n) __asm__ __volatile__ (__nops(n)) +#endif + #endif /* __ASM_INSN_DEF_H */ diff --git a/arch/riscv/include/asm/kexec.h b/arch/riscv/include/asm/kexec.h index 2b56769cb530..b9ee8346cc8c 100644 --- a/arch/riscv/include/asm/kexec.h +++ b/arch/riscv/include/asm/kexec.h @@ -56,6 +56,7 @@ extern riscv_kexec_method riscv_kexec_norelocate; #ifdef CONFIG_KEXEC_FILE extern const struct kexec_file_ops elf_kexec_ops; +extern const struct kexec_file_ops image_kexec_ops; struct purgatory_info; int arch_kexec_apply_relocations_add(struct purgatory_info *pi, @@ -67,6 +68,11 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, struct kimage; int arch_kimage_file_post_load_cleanup(struct kimage *image); #define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup + +int load_extra_segments(struct kimage *image, unsigned long kernel_start, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len); #endif #endif diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h index 188fadc1c21f..7de05db7d3bd 100644 --- a/arch/riscv/include/asm/pgtable-64.h +++ b/arch/riscv/include/asm/pgtable-64.h @@ -184,7 +184,7 @@ static inline int pud_none(pud_t pud) static inline int pud_bad(pud_t pud) { - return !pud_present(pud); + return !pud_present(pud) || (pud_val(pud) & _PAGE_LEAF); } #define pud_leaf pud_leaf @@ -399,6 +399,7 @@ p4d_t *p4d_offset(pgd_t *pgd, unsigned long address); #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline int pte_devmap(pte_t pte); static inline pte_t pmd_pte(pmd_t pmd); +static inline pte_t pud_pte(pud_t pud); static inline int pmd_devmap(pmd_t pmd) { @@ -407,7 +408,7 @@ static inline int pmd_devmap(pmd_t pmd) static inline int pud_devmap(pud_t pud) { - return 0; + return pte_devmap(pud_pte(pud)); } static inline int pgd_devmap(pgd_t pgd) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index f19240fd018e..a11816bbf9e7 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -900,6 +900,103 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, #define pmdp_collapse_flush pmdp_collapse_flush extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); + +static inline pud_t pud_wrprotect(pud_t pud) +{ + return pte_pud(pte_wrprotect(pud_pte(pud))); +} + +static inline int pud_trans_huge(pud_t pud) +{ + return pud_leaf(pud); +} + +static inline int pud_dirty(pud_t pud) +{ + return pte_dirty(pud_pte(pud)); +} + +static inline pud_t pud_mkyoung(pud_t pud) +{ + return pte_pud(pte_mkyoung(pud_pte(pud))); +} + +static inline pud_t pud_mkold(pud_t pud) +{ + return pte_pud(pte_mkold(pud_pte(pud))); +} + +static inline pud_t pud_mkdirty(pud_t pud) +{ + return pte_pud(pte_mkdirty(pud_pte(pud))); +} + +static inline pud_t pud_mkclean(pud_t pud) +{ + return pte_pud(pte_mkclean(pud_pte(pud))); +} + +static inline pud_t pud_mkwrite(pud_t pud) +{ + return pte_pud(pte_mkwrite_novma(pud_pte(pud))); +} + +static inline pud_t pud_mkhuge(pud_t pud) +{ + return pud; +} + +static inline pud_t pud_mkdevmap(pud_t pud) +{ + return pte_pud(pte_mkdevmap(pud_pte(pud))); +} + +static inline int pudp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, + pud_t entry, int dirty) +{ + return ptep_set_access_flags(vma, address, (pte_t *)pudp, pud_pte(entry), dirty); +} + +static inline int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp) +{ + return ptep_test_and_clear_young(vma, address, (pte_t *)pudp); +} + +static inline int pud_young(pud_t pud) +{ + return pte_young(pud_pte(pud)); +} + +static inline void update_mmu_cache_pud(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp) +{ + pte_t *ptep = (pte_t *)pudp; + + update_mmu_cache(vma, address, ptep); +} + +static inline pud_t pudp_establish(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, pud_t pud) +{ + page_table_check_pud_set(vma->vm_mm, pudp, pud); + return __pud(atomic_long_xchg((atomic_long_t *)pudp, pud_val(pud))); +} + +static inline pud_t pud_mkinvalid(pud_t pud) +{ + return __pud(pud_val(pud) & ~(_PAGE_PRESENT | _PAGE_PROT_NONE)); +} + +extern pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp); + +static inline pud_t pud_modify(pud_t pud, pgprot_t newprot) +{ + return pte_pud(pte_modify(pud_pte(pud), newprot)); +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 5f56eb9d114a..24d3af4d3807 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -13,6 +13,9 @@ #include <vdso/processor.h> #include <asm/ptrace.h> +#include <asm/insn-def.h> +#include <asm/alternative-macros.h> +#include <asm/hwcap.h> #define arch_get_mmap_end(addr, len, flags) \ ({ \ @@ -52,7 +55,6 @@ #endif #ifndef __ASSEMBLY__ -#include <linux/cpumask.h> struct task_struct; struct pt_regs; @@ -79,6 +81,10 @@ struct pt_regs; * Thus, the task does not own preempt_v. Any use of Vector will have to * save preempt_v, if dirty, and fallback to non-preemptible kernel-mode * Vector. + * - bit 29: The thread voluntarily calls schedule() while holding an active + * preempt_v. All preempt_v context should be dropped in such case because + * V-regs are caller-saved. Only sstatus.VS=ON is persisted across a + * schedule() call. * - bit 30: The in-kernel preempt_v context is saved, and requries to be * restored when returning to the context that owns the preempt_v. * - bit 31: The in-kernel preempt_v context is dirty, as signaled by the @@ -93,6 +99,7 @@ struct pt_regs; #define RISCV_PREEMPT_V 0x00000100 #define RISCV_PREEMPT_V_DIRTY 0x80000000 #define RISCV_PREEMPT_V_NEED_RESTORE 0x40000000 +#define RISCV_PREEMPT_V_IN_SCHEDULE 0x20000000 /* CPU-specific state of a task */ struct thread_struct { @@ -103,6 +110,7 @@ struct thread_struct { struct __riscv_d_ext_state fstate; unsigned long bad_cause; unsigned long envcfg; + unsigned long sum; u32 riscv_v_flags; u32 vstate_ctrl; struct __riscv_v_ext_state vstate; @@ -136,6 +144,27 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset, #define KSTK_EIP(tsk) (task_pt_regs(tsk)->epc) #define KSTK_ESP(tsk) (task_pt_regs(tsk)->sp) +#define PREFETCH_ASM(x) \ + ALTERNATIVE(__nops(1), PREFETCH_R(x, 0), 0, \ + RISCV_ISA_EXT_ZICBOP, CONFIG_RISCV_ISA_ZICBOP) + +#define PREFETCHW_ASM(x) \ + ALTERNATIVE(__nops(1), PREFETCH_W(x, 0), 0, \ + RISCV_ISA_EXT_ZICBOP, CONFIG_RISCV_ISA_ZICBOP) + +#ifdef CONFIG_RISCV_ISA_ZICBOP +#define ARCH_HAS_PREFETCH +static inline void prefetch(const void *x) +{ + __asm__ __volatile__(PREFETCH_ASM(%0) : : "r" (x) : "memory"); +} + +#define ARCH_HAS_PREFETCHW +static inline void prefetchw(const void *x) +{ + __asm__ __volatile__(PREFETCHW_ASM(%0) : : "r" (x) : "memory"); +} +#endif /* CONFIG_RISCV_ISA_ZICBOP */ /* Do necessary setup to start up a newly executed thread. */ extern void start_thread(struct pt_regs *regs, diff --git a/arch/riscv/include/asm/ptrace.h b/arch/riscv/include/asm/ptrace.h index 2910231977cb..a7dc0e330757 100644 --- a/arch/riscv/include/asm/ptrace.h +++ b/arch/riscv/include/asm/ptrace.h @@ -175,7 +175,7 @@ static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, return 0; } -static inline int regs_irqs_disabled(struct pt_regs *regs) +static __always_inline bool regs_irqs_disabled(struct pt_regs *regs) { return !(regs->status & SR_PIE); } diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 3d250824178b..341e74238aa0 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -35,6 +35,7 @@ enum sbi_ext_id { SBI_EXT_DBCN = 0x4442434E, SBI_EXT_STA = 0x535441, SBI_EXT_NACL = 0x4E41434C, + SBI_EXT_FWFT = 0x46574654, /* Experimentals extensions must lie within this range */ SBI_EXT_EXPERIMENTAL_START = 0x08000000, @@ -402,6 +403,33 @@ enum sbi_ext_nacl_feature { #define SBI_NACL_SHMEM_SRET_X(__i) ((__riscv_xlen / 8) * (__i)) #define SBI_NACL_SHMEM_SRET_X_LAST 31 +/* SBI function IDs for FW feature extension */ +#define SBI_EXT_FWFT_SET 0x0 +#define SBI_EXT_FWFT_GET 0x1 + +enum sbi_fwft_feature_t { + SBI_FWFT_MISALIGNED_EXC_DELEG = 0x0, + SBI_FWFT_LANDING_PAD = 0x1, + SBI_FWFT_SHADOW_STACK = 0x2, + SBI_FWFT_DOUBLE_TRAP = 0x3, + SBI_FWFT_PTE_AD_HW_UPDATING = 0x4, + SBI_FWFT_POINTER_MASKING_PMLEN = 0x5, + SBI_FWFT_LOCAL_RESERVED_START = 0x6, + SBI_FWFT_LOCAL_RESERVED_END = 0x3fffffff, + SBI_FWFT_LOCAL_PLATFORM_START = 0x40000000, + SBI_FWFT_LOCAL_PLATFORM_END = 0x7fffffff, + + SBI_FWFT_GLOBAL_RESERVED_START = 0x80000000, + SBI_FWFT_GLOBAL_RESERVED_END = 0xbfffffff, + SBI_FWFT_GLOBAL_PLATFORM_START = 0xc0000000, + SBI_FWFT_GLOBAL_PLATFORM_END = 0xffffffff, +}; + +#define SBI_FWFT_PLATFORM_FEATURE_BIT BIT(30) +#define SBI_FWFT_GLOBAL_FEATURE_BIT BIT(31) + +#define SBI_FWFT_SET_FLAG_LOCK BIT(0) + /* SBI spec version fields */ #define SBI_SPEC_VERSION_DEFAULT 0x1 #define SBI_SPEC_VERSION_MAJOR_SHIFT 24 @@ -419,6 +447,11 @@ enum sbi_ext_nacl_feature { #define SBI_ERR_ALREADY_STARTED -7 #define SBI_ERR_ALREADY_STOPPED -8 #define SBI_ERR_NO_SHMEM -9 +#define SBI_ERR_INVALID_STATE -10 +#define SBI_ERR_BAD_RANGE -11 +#define SBI_ERR_TIMEOUT -12 +#define SBI_ERR_IO -13 +#define SBI_ERR_DENIED_LOCKED -14 extern unsigned long sbi_spec_version; struct sbiret { @@ -470,6 +503,23 @@ int sbi_remote_hfence_vvma_asid(const struct cpumask *cpu_mask, unsigned long asid); long sbi_probe_extension(int ext); +int sbi_fwft_set(u32 feature, unsigned long value, unsigned long flags); +int sbi_fwft_set_cpumask(const cpumask_t *mask, u32 feature, + unsigned long value, unsigned long flags); +/** + * sbi_fwft_set_online_cpus() - Set a feature on all online cpus + * @feature: The feature to be set + * @value: The feature value to be set + * @flags: FWFT feature set flags + * + * Return: 0 on success, appropriate linux error code otherwise. + */ +static inline int sbi_fwft_set_online_cpus(u32 feature, unsigned long value, + unsigned long flags) +{ + return sbi_fwft_set_cpumask(cpu_online_mask, feature, value, flags); +} + /* Check if current SBI specification version is 0.1 or not */ static inline int sbi_spec_is_0_1(void) { @@ -503,11 +553,21 @@ static inline int sbi_err_map_linux_errno(int err) case SBI_SUCCESS: return 0; case SBI_ERR_DENIED: + case SBI_ERR_DENIED_LOCKED: return -EPERM; case SBI_ERR_INVALID_PARAM: + case SBI_ERR_INVALID_STATE: return -EINVAL; + case SBI_ERR_BAD_RANGE: + return -ERANGE; case SBI_ERR_INVALID_ADDRESS: return -EFAULT; + case SBI_ERR_NO_SHMEM: + return -ENOMEM; + case SBI_ERR_TIMEOUT: + return -ETIMEDOUT; + case SBI_ERR_IO: + return -EIO; case SBI_ERR_NOT_SUPPORTED: case SBI_ERR_FAILURE: default: diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h index ce0dd0fed764..1a20dd746a49 100644 --- a/arch/riscv/include/asm/tlbflush.h +++ b/arch/riscv/include/asm/tlbflush.h @@ -56,6 +56,8 @@ void local_flush_tlb_kernel_range(unsigned long start, unsigned long end); #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); +void flush_pud_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end); #endif bool arch_tlbbatch_should_defer(struct mm_struct *mm); diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index fee56b0c8058..d472da4450e6 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -62,6 +62,19 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, unsigne __asm__ __volatile__ ("csrc sstatus, %0" : : "r" (SR_SUM) : "memory") /* + * This is the smallest unsigned integer type that can fit a value + * (up to 'long long') + */ +#define __inttype(x) __typeof__( \ + __typefits(x, char, \ + __typefits(x, short, \ + __typefits(x, int, \ + __typefits(x, long, 0ULL))))) + +#define __typefits(x, type, not) \ + __builtin_choose_expr(sizeof(x) <= sizeof(type), (unsigned type)0, not) + +/* * The exception table consists of pairs of addresses: the first is the * address of an instruction that is allowed to fault, and the second is * the address at which the program should continue. No registers are @@ -83,27 +96,58 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, unsigne * call. */ -#define __get_user_asm(insn, x, ptr, err) \ +#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT +#define __get_user_asm(insn, x, ptr, label) \ + asm_goto_output( \ + "1:\n" \ + " " insn " %0, %1\n" \ + _ASM_EXTABLE_UACCESS_ERR(1b, %l2, %0) \ + : "=&r" (x) \ + : "m" (*(ptr)) : : label) +#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ +#define __get_user_asm(insn, x, ptr, label) \ do { \ - __typeof__(x) __x; \ + long __gua_err = 0; \ __asm__ __volatile__ ( \ "1:\n" \ " " insn " %1, %2\n" \ "2:\n" \ _ASM_EXTABLE_UACCESS_ERR_ZERO(1b, 2b, %0, %1) \ - : "+r" (err), "=&r" (__x) \ + : "+r" (__gua_err), "=&r" (x) \ : "m" (*(ptr))); \ - (x) = __x; \ + if (__gua_err) \ + goto label; \ } while (0) +#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ #ifdef CONFIG_64BIT -#define __get_user_8(x, ptr, err) \ - __get_user_asm("ld", x, ptr, err) +#define __get_user_8(x, ptr, label) \ + __get_user_asm("ld", x, ptr, label) #else /* !CONFIG_64BIT */ -#define __get_user_8(x, ptr, err) \ + +#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT +#define __get_user_8(x, ptr, label) \ + u32 __user *__ptr = (u32 __user *)(ptr); \ + u32 __lo, __hi; \ + asm_goto_output( \ + "1:\n" \ + " lw %0, %2\n" \ + "2:\n" \ + " lw %1, %3\n" \ + _ASM_EXTABLE_UACCESS_ERR(1b, %l4, %0) \ + _ASM_EXTABLE_UACCESS_ERR(2b, %l4, %0) \ + : "=&r" (__lo), "=r" (__hi) \ + : "m" (__ptr[__LSW]), "m" (__ptr[__MSW]) \ + : : label); \ + (x) = (__typeof__(x))((__typeof__((x) - (x)))( \ + (((u64)__hi << 32) | __lo))); \ + +#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ +#define __get_user_8(x, ptr, label) \ do { \ u32 __user *__ptr = (u32 __user *)(ptr); \ u32 __lo, __hi; \ + long __gu8_err = 0; \ __asm__ __volatile__ ( \ "1:\n" \ " lw %1, %3\n" \ @@ -112,35 +156,62 @@ do { \ "3:\n" \ _ASM_EXTABLE_UACCESS_ERR_ZERO(1b, 3b, %0, %1) \ _ASM_EXTABLE_UACCESS_ERR_ZERO(2b, 3b, %0, %1) \ - : "+r" (err), "=&r" (__lo), "=r" (__hi) \ + : "+r" (__gu8_err), "=&r" (__lo), "=r" (__hi) \ : "m" (__ptr[__LSW]), "m" (__ptr[__MSW])); \ - if (err) \ + if (__gu8_err) { \ __hi = 0; \ - (x) = (__typeof__(x))((__typeof__((x)-(x)))( \ + goto label; \ + } \ + (x) = (__typeof__(x))((__typeof__((x) - (x)))( \ (((u64)__hi << 32) | __lo))); \ } while (0) +#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ + #endif /* CONFIG_64BIT */ -#define __get_user_nocheck(x, __gu_ptr, __gu_err) \ +unsigned long __must_check __asm_copy_to_user_sum_enabled(void __user *to, + const void *from, unsigned long n); +unsigned long __must_check __asm_copy_from_user_sum_enabled(void *to, + const void __user *from, unsigned long n); + +#define __get_user_nocheck(x, __gu_ptr, label) \ do { \ + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && \ + !IS_ALIGNED((uintptr_t)__gu_ptr, sizeof(*__gu_ptr))) { \ + if (__asm_copy_from_user_sum_enabled(&(x), __gu_ptr, sizeof(*__gu_ptr))) \ + goto label; \ + break; \ + } \ switch (sizeof(*__gu_ptr)) { \ case 1: \ - __get_user_asm("lb", (x), __gu_ptr, __gu_err); \ + __get_user_asm("lb", (x), __gu_ptr, label); \ break; \ case 2: \ - __get_user_asm("lh", (x), __gu_ptr, __gu_err); \ + __get_user_asm("lh", (x), __gu_ptr, label); \ break; \ case 4: \ - __get_user_asm("lw", (x), __gu_ptr, __gu_err); \ + __get_user_asm("lw", (x), __gu_ptr, label); \ break; \ case 8: \ - __get_user_8((x), __gu_ptr, __gu_err); \ + __get_user_8((x), __gu_ptr, label); \ break; \ default: \ BUILD_BUG(); \ } \ } while (0) +#define __get_user_error(x, ptr, err) \ +do { \ + __label__ __gu_failed; \ + \ + __get_user_nocheck(x, ptr, __gu_failed); \ + err = 0; \ + break; \ +__gu_failed: \ + x = 0; \ + err = -EFAULT; \ +} while (0) + /** * __get_user: - Get a simple variable from user space, with less checking. * @x: Variable to store result. @@ -165,13 +236,16 @@ do { \ ({ \ const __typeof__(*(ptr)) __user *__gu_ptr = untagged_addr(ptr); \ long __gu_err = 0; \ + __typeof__(x) __gu_val; \ \ __chk_user_ptr(__gu_ptr); \ \ __enable_user_access(); \ - __get_user_nocheck(x, __gu_ptr, __gu_err); \ + __get_user_error(__gu_val, __gu_ptr, __gu_err); \ __disable_user_access(); \ \ + (x) = __gu_val; \ + \ __gu_err; \ }) @@ -201,61 +275,73 @@ do { \ ((x) = (__force __typeof__(x))0, -EFAULT); \ }) -#define __put_user_asm(insn, x, ptr, err) \ +#define __put_user_asm(insn, x, ptr, label) \ do { \ __typeof__(*(ptr)) __x = x; \ - __asm__ __volatile__ ( \ + asm goto( \ "1:\n" \ - " " insn " %z2, %1\n" \ - "2:\n" \ - _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %0) \ - : "+r" (err), "=m" (*(ptr)) \ - : "rJ" (__x)); \ + " " insn " %z0, %1\n" \ + _ASM_EXTABLE(1b, %l2) \ + : : "rJ" (__x), "m"(*(ptr)) : : label); \ } while (0) #ifdef CONFIG_64BIT -#define __put_user_8(x, ptr, err) \ - __put_user_asm("sd", x, ptr, err) +#define __put_user_8(x, ptr, label) \ + __put_user_asm("sd", x, ptr, label) #else /* !CONFIG_64BIT */ -#define __put_user_8(x, ptr, err) \ +#define __put_user_8(x, ptr, label) \ do { \ u32 __user *__ptr = (u32 __user *)(ptr); \ u64 __x = (__typeof__((x)-(x)))(x); \ - __asm__ __volatile__ ( \ + asm goto( \ "1:\n" \ - " sw %z3, %1\n" \ + " sw %z0, %2\n" \ "2:\n" \ - " sw %z4, %2\n" \ - "3:\n" \ - _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %0) \ - _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %0) \ - : "+r" (err), \ - "=m" (__ptr[__LSW]), \ - "=m" (__ptr[__MSW]) \ - : "rJ" (__x), "rJ" (__x >> 32)); \ + " sw %z1, %3\n" \ + _ASM_EXTABLE(1b, %l4) \ + _ASM_EXTABLE(2b, %l4) \ + : : "rJ" (__x), "rJ" (__x >> 32), \ + "m" (__ptr[__LSW]), \ + "m" (__ptr[__MSW]) : : label); \ } while (0) #endif /* CONFIG_64BIT */ -#define __put_user_nocheck(x, __gu_ptr, __pu_err) \ +#define __put_user_nocheck(x, __gu_ptr, label) \ do { \ + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && \ + !IS_ALIGNED((uintptr_t)__gu_ptr, sizeof(*__gu_ptr))) { \ + __inttype(x) val = (__inttype(x))x; \ + if (__asm_copy_to_user_sum_enabled(__gu_ptr, &(val), sizeof(*__gu_ptr))) \ + goto label; \ + break; \ + } \ switch (sizeof(*__gu_ptr)) { \ case 1: \ - __put_user_asm("sb", (x), __gu_ptr, __pu_err); \ + __put_user_asm("sb", (x), __gu_ptr, label); \ break; \ case 2: \ - __put_user_asm("sh", (x), __gu_ptr, __pu_err); \ + __put_user_asm("sh", (x), __gu_ptr, label); \ break; \ case 4: \ - __put_user_asm("sw", (x), __gu_ptr, __pu_err); \ + __put_user_asm("sw", (x), __gu_ptr, label); \ break; \ case 8: \ - __put_user_8((x), __gu_ptr, __pu_err); \ + __put_user_8((x), __gu_ptr, label); \ break; \ default: \ BUILD_BUG(); \ } \ } while (0) +#define __put_user_error(x, ptr, err) \ +do { \ + __label__ err_label; \ + __put_user_nocheck(x, ptr, err_label); \ + break; \ +err_label: \ + (err) = -EFAULT; \ +} while (0) + /** * __put_user: - Write a simple value into user space, with less checking. * @x: Value to copy to user space. @@ -286,7 +372,7 @@ do { \ __chk_user_ptr(__gu_ptr); \ \ __enable_user_access(); \ - __put_user_nocheck(__val, __gu_ptr, __pu_err); \ + __put_user_error(__val, __gu_ptr, __pu_err); \ __disable_user_access(); \ \ __pu_err; \ @@ -351,23 +437,45 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n) } #define __get_kernel_nofault(dst, src, type, err_label) \ -do { \ - long __kr_err = 0; \ - \ - __get_user_nocheck(*((type *)(dst)), (type *)(src), __kr_err); \ - if (unlikely(__kr_err)) \ - goto err_label; \ -} while (0) + __get_user_nocheck(*((type *)(dst)), (type *)(src), err_label) #define __put_kernel_nofault(dst, src, type, err_label) \ -do { \ - long __kr_err = 0; \ - \ - __put_user_nocheck(*((type *)(src)), (type *)(dst), __kr_err); \ - if (unlikely(__kr_err)) \ - goto err_label; \ + __put_user_nocheck(*((type *)(src)), (type *)(dst), err_label) + +static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len) +{ + if (unlikely(!access_ok(ptr, len))) + return 0; + __enable_user_access(); + return 1; +} +#define user_access_begin user_access_begin +#define user_access_end __disable_user_access + +static inline unsigned long user_access_save(void) { return 0UL; } +static inline void user_access_restore(unsigned long enabled) { } + +/* + * We want the unsafe accessors to always be inlined and use + * the error labels - thus the macro games. + */ +#define unsafe_put_user(x, ptr, label) \ + __put_user_nocheck(x, (ptr), label) + +#define unsafe_get_user(x, ptr, label) do { \ + __inttype(*(ptr)) __gu_val; \ + __get_user_nocheck(__gu_val, (ptr), label); \ + (x) = (__force __typeof__(*(ptr)))__gu_val; \ } while (0) +#define unsafe_copy_to_user(_dst, _src, _len, label) \ + if (__asm_copy_to_user_sum_enabled(_dst, _src, _len)) \ + goto label; + +#define unsafe_copy_from_user(_dst, _src, _len, label) \ + if (__asm_copy_from_user_sum_enabled(_dst, _src, _len)) \ + goto label; + #else /* CONFIG_MMU */ #include <asm-generic/uaccess.h> #endif /* CONFIG_MMU */ diff --git a/arch/riscv/include/asm/vdso/getrandom.h b/arch/riscv/include/asm/vdso/getrandom.h new file mode 100644 index 000000000000..8dc92441702a --- /dev/null +++ b/arch/riscv/include/asm/vdso/getrandom.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2025 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved. + */ +#ifndef __ASM_VDSO_GETRANDOM_H +#define __ASM_VDSO_GETRANDOM_H + +#ifndef __ASSEMBLY__ + +#include <asm/unistd.h> + +static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, unsigned int _flags) +{ + register long ret asm("a0"); + register long nr asm("a7") = __NR_getrandom; + register void *buffer asm("a0") = _buffer; + register size_t len asm("a1") = _len; + register unsigned int flags asm("a2") = _flags; + + asm volatile ("ecall\n" + : "+r" (ret) + : "r" (nr), "r" (buffer), "r" (len), "r" (flags) + : "memory"); + + return ret; +} + +#endif /* !__ASSEMBLY__ */ + +#endif /* __ASM_VDSO_GETRANDOM_H */ diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h index e8a83f55be2b..45c9b426fcc5 100644 --- a/arch/riscv/include/asm/vector.h +++ b/arch/riscv/include/asm/vector.h @@ -120,6 +120,11 @@ static __always_inline void riscv_v_disable(void) csr_clear(CSR_SSTATUS, SR_VS); } +static __always_inline bool riscv_v_is_on(void) +{ + return !!(csr_read(CSR_SSTATUS) & SR_VS); +} + static __always_inline void __vstate_csr_save(struct __riscv_v_ext_state *dest) { asm volatile ( @@ -366,6 +371,11 @@ static inline void __switch_to_vector(struct task_struct *prev, struct pt_regs *regs; if (riscv_preempt_v_started(prev)) { + if (riscv_v_is_on()) { + WARN_ON(prev->thread.riscv_v_flags & RISCV_V_CTX_DEPTH_MASK); + riscv_v_disable(); + prev->thread.riscv_v_flags |= RISCV_PREEMPT_V_IN_SCHEDULE; + } if (riscv_preempt_v_dirty(prev)) { __riscv_v_vstate_save(&prev->thread.kernel_vstate, prev->thread.kernel_vstate.datap); @@ -376,10 +386,16 @@ static inline void __switch_to_vector(struct task_struct *prev, riscv_v_vstate_save(&prev->thread.vstate, regs); } - if (riscv_preempt_v_started(next)) - riscv_preempt_v_set_restore(next); - else + if (riscv_preempt_v_started(next)) { + if (next->thread.riscv_v_flags & RISCV_PREEMPT_V_IN_SCHEDULE) { + next->thread.riscv_v_flags &= ~RISCV_PREEMPT_V_IN_SCHEDULE; + riscv_v_enable(); + } else { + riscv_preempt_v_set_restore(next); + } + } else { riscv_v_vstate_set_restore(next, task_pt_regs(next)); + } } void riscv_v_vstate_ctrl_init(struct task_struct *tsk); diff --git a/arch/riscv/include/asm/vendor_extensions/sifive.h b/arch/riscv/include/asm/vendor_extensions/sifive.h new file mode 100644 index 000000000000..ac00e500361c --- /dev/null +++ b/arch/riscv/include/asm/vendor_extensions/sifive.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RISCV_VENDOR_EXTENSIONS_SIFIVE_H +#define _ASM_RISCV_VENDOR_EXTENSIONS_SIFIVE_H + +#include <asm/vendor_extensions.h> + +#include <linux/types.h> + +#define RISCV_ISA_VENDOR_EXT_XSFVQMACCDOD 0 +#define RISCV_ISA_VENDOR_EXT_XSFVQMACCQOQ 1 +#define RISCV_ISA_VENDOR_EXT_XSFVFNRCLIPXFQF 2 +#define RISCV_ISA_VENDOR_EXT_XSFVFWMACCQQQ 3 + +extern struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_sifive; + +#endif diff --git a/arch/riscv/include/asm/vendor_extensions/sifive_hwprobe.h b/arch/riscv/include/asm/vendor_extensions/sifive_hwprobe.h new file mode 100644 index 000000000000..90a61abd033c --- /dev/null +++ b/arch/riscv/include/asm/vendor_extensions/sifive_hwprobe.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RISCV_VENDOR_EXTENSIONS_SIFIVE_HWPROBE_H +#define _ASM_RISCV_VENDOR_EXTENSIONS_SIFIVE_HWPROBE_H + +#include <linux/cpumask.h> + +#include <uapi/asm/hwprobe.h> + +#ifdef CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE +void hwprobe_isa_vendor_ext_sifive_0(struct riscv_hwprobe *pair, const struct cpumask *cpus); +#else +static inline void hwprobe_isa_vendor_ext_sifive_0(struct riscv_hwprobe *pair, + const struct cpumask *cpus) +{ + pair->value = 0; +} +#endif + +#endif diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h index 3c2fce939673..aaf6ad970499 100644 --- a/arch/riscv/include/uapi/asm/hwprobe.h +++ b/arch/riscv/include/uapi/asm/hwprobe.h @@ -81,6 +81,7 @@ struct riscv_hwprobe { #define RISCV_HWPROBE_EXT_ZICBOM (1ULL << 55) #define RISCV_HWPROBE_EXT_ZAAMO (1ULL << 56) #define RISCV_HWPROBE_EXT_ZALRSC (1ULL << 57) +#define RISCV_HWPROBE_EXT_ZABHA (1ULL << 58) #define RISCV_HWPROBE_KEY_CPUPERF_0 5 #define RISCV_HWPROBE_MISALIGNED_UNKNOWN (0 << 0) #define RISCV_HWPROBE_MISALIGNED_EMULATED (1 << 0) @@ -104,6 +105,7 @@ struct riscv_hwprobe { #define RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED 4 #define RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0 11 #define RISCV_HWPROBE_KEY_ZICBOM_BLOCK_SIZE 12 +#define RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0 13 /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */ /* Flags */ diff --git a/arch/riscv/include/uapi/asm/vendor/sifive.h b/arch/riscv/include/uapi/asm/vendor/sifive.h new file mode 100644 index 000000000000..9f3278a4b298 --- /dev/null +++ b/arch/riscv/include/uapi/asm/vendor/sifive.h @@ -0,0 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + +#define RISCV_HWPROBE_VENDOR_EXT_XSFVQMACCDOD (1 << 0) +#define RISCV_HWPROBE_VENDOR_EXT_XSFVQMACCQOQ (1 << 1) +#define RISCV_HWPROBE_VENDOR_EXT_XSFVFNRCLIPXFQF (1 << 2) +#define RISCV_HWPROBE_VENDOR_EXT_XSFVFWMACCQQQ (1 << 3) diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index f7480c9c6f8d..7ce2307738c2 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -107,7 +107,7 @@ obj-$(CONFIG_HOTPLUG_CPU) += cpu-hotplug.o obj-$(CONFIG_PARAVIRT) += paravirt.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_KEXEC_CORE) += kexec_relocate.o crash_save_regs.o machine_kexec.o -obj-$(CONFIG_KEXEC_FILE) += elf_kexec.o machine_kexec_file.o +obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o kexec_image.o machine_kexec_file.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index 16490755304e..6e8c0d6feae9 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -34,6 +34,7 @@ void asm_offsets(void) OFFSET(TASK_THREAD_S9, task_struct, thread.s[9]); OFFSET(TASK_THREAD_S10, task_struct, thread.s[10]); OFFSET(TASK_THREAD_S11, task_struct, thread.s[11]); + OFFSET(TASK_THREAD_SUM, task_struct, thread.sum); OFFSET(TASK_TI_CPU, task_struct, thread_info.cpu); OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count); @@ -346,6 +347,10 @@ void asm_offsets(void) offsetof(struct task_struct, thread.s[11]) - offsetof(struct task_struct, thread.ra) ); + DEFINE(TASK_THREAD_SUM_RA, + offsetof(struct task_struct, thread.sum) + - offsetof(struct task_struct, thread.ra) + ); DEFINE(TASK_THREAD_F0_F0, offsetof(struct task_struct, thread.fstate.f[0]) @@ -493,6 +498,12 @@ void asm_offsets(void) DEFINE(STACKFRAME_SIZE_ON_STACK, ALIGN(sizeof(struct stackframe), STACK_ALIGN)); OFFSET(STACKFRAME_FP, stackframe, fp); OFFSET(STACKFRAME_RA, stackframe, ra); +#ifdef CONFIG_FUNCTION_TRACER + DEFINE(FTRACE_OPS_FUNC, offsetof(struct ftrace_ops, func)); +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + DEFINE(FTRACE_OPS_DIRECT_CALL, offsetof(struct ftrace_ops, direct_call)); +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ +#endif #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS DEFINE(FREGS_SIZE_ON_STACK, ALIGN(sizeof(struct __arch_ftrace_regs), STACK_ALIGN)); @@ -501,6 +512,13 @@ void asm_offsets(void) DEFINE(FREGS_SP, offsetof(struct __arch_ftrace_regs, sp)); DEFINE(FREGS_S0, offsetof(struct __arch_ftrace_regs, s0)); DEFINE(FREGS_T1, offsetof(struct __arch_ftrace_regs, t1)); +#ifdef CONFIG_CC_IS_CLANG + DEFINE(FREGS_T2, offsetof(struct __arch_ftrace_regs, t2)); + DEFINE(FREGS_T3, offsetof(struct __arch_ftrace_regs, t3)); + DEFINE(FREGS_T4, offsetof(struct __arch_ftrace_regs, t4)); + DEFINE(FREGS_T5, offsetof(struct __arch_ftrace_regs, t5)); + DEFINE(FREGS_T6, offsetof(struct __arch_ftrace_regs, t6)); +#endif DEFINE(FREGS_A0, offsetof(struct __arch_ftrace_regs, a0)); DEFINE(FREGS_A1, offsetof(struct __arch_ftrace_regs, a1)); DEFINE(FREGS_A2, offsetof(struct __arch_ftrace_regs, a2)); diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 2054f6c4b0ae..743d53415572 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -32,6 +32,7 @@ #define NUM_ALPHA_EXTS ('z' - 'a' + 1) static bool any_cpu_has_zicboz; +static bool any_cpu_has_zicbop; static bool any_cpu_has_zicbom; unsigned long elf_hwcap __read_mostly; @@ -119,6 +120,21 @@ static int riscv_ext_zicboz_validate(const struct riscv_isa_ext_data *data, return 0; } +static int riscv_ext_zicbop_validate(const struct riscv_isa_ext_data *data, + const unsigned long *isa_bitmap) +{ + if (!riscv_cbop_block_size) { + pr_err("Zicbop detected in ISA string, disabling as no cbop-block-size found\n"); + return -EINVAL; + } + if (!is_power_of_2(riscv_cbop_block_size)) { + pr_err("Zicbop disabled as cbop-block-size present, but is not a power-of-2\n"); + return -EINVAL; + } + any_cpu_has_zicbop = true; + return 0; +} + static int riscv_ext_f_validate(const struct riscv_isa_ext_data *data, const unsigned long *isa_bitmap) { @@ -442,6 +458,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = { __RISCV_ISA_EXT_SUPERSET_VALIDATE(v, RISCV_ISA_EXT_v, riscv_v_exts, riscv_ext_vector_float_validate), __RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h), __RISCV_ISA_EXT_SUPERSET_VALIDATE(zicbom, RISCV_ISA_EXT_ZICBOM, riscv_xlinuxenvcfg_exts, riscv_ext_zicbom_validate), + __RISCV_ISA_EXT_DATA_VALIDATE(zicbop, RISCV_ISA_EXT_ZICBOP, riscv_ext_zicbop_validate), __RISCV_ISA_EXT_SUPERSET_VALIDATE(zicboz, RISCV_ISA_EXT_ZICBOZ, riscv_xlinuxenvcfg_exts, riscv_ext_zicboz_validate), __RISCV_ISA_EXT_DATA(ziccrse, RISCV_ISA_EXT_ZICCRSE), __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR), @@ -1112,6 +1129,10 @@ void __init riscv_user_isa_enable(void) current->thread.envcfg |= ENVCFG_CBCFE; else if (any_cpu_has_zicbom) pr_warn("Zicbom disabled as it is unavailable on some harts\n"); + + if (!riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICBOP) && + any_cpu_has_zicbop) + pr_warn("Zicbop disabled as it is unavailable on some harts\n"); } #ifdef CONFIG_RISCV_ALTERNATIVE diff --git a/arch/riscv/kernel/elf_kexec.c b/arch/riscv/kernel/elf_kexec.c deleted file mode 100644 index e783a72d051f..000000000000 --- a/arch/riscv/kernel/elf_kexec.c +++ /dev/null @@ -1,485 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Load ELF vmlinux file for the kexec_file_load syscall. - * - * Copyright (C) 2021 Huawei Technologies Co, Ltd. - * - * Author: Liao Chang (liaochang1@huawei.com) - * - * Based on kexec-tools' kexec-elf-riscv.c, heavily modified - * for kernel. - */ - -#define pr_fmt(fmt) "kexec_image: " fmt - -#include <linux/elf.h> -#include <linux/kexec.h> -#include <linux/slab.h> -#include <linux/of.h> -#include <linux/libfdt.h> -#include <linux/types.h> -#include <linux/memblock.h> -#include <linux/vmalloc.h> -#include <asm/setup.h> - -int arch_kimage_file_post_load_cleanup(struct kimage *image) -{ - kvfree(image->arch.fdt); - image->arch.fdt = NULL; - - vfree(image->elf_headers); - image->elf_headers = NULL; - image->elf_headers_sz = 0; - - return kexec_image_post_load_cleanup_default(image); -} - -static int riscv_kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, - struct kexec_elf_info *elf_info, unsigned long old_pbase, - unsigned long new_pbase) -{ - int i; - int ret = 0; - size_t size; - struct kexec_buf kbuf; - const struct elf_phdr *phdr; - - kbuf.image = image; - - for (i = 0; i < ehdr->e_phnum; i++) { - phdr = &elf_info->proghdrs[i]; - if (phdr->p_type != PT_LOAD) - continue; - - size = phdr->p_filesz; - if (size > phdr->p_memsz) - size = phdr->p_memsz; - - kbuf.buffer = (void *) elf_info->buffer + phdr->p_offset; - kbuf.bufsz = size; - kbuf.buf_align = phdr->p_align; - kbuf.mem = phdr->p_paddr - old_pbase + new_pbase; - kbuf.memsz = phdr->p_memsz; - kbuf.top_down = false; - ret = kexec_add_buffer(&kbuf); - if (ret) - break; - } - - return ret; -} - -/* - * Go through the available phsyical memory regions and find one that hold - * an image of the specified size. - */ -static int elf_find_pbase(struct kimage *image, unsigned long kernel_len, - struct elfhdr *ehdr, struct kexec_elf_info *elf_info, - unsigned long *old_pbase, unsigned long *new_pbase) -{ - int i; - int ret; - struct kexec_buf kbuf; - const struct elf_phdr *phdr; - unsigned long lowest_paddr = ULONG_MAX; - unsigned long lowest_vaddr = ULONG_MAX; - - for (i = 0; i < ehdr->e_phnum; i++) { - phdr = &elf_info->proghdrs[i]; - if (phdr->p_type != PT_LOAD) - continue; - - if (lowest_paddr > phdr->p_paddr) - lowest_paddr = phdr->p_paddr; - - if (lowest_vaddr > phdr->p_vaddr) - lowest_vaddr = phdr->p_vaddr; - } - - kbuf.image = image; - kbuf.buf_min = lowest_paddr; - kbuf.buf_max = ULONG_MAX; - - /* - * Current riscv boot protocol requires 2MB alignment for - * RV64 and 4MB alignment for RV32 - * - */ - kbuf.buf_align = PMD_SIZE; - kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; - kbuf.memsz = ALIGN(kernel_len, PAGE_SIZE); - kbuf.top_down = false; - ret = arch_kexec_locate_mem_hole(&kbuf); - if (!ret) { - *old_pbase = lowest_paddr; - *new_pbase = kbuf.mem; - image->start = ehdr->e_entry - lowest_vaddr + kbuf.mem; - } - return ret; -} - -#ifdef CONFIG_CRASH_DUMP -static int get_nr_ram_ranges_callback(struct resource *res, void *arg) -{ - unsigned int *nr_ranges = arg; - - (*nr_ranges)++; - return 0; -} - -static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg) -{ - struct crash_mem *cmem = arg; - - cmem->ranges[cmem->nr_ranges].start = res->start; - cmem->ranges[cmem->nr_ranges].end = res->end; - cmem->nr_ranges++; - - return 0; -} - -static int prepare_elf_headers(void **addr, unsigned long *sz) -{ - struct crash_mem *cmem; - unsigned int nr_ranges; - int ret; - - nr_ranges = 1; /* For exclusion of crashkernel region */ - walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback); - - cmem = kmalloc(struct_size(cmem, ranges, nr_ranges), GFP_KERNEL); - if (!cmem) - return -ENOMEM; - - cmem->max_nr_ranges = nr_ranges; - cmem->nr_ranges = 0; - ret = walk_system_ram_res(0, -1, cmem, prepare_elf64_ram_headers_callback); - if (ret) - goto out; - - /* Exclude crashkernel region */ - ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end); - if (!ret) - ret = crash_prepare_elf64_headers(cmem, true, addr, sz); - -out: - kfree(cmem); - return ret; -} - -static char *setup_kdump_cmdline(struct kimage *image, char *cmdline, - unsigned long cmdline_len) -{ - int elfcorehdr_strlen; - char *cmdline_ptr; - - cmdline_ptr = kzalloc(COMMAND_LINE_SIZE, GFP_KERNEL); - if (!cmdline_ptr) - return NULL; - - elfcorehdr_strlen = sprintf(cmdline_ptr, "elfcorehdr=0x%lx ", - image->elf_load_addr); - - if (elfcorehdr_strlen + cmdline_len > COMMAND_LINE_SIZE) { - pr_err("Appending elfcorehdr=<addr> exceeds cmdline size\n"); - kfree(cmdline_ptr); - return NULL; - } - - memcpy(cmdline_ptr + elfcorehdr_strlen, cmdline, cmdline_len); - /* Ensure it's nul terminated */ - cmdline_ptr[COMMAND_LINE_SIZE - 1] = '\0'; - return cmdline_ptr; -} -#endif - -static void *elf_kexec_load(struct kimage *image, char *kernel_buf, - unsigned long kernel_len, char *initrd, - unsigned long initrd_len, char *cmdline, - unsigned long cmdline_len) -{ - int ret; - void *fdt; - unsigned long old_kernel_pbase = ULONG_MAX; - unsigned long new_kernel_pbase = 0UL; - unsigned long initrd_pbase = 0UL; - unsigned long kernel_start; - struct elfhdr ehdr; - struct kexec_buf kbuf; - struct kexec_elf_info elf_info; - char *modified_cmdline = NULL; - - ret = kexec_build_elf_info(kernel_buf, kernel_len, &ehdr, &elf_info); - if (ret) - return ERR_PTR(ret); - - ret = elf_find_pbase(image, kernel_len, &ehdr, &elf_info, - &old_kernel_pbase, &new_kernel_pbase); - if (ret) - goto out; - kernel_start = image->start; - - /* Add the kernel binary to the image */ - ret = riscv_kexec_elf_load(image, &ehdr, &elf_info, - old_kernel_pbase, new_kernel_pbase); - if (ret) - goto out; - - kbuf.image = image; - kbuf.buf_min = new_kernel_pbase + kernel_len; - kbuf.buf_max = ULONG_MAX; - -#ifdef CONFIG_CRASH_DUMP - /* Add elfcorehdr */ - if (image->type == KEXEC_TYPE_CRASH) { - void *headers; - unsigned long headers_sz; - ret = prepare_elf_headers(&headers, &headers_sz); - if (ret) { - pr_err("Preparing elf core header failed\n"); - goto out; - } - - kbuf.buffer = headers; - kbuf.bufsz = headers_sz; - kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; - kbuf.memsz = headers_sz; - kbuf.buf_align = ELF_CORE_HEADER_ALIGN; - kbuf.top_down = true; - - ret = kexec_add_buffer(&kbuf); - if (ret) { - vfree(headers); - goto out; - } - image->elf_headers = headers; - image->elf_load_addr = kbuf.mem; - image->elf_headers_sz = headers_sz; - - kexec_dprintk("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - image->elf_load_addr, kbuf.bufsz, kbuf.memsz); - - /* Setup cmdline for kdump kernel case */ - modified_cmdline = setup_kdump_cmdline(image, cmdline, - cmdline_len); - if (!modified_cmdline) { - pr_err("Setting up cmdline for kdump kernel failed\n"); - ret = -EINVAL; - goto out; - } - cmdline = modified_cmdline; - } -#endif - -#ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY - /* Add purgatory to the image */ - kbuf.top_down = true; - kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; - ret = kexec_load_purgatory(image, &kbuf); - if (ret) { - pr_err("Error loading purgatory ret=%d\n", ret); - goto out; - } - kexec_dprintk("Loaded purgatory at 0x%lx\n", kbuf.mem); - - ret = kexec_purgatory_get_set_symbol(image, "riscv_kernel_entry", - &kernel_start, - sizeof(kernel_start), 0); - if (ret) - pr_err("Error update purgatory ret=%d\n", ret); -#endif /* CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY */ - - /* Add the initrd to the image */ - if (initrd != NULL) { - kbuf.buffer = initrd; - kbuf.bufsz = kbuf.memsz = initrd_len; - kbuf.buf_align = PAGE_SIZE; - kbuf.top_down = true; - kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; - ret = kexec_add_buffer(&kbuf); - if (ret) - goto out; - initrd_pbase = kbuf.mem; - kexec_dprintk("Loaded initrd at 0x%lx\n", initrd_pbase); - } - - /* Add the DTB to the image */ - fdt = of_kexec_alloc_and_setup_fdt(image, initrd_pbase, - initrd_len, cmdline, 0); - if (!fdt) { - pr_err("Error setting up the new device tree.\n"); - ret = -EINVAL; - goto out; - } - - fdt_pack(fdt); - kbuf.buffer = fdt; - kbuf.bufsz = kbuf.memsz = fdt_totalsize(fdt); - kbuf.buf_align = PAGE_SIZE; - kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; - kbuf.top_down = true; - ret = kexec_add_buffer(&kbuf); - if (ret) { - pr_err("Error add DTB kbuf ret=%d\n", ret); - goto out_free_fdt; - } - /* Cache the fdt buffer address for memory cleanup */ - image->arch.fdt = fdt; - kexec_dprintk("Loaded device tree at 0x%lx\n", kbuf.mem); - goto out; - -out_free_fdt: - kvfree(fdt); -out: - kfree(modified_cmdline); - kexec_free_elf_info(&elf_info); - return ret ? ERR_PTR(ret) : NULL; -} - -#define RV_X(x, s, n) (((x) >> (s)) & ((1 << (n)) - 1)) -#define RISCV_IMM_BITS 12 -#define RISCV_IMM_REACH (1LL << RISCV_IMM_BITS) -#define RISCV_CONST_HIGH_PART(x) \ - (((x) + (RISCV_IMM_REACH >> 1)) & ~(RISCV_IMM_REACH - 1)) -#define RISCV_CONST_LOW_PART(x) ((x) - RISCV_CONST_HIGH_PART(x)) - -#define ENCODE_ITYPE_IMM(x) \ - (RV_X(x, 0, 12) << 20) -#define ENCODE_BTYPE_IMM(x) \ - ((RV_X(x, 1, 4) << 8) | (RV_X(x, 5, 6) << 25) | \ - (RV_X(x, 11, 1) << 7) | (RV_X(x, 12, 1) << 31)) -#define ENCODE_UTYPE_IMM(x) \ - (RV_X(x, 12, 20) << 12) -#define ENCODE_JTYPE_IMM(x) \ - ((RV_X(x, 1, 10) << 21) | (RV_X(x, 11, 1) << 20) | \ - (RV_X(x, 12, 8) << 12) | (RV_X(x, 20, 1) << 31)) -#define ENCODE_CBTYPE_IMM(x) \ - ((RV_X(x, 1, 2) << 3) | (RV_X(x, 3, 2) << 10) | (RV_X(x, 5, 1) << 2) | \ - (RV_X(x, 6, 2) << 5) | (RV_X(x, 8, 1) << 12)) -#define ENCODE_CJTYPE_IMM(x) \ - ((RV_X(x, 1, 3) << 3) | (RV_X(x, 4, 1) << 11) | (RV_X(x, 5, 1) << 2) | \ - (RV_X(x, 6, 1) << 7) | (RV_X(x, 7, 1) << 6) | (RV_X(x, 8, 2) << 9) | \ - (RV_X(x, 10, 1) << 8) | (RV_X(x, 11, 1) << 12)) -#define ENCODE_UJTYPE_IMM(x) \ - (ENCODE_UTYPE_IMM(RISCV_CONST_HIGH_PART(x)) | \ - (ENCODE_ITYPE_IMM(RISCV_CONST_LOW_PART(x)) << 32)) -#define ENCODE_UITYPE_IMM(x) \ - (ENCODE_UTYPE_IMM(x) | (ENCODE_ITYPE_IMM(x) << 32)) - -#define CLEAN_IMM(type, x) \ - ((~ENCODE_##type##_IMM((uint64_t)(-1))) & (x)) - -int arch_kexec_apply_relocations_add(struct purgatory_info *pi, - Elf_Shdr *section, - const Elf_Shdr *relsec, - const Elf_Shdr *symtab) -{ - const char *strtab, *name, *shstrtab; - const Elf_Shdr *sechdrs; - Elf64_Rela *relas; - int i, r_type; - - /* String & section header string table */ - sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; - strtab = (char *)pi->ehdr + sechdrs[symtab->sh_link].sh_offset; - shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset; - - relas = (void *)pi->ehdr + relsec->sh_offset; - - for (i = 0; i < relsec->sh_size / sizeof(*relas); i++) { - const Elf_Sym *sym; /* symbol to relocate */ - unsigned long addr; /* final location after relocation */ - unsigned long val; /* relocated symbol value */ - unsigned long sec_base; /* relocated symbol value */ - void *loc; /* tmp location to modify */ - - sym = (void *)pi->ehdr + symtab->sh_offset; - sym += ELF64_R_SYM(relas[i].r_info); - - if (sym->st_name) - name = strtab + sym->st_name; - else - name = shstrtab + sechdrs[sym->st_shndx].sh_name; - - loc = pi->purgatory_buf; - loc += section->sh_offset; - loc += relas[i].r_offset; - - if (sym->st_shndx == SHN_ABS) - sec_base = 0; - else if (sym->st_shndx >= pi->ehdr->e_shnum) { - pr_err("Invalid section %d for symbol %s\n", - sym->st_shndx, name); - return -ENOEXEC; - } else - sec_base = pi->sechdrs[sym->st_shndx].sh_addr; - - val = sym->st_value; - val += sec_base; - val += relas[i].r_addend; - - addr = section->sh_addr + relas[i].r_offset; - - r_type = ELF64_R_TYPE(relas[i].r_info); - - switch (r_type) { - case R_RISCV_BRANCH: - *(u32 *)loc = CLEAN_IMM(BTYPE, *(u32 *)loc) | - ENCODE_BTYPE_IMM(val - addr); - break; - case R_RISCV_JAL: - *(u32 *)loc = CLEAN_IMM(JTYPE, *(u32 *)loc) | - ENCODE_JTYPE_IMM(val - addr); - break; - /* - * With no R_RISCV_PCREL_LO12_S, R_RISCV_PCREL_LO12_I - * sym is expected to be next to R_RISCV_PCREL_HI20 - * in purgatory relsec. Handle it like R_RISCV_CALL - * sym, instead of searching the whole relsec. - */ - case R_RISCV_PCREL_HI20: - case R_RISCV_CALL_PLT: - case R_RISCV_CALL: - *(u64 *)loc = CLEAN_IMM(UITYPE, *(u64 *)loc) | - ENCODE_UJTYPE_IMM(val - addr); - break; - case R_RISCV_RVC_BRANCH: - *(u32 *)loc = CLEAN_IMM(CBTYPE, *(u32 *)loc) | - ENCODE_CBTYPE_IMM(val - addr); - break; - case R_RISCV_RVC_JUMP: - *(u32 *)loc = CLEAN_IMM(CJTYPE, *(u32 *)loc) | - ENCODE_CJTYPE_IMM(val - addr); - break; - case R_RISCV_ADD16: - *(u16 *)loc += val; - break; - case R_RISCV_SUB16: - *(u16 *)loc -= val; - break; - case R_RISCV_ADD32: - *(u32 *)loc += val; - break; - case R_RISCV_SUB32: - *(u32 *)loc -= val; - break; - /* It has been applied by R_RISCV_PCREL_HI20 sym */ - case R_RISCV_PCREL_LO12_I: - case R_RISCV_ALIGN: - case R_RISCV_RELAX: - break; - case R_RISCV_64: - *(u64 *)loc = val; - break; - default: - pr_err("Unknown rela relocation: %d\n", r_type); - return -ENOEXEC; - } - } - return 0; -} - -const struct kexec_file_ops elf_kexec_ops = { - .probe = kexec_elf_probe, - .load = elf_kexec_load, -}; diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 0fb338000c6d..75656afa2d6b 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -401,9 +401,18 @@ SYM_FUNC_START(__switch_to) REG_S s9, TASK_THREAD_S9_RA(a3) REG_S s10, TASK_THREAD_S10_RA(a3) REG_S s11, TASK_THREAD_S11_RA(a3) + + /* save the user space access flag */ + csrr s0, CSR_STATUS + REG_S s0, TASK_THREAD_SUM_RA(a3) + /* Save the kernel shadow call stack pointer */ scs_save_current /* Restore context from next->thread */ + REG_L s0, TASK_THREAD_SUM_RA(a4) + li s1, SR_SUM + and s0, s0, s1 + csrs CSR_STATUS, s0 REG_L ra, TASK_THREAD_RA_RA(a4) REG_L sp, TASK_THREAD_SP_RA(a4) REG_L s0, TASK_THREAD_S0_RA(a4) diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index 674dcdfae7a1..4c6c24380cfd 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -8,98 +8,129 @@ #include <linux/ftrace.h> #include <linux/uaccess.h> #include <linux/memory.h> +#include <linux/irqflags.h> #include <linux/stop_machine.h> #include <asm/cacheflush.h> #include <asm/text-patching.h> #ifdef CONFIG_DYNAMIC_FTRACE -void ftrace_arch_code_modify_prepare(void) __acquires(&text_mutex) +unsigned long ftrace_call_adjust(unsigned long addr) { - mutex_lock(&text_mutex); + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS)) + return addr + 8 + MCOUNT_AUIPC_SIZE; - /* - * The code sequences we use for ftrace can't be patched while the - * kernel is running, so we need to use stop_machine() to modify them - * for now. This doesn't play nice with text_mutex, we use this flag - * to elide the check. - */ - riscv_patch_in_stop_machine = true; + return addr + MCOUNT_AUIPC_SIZE; +} + +unsigned long arch_ftrace_get_symaddr(unsigned long fentry_ip) +{ + return fentry_ip - MCOUNT_AUIPC_SIZE; } -void ftrace_arch_code_modify_post_process(void) __releases(&text_mutex) +void arch_ftrace_update_code(int command) { - riscv_patch_in_stop_machine = false; + mutex_lock(&text_mutex); + command |= FTRACE_MAY_SLEEP; + ftrace_modify_all_code(command); mutex_unlock(&text_mutex); + flush_icache_all(); } -static int ftrace_check_current_call(unsigned long hook_pos, - unsigned int *expected) +static int __ftrace_modify_call(unsigned long source, unsigned long target, bool validate) { + unsigned int call[2], offset; unsigned int replaced[2]; - unsigned int nops[2] = {RISCV_INSN_NOP4, RISCV_INSN_NOP4}; - /* we expect nops at the hook position */ - if (!expected) - expected = nops; + offset = target - source; + call[1] = to_jalr_t0(offset); - /* - * Read the text we want to modify; - * return must be -EFAULT on read error - */ - if (copy_from_kernel_nofault(replaced, (void *)hook_pos, - MCOUNT_INSN_SIZE)) - return -EFAULT; - - /* - * Make sure it is what we expect it to be; - * return must be -EINVAL on failed comparison - */ - if (memcmp(expected, replaced, sizeof(replaced))) { - pr_err("%p: expected (%08x %08x) but got (%08x %08x)\n", - (void *)hook_pos, expected[0], expected[1], replaced[0], - replaced[1]); - return -EINVAL; + if (validate) { + call[0] = to_auipc_t0(offset); + /* + * Read the text we want to modify; + * return must be -EFAULT on read error + */ + if (copy_from_kernel_nofault(replaced, (void *)source, 2 * MCOUNT_INSN_SIZE)) + return -EFAULT; + + if (replaced[0] != call[0]) { + pr_err("%p: expected (%08x) but got (%08x)\n", + (void *)source, call[0], replaced[0]); + return -EINVAL; + } } + /* Replace the jalr at once. Return -EPERM on write error. */ + if (patch_insn_write((void *)(source + MCOUNT_AUIPC_SIZE), call + 1, MCOUNT_JALR_SIZE)) + return -EPERM; + return 0; } -static int __ftrace_modify_call(unsigned long hook_pos, unsigned long target, - bool enable, bool ra) +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS +static const struct ftrace_ops *riscv64_rec_get_ops(struct dyn_ftrace *rec) { - unsigned int call[2]; - unsigned int nops[2] = {RISCV_INSN_NOP4, RISCV_INSN_NOP4}; + const struct ftrace_ops *ops = NULL; - if (ra) - make_call_ra(hook_pos, target, call); - else - make_call_t0(hook_pos, target, call); + if (rec->flags & FTRACE_FL_CALL_OPS_EN) { + ops = ftrace_find_unique_ops(rec); + WARN_ON_ONCE(!ops); + } - /* Replace the auipc-jalr pair at once. Return -EPERM on write error. */ - if (patch_insn_write((void *)hook_pos, enable ? call : nops, MCOUNT_INSN_SIZE)) - return -EPERM; + if (!ops) + ops = &ftrace_list_ops; - return 0; + return ops; +} + +static int ftrace_rec_set_ops(const struct dyn_ftrace *rec, const struct ftrace_ops *ops) +{ + unsigned long literal = ALIGN_DOWN(rec->ip - 12, 8); + + return patch_text_nosync((void *)literal, &ops, sizeof(ops)); +} + +static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec) +{ + return ftrace_rec_set_ops(rec, &ftrace_nop_ops); +} + +static int ftrace_rec_update_ops(struct dyn_ftrace *rec) +{ + return ftrace_rec_set_ops(rec, riscv64_rec_get_ops(rec)); } +#else +static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec) { return 0; } +static int ftrace_rec_update_ops(struct dyn_ftrace *rec) { return 0; } +#endif int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { - unsigned int call[2]; + unsigned long distance, orig_addr, pc = rec->ip - MCOUNT_AUIPC_SIZE; + int ret; - make_call_t0(rec->ip, addr, call); + ret = ftrace_rec_update_ops(rec); + if (ret) + return ret; - if (patch_insn_write((void *)rec->ip, call, MCOUNT_INSN_SIZE)) - return -EPERM; + orig_addr = (unsigned long)&ftrace_caller; + distance = addr > orig_addr ? addr - orig_addr : orig_addr - addr; + if (distance > JALR_RANGE) + addr = FTRACE_ADDR; - return 0; + return __ftrace_modify_call(pc, addr, false); } -int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, - unsigned long addr) +int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - unsigned int nops[2] = {RISCV_INSN_NOP4, RISCV_INSN_NOP4}; + u32 nop4 = RISCV_INSN_NOP4; + int ret; - if (patch_insn_write((void *)rec->ip, nops, MCOUNT_INSN_SIZE)) + ret = ftrace_rec_set_nop_ops(rec); + if (ret) + return ret; + + if (patch_insn_write((void *)rec->ip, &nop4, MCOUNT_NOP4_SIZE)) return -EPERM; return 0; @@ -114,75 +145,71 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, */ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) { - int out; + unsigned long pc = rec->ip - MCOUNT_AUIPC_SIZE; + unsigned int nops[2], offset; + int ret; - mutex_lock(&text_mutex); - out = ftrace_make_nop(mod, rec, MCOUNT_ADDR); - mutex_unlock(&text_mutex); + ret = ftrace_rec_set_nop_ops(rec); + if (ret) + return ret; - return out; -} + offset = (unsigned long) &ftrace_caller - pc; + nops[0] = to_auipc_t0(offset); + nops[1] = RISCV_INSN_NOP4; -int ftrace_update_ftrace_func(ftrace_func_t func) -{ - int ret = __ftrace_modify_call((unsigned long)&ftrace_call, - (unsigned long)func, true, true); + mutex_lock(&text_mutex); + ret = patch_insn_write((void *)pc, nops, 2 * MCOUNT_INSN_SIZE); + mutex_unlock(&text_mutex); return ret; } -struct ftrace_modify_param { - int command; - atomic_t cpu_count; -}; - -static int __ftrace_modify_code(void *data) +ftrace_func_t ftrace_call_dest = ftrace_stub; +int ftrace_update_ftrace_func(ftrace_func_t func) { - struct ftrace_modify_param *param = data; - - if (atomic_inc_return(¶m->cpu_count) == num_online_cpus()) { - ftrace_modify_all_code(param->command); - /* - * Make sure the patching store is effective *before* we - * increment the counter which releases all waiting CPUs - * by using the release variant of atomic increment. The - * release pairs with the call to local_flush_icache_all() - * on the waiting CPU. - */ - atomic_inc_return_release(¶m->cpu_count); - } else { - while (atomic_read(¶m->cpu_count) <= num_online_cpus()) - cpu_relax(); - - local_flush_icache_all(); - } + /* + * When using CALL_OPS, the function to call is associated with the + * call site, and we don't have a global function pointer to update. + */ + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS)) + return 0; + WRITE_ONCE(ftrace_call_dest, func); + /* + * The data fence ensure that the update to ftrace_call_dest happens + * before the write to function_trace_op later in the generic ftrace. + * If the sequence is not enforced, then an old ftrace_call_dest may + * race loading a new function_trace_op set in ftrace_modify_all_code + */ + smp_wmb(); + /* + * Updating ftrace dpes not take stop_machine path, so irqs should not + * be disabled. + */ + WARN_ON(irqs_disabled()); + smp_call_function(ftrace_sync_ipi, NULL, 1); return 0; } -void arch_ftrace_update_code(int command) +#else /* CONFIG_DYNAMIC_FTRACE */ +unsigned long ftrace_call_adjust(unsigned long addr) { - struct ftrace_modify_param param = { command, ATOMIC_INIT(0) }; - - stop_machine(__ftrace_modify_code, ¶m, cpu_online_mask); + return addr; } -#endif +#endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) { - unsigned int call[2]; - unsigned long caller = rec->ip; + unsigned long caller = rec->ip - MCOUNT_AUIPC_SIZE; int ret; - make_call_t0(caller, old_addr, call); - ret = ftrace_check_current_call(caller, call); - + ret = ftrace_rec_update_ops(rec); if (ret) return ret; - return __ftrace_modify_call(caller, addr, true, false); + return __ftrace_modify_call(caller, FTRACE_ADDR, true); } #endif @@ -210,7 +237,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, } #ifdef CONFIG_DYNAMIC_FTRACE -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { @@ -231,19 +257,5 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, if (!function_graph_enter_regs(old, ip, frame_pointer, parent, fregs)) *parent = return_hooker; } -#else /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */ -extern void ftrace_graph_call(void); -int ftrace_enable_ftrace_graph_caller(void) -{ - return __ftrace_modify_call((unsigned long)&ftrace_graph_call, - (unsigned long)&prepare_ftrace_return, true, true); -} - -int ftrace_disable_ftrace_graph_caller(void) -{ - return __ftrace_modify_call((unsigned long)&ftrace_graph_call, - (unsigned long)&prepare_ftrace_return, false, true); -} -#endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */ #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/riscv/kernel/kexec_elf.c b/arch/riscv/kernel/kexec_elf.c new file mode 100644 index 000000000000..f4755d49b89e --- /dev/null +++ b/arch/riscv/kernel/kexec_elf.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Load ELF vmlinux file for the kexec_file_load syscall. + * + * Copyright (C) 2021 Huawei Technologies Co, Ltd. + * + * Author: Liao Chang (liaochang1@huawei.com) + * + * Based on kexec-tools' kexec-elf-riscv.c, heavily modified + * for kernel. + */ + +#define pr_fmt(fmt) "kexec_image: " fmt + +#include <linux/elf.h> +#include <linux/kexec.h> +#include <linux/slab.h> +#include <linux/of.h> +#include <linux/libfdt.h> +#include <linux/types.h> +#include <linux/memblock.h> +#include <asm/setup.h> + +static int riscv_kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, + struct kexec_elf_info *elf_info, unsigned long old_pbase, + unsigned long new_pbase) +{ + int i; + int ret = 0; + size_t size; + struct kexec_buf kbuf; + const struct elf_phdr *phdr; + + kbuf.image = image; + + for (i = 0; i < ehdr->e_phnum; i++) { + phdr = &elf_info->proghdrs[i]; + if (phdr->p_type != PT_LOAD) + continue; + + size = phdr->p_filesz; + if (size > phdr->p_memsz) + size = phdr->p_memsz; + + kbuf.buffer = (void *) elf_info->buffer + phdr->p_offset; + kbuf.bufsz = size; + kbuf.buf_align = phdr->p_align; + kbuf.mem = phdr->p_paddr - old_pbase + new_pbase; + kbuf.memsz = phdr->p_memsz; + kbuf.top_down = false; + ret = kexec_add_buffer(&kbuf); + if (ret) + break; + } + + return ret; +} + +/* + * Go through the available phsyical memory regions and find one that hold + * an image of the specified size. + */ +static int elf_find_pbase(struct kimage *image, unsigned long kernel_len, + struct elfhdr *ehdr, struct kexec_elf_info *elf_info, + unsigned long *old_pbase, unsigned long *new_pbase) +{ + int i; + int ret; + struct kexec_buf kbuf; + const struct elf_phdr *phdr; + unsigned long lowest_paddr = ULONG_MAX; + unsigned long lowest_vaddr = ULONG_MAX; + + for (i = 0; i < ehdr->e_phnum; i++) { + phdr = &elf_info->proghdrs[i]; + if (phdr->p_type != PT_LOAD) + continue; + + if (lowest_paddr > phdr->p_paddr) + lowest_paddr = phdr->p_paddr; + + if (lowest_vaddr > phdr->p_vaddr) + lowest_vaddr = phdr->p_vaddr; + } + + kbuf.image = image; + kbuf.buf_min = lowest_paddr; + kbuf.buf_max = ULONG_MAX; + + /* + * Current riscv boot protocol requires 2MB alignment for + * RV64 and 4MB alignment for RV32 + * + */ + kbuf.buf_align = PMD_SIZE; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + kbuf.memsz = ALIGN(kernel_len, PAGE_SIZE); + kbuf.top_down = false; + ret = arch_kexec_locate_mem_hole(&kbuf); + if (!ret) { + *old_pbase = lowest_paddr; + *new_pbase = kbuf.mem; + image->start = ehdr->e_entry - lowest_vaddr + kbuf.mem; + } + return ret; +} + +static void *elf_kexec_load(struct kimage *image, char *kernel_buf, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len) +{ + int ret; + unsigned long old_kernel_pbase = ULONG_MAX; + unsigned long new_kernel_pbase = 0UL; + struct elfhdr ehdr; + struct kexec_elf_info elf_info; + + ret = kexec_build_elf_info(kernel_buf, kernel_len, &ehdr, &elf_info); + if (ret) + return ERR_PTR(ret); + + ret = elf_find_pbase(image, kernel_len, &ehdr, &elf_info, + &old_kernel_pbase, &new_kernel_pbase); + if (ret) + goto out; + + /* Add the kernel binary to the image */ + ret = riscv_kexec_elf_load(image, &ehdr, &elf_info, + old_kernel_pbase, new_kernel_pbase); + if (ret) + goto out; + + ret = load_extra_segments(image, image->start, kernel_len, + initrd, initrd_len, cmdline, cmdline_len); +out: + kexec_free_elf_info(&elf_info); + return ret ? ERR_PTR(ret) : NULL; +} + +const struct kexec_file_ops elf_kexec_ops = { + .probe = kexec_elf_probe, + .load = elf_kexec_load, +}; diff --git a/arch/riscv/kernel/kexec_image.c b/arch/riscv/kernel/kexec_image.c new file mode 100644 index 000000000000..26a81774a78a --- /dev/null +++ b/arch/riscv/kernel/kexec_image.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * RISC-V Kexec image loader + * + */ + +#define pr_fmt(fmt) "kexec_file(Image): " fmt + +#include <linux/err.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/kexec.h> +#include <linux/pe.h> +#include <linux/string.h> +#include <asm/byteorder.h> +#include <asm/image.h> + +static int image_probe(const char *kernel_buf, unsigned long kernel_len) +{ + const struct riscv_image_header *h = (const struct riscv_image_header *)kernel_buf; + + if (!h || kernel_len < sizeof(*h)) + return -EINVAL; + + /* According to Documentation/riscv/boot-image-header.rst, + * use "magic2" field to check when version >= 0.2. + */ + + if (h->version >= RISCV_HEADER_VERSION && + memcmp(&h->magic2, RISCV_IMAGE_MAGIC2, sizeof(h->magic2))) + return -EINVAL; + + return 0; +} + +static void *image_load(struct kimage *image, + char *kernel, unsigned long kernel_len, + char *initrd, unsigned long initrd_len, + char *cmdline, unsigned long cmdline_len) +{ + struct riscv_image_header *h; + u64 flags; + bool be_image, be_kernel; + struct kexec_buf kbuf; + int ret; + + /* Check Image header */ + h = (struct riscv_image_header *)kernel; + if (!h->image_size) { + ret = -EINVAL; + goto out; + } + + /* Check endianness */ + flags = le64_to_cpu(h->flags); + be_image = riscv_image_flag_field(flags, RISCV_IMAGE_FLAG_BE); + be_kernel = IS_ENABLED(CONFIG_CPU_BIG_ENDIAN); + if (be_image != be_kernel) { + ret = -EINVAL; + goto out; + } + + /* Load the kernel image */ + kbuf.image = image; + kbuf.buf_min = 0; + kbuf.buf_max = ULONG_MAX; + kbuf.top_down = false; + + kbuf.buffer = kernel; + kbuf.bufsz = kernel_len; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + kbuf.memsz = le64_to_cpu(h->image_size); + kbuf.buf_align = le64_to_cpu(h->text_offset); + + ret = kexec_add_buffer(&kbuf); + if (ret) { + pr_err("Error add kernel image ret=%d\n", ret); + goto out; + } + + image->start = kbuf.mem; + + pr_info("Loaded kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + kbuf.mem, kbuf.bufsz, kbuf.memsz); + + ret = load_extra_segments(image, kbuf.mem, kbuf.memsz, + initrd, initrd_len, cmdline, cmdline_len); + +out: + return ret ? ERR_PTR(ret) : NULL; +} + +const struct kexec_file_ops image_kexec_ops = { + .probe = image_probe, + .load = image_load, +}; diff --git a/arch/riscv/kernel/machine_kexec_file.c b/arch/riscv/kernel/machine_kexec_file.c index b0bf8c1722c0..e36104af2e24 100644 --- a/arch/riscv/kernel/machine_kexec_file.c +++ b/arch/riscv/kernel/machine_kexec_file.c @@ -7,8 +7,369 @@ * Author: Liao Chang (liaochang1@huawei.com) */ #include <linux/kexec.h> +#include <linux/elf.h> +#include <linux/slab.h> +#include <linux/of.h> +#include <linux/libfdt.h> +#include <linux/types.h> +#include <linux/memblock.h> +#include <linux/vmalloc.h> +#include <asm/setup.h> const struct kexec_file_ops * const kexec_file_loaders[] = { &elf_kexec_ops, + &image_kexec_ops, NULL }; + +int arch_kimage_file_post_load_cleanup(struct kimage *image) +{ + kvfree(image->arch.fdt); + image->arch.fdt = NULL; + + vfree(image->elf_headers); + image->elf_headers = NULL; + image->elf_headers_sz = 0; + + return kexec_image_post_load_cleanup_default(image); +} + +#ifdef CONFIG_CRASH_DUMP +static int get_nr_ram_ranges_callback(struct resource *res, void *arg) +{ + unsigned int *nr_ranges = arg; + + (*nr_ranges)++; + return 0; +} + +static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg) +{ + struct crash_mem *cmem = arg; + + cmem->ranges[cmem->nr_ranges].start = res->start; + cmem->ranges[cmem->nr_ranges].end = res->end; + cmem->nr_ranges++; + + return 0; +} + +static int prepare_elf_headers(void **addr, unsigned long *sz) +{ + struct crash_mem *cmem; + unsigned int nr_ranges; + int ret; + + nr_ranges = 1; /* For exclusion of crashkernel region */ + walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback); + + cmem = kmalloc(struct_size(cmem, ranges, nr_ranges), GFP_KERNEL); + if (!cmem) + return -ENOMEM; + + cmem->max_nr_ranges = nr_ranges; + cmem->nr_ranges = 0; + ret = walk_system_ram_res(0, -1, cmem, prepare_elf64_ram_headers_callback); + if (ret) + goto out; + + /* Exclude crashkernel region */ + ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end); + if (!ret) + ret = crash_prepare_elf64_headers(cmem, true, addr, sz); + +out: + kfree(cmem); + return ret; +} + +static char *setup_kdump_cmdline(struct kimage *image, char *cmdline, + unsigned long cmdline_len) +{ + int elfcorehdr_strlen; + char *cmdline_ptr; + + cmdline_ptr = kzalloc(COMMAND_LINE_SIZE, GFP_KERNEL); + if (!cmdline_ptr) + return NULL; + + elfcorehdr_strlen = sprintf(cmdline_ptr, "elfcorehdr=0x%lx ", + image->elf_load_addr); + + if (elfcorehdr_strlen + cmdline_len > COMMAND_LINE_SIZE) { + pr_err("Appending elfcorehdr=<addr> exceeds cmdline size\n"); + kfree(cmdline_ptr); + return NULL; + } + + memcpy(cmdline_ptr + elfcorehdr_strlen, cmdline, cmdline_len); + /* Ensure it's nul terminated */ + cmdline_ptr[COMMAND_LINE_SIZE - 1] = '\0'; + return cmdline_ptr; +} +#endif + +#define RV_X(x, s, n) (((x) >> (s)) & ((1 << (n)) - 1)) +#define RISCV_IMM_BITS 12 +#define RISCV_IMM_REACH (1LL << RISCV_IMM_BITS) +#define RISCV_CONST_HIGH_PART(x) \ + (((x) + (RISCV_IMM_REACH >> 1)) & ~(RISCV_IMM_REACH - 1)) +#define RISCV_CONST_LOW_PART(x) ((x) - RISCV_CONST_HIGH_PART(x)) + +#define ENCODE_ITYPE_IMM(x) \ + (RV_X(x, 0, 12) << 20) +#define ENCODE_BTYPE_IMM(x) \ + ((RV_X(x, 1, 4) << 8) | (RV_X(x, 5, 6) << 25) | \ + (RV_X(x, 11, 1) << 7) | (RV_X(x, 12, 1) << 31)) +#define ENCODE_UTYPE_IMM(x) \ + (RV_X(x, 12, 20) << 12) +#define ENCODE_JTYPE_IMM(x) \ + ((RV_X(x, 1, 10) << 21) | (RV_X(x, 11, 1) << 20) | \ + (RV_X(x, 12, 8) << 12) | (RV_X(x, 20, 1) << 31)) +#define ENCODE_CBTYPE_IMM(x) \ + ((RV_X(x, 1, 2) << 3) | (RV_X(x, 3, 2) << 10) | (RV_X(x, 5, 1) << 2) | \ + (RV_X(x, 6, 2) << 5) | (RV_X(x, 8, 1) << 12)) +#define ENCODE_CJTYPE_IMM(x) \ + ((RV_X(x, 1, 3) << 3) | (RV_X(x, 4, 1) << 11) | (RV_X(x, 5, 1) << 2) | \ + (RV_X(x, 6, 1) << 7) | (RV_X(x, 7, 1) << 6) | (RV_X(x, 8, 2) << 9) | \ + (RV_X(x, 10, 1) << 8) | (RV_X(x, 11, 1) << 12)) +#define ENCODE_UJTYPE_IMM(x) \ + (ENCODE_UTYPE_IMM(RISCV_CONST_HIGH_PART(x)) | \ + (ENCODE_ITYPE_IMM(RISCV_CONST_LOW_PART(x)) << 32)) +#define ENCODE_UITYPE_IMM(x) \ + (ENCODE_UTYPE_IMM(x) | (ENCODE_ITYPE_IMM(x) << 32)) + +#define CLEAN_IMM(type, x) \ + ((~ENCODE_##type##_IMM((uint64_t)(-1))) & (x)) + +int arch_kexec_apply_relocations_add(struct purgatory_info *pi, + Elf_Shdr *section, + const Elf_Shdr *relsec, + const Elf_Shdr *symtab) +{ + const char *strtab, *name, *shstrtab; + const Elf_Shdr *sechdrs; + Elf64_Rela *relas; + int i, r_type; + + /* String & section header string table */ + sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; + strtab = (char *)pi->ehdr + sechdrs[symtab->sh_link].sh_offset; + shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset; + + relas = (void *)pi->ehdr + relsec->sh_offset; + + for (i = 0; i < relsec->sh_size / sizeof(*relas); i++) { + const Elf_Sym *sym; /* symbol to relocate */ + unsigned long addr; /* final location after relocation */ + unsigned long val; /* relocated symbol value */ + unsigned long sec_base; /* relocated symbol value */ + void *loc; /* tmp location to modify */ + + sym = (void *)pi->ehdr + symtab->sh_offset; + sym += ELF64_R_SYM(relas[i].r_info); + + if (sym->st_name) + name = strtab + sym->st_name; + else + name = shstrtab + sechdrs[sym->st_shndx].sh_name; + + loc = pi->purgatory_buf; + loc += section->sh_offset; + loc += relas[i].r_offset; + + if (sym->st_shndx == SHN_ABS) + sec_base = 0; + else if (sym->st_shndx >= pi->ehdr->e_shnum) { + pr_err("Invalid section %d for symbol %s\n", + sym->st_shndx, name); + return -ENOEXEC; + } else + sec_base = pi->sechdrs[sym->st_shndx].sh_addr; + + val = sym->st_value; + val += sec_base; + val += relas[i].r_addend; + + addr = section->sh_addr + relas[i].r_offset; + + r_type = ELF64_R_TYPE(relas[i].r_info); + + switch (r_type) { + case R_RISCV_BRANCH: + *(u32 *)loc = CLEAN_IMM(BTYPE, *(u32 *)loc) | + ENCODE_BTYPE_IMM(val - addr); + break; + case R_RISCV_JAL: + *(u32 *)loc = CLEAN_IMM(JTYPE, *(u32 *)loc) | + ENCODE_JTYPE_IMM(val - addr); + break; + /* + * With no R_RISCV_PCREL_LO12_S, R_RISCV_PCREL_LO12_I + * sym is expected to be next to R_RISCV_PCREL_HI20 + * in purgatory relsec. Handle it like R_RISCV_CALL + * sym, instead of searching the whole relsec. + */ + case R_RISCV_PCREL_HI20: + case R_RISCV_CALL_PLT: + case R_RISCV_CALL: + *(u64 *)loc = CLEAN_IMM(UITYPE, *(u64 *)loc) | + ENCODE_UJTYPE_IMM(val - addr); + break; + case R_RISCV_RVC_BRANCH: + *(u32 *)loc = CLEAN_IMM(CBTYPE, *(u32 *)loc) | + ENCODE_CBTYPE_IMM(val - addr); + break; + case R_RISCV_RVC_JUMP: + *(u32 *)loc = CLEAN_IMM(CJTYPE, *(u32 *)loc) | + ENCODE_CJTYPE_IMM(val - addr); + break; + case R_RISCV_ADD16: + *(u16 *)loc += val; + break; + case R_RISCV_SUB16: + *(u16 *)loc -= val; + break; + case R_RISCV_ADD32: + *(u32 *)loc += val; + break; + case R_RISCV_SUB32: + *(u32 *)loc -= val; + break; + /* It has been applied by R_RISCV_PCREL_HI20 sym */ + case R_RISCV_PCREL_LO12_I: + case R_RISCV_ALIGN: + case R_RISCV_RELAX: + break; + case R_RISCV_64: + *(u64 *)loc = val; + break; + default: + pr_err("Unknown rela relocation: %d\n", r_type); + return -ENOEXEC; + } + } + return 0; +} + + +int load_extra_segments(struct kimage *image, unsigned long kernel_start, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len) +{ + int ret; + void *fdt; + unsigned long initrd_pbase = 0UL; + struct kexec_buf kbuf; + char *modified_cmdline = NULL; + + kbuf.image = image; + kbuf.buf_min = kernel_start + kernel_len; + kbuf.buf_max = ULONG_MAX; + +#ifdef CONFIG_CRASH_DUMP + /* Add elfcorehdr */ + if (image->type == KEXEC_TYPE_CRASH) { + void *headers; + unsigned long headers_sz; + ret = prepare_elf_headers(&headers, &headers_sz); + if (ret) { + pr_err("Preparing elf core header failed\n"); + goto out; + } + + kbuf.buffer = headers; + kbuf.bufsz = headers_sz; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + kbuf.memsz = headers_sz; + kbuf.buf_align = ELF_CORE_HEADER_ALIGN; + kbuf.top_down = true; + + ret = kexec_add_buffer(&kbuf); + if (ret) { + vfree(headers); + goto out; + } + image->elf_headers = headers; + image->elf_load_addr = kbuf.mem; + image->elf_headers_sz = headers_sz; + + kexec_dprintk("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + image->elf_load_addr, kbuf.bufsz, kbuf.memsz); + + /* Setup cmdline for kdump kernel case */ + modified_cmdline = setup_kdump_cmdline(image, cmdline, + cmdline_len); + if (!modified_cmdline) { + pr_err("Setting up cmdline for kdump kernel failed\n"); + ret = -EINVAL; + goto out; + } + cmdline = modified_cmdline; + } +#endif + +#ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY + /* Add purgatory to the image */ + kbuf.top_down = true; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + ret = kexec_load_purgatory(image, &kbuf); + if (ret) { + pr_err("Error loading purgatory ret=%d\n", ret); + goto out; + } + kexec_dprintk("Loaded purgatory at 0x%lx\n", kbuf.mem); + + ret = kexec_purgatory_get_set_symbol(image, "riscv_kernel_entry", + &kernel_start, + sizeof(kernel_start), 0); + if (ret) + pr_err("Error update purgatory ret=%d\n", ret); +#endif /* CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY */ + + /* Add the initrd to the image */ + if (initrd != NULL) { + kbuf.buffer = initrd; + kbuf.bufsz = kbuf.memsz = initrd_len; + kbuf.buf_align = PAGE_SIZE; + kbuf.top_down = true; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + ret = kexec_add_buffer(&kbuf); + if (ret) + goto out; + initrd_pbase = kbuf.mem; + kexec_dprintk("Loaded initrd at 0x%lx\n", initrd_pbase); + } + + /* Add the DTB to the image */ + fdt = of_kexec_alloc_and_setup_fdt(image, initrd_pbase, + initrd_len, cmdline, 0); + if (!fdt) { + pr_err("Error setting up the new device tree.\n"); + ret = -EINVAL; + goto out; + } + + fdt_pack(fdt); + kbuf.buffer = fdt; + kbuf.bufsz = kbuf.memsz = fdt_totalsize(fdt); + kbuf.buf_align = PAGE_SIZE; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + kbuf.top_down = true; + ret = kexec_add_buffer(&kbuf); + if (ret) { + pr_err("Error add DTB kbuf ret=%d\n", ret); + goto out_free_fdt; + } + /* Cache the fdt buffer address for memory cleanup */ + image->arch.fdt = fdt; + kexec_dprintk("Loaded device tree at 0x%lx\n", kbuf.mem); + goto out; + +out_free_fdt: + kvfree(fdt); +out: + kfree(modified_cmdline); + return ret; +} diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S index 745dd4c4a69c..48f6c4f7dca0 100644 --- a/arch/riscv/kernel/mcount-dyn.S +++ b/arch/riscv/kernel/mcount-dyn.S @@ -13,7 +13,6 @@ .text -#define FENTRY_RA_OFFSET 8 #define ABI_SIZE_ON_STACK 80 #define ABI_A0 0 #define ABI_A1 8 @@ -56,16 +55,13 @@ addi sp, sp, ABI_SIZE_ON_STACK .endm -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS - /** * SAVE_ABI_REGS - save regs against the ftrace_regs struct * * After the stack is established, * * 0(sp) stores the PC of the traced function which can be accessed -* by &(fregs)->epc in tracing function. Note that the real -* function entry address should be computed with -FENTRY_RA_OFFSET. +* by &(fregs)->epc in tracing function. * * 8(sp) stores the function return address (i.e. parent IP) that * can be accessed by &(fregs)->ra in tracing function. @@ -86,17 +82,20 @@ * +++++++++ **/ .macro SAVE_ABI_REGS - mv t4, sp // Save original SP in T4 addi sp, sp, -FREGS_SIZE_ON_STACK - REG_S t0, FREGS_EPC(sp) REG_S x1, FREGS_RA(sp) - REG_S t4, FREGS_SP(sp) // Put original SP on stack #ifdef HAVE_FUNCTION_GRAPH_FP_TEST REG_S x8, FREGS_S0(sp) #endif REG_S x6, FREGS_T1(sp) - +#ifdef CONFIG_CC_IS_CLANG + REG_S x7, FREGS_T2(sp) + REG_S x28, FREGS_T3(sp) + REG_S x29, FREGS_T4(sp) + REG_S x30, FREGS_T5(sp) + REG_S x31, FREGS_T6(sp) +#endif // save the arguments REG_S x10, FREGS_A0(sp) REG_S x11, FREGS_A1(sp) @@ -106,16 +105,25 @@ REG_S x15, FREGS_A5(sp) REG_S x16, FREGS_A6(sp) REG_S x17, FREGS_A7(sp) + mv a0, sp + addi a0, a0, FREGS_SIZE_ON_STACK + REG_S a0, FREGS_SP(sp) // Put original SP on stack .endm - .macro RESTORE_ABI_REGS, all=0 + .macro RESTORE_ABI_REGS REG_L t0, FREGS_EPC(sp) REG_L x1, FREGS_RA(sp) #ifdef HAVE_FUNCTION_GRAPH_FP_TEST REG_L x8, FREGS_S0(sp) #endif REG_L x6, FREGS_T1(sp) - +#ifdef CONFIG_CC_IS_CLANG + REG_L x7, FREGS_T2(sp) + REG_L x28, FREGS_T3(sp) + REG_L x29, FREGS_T4(sp) + REG_L x30, FREGS_T5(sp) + REG_L x31, FREGS_T6(sp) +#endif // restore the arguments REG_L x10, FREGS_A0(sp) REG_L x11, FREGS_A1(sp) @@ -130,60 +138,71 @@ .endm .macro PREPARE_ARGS - addi a0, t0, -FENTRY_RA_OFFSET + addi a0, t0, -MCOUNT_JALR_SIZE // ip (callsite's jalr insn) +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS + mv a1, ra // parent_ip + REG_L a2, -16(t0) // op + REG_L ra, FTRACE_OPS_FUNC(a2) // op->func +#else la a1, function_trace_op - REG_L a2, 0(a1) - mv a1, ra - mv a3, sp + REG_L a2, 0(a1) // op + mv a1, ra // parent_ip +#endif + mv a3, sp // regs .endm -#endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */ - -#ifndef CONFIG_DYNAMIC_FTRACE_WITH_ARGS SYM_FUNC_START(ftrace_caller) - SAVE_ABI - - addi a0, t0, -FENTRY_RA_OFFSET - la a1, function_trace_op - REG_L a2, 0(a1) - mv a1, ra - mv a3, sp +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS + /* + * When CALL_OPS is enabled (2 or 4) nops [8B] are placed before the + * function entry, these are later overwritten with the pointer to the + * associated struct ftrace_ops. + * + * -8: &ftrace_ops of the associated tracer function. + *<ftrace enable>: + * 0: auipc t0/ra, 0x? + * 4: jalr t0/ra, ?(t0/ra) + * + * -8: &ftrace_nop_ops + *<ftrace disable>: + * 0: nop + * 4: nop + * + * t0 is set to ip+8 after the jalr is executed at the callsite, + * so we find the associated op at t0-16. + */ + REG_L t1, -16(t0) // op Should be SZ_REG instead of 16 -SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) - call ftrace_stub - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - addi a0, sp, ABI_RA - REG_L a1, ABI_T0(sp) - addi a1, a1, -FENTRY_RA_OFFSET -#ifdef HAVE_FUNCTION_GRAPH_FP_TEST - mv a2, s0 +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + /* + * If the op has a direct call, handle it immediately without + * saving/restoring registers. + */ + REG_L t1, FTRACE_OPS_DIRECT_CALL(t1) + bnez t1, ftrace_caller_direct #endif -SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) - call ftrace_stub #endif - RESTORE_ABI - jr t0 -SYM_FUNC_END(ftrace_caller) - -#else /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */ -SYM_FUNC_START(ftrace_caller) - mv t1, zero SAVE_ABI_REGS PREPARE_ARGS +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS + jalr ra +#else SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) - call ftrace_stub - + REG_L ra, ftrace_call_dest + jalr ra, 0(ra) +#endif RESTORE_ABI_REGS - bnez t1, .Ldirect +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + bnez t1, ftrace_caller_direct +#endif jr t0 -.Ldirect: +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +SYM_INNER_LABEL(ftrace_caller_direct, SYM_L_LOCAL) jr t1 +#endif SYM_FUNC_END(ftrace_caller) -#endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */ - #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS SYM_CODE_START(ftrace_stub_direct_tramp) jr t0 diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c index 91d0b355ceef..75551ac6504c 100644 --- a/arch/riscv/kernel/module-sections.c +++ b/arch/riscv/kernel/module-sections.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/moduleloader.h> +#include <linux/sort.h> unsigned long module_emit_got_entry(struct module *mod, unsigned long val) { @@ -55,44 +56,70 @@ unsigned long module_emit_plt_entry(struct module *mod, unsigned long val) return (unsigned long)&plt[i]; } -static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y) +#define cmp_3way(a, b) ((a) < (b) ? -1 : (a) > (b)) + +static int cmp_rela(const void *a, const void *b) { - return x->r_info == y->r_info && x->r_addend == y->r_addend; + const Elf_Rela *x = a, *y = b; + int i; + + /* sort by type, symbol index and addend */ + i = cmp_3way(x->r_info, y->r_info); + if (i == 0) + i = cmp_3way(x->r_addend, y->r_addend); + return i; } static bool duplicate_rela(const Elf_Rela *rela, int idx) { - int i; - for (i = 0; i < idx; i++) { - if (is_rela_equal(&rela[i], &rela[idx])) - return true; - } - return false; + /* + * Entries are sorted by type, symbol index and addend. That means + * that, if a duplicate entry exists, it must be in the preceding slot. + */ + return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0; } -static void count_max_entries(Elf_Rela *relas, int num, +static void count_max_entries(const Elf_Rela *relas, size_t num, unsigned int *plts, unsigned int *gots) { - for (int i = 0; i < num; i++) { + for (size_t i = 0; i < num; i++) { + if (duplicate_rela(relas, i)) + continue; + switch (ELF_R_TYPE(relas[i].r_info)) { case R_RISCV_CALL_PLT: case R_RISCV_PLT32: - if (!duplicate_rela(relas, i)) - (*plts)++; + (*plts)++; break; case R_RISCV_GOT_HI20: - if (!duplicate_rela(relas, i)) - (*gots)++; + (*gots)++; break; + default: + unreachable(); } } } +static bool rela_needs_plt_got_entry(const Elf_Rela *rela) +{ + switch (ELF_R_TYPE(rela->r_info)) { + case R_RISCV_CALL_PLT: + case R_RISCV_GOT_HI20: + case R_RISCV_PLT32: + return true; + default: + return false; + } +} + int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod) { + size_t num_scratch_relas = 0; unsigned int num_plts = 0; unsigned int num_gots = 0; + Elf_Rela *scratch = NULL; + size_t scratch_size = 0; int i; /* @@ -122,9 +149,10 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, /* Calculate the maxinum number of entries */ for (i = 0; i < ehdr->e_shnum; i++) { + size_t num_relas = sechdrs[i].sh_size / sizeof(Elf_Rela); Elf_Rela *relas = (void *)ehdr + sechdrs[i].sh_offset; - int num_rela = sechdrs[i].sh_size / sizeof(Elf_Rela); Elf_Shdr *dst_sec = sechdrs + sechdrs[i].sh_info; + size_t scratch_size_needed; if (sechdrs[i].sh_type != SHT_RELA) continue; @@ -133,7 +161,28 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, if (!(dst_sec->sh_flags & SHF_EXECINSTR)) continue; - count_max_entries(relas, num_rela, &num_plts, &num_gots); + /* + * apply_relocate_add() relies on HI20 and LO12 relocation pairs being + * close together, so sort a copy of the section to avoid interfering. + */ + scratch_size_needed = (num_scratch_relas + num_relas) * sizeof(*scratch); + if (scratch_size_needed > scratch_size) { + scratch_size = scratch_size_needed; + scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL); + if (!scratch) + return -ENOMEM; + } + + for (size_t j = 0; j < num_relas; j++) + if (rela_needs_plt_got_entry(&relas[j])) + scratch[num_scratch_relas++] = relas[j]; + } + + if (scratch) { + /* sort the accumulated PLT/GOT relocations so duplicates are adjacent */ + sort(scratch, num_scratch_relas, sizeof(*scratch), cmp_rela, NULL); + count_max_entries(scratch, num_scratch_relas, &num_plts, &num_gots); + kvfree(scratch); } mod->arch.plt.shdr->sh_type = SHT_NOBITS; diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index bbf7ec6a75c0..a0a40889d79a 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -60,7 +60,7 @@ int get_unalign_ctl(struct task_struct *tsk, unsigned long adr) if (!unaligned_ctl_available()) return -EINVAL; - return put_user(tsk->thread.align_ctl, (unsigned long __user *)adr); + return put_user(tsk->thread.align_ctl, (unsigned int __user *)adr); } void __show_regs(struct pt_regs *regs) diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index 1989b8cade1b..53836a9235e3 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -299,6 +299,76 @@ static int __sbi_rfence_v02(int fid, const struct cpumask *cpu_mask, return 0; } +static bool sbi_fwft_supported; + +struct fwft_set_req { + u32 feature; + unsigned long value; + unsigned long flags; + atomic_t error; +}; + +static void cpu_sbi_fwft_set(void *arg) +{ + struct fwft_set_req *req = arg; + int ret; + + ret = sbi_fwft_set(req->feature, req->value, req->flags); + if (ret) + atomic_set(&req->error, ret); +} + +/** + * sbi_fwft_set() - Set a feature on the local hart + * @feature: The feature ID to be set + * @value: The feature value to be set + * @flags: FWFT feature set flags + * + * Return: 0 on success, appropriate linux error code otherwise. + */ +int sbi_fwft_set(u32 feature, unsigned long value, unsigned long flags) +{ + struct sbiret ret; + + if (!sbi_fwft_supported) + return -EOPNOTSUPP; + + ret = sbi_ecall(SBI_EXT_FWFT, SBI_EXT_FWFT_SET, + feature, value, flags, 0, 0, 0); + + return sbi_err_map_linux_errno(ret.error); +} + +/** + * sbi_fwft_set_cpumask() - Set a feature for the specified cpumask + * @mask: CPU mask of cpus that need the feature to be set + * @feature: The feature ID to be set + * @value: The feature value to be set + * @flags: FWFT feature set flags + * + * Return: 0 on success, appropriate linux error code otherwise. + */ +int sbi_fwft_set_cpumask(const cpumask_t *mask, u32 feature, + unsigned long value, unsigned long flags) +{ + struct fwft_set_req req = { + .feature = feature, + .value = value, + .flags = flags, + .error = ATOMIC_INIT(0), + }; + + if (!sbi_fwft_supported) + return -EOPNOTSUPP; + + if (feature & SBI_FWFT_GLOBAL_FEATURE_BIT) + return -EINVAL; + + on_each_cpu_mask(mask, cpu_sbi_fwft_set, &req, 1); + + return atomic_read(&req.error); +} + /** * sbi_set_timer() - Program the timer for next timer event. * @stime_value: The value after which next timer event should fire. @@ -609,7 +679,7 @@ void __init sbi_init(void) } else { __sbi_rfence = __sbi_rfence_v01; } - if ((sbi_spec_version >= sbi_mk_version(0, 3)) && + if (sbi_spec_version >= sbi_mk_version(0, 3) && sbi_probe_extension(SBI_EXT_SRST)) { pr_info("SBI SRST extension detected\n"); pm_power_off = sbi_srst_power_off; @@ -617,11 +687,16 @@ void __init sbi_init(void) sbi_srst_reboot_nb.priority = 192; register_restart_handler(&sbi_srst_reboot_nb); } - if ((sbi_spec_version >= sbi_mk_version(2, 0)) && - (sbi_probe_extension(SBI_EXT_DBCN) > 0)) { + if (sbi_spec_version >= sbi_mk_version(2, 0) && + sbi_probe_extension(SBI_EXT_DBCN) > 0) { pr_info("SBI DBCN extension detected\n"); sbi_debug_console_available = true; } + if (sbi_spec_version >= sbi_mk_version(3, 0) && + sbi_probe_extension(SBI_EXT_FWFT)) { + pr_info("SBI FWFT extension detected\n"); + sbi_fwft_supported = true; + } } else { __sbi_set_timer = __sbi_set_timer_v01; __sbi_send_ipi = __sbi_send_ipi_v01; diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c index 249aec8594a9..0b170e18a2be 100644 --- a/arch/riscv/kernel/sys_hwprobe.c +++ b/arch/riscv/kernel/sys_hwprobe.c @@ -15,6 +15,7 @@ #include <asm/uaccess.h> #include <asm/unistd.h> #include <asm/vector.h> +#include <asm/vendor_extensions/sifive_hwprobe.h> #include <asm/vendor_extensions/thead_hwprobe.h> #include <vdso/vsyscall.h> @@ -96,6 +97,7 @@ static void hwprobe_isa_ext0(struct riscv_hwprobe *pair, * presence in the hart_isa bitmap, are made. */ EXT_KEY(ZAAMO); + EXT_KEY(ZABHA); EXT_KEY(ZACAS); EXT_KEY(ZALRSC); EXT_KEY(ZAWRS); @@ -300,6 +302,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair, pair->value = riscv_timebase; break; + case RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0: + hwprobe_isa_vendor_ext_sifive_0(pair, cpus); + break; + case RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0: hwprobe_isa_vendor_ext_thead_0(pair, cpus); break; diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index 77c788660223..dd8e4af6583f 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -16,6 +16,7 @@ #include <asm/entry-common.h> #include <asm/hwprobe.h> #include <asm/cpufeature.h> +#include <asm/sbi.h> #include <asm/vector.h> #define INSN_MATCH_LB 0x3 @@ -368,9 +369,7 @@ static int handle_scalar_misaligned_load(struct pt_regs *regs) perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr); -#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS *this_cpu_ptr(&misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED; -#endif if (!unaligned_enabled) return -1; @@ -455,7 +454,7 @@ static int handle_scalar_misaligned_load(struct pt_regs *regs) val.data_u64 = 0; if (user_mode(regs)) { - if (copy_from_user(&val, (u8 __user *)addr, len)) + if (copy_from_user_nofault(&val, (u8 __user *)addr, len)) return -1; } else { memcpy(&val, (u8 *)addr, len); @@ -556,7 +555,7 @@ static int handle_scalar_misaligned_store(struct pt_regs *regs) return -EOPNOTSUPP; if (user_mode(regs)) { - if (copy_to_user((u8 __user *)addr, &val, len)) + if (copy_to_user_nofault((u8 __user *)addr, &val, len)) return -1; } else { memcpy((u8 *)addr, &val, len); @@ -626,6 +625,10 @@ bool __init check_vector_unaligned_access_emulated_all_cpus(void) { int cpu; + /* + * While being documented as very slow, schedule_on_each_cpu() is used since + * kernel_vector_begin() expects irqs to be enabled or it will panic() + */ schedule_on_each_cpu(check_vector_unaligned_access_emulated); for_each_online_cpu(cpu) @@ -642,11 +645,23 @@ bool __init check_vector_unaligned_access_emulated_all_cpus(void) } #endif +static bool all_cpus_unaligned_scalar_access_emulated(void) +{ + int cpu; + + for_each_online_cpu(cpu) + if (per_cpu(misaligned_access_speed, cpu) != + RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED) + return false; + + return true; +} + #ifdef CONFIG_RISCV_SCALAR_MISALIGNED static bool unaligned_ctl __read_mostly; -void check_unaligned_access_emulated(struct work_struct *work __always_unused) +static void check_unaligned_access_emulated(void *arg __always_unused) { int cpu = smp_processor_id(); long *mas_ptr = per_cpu_ptr(&misaligned_access_speed, cpu); @@ -657,6 +672,13 @@ void check_unaligned_access_emulated(struct work_struct *work __always_unused) __asm__ __volatile__ ( " "REG_L" %[tmp], 1(%[ptr])\n" : [tmp] "=r" (tmp_val) : [ptr] "r" (&tmp_var) : "memory"); +} + +static int cpu_online_check_unaligned_access_emulated(unsigned int cpu) +{ + long *mas_ptr = per_cpu_ptr(&misaligned_access_speed, cpu); + + check_unaligned_access_emulated(NULL); /* * If unaligned_ctl is already set, this means that we detected that all @@ -665,26 +687,23 @@ void check_unaligned_access_emulated(struct work_struct *work __always_unused) */ if (unlikely(unaligned_ctl && (*mas_ptr != RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED))) { pr_crit("CPU misaligned accesses non homogeneous (expected all emulated)\n"); - while (true) - cpu_relax(); + return -EINVAL; } + + return 0; } bool __init check_unaligned_access_emulated_all_cpus(void) { - int cpu; - /* * We can only support PR_UNALIGN controls if all CPUs have misaligned * accesses emulated since tasks requesting such control can run on any * CPU. */ - schedule_on_each_cpu(check_unaligned_access_emulated); + on_each_cpu(check_unaligned_access_emulated, NULL, 1); - for_each_online_cpu(cpu) - if (per_cpu(misaligned_access_speed, cpu) - != RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED) - return false; + if (!all_cpus_unaligned_scalar_access_emulated()) + return false; unaligned_ctl = true; return true; @@ -699,4 +718,73 @@ bool __init check_unaligned_access_emulated_all_cpus(void) { return false; } +static int cpu_online_check_unaligned_access_emulated(unsigned int cpu) +{ + return 0; +} +#endif + +static bool misaligned_traps_delegated; + +#ifdef CONFIG_RISCV_SBI + +static int cpu_online_sbi_unaligned_setup(unsigned int cpu) +{ + if (sbi_fwft_set(SBI_FWFT_MISALIGNED_EXC_DELEG, 1, 0) && + misaligned_traps_delegated) { + pr_crit("Misaligned trap delegation non homogeneous (expected delegated)"); + return -EINVAL; + } + + return 0; +} + +void __init unaligned_access_init(void) +{ + int ret; + + ret = sbi_fwft_set_online_cpus(SBI_FWFT_MISALIGNED_EXC_DELEG, 1, 0); + if (ret) + return; + + misaligned_traps_delegated = true; + pr_info("SBI misaligned access exception delegation ok\n"); + /* + * Note that we don't have to take any specific action here, if + * the delegation is successful, then + * check_unaligned_access_emulated() will verify that indeed the + * platform traps on misaligned accesses. + */ +} +#else +void __init unaligned_access_init(void) {} + +static int cpu_online_sbi_unaligned_setup(unsigned int cpu __always_unused) +{ + return 0; +} + #endif + +int cpu_online_unaligned_access_init(unsigned int cpu) +{ + int ret; + + ret = cpu_online_sbi_unaligned_setup(cpu); + if (ret) + return ret; + + return cpu_online_check_unaligned_access_emulated(cpu); +} + +bool misaligned_traps_can_delegate(void) +{ + /* + * Either we successfully requested misaligned traps delegation for all + * CPUs, or the SBI does not implement the FWFT extension but delegated + * the exception by default. + */ + return misaligned_traps_delegated || + all_cpus_unaligned_scalar_access_emulated(); +} +EXPORT_SYMBOL_GPL(misaligned_traps_can_delegate); diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c index b8ba13819d05..ae2068425fbc 100644 --- a/arch/riscv/kernel/unaligned_access_speed.c +++ b/arch/riscv/kernel/unaligned_access_speed.c @@ -236,6 +236,11 @@ arch_initcall_sync(lock_and_set_unaligned_access_static_branch); static int riscv_online_cpu(unsigned int cpu) { + int ret = cpu_online_unaligned_access_init(cpu); + + if (ret) + return ret; + /* We are already set since the last check */ if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) { goto exit; @@ -248,7 +253,6 @@ static int riscv_online_cpu(unsigned int cpu) { static struct page *buf; - check_unaligned_access_emulated(NULL); buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); if (!buf) { pr_warn("Allocation failure, not measuring misaligned performance\n"); @@ -439,6 +443,8 @@ static int __init check_unaligned_access_all_cpus(void) { int cpu; + unaligned_access_init(); + if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) { pr_info("scalar unaligned access speed set to '%s' (%lu) by command line\n", speed_str[unaligned_scalar_speed_param], unaligned_scalar_speed_param); diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index cc2895d1fbc2..3a8e038b10a2 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -136,7 +136,7 @@ static int __setup_additional_pages(struct mm_struct *mm, ret = _install_special_mapping(mm, vdso_base, vdso_text_len, - (VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC), + (VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | VM_SEALED_SYSMAP), vdso_info->cm); if (IS_ERR(ret)) diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index ad73607abc28..9ebb5e590f93 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -13,9 +13,17 @@ vdso-syms += flush_icache vdso-syms += hwprobe vdso-syms += sys_hwprobe +ifdef CONFIG_VDSO_GETRANDOM +vdso-syms += getrandom +endif + # Files to link into the vdso obj-vdso = $(patsubst %, %.o, $(vdso-syms)) note.o +ifdef CONFIG_VDSO_GETRANDOM +obj-vdso += vgetrandom-chacha.o +endif + ccflags-y := -fno-stack-protector ccflags-y += -DDISABLE_BRANCH_PROFILING ccflags-y += -fno-builtin @@ -24,6 +32,10 @@ ifneq ($(c-gettimeofday-y),) CFLAGS_vgettimeofday.o += -fPIC -include $(c-gettimeofday-y) endif +ifneq ($(c-getrandom-y),) + CFLAGS_getrandom.o += -fPIC -include $(c-getrandom-y) +endif + CFLAGS_hwprobe.o += -fPIC # Build rules @@ -38,6 +50,7 @@ endif # Disable -pg to prevent insert call site CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) +CFLAGS_REMOVE_getrandom.o = $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) CFLAGS_REMOVE_hwprobe.o = $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) # Force dependency @@ -47,7 +60,7 @@ $(obj)/vdso.o: $(obj)/vdso.so $(obj)/vdso.so.dbg: $(obj)/vdso.lds $(obj-vdso) FORCE $(call if_changed,vdsold_and_check) LDFLAGS_vdso.so.dbg = -shared -soname=linux-vdso.so.1 \ - --build-id=sha1 --hash-style=both --eh-frame-hdr + --build-id=sha1 --eh-frame-hdr # strip rule for the .so file $(obj)/%.so: OBJCOPYFLAGS := -S diff --git a/arch/riscv/kernel/vdso/getrandom.c b/arch/riscv/kernel/vdso/getrandom.c new file mode 100644 index 000000000000..f21922e8cebd --- /dev/null +++ b/arch/riscv/kernel/vdso/getrandom.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved. + */ +#include <linux/types.h> + +ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) +{ + return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len); +} diff --git a/arch/riscv/kernel/vdso/vdso.lds.S b/arch/riscv/kernel/vdso/vdso.lds.S index 8e86965a8aae..7c15b0f4ee3b 100644 --- a/arch/riscv/kernel/vdso/vdso.lds.S +++ b/arch/riscv/kernel/vdso/vdso.lds.S @@ -80,6 +80,9 @@ VERSION #ifndef COMPAT_VDSO __vdso_riscv_hwprobe; #endif +#if defined(CONFIG_VDSO_GETRANDOM) && !defined(COMPAT_VDSO) + __vdso_getrandom; +#endif local: *; }; } diff --git a/arch/riscv/kernel/vdso/vgetrandom-chacha.S b/arch/riscv/kernel/vdso/vgetrandom-chacha.S new file mode 100644 index 000000000000..5f0dad8f2373 --- /dev/null +++ b/arch/riscv/kernel/vdso/vgetrandom-chacha.S @@ -0,0 +1,249 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2025 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved. + * + * Based on arch/loongarch/vdso/vgetrandom-chacha.S. + */ + +#include <asm/asm.h> +#include <linux/linkage.h> + +.text + +.macro ROTRI rd rs imm + slliw t0, \rs, 32 - \imm + srliw \rd, \rs, \imm + or \rd, \rd, t0 +.endm + +.macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3 + \op \d0, \d0, \s0 + \op \d1, \d1, \s1 + \op \d2, \d2, \s2 + \op \d3, \d3, \s3 +.endm + +/* + * a0: output bytes + * a1: 32-byte key input + * a2: 8-byte counter input/output + * a3: number of 64-byte blocks to write to output + */ +SYM_FUNC_START(__arch_chacha20_blocks_nostack) + +#define output a0 +#define key a1 +#define counter a2 +#define nblocks a3 +#define i a4 +#define state0 s0 +#define state1 s1 +#define state2 s2 +#define state3 s3 +#define state4 s4 +#define state5 s5 +#define state6 s6 +#define state7 s7 +#define state8 s8 +#define state9 s9 +#define state10 s10 +#define state11 s11 +#define state12 a5 +#define state13 a6 +#define state14 a7 +#define state15 t1 +#define cnt t2 +#define copy0 t3 +#define copy1 t4 +#define copy2 t5 +#define copy3 t6 + +/* Packs to be used with OP_4REG */ +#define line0 state0, state1, state2, state3 +#define line1 state4, state5, state6, state7 +#define line2 state8, state9, state10, state11 +#define line3 state12, state13, state14, state15 + +#define line1_perm state5, state6, state7, state4 +#define line2_perm state10, state11, state8, state9 +#define line3_perm state15, state12, state13, state14 + +#define copy copy0, copy1, copy2, copy3 + +#define _16 16, 16, 16, 16 +#define _20 20, 20, 20, 20 +#define _24 24, 24, 24, 24 +#define _25 25, 25, 25, 25 + + /* + * The ABI requires s0-s9 saved. + * This does not violate the stack-less requirement: no sensitive data + * is spilled onto the stack. + */ + addi sp, sp, -12*SZREG + REG_S s0, (sp) + REG_S s1, SZREG(sp) + REG_S s2, 2*SZREG(sp) + REG_S s3, 3*SZREG(sp) + REG_S s4, 4*SZREG(sp) + REG_S s5, 5*SZREG(sp) + REG_S s6, 6*SZREG(sp) + REG_S s7, 7*SZREG(sp) + REG_S s8, 8*SZREG(sp) + REG_S s9, 9*SZREG(sp) + REG_S s10, 10*SZREG(sp) + REG_S s11, 11*SZREG(sp) + + ld cnt, (counter) + + li copy0, 0x61707865 + li copy1, 0x3320646e + li copy2, 0x79622d32 + li copy3, 0x6b206574 + +.Lblock: + /* state[0,1,2,3] = "expand 32-byte k" */ + mv state0, copy0 + mv state1, copy1 + mv state2, copy2 + mv state3, copy3 + + /* state[4,5,..,11] = key */ + lw state4, (key) + lw state5, 4(key) + lw state6, 8(key) + lw state7, 12(key) + lw state8, 16(key) + lw state9, 20(key) + lw state10, 24(key) + lw state11, 28(key) + + /* state[12,13] = counter */ + mv state12, cnt + srli state13, cnt, 32 + + /* state[14,15] = 0 */ + mv state14, zero + mv state15, zero + + li i, 10 +.Lpermute: + /* odd round */ + OP_4REG addw line0, line1 + OP_4REG xor line3, line0 + OP_4REG ROTRI line3, _16 + + OP_4REG addw line2, line3 + OP_4REG xor line1, line2 + OP_4REG ROTRI line1, _20 + + OP_4REG addw line0, line1 + OP_4REG xor line3, line0 + OP_4REG ROTRI line3, _24 + + OP_4REG addw line2, line3 + OP_4REG xor line1, line2 + OP_4REG ROTRI line1, _25 + + /* even round */ + OP_4REG addw line0, line1_perm + OP_4REG xor line3_perm, line0 + OP_4REG ROTRI line3_perm, _16 + + OP_4REG addw line2_perm, line3_perm + OP_4REG xor line1_perm, line2_perm + OP_4REG ROTRI line1_perm, _20 + + OP_4REG addw line0, line1_perm + OP_4REG xor line3_perm, line0 + OP_4REG ROTRI line3_perm, _24 + + OP_4REG addw line2_perm, line3_perm + OP_4REG xor line1_perm, line2_perm + OP_4REG ROTRI line1_perm, _25 + + addi i, i, -1 + bnez i, .Lpermute + + /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */ + OP_4REG addw line0, copy + sw state0, (output) + sw state1, 4(output) + sw state2, 8(output) + sw state3, 12(output) + + /* from now on state[0,1,2,3] are scratch registers */ + + /* state[0,1,2,3] = lo(key) */ + lw state0, (key) + lw state1, 4(key) + lw state2, 8(key) + lw state3, 12(key) + + /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */ + OP_4REG addw line1, line0 + sw state4, 16(output) + sw state5, 20(output) + sw state6, 24(output) + sw state7, 28(output) + + /* state[0,1,2,3] = hi(key) */ + lw state0, 16(key) + lw state1, 20(key) + lw state2, 24(key) + lw state3, 28(key) + + /* output[8,9,10,11] = tmp[0,1,2,3] + state[8,9,10,11] */ + OP_4REG addw line2, line0 + sw state8, 32(output) + sw state9, 36(output) + sw state10, 40(output) + sw state11, 44(output) + + /* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */ + addw state12, state12, cnt + srli state0, cnt, 32 + addw state13, state13, state0 + sw state12, 48(output) + sw state13, 52(output) + sw state14, 56(output) + sw state15, 60(output) + + /* ++counter */ + addi cnt, cnt, 1 + + /* output += 64 */ + addi output, output, 64 + /* --nblocks */ + addi nblocks, nblocks, -1 + bnez nblocks, .Lblock + + /* counter = [cnt_lo, cnt_hi] */ + sd cnt, (counter) + + /* Zero out the potentially sensitive regs, in case nothing uses these + * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and + * state[0,...,11] are s0-s11 those we'll restore in the epilogue, we + * only need to zero state[12,...,15]. + */ + mv state12, zero + mv state13, zero + mv state14, zero + mv state15, zero + + REG_L s0, (sp) + REG_L s1, SZREG(sp) + REG_L s2, 2*SZREG(sp) + REG_L s3, 3*SZREG(sp) + REG_L s4, 4*SZREG(sp) + REG_L s5, 5*SZREG(sp) + REG_L s6, 6*SZREG(sp) + REG_L s7, 7*SZREG(sp) + REG_L s8, 8*SZREG(sp) + REG_L s9, 9*SZREG(sp) + REG_L s10, 10*SZREG(sp) + REG_L s11, 11*SZREG(sp) + addi sp, sp, 12*SZREG + + ret +SYM_FUNC_END(__arch_chacha20_blocks_nostack) diff --git a/arch/riscv/kernel/vendor_extensions.c b/arch/riscv/kernel/vendor_extensions.c index 9feb7f67a0a3..92d8ff81f42c 100644 --- a/arch/riscv/kernel/vendor_extensions.c +++ b/arch/riscv/kernel/vendor_extensions.c @@ -6,6 +6,7 @@ #include <asm/vendorid_list.h> #include <asm/vendor_extensions.h> #include <asm/vendor_extensions/andes.h> +#include <asm/vendor_extensions/sifive.h> #include <asm/vendor_extensions/thead.h> #include <linux/array_size.h> @@ -15,6 +16,9 @@ struct riscv_isa_vendor_ext_data_list *riscv_isa_vendor_ext_list[] = { #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_ANDES &riscv_isa_vendor_ext_list_andes, #endif +#ifdef CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE + &riscv_isa_vendor_ext_list_sifive, +#endif #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_THEAD &riscv_isa_vendor_ext_list_thead, #endif @@ -45,6 +49,12 @@ bool __riscv_isa_vendor_extension_available(int cpu, unsigned long vendor, unsig cpu_bmap = riscv_isa_vendor_ext_list_andes.per_hart_isa_bitmap; break; #endif + #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE + case SIFIVE_VENDOR_ID: + bmap = &riscv_isa_vendor_ext_list_sifive.all_harts_isa_bitmap; + cpu_bmap = riscv_isa_vendor_ext_list_sifive.per_hart_isa_bitmap; + break; + #endif #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_THEAD case THEAD_VENDOR_ID: bmap = &riscv_isa_vendor_ext_list_thead.all_harts_isa_bitmap; diff --git a/arch/riscv/kernel/vendor_extensions/Makefile b/arch/riscv/kernel/vendor_extensions/Makefile index 866414c81a9f..a4eca96d1c8a 100644 --- a/arch/riscv/kernel/vendor_extensions/Makefile +++ b/arch/riscv/kernel/vendor_extensions/Makefile @@ -1,5 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_ANDES) += andes.o +obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE) += sifive.o +obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE) += sifive_hwprobe.o obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_THEAD) += thead.o obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_THEAD) += thead_hwprobe.o diff --git a/arch/riscv/kernel/vendor_extensions/sifive.c b/arch/riscv/kernel/vendor_extensions/sifive.c new file mode 100644 index 000000000000..1411337dc1e6 --- /dev/null +++ b/arch/riscv/kernel/vendor_extensions/sifive.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <asm/cpufeature.h> +#include <asm/vendor_extensions.h> +#include <asm/vendor_extensions/sifive.h> + +#include <linux/array_size.h> +#include <linux/types.h> + +/* All SiFive vendor extensions supported in Linux */ +const struct riscv_isa_ext_data riscv_isa_vendor_ext_sifive[] = { + __RISCV_ISA_EXT_DATA(xsfvfnrclipxfqf, RISCV_ISA_VENDOR_EXT_XSFVFNRCLIPXFQF), + __RISCV_ISA_EXT_DATA(xsfvfwmaccqqq, RISCV_ISA_VENDOR_EXT_XSFVFWMACCQQQ), + __RISCV_ISA_EXT_DATA(xsfvqmaccdod, RISCV_ISA_VENDOR_EXT_XSFVQMACCDOD), + __RISCV_ISA_EXT_DATA(xsfvqmaccqoq, RISCV_ISA_VENDOR_EXT_XSFVQMACCQOQ), +}; + +struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_sifive = { + .ext_data_count = ARRAY_SIZE(riscv_isa_vendor_ext_sifive), + .ext_data = riscv_isa_vendor_ext_sifive, +}; diff --git a/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c b/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c new file mode 100644 index 000000000000..1f77f6309763 --- /dev/null +++ b/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <asm/vendor_extensions/sifive.h> +#include <asm/vendor_extensions/sifive_hwprobe.h> +#include <asm/vendor_extensions/vendor_hwprobe.h> + +#include <linux/cpumask.h> +#include <linux/types.h> + +#include <uapi/asm/hwprobe.h> +#include <uapi/asm/vendor/sifive.h> + +void hwprobe_isa_vendor_ext_sifive_0(struct riscv_hwprobe *pair, const struct cpumask *cpus) +{ + VENDOR_EXTENSION_SUPPORTED(pair, cpus, + riscv_isa_vendor_ext_list_sifive.per_hart_isa_bitmap, { + VENDOR_EXT_KEY(XSFVQMACCDOD); + VENDOR_EXT_KEY(XSFVQMACCQOQ); + VENDOR_EXT_KEY(XSFVFNRCLIPXFQF); + VENDOR_EXT_KEY(XSFVFWMACCQQQ); + }); +} diff --git a/arch/riscv/lib/riscv_v_helpers.c b/arch/riscv/lib/riscv_v_helpers.c index be38a93cedae..7bbdfc6d4552 100644 --- a/arch/riscv/lib/riscv_v_helpers.c +++ b/arch/riscv/lib/riscv_v_helpers.c @@ -16,8 +16,11 @@ #ifdef CONFIG_MMU size_t riscv_v_usercopy_threshold = CONFIG_RISCV_ISA_V_UCOPY_THRESHOLD; int __asm_vector_usercopy(void *dst, void *src, size_t n); +int __asm_vector_usercopy_sum_enabled(void *dst, void *src, size_t n); int fallback_scalar_usercopy(void *dst, void *src, size_t n); -asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n) +int fallback_scalar_usercopy_sum_enabled(void *dst, void *src, size_t n); +asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n, + bool enable_sum) { size_t remain, copied; @@ -26,7 +29,8 @@ asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n) goto fallback; kernel_vector_begin(); - remain = __asm_vector_usercopy(dst, src, n); + remain = enable_sum ? __asm_vector_usercopy(dst, src, n) : + __asm_vector_usercopy_sum_enabled(dst, src, n); kernel_vector_end(); if (remain) { @@ -40,6 +44,7 @@ asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n) return remain; fallback: - return fallback_scalar_usercopy(dst, src, n); + return enable_sum ? fallback_scalar_usercopy(dst, src, n) : + fallback_scalar_usercopy_sum_enabled(dst, src, n); } #endif diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S index 6a9f116bb545..4efea1b3326c 100644 --- a/arch/riscv/lib/uaccess.S +++ b/arch/riscv/lib/uaccess.S @@ -17,14 +17,43 @@ SYM_FUNC_START(__asm_copy_to_user) ALTERNATIVE("j fallback_scalar_usercopy", "nop", 0, RISCV_ISA_EXT_ZVE32X, CONFIG_RISCV_ISA_V) REG_L t0, riscv_v_usercopy_threshold bltu a2, t0, fallback_scalar_usercopy - tail enter_vector_usercopy + li a3, 1 + tail enter_vector_usercopy #endif -SYM_FUNC_START(fallback_scalar_usercopy) +SYM_FUNC_END(__asm_copy_to_user) +EXPORT_SYMBOL(__asm_copy_to_user) +SYM_FUNC_ALIAS(__asm_copy_from_user, __asm_copy_to_user) +EXPORT_SYMBOL(__asm_copy_from_user) +SYM_FUNC_START(fallback_scalar_usercopy) /* Enable access to user memory */ - li t6, SR_SUM - csrs CSR_STATUS, t6 + li t6, SR_SUM + csrs CSR_STATUS, t6 + mv t6, ra + call fallback_scalar_usercopy_sum_enabled + + /* Disable access to user memory */ + mv ra, t6 + li t6, SR_SUM + csrc CSR_STATUS, t6 + ret +SYM_FUNC_END(fallback_scalar_usercopy) + +SYM_FUNC_START(__asm_copy_to_user_sum_enabled) +#ifdef CONFIG_RISCV_ISA_V + ALTERNATIVE("j fallback_scalar_usercopy_sum_enabled", "nop", 0, RISCV_ISA_EXT_ZVE32X, CONFIG_RISCV_ISA_V) + REG_L t0, riscv_v_usercopy_threshold + bltu a2, t0, fallback_scalar_usercopy_sum_enabled + li a3, 0 + tail enter_vector_usercopy +#endif +SYM_FUNC_END(__asm_copy_to_user_sum_enabled) +SYM_FUNC_ALIAS(__asm_copy_from_user_sum_enabled, __asm_copy_to_user_sum_enabled) +EXPORT_SYMBOL(__asm_copy_from_user_sum_enabled) +EXPORT_SYMBOL(__asm_copy_to_user_sum_enabled) + +SYM_FUNC_START(fallback_scalar_usercopy_sum_enabled) /* * Save the terminal address which will be used to compute the number * of bytes copied in case of a fixup exception. @@ -178,23 +207,12 @@ SYM_FUNC_START(fallback_scalar_usercopy) bltu a0, t0, 4b /* t0 - end of dst */ .Lout_copy_user: - /* Disable access to user memory */ - csrc CSR_STATUS, t6 li a0, 0 ret - - /* Exception fixup code */ 10: - /* Disable access to user memory */ - csrc CSR_STATUS, t6 sub a0, t5, a0 ret -SYM_FUNC_END(__asm_copy_to_user) -SYM_FUNC_END(fallback_scalar_usercopy) -EXPORT_SYMBOL(__asm_copy_to_user) -SYM_FUNC_ALIAS(__asm_copy_from_user, __asm_copy_to_user) -EXPORT_SYMBOL(__asm_copy_from_user) - +SYM_FUNC_END(fallback_scalar_usercopy_sum_enabled) SYM_FUNC_START(__clear_user) diff --git a/arch/riscv/lib/uaccess_vector.S b/arch/riscv/lib/uaccess_vector.S index 7c45f26de4f7..03b5560609a2 100644 --- a/arch/riscv/lib/uaccess_vector.S +++ b/arch/riscv/lib/uaccess_vector.S @@ -24,7 +24,18 @@ SYM_FUNC_START(__asm_vector_usercopy) /* Enable access to user memory */ li t6, SR_SUM csrs CSR_STATUS, t6 + mv t6, ra + call __asm_vector_usercopy_sum_enabled + + /* Disable access to user memory */ + mv ra, t6 + li t6, SR_SUM + csrc CSR_STATUS, t6 + ret +SYM_FUNC_END(__asm_vector_usercopy) + +SYM_FUNC_START(__asm_vector_usercopy_sum_enabled) loop: vsetvli iVL, iNum, e8, ELEM_LMUL_SETTING, ta, ma fixup vle8.v vData, (pSrc), 10f @@ -36,8 +47,6 @@ loop: /* Exception fixup for vector load is shared with normal exit */ 10: - /* Disable access to user memory */ - csrc CSR_STATUS, t6 mv a0, iNum ret @@ -49,4 +58,4 @@ loop: csrr t2, CSR_VSTART sub iNum, iNum, t2 j 10b -SYM_FUNC_END(__asm_vector_usercopy) +SYM_FUNC_END(__asm_vector_usercopy_sum_enabled) diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index b8e96dfff19d..4ca5aafce22e 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -24,7 +24,20 @@ void flush_icache_all(void) if (num_online_cpus() < 2) return; - else if (riscv_use_sbi_for_rfence()) + + /* + * Make sure all previous writes to the D$ are ordered before making + * the IPI. The RISC-V spec states that a hart must execute a data fence + * before triggering a remote fence.i in order to make the modification + * visable for remote harts. + * + * IPIs on RISC-V are triggered by MMIO writes to either CLINT or + * S-IMSIC, so the fence ensures previous data writes "happen before" + * the MMIO. + */ + RISCV_FENCE(w, o); + + if (riscv_use_sbi_for_rfence()) sbi_remote_fence_i(NULL); else on_each_cpu(ipi_remote_fence_i, NULL, 1); @@ -101,6 +114,9 @@ EXPORT_SYMBOL_GPL(riscv_cbom_block_size); unsigned int riscv_cboz_block_size; EXPORT_SYMBOL_GPL(riscv_cboz_block_size); +unsigned int riscv_cbop_block_size; +EXPORT_SYMBOL_GPL(riscv_cbop_block_size); + static void __init cbo_get_block_size(struct device_node *node, const char *name, u32 *block_size, unsigned long *first_hartid) @@ -125,8 +141,8 @@ static void __init cbo_get_block_size(struct device_node *node, void __init riscv_init_cbo_blocksizes(void) { - unsigned long cbom_hartid, cboz_hartid; - u32 cbom_block_size = 0, cboz_block_size = 0; + unsigned long cbom_hartid, cboz_hartid, cbop_hartid; + u32 cbom_block_size = 0, cboz_block_size = 0, cbop_block_size = 0; struct device_node *node; struct acpi_table_header *rhct; acpi_status status; @@ -138,13 +154,15 @@ void __init riscv_init_cbo_blocksizes(void) &cbom_block_size, &cbom_hartid); cbo_get_block_size(node, "riscv,cboz-block-size", &cboz_block_size, &cboz_hartid); + cbo_get_block_size(node, "riscv,cbop-block-size", + &cbop_block_size, &cbop_hartid); } } else { status = acpi_get_table(ACPI_SIG_RHCT, 0, &rhct); if (ACPI_FAILURE(status)) return; - acpi_get_cbo_block_size(rhct, &cbom_block_size, &cboz_block_size, NULL); + acpi_get_cbo_block_size(rhct, &cbom_block_size, &cboz_block_size, &cbop_block_size); acpi_put_table((struct acpi_table_header *)rhct); } @@ -153,6 +171,9 @@ void __init riscv_init_cbo_blocksizes(void) if (cboz_block_size) riscv_cboz_block_size = cboz_block_size; + + if (cbop_block_size) + riscv_cbop_block_size = cbop_block_size; } #ifdef CONFIG_SMP diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c index 4ae67324f992..8b6c0a112a8d 100644 --- a/arch/riscv/mm/pgtable.c +++ b/arch/riscv/mm/pgtable.c @@ -154,4 +154,14 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, flush_tlb_mm(vma->vm_mm); return pmd; } + +pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp) +{ + VM_WARN_ON_ONCE(!pud_present(*pudp)); + pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp)); + + flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); + return old; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c index f9e27ba1df99..e737ba7949b1 100644 --- a/arch/riscv/mm/tlbflush.c +++ b/arch/riscv/mm/tlbflush.c @@ -7,6 +7,27 @@ #include <linux/mmu_notifier.h> #include <asm/sbi.h> #include <asm/mmu_context.h> +#include <asm/cpufeature.h> + +#define has_svinval() riscv_has_extension_unlikely(RISCV_ISA_EXT_SVINVAL) + +static inline void local_sfence_inval_ir(void) +{ + asm volatile(SFENCE_INVAL_IR() ::: "memory"); +} + +static inline void local_sfence_w_inval(void) +{ + asm volatile(SFENCE_W_INVAL() ::: "memory"); +} + +static inline void local_sinval_vma(unsigned long vma, unsigned long asid) +{ + if (asid != FLUSH_TLB_NO_ASID) + asm volatile(SINVAL_VMA(%0, %1) : : "r" (vma), "r" (asid) : "memory"); + else + asm volatile(SINVAL_VMA(%0, zero) : : "r" (vma) : "memory"); +} /* * Flush entire TLB if number of entries to be flushed is greater @@ -27,6 +48,16 @@ static void local_flush_tlb_range_threshold_asid(unsigned long start, return; } + if (has_svinval()) { + local_sfence_w_inval(); + for (i = 0; i < nr_ptes_in_range; ++i) { + local_sinval_vma(start, asid); + start += stride; + } + local_sfence_inval_ir(); + return; + } + for (i = 0; i < nr_ptes_in_range; ++i) { local_flush_tlb_page_asid(start, asid); start += stride; @@ -182,6 +213,13 @@ void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, __flush_tlb_range(vma->vm_mm, mm_cpumask(vma->vm_mm), start, end - start, PMD_SIZE); } + +void flush_pud_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) +{ + __flush_tlb_range(vma->vm_mm, mm_cpumask(vma->vm_mm), + start, end - start, PUD_SIZE); +} #endif bool arch_tlbbatch_should_defer(struct mm_struct *mm) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index e23670e1949c..21c2e61fece4 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -319,7 +319,7 @@ enum prot_type { PROT_TYPE_DAT = 3, PROT_TYPE_IEP = 4, /* Dummy value for passing an initialized value when code != PGM_PROTECTION */ - PROT_NONE, + PROT_TYPE_DUMMY, }; static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar, @@ -335,7 +335,7 @@ static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, switch (code) { case PGM_PROTECTION: switch (prot) { - case PROT_NONE: + case PROT_TYPE_DUMMY: /* We should never get here, acts like termination */ WARN_ON_ONCE(1); break; @@ -805,7 +805,7 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, gpa = kvm_s390_real_to_abs(vcpu, ga); if (!kvm_is_gpa_in_memslot(vcpu->kvm, gpa)) { rc = PGM_ADDRESSING; - prot = PROT_NONE; + prot = PROT_TYPE_DUMMY; } } if (rc) @@ -963,7 +963,7 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, if (rc == PGM_PROTECTION) prot = PROT_TYPE_KEYC; else - prot = PROT_NONE; + prot = PROT_TYPE_DUMMY; rc = trans_exc_ending(vcpu, rc, ga, ar, mode, prot, terminate); } out_unlock: diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 3829521450dd..e1ad05bfd28a 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -441,6 +441,8 @@ void do_secure_storage_access(struct pt_regs *regs) if (rc) BUG(); } else { + if (faulthandler_disabled()) + return handle_fault_error_nolock(regs, 0); mm = current->mm; mmap_read_lock(mm); vma = find_vma(mm, addr); diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 79509c7f39de..f08e8a7fac93 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -52,13 +52,7 @@ config NO_IOMEM config UML_IOMEM_EMULATION bool select INDIRECT_IOMEM - select HAS_IOPORT select GENERIC_PCI_IOMAP - select GENERIC_IOMAP - select NO_GENERIC_PCI_IOPORT_MAP - -config NO_IOPORT_MAP - def_bool !UML_IOMEM_EMULATION config ISA bool diff --git a/arch/um/configs/i386_defconfig b/arch/um/configs/i386_defconfig index 1ffa088739f4..29d9666eceae 100644 --- a/arch/um/configs/i386_defconfig +++ b/arch/um/configs/i386_defconfig @@ -52,13 +52,6 @@ CONFIG_PACKET=y CONFIG_UNIX=y CONFIG_INET=y # CONFIG_IPV6 is not set -CONFIG_UML_NET=y -CONFIG_UML_NET_ETHERTAP=y -CONFIG_UML_NET_TUNTAP=y -CONFIG_UML_NET_SLIP=y -CONFIG_UML_NET_DAEMON=y -CONFIG_UML_NET_MCAST=y -CONFIG_UML_NET_SLIRP=y CONFIG_EXT4_FS=y CONFIG_QUOTA=y CONFIG_AUTOFS_FS=m diff --git a/arch/um/configs/x86_64_defconfig b/arch/um/configs/x86_64_defconfig index 03b10d3f6816..cf309c5406a2 100644 --- a/arch/um/configs/x86_64_defconfig +++ b/arch/um/configs/x86_64_defconfig @@ -51,13 +51,6 @@ CONFIG_PACKET=y CONFIG_UNIX=y CONFIG_INET=y # CONFIG_IPV6 is not set -CONFIG_UML_NET=y -CONFIG_UML_NET_ETHERTAP=y -CONFIG_UML_NET_TUNTAP=y -CONFIG_UML_NET_SLIP=y -CONFIG_UML_NET_DAEMON=y -CONFIG_UML_NET_MCAST=y -CONFIG_UML_NET_SLIRP=y CONFIG_EXT4_FS=y CONFIG_QUOTA=y CONFIG_AUTOFS_FS=m diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig index 9cb196070614..34085bfc6d41 100644 --- a/arch/um/drivers/Kconfig +++ b/arch/um/drivers/Kconfig @@ -124,206 +124,18 @@ endmenu menu "UML Network Devices" depends on NET -# UML virtual driver -config UML_NET - bool "Virtual network device" - help - While the User-Mode port cannot directly talk to any physical - hardware devices, this choice and the following transport options - provide one or more virtual network devices through which the UML - kernels can talk to each other, the host, and with the host's help, - machines on the outside world. - - For more information, including explanations of the networking and - sample configurations, see - <http://user-mode-linux.sourceforge.net/old/networking.html>. - - If you'd like to be able to enable networking in the User-Mode - linux environment, say Y; otherwise say N. Note that you must - enable at least one of the following transport options to actually - make use of UML networking. - -config UML_NET_ETHERTAP - bool "Ethertap transport (obsolete)" - depends on UML_NET - help - The Ethertap User-Mode Linux network transport allows a single - running UML to exchange packets with its host over one of the - host's Ethertap devices, such as /dev/tap0. Additional running - UMLs can use additional Ethertap devices, one per running UML. - While the UML believes it's on a (multi-device, broadcast) virtual - Ethernet network, it's in fact communicating over a point-to-point - link with the host. - - To use this, your host kernel must have support for Ethertap - devices. Also, if your host kernel is 2.4.x, it must have - CONFIG_NETLINK_DEV configured as Y or M. - - For more information, see - <http://user-mode-linux.sourceforge.net/old/networking.html> That site - has examples of the UML command line to use to enable Ethertap - networking. - - NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please - migrate to UML_NET_VECTOR. - - If unsure, say N. - -config UML_NET_TUNTAP - bool "TUN/TAP transport (obsolete)" - depends on UML_NET - help - The UML TUN/TAP network transport allows a UML instance to exchange - packets with the host over a TUN/TAP device. This option will only - work with a 2.4 host, unless you've applied the TUN/TAP patch to - your 2.2 host kernel. - - To use this transport, your host kernel must have support for TUN/TAP - devices, either built-in or as a module. - - NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please - migrate to UML_NET_VECTOR. - - If unsure, say N. - -config UML_NET_SLIP - bool "SLIP transport (obsolete)" - depends on UML_NET - help - The slip User-Mode Linux network transport allows a running UML to - network with its host over a point-to-point link. Unlike Ethertap, - which can carry any Ethernet frame (and hence even non-IP packets), - the slip transport can only carry IP packets. - - To use this, your host must support slip devices. - - For more information, see - <http://user-mode-linux.sourceforge.net/old/networking.html>. - has examples of the UML command line to use to enable slip - networking, and details of a few quirks with it. - - NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please - migrate to UML_NET_VECTOR. - - If unsure, say N. - -config UML_NET_DAEMON - bool "Daemon transport (obsolete)" - depends on UML_NET - help - This User-Mode Linux network transport allows one or more running - UMLs on a single host to communicate with each other, but not to - the host. - - To use this form of networking, you'll need to run the UML - networking daemon on the host. - - For more information, see - <http://user-mode-linux.sourceforge.net/old/networking.html> That site - has examples of the UML command line to use to enable Daemon - networking. - - NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please - migrate to UML_NET_VECTOR. - - If unsure, say N. - -config UML_NET_DAEMON_DEFAULT_SOCK - string "Default socket for daemon transport" - default "/tmp/uml.ctl" - depends on UML_NET_DAEMON - help - This option allows setting the default socket for the daemon - transport, normally it defaults to /tmp/uml.ctl. - config UML_NET_VECTOR bool "Vector I/O high performance network devices" - depends on UML_NET select MAY_HAVE_RUNTIME_DEPS help This User-Mode Linux network driver uses multi-message send and receive functions. The host running the UML guest must have a linux kernel version above 3.0 and a libc version > 2.13. - This driver provides tap, raw, gre and l2tpv3 network transports - with up to 4 times higher network throughput than the UML network - drivers. - -config UML_NET_VDE - bool "VDE transport (obsolete)" - depends on UML_NET - depends on !MODVERSIONS - select MAY_HAVE_RUNTIME_DEPS - help - This User-Mode Linux network transport allows one or more running - UMLs on a single host to communicate with each other and also - with the rest of the world using Virtual Distributed Ethernet, - an improved fork of uml_switch. + This driver provides tap, raw, gre and l2tpv3 network transports. - You must have libvdeplug installed in order to build the vde - transport into UML. - - To use this form of networking, you will need to run vde_switch - on the host. - - For more information, see <http://wiki.virtualsquare.org/> - That site has a good overview of what VDE is and also examples - of the UML command line to use to enable VDE networking. - - NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please - migrate to UML_NET_VECTOR. - - If unsure, say N. - -config UML_NET_MCAST - bool "Multicast transport (obsolete)" - depends on UML_NET - help - This Multicast User-Mode Linux network transport allows multiple - UMLs (even ones running on different host machines!) to talk to - each other over a virtual ethernet network. However, it requires - at least one UML with one of the other transports to act as a - bridge if any of them need to be able to talk to their hosts or any - other IP machines. - - To use this, your host kernel(s) must support IP Multicasting. - - For more information, see - <http://user-mode-linux.sourceforge.net/old/networking.html> That site - has examples of the UML command line to use to enable Multicast - networking, and notes about the security of this approach. - - NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please - migrate to UML_NET_VECTOR. - - If unsure, say N. - -config UML_NET_SLIRP - bool "SLiRP transport (obsolete)" - depends on UML_NET - help - The SLiRP User-Mode Linux network transport allows a running UML - to network by invoking a program that can handle SLIP encapsulated - packets. This is commonly (but not limited to) the application - known as SLiRP, a program that can re-socket IP packets back onto - he host on which it is run. Only IP packets are supported, - unlike other network transports that can handle all Ethernet - frames. In general, slirp allows the UML the same IP connectivity - to the outside world that the host user is permitted, and unlike - other transports, SLiRP works without the need of root level - privileges, setuid binaries, or SLIP devices on the host. This - also means not every type of connection is possible, but most - situations can be accommodated with carefully crafted slirp - commands that can be passed along as part of the network device's - setup string. The effect of this transport on the UML is similar - that of a host behind a firewall that masquerades all network - connections passing through it (but is less secure). - - NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please - migrate to UML_NET_VECTOR. - - If unsure, say N. - - Startup example: "eth0=slirp,FE:FD:01:02:03:04,/usr/local/bin/slirp" + For more information, including explanations of the networking + and sample configurations, see + <file:Documentation/virt/uml/user_mode_linux_howto_v2.rst>. endmenu @@ -367,3 +179,11 @@ config UML_PCI_OVER_VIRTIO_DEVICE_ID There's no official device ID assigned (yet), set the one you wish to use for experimentation here. The default of -1 is not valid and will cause the driver to fail at probe. + +config UML_PCI_OVER_VFIO + bool "Enable VFIO-based PCI passthrough" + select UML_PCI + help + This driver provides support for VFIO-based PCI passthrough. + Currently, only MSI-X capable devices are supported, and it + is assumed that drivers will use MSI-X. diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile index 0a5820343ad3..6bf8cbf71d3c 100644 --- a/arch/um/drivers/Makefile +++ b/arch/um/drivers/Makefile @@ -6,12 +6,7 @@ # pcap is broken in 2.5 because kbuild doesn't allow pcap.a to be linked # in to pcap.o -slip-objs := slip_kern.o slip_user.o -slirp-objs := slirp_kern.o slirp_user.o -daemon-objs := daemon_kern.o daemon_user.o vector-objs := vector_kern.o vector_user.o vector_transports.o -umcast-objs := umcast_kern.o umcast_user.o -net-objs := net_kern.o net_user.o mconsole-objs := mconsole_kern.o mconsole_user.o hostaudio-objs := hostaudio_kern.o ubd-objs := ubd_kern.o ubd_user.o @@ -19,13 +14,7 @@ port-objs := port_kern.o port_user.o harddog-objs := harddog_kern.o harddog-builtin-$(CONFIG_UML_WATCHDOG) := harddog_user.o harddog_user_exp.o rtc-objs := rtc_kern.o rtc_user.o - -LDFLAGS_vde.o = $(shell $(CC) $(CFLAGS) -print-file-name=libvdeplug.a) - -targets := vde_kern.o vde_user.o - -$(obj)/vde.o: $(obj)/vde_kern.o $(obj)/vde_user.o - $(LD) -r -dp -o $@ $^ $(ld_flags) +vfio_uml-objs := vfio_kern.o vfio_user.o #XXX: The call below does not work because the flags are added before the # object name, so nothing from the library gets linked. @@ -38,13 +27,7 @@ obj-y := stdio_console.o fd.o chan_kern.o chan_user.o line.o obj-$(CONFIG_SSL) += ssl.o obj-$(CONFIG_STDERR_CONSOLE) += stderr_console.o -obj-$(CONFIG_UML_NET_SLIP) += slip.o slip_common.o -obj-$(CONFIG_UML_NET_SLIRP) += slirp.o slip_common.o -obj-$(CONFIG_UML_NET_DAEMON) += daemon.o obj-$(CONFIG_UML_NET_VECTOR) += vector.o -obj-$(CONFIG_UML_NET_VDE) += vde.o -obj-$(CONFIG_UML_NET_MCAST) += umcast.o -obj-$(CONFIG_UML_NET) += net.o obj-$(CONFIG_MCONSOLE) += mconsole.o obj-$(CONFIG_MMAPPER) += mmapper_kern.o obj-$(CONFIG_BLK_DEV_UBD) += ubd.o @@ -62,9 +45,10 @@ obj-$(CONFIG_VIRTIO_UML) += virtio_uml.o obj-$(CONFIG_UML_RTC) += rtc.o obj-$(CONFIG_UML_PCI) += virt-pci.o obj-$(CONFIG_UML_PCI_OVER_VIRTIO) += virtio_pcidev.o +obj-$(CONFIG_UML_PCI_OVER_VFIO) += vfio_uml.o # pcap_user.o must be added explicitly. -USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o vde_user.o vector_user.o +USER_OBJS := fd.o null.o pty.o tty.o xterm.o vector_user.o CFLAGS_null.o = -DDEV_NULL=$(DEV_NULL_PATH) CFLAGS_xterm.o += '-DCONFIG_XTERM_CHAN_DEFAULT_EMULATOR="$(CONFIG_XTERM_CHAN_DEFAULT_EMULATOR)"' diff --git a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c index e78a99816c86..26442db7d608 100644 --- a/arch/um/drivers/chan_kern.c +++ b/arch/um/drivers/chan_kern.c @@ -212,7 +212,7 @@ int enable_chan(struct line *line) * be permanently disabled. This is discovered in IRQ context, but * the freeing of the IRQ must be done later. */ -static DEFINE_SPINLOCK(irqs_to_free_lock); +static DEFINE_RAW_SPINLOCK(irqs_to_free_lock); static LIST_HEAD(irqs_to_free); void free_irqs(void) @@ -222,9 +222,9 @@ void free_irqs(void) struct list_head *ele; unsigned long flags; - spin_lock_irqsave(&irqs_to_free_lock, flags); + raw_spin_lock_irqsave(&irqs_to_free_lock, flags); list_splice_init(&irqs_to_free, &list); - spin_unlock_irqrestore(&irqs_to_free_lock, flags); + raw_spin_unlock_irqrestore(&irqs_to_free_lock, flags); list_for_each(ele, &list) { chan = list_entry(ele, struct chan, free_list); @@ -246,9 +246,9 @@ static void close_one_chan(struct chan *chan, int delay_free_irq) return; if (delay_free_irq) { - spin_lock_irqsave(&irqs_to_free_lock, flags); + raw_spin_lock_irqsave(&irqs_to_free_lock, flags); list_add(&chan->free_list, &irqs_to_free); - spin_unlock_irqrestore(&irqs_to_free_lock, flags); + raw_spin_unlock_irqrestore(&irqs_to_free_lock, flags); } else { if (chan->input && chan->enabled) um_free_irq(chan->line->read_irq, chan); diff --git a/arch/um/drivers/daemon.h b/arch/um/drivers/daemon.h deleted file mode 100644 index 1509cc7eb907..000000000000 --- a/arch/um/drivers/daemon.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#ifndef __DAEMON_H__ -#define __DAEMON_H__ - -#include <net_user.h> - -#define SWITCH_VERSION 3 - -struct daemon_data { - char *sock_type; - char *ctl_sock; - void *ctl_addr; - void *data_addr; - void *local_addr; - int fd; - int control; - void *dev; -}; - -extern const struct net_user_info daemon_user_info; - -extern int daemon_user_write(int fd, void *buf, int len, - struct daemon_data *pri); - -#endif diff --git a/arch/um/drivers/daemon_kern.c b/arch/um/drivers/daemon_kern.c deleted file mode 100644 index afde1e82c056..000000000000 --- a/arch/um/drivers/daemon_kern.c +++ /dev/null @@ -1,95 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Copyright (C) 2001 by various other people who didn't put their name here. - */ - -#include <linux/init.h> -#include <linux/netdevice.h> -#include <net_kern.h> -#include "daemon.h" - -struct daemon_init { - char *sock_type; - char *ctl_sock; -}; - -static void daemon_init(struct net_device *dev, void *data) -{ - struct uml_net_private *pri; - struct daemon_data *dpri; - struct daemon_init *init = data; - - pri = netdev_priv(dev); - dpri = (struct daemon_data *) pri->user; - dpri->sock_type = init->sock_type; - dpri->ctl_sock = init->ctl_sock; - dpri->fd = -1; - dpri->control = -1; - dpri->dev = dev; - /* We will free this pointer. If it contains crap we're burned. */ - dpri->ctl_addr = NULL; - dpri->data_addr = NULL; - dpri->local_addr = NULL; - - printk("daemon backend (uml_switch version %d) - %s:%s", - SWITCH_VERSION, dpri->sock_type, dpri->ctl_sock); - printk("\n"); -} - -static int daemon_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return net_recvfrom(fd, skb_mac_header(skb), - skb->dev->mtu + ETH_HEADER_OTHER); -} - -static int daemon_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return daemon_user_write(fd, skb->data, skb->len, - (struct daemon_data *) &lp->user); -} - -static const struct net_kern_info daemon_kern_info = { - .init = daemon_init, - .protocol = eth_protocol, - .read = daemon_read, - .write = daemon_write, -}; - -static int daemon_setup(char *str, char **mac_out, void *data) -{ - struct daemon_init *init = data; - char *remain; - - *init = ((struct daemon_init) - { .sock_type = "unix", - .ctl_sock = CONFIG_UML_NET_DAEMON_DEFAULT_SOCK }); - - remain = split_if_spec(str, mac_out, &init->sock_type, &init->ctl_sock, - NULL); - if (remain != NULL) - printk(KERN_WARNING "daemon_setup : Ignoring data socket " - "specification\n"); - - return 1; -} - -static struct transport daemon_transport = { - .list = LIST_HEAD_INIT(daemon_transport.list), - .name = "daemon", - .setup = daemon_setup, - .user = &daemon_user_info, - .kern = &daemon_kern_info, - .private_size = sizeof(struct daemon_data), - .setup_size = sizeof(struct daemon_init), -}; - -static int register_daemon(void) -{ - register_transport(&daemon_transport); - return 0; -} - -late_initcall(register_daemon); diff --git a/arch/um/drivers/daemon_user.c b/arch/um/drivers/daemon_user.c deleted file mode 100644 index 785baedc3555..000000000000 --- a/arch/um/drivers/daemon_user.c +++ /dev/null @@ -1,194 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 by various other people who didn't put their name here. - */ - -#include <stdint.h> -#include <string.h> -#include <unistd.h> -#include <errno.h> -#include <sys/types.h> -#include <sys/socket.h> -#include <sys/time.h> -#include <sys/un.h> -#include "daemon.h" -#include <net_user.h> -#include <os.h> -#include <um_malloc.h> - -enum request_type { REQ_NEW_CONTROL }; - -#define SWITCH_MAGIC 0xfeedface - -struct request_v3 { - uint32_t magic; - uint32_t version; - enum request_type type; - struct sockaddr_un sock; -}; - -static struct sockaddr_un *new_addr(void *name, int len) -{ - struct sockaddr_un *sun; - - sun = uml_kmalloc(sizeof(struct sockaddr_un), UM_GFP_KERNEL); - if (sun == NULL) { - printk(UM_KERN_ERR "new_addr: allocation of sockaddr_un " - "failed\n"); - return NULL; - } - sun->sun_family = AF_UNIX; - memcpy(sun->sun_path, name, len); - return sun; -} - -static int connect_to_switch(struct daemon_data *pri) -{ - struct sockaddr_un *ctl_addr = pri->ctl_addr; - struct sockaddr_un *local_addr = pri->local_addr; - struct sockaddr_un *sun; - struct request_v3 req; - int fd, n, err; - - pri->control = socket(AF_UNIX, SOCK_STREAM, 0); - if (pri->control < 0) { - err = -errno; - printk(UM_KERN_ERR "daemon_open : control socket failed, " - "errno = %d\n", -err); - return err; - } - - if (connect(pri->control, (struct sockaddr *) ctl_addr, - sizeof(*ctl_addr)) < 0) { - err = -errno; - printk(UM_KERN_ERR "daemon_open : control connect failed, " - "errno = %d\n", -err); - goto out; - } - - fd = socket(AF_UNIX, SOCK_DGRAM, 0); - if (fd < 0) { - err = -errno; - printk(UM_KERN_ERR "daemon_open : data socket failed, " - "errno = %d\n", -err); - goto out; - } - if (bind(fd, (struct sockaddr *) local_addr, sizeof(*local_addr)) < 0) { - err = -errno; - printk(UM_KERN_ERR "daemon_open : data bind failed, " - "errno = %d\n", -err); - goto out_close; - } - - sun = uml_kmalloc(sizeof(struct sockaddr_un), UM_GFP_KERNEL); - if (sun == NULL) { - printk(UM_KERN_ERR "new_addr: allocation of sockaddr_un " - "failed\n"); - err = -ENOMEM; - goto out_close; - } - - req.magic = SWITCH_MAGIC; - req.version = SWITCH_VERSION; - req.type = REQ_NEW_CONTROL; - req.sock = *local_addr; - n = write(pri->control, &req, sizeof(req)); - if (n != sizeof(req)) { - printk(UM_KERN_ERR "daemon_open : control setup request " - "failed, err = %d\n", -errno); - err = -ENOTCONN; - goto out_free; - } - - n = read(pri->control, sun, sizeof(*sun)); - if (n != sizeof(*sun)) { - printk(UM_KERN_ERR "daemon_open : read of data socket failed, " - "err = %d\n", -errno); - err = -ENOTCONN; - goto out_free; - } - - pri->data_addr = sun; - return fd; - - out_free: - kfree(sun); - out_close: - close(fd); - out: - close(pri->control); - return err; -} - -static int daemon_user_init(void *data, void *dev) -{ - struct daemon_data *pri = data; - struct timeval tv; - struct { - char zero; - int pid; - int usecs; - } name; - - if (!strcmp(pri->sock_type, "unix")) - pri->ctl_addr = new_addr(pri->ctl_sock, - strlen(pri->ctl_sock) + 1); - name.zero = 0; - name.pid = os_getpid(); - gettimeofday(&tv, NULL); - name.usecs = tv.tv_usec; - pri->local_addr = new_addr(&name, sizeof(name)); - pri->dev = dev; - pri->fd = connect_to_switch(pri); - if (pri->fd < 0) { - kfree(pri->local_addr); - pri->local_addr = NULL; - return pri->fd; - } - - return 0; -} - -static int daemon_open(void *data) -{ - struct daemon_data *pri = data; - return pri->fd; -} - -static void daemon_remove(void *data) -{ - struct daemon_data *pri = data; - - close(pri->fd); - pri->fd = -1; - close(pri->control); - pri->control = -1; - - kfree(pri->data_addr); - pri->data_addr = NULL; - kfree(pri->ctl_addr); - pri->ctl_addr = NULL; - kfree(pri->local_addr); - pri->local_addr = NULL; -} - -int daemon_user_write(int fd, void *buf, int len, struct daemon_data *pri) -{ - struct sockaddr_un *data_addr = pri->data_addr; - - return net_sendto(fd, buf, len, data_addr, sizeof(*data_addr)); -} - -const struct net_user_info daemon_user_info = { - .init = daemon_user_init, - .open = daemon_open, - .close = NULL, - .remove = daemon_remove, - .add_address = NULL, - .delete_address = NULL, - .mtu = ETH_MAX_PACKET, - .max_packet = ETH_MAX_PACKET + ETH_HEADER_OTHER, -}; diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c deleted file mode 100644 index d5a9c5aabaec..000000000000 --- a/arch/um/drivers/net_kern.c +++ /dev/null @@ -1,889 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 by various other people who didn't put their name here. - */ - -#include <linux/memblock.h> -#include <linux/etherdevice.h> -#include <linux/ethtool.h> -#include <linux/inetdevice.h> -#include <linux/init.h> -#include <linux/list.h> -#include <linux/netdevice.h> -#include <linux/platform_device.h> -#include <linux/rtnetlink.h> -#include <linux/skbuff.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <init.h> -#include <irq_kern.h> -#include <irq_user.h> -#include "mconsole_kern.h" -#include <net_kern.h> -#include <net_user.h> - -#define DRIVER_NAME "uml-netdev" - -static DEFINE_SPINLOCK(opened_lock); -static LIST_HEAD(opened); - -/* - * The drop_skb is used when we can't allocate an skb. The - * packet is read into drop_skb in order to get the data off the - * connection to the host. - * It is reallocated whenever a maximum packet size is seen which is - * larger than any seen before. update_drop_skb is called from - * eth_configure when a new interface is added. - */ -static DEFINE_SPINLOCK(drop_lock); -static struct sk_buff *drop_skb; -static int drop_max; - -static int update_drop_skb(int max) -{ - struct sk_buff *new; - unsigned long flags; - int err = 0; - - spin_lock_irqsave(&drop_lock, flags); - - if (max <= drop_max) - goto out; - - err = -ENOMEM; - new = dev_alloc_skb(max); - if (new == NULL) - goto out; - - skb_put(new, max); - - kfree_skb(drop_skb); - drop_skb = new; - drop_max = max; - err = 0; -out: - spin_unlock_irqrestore(&drop_lock, flags); - - return err; -} - -static int uml_net_rx(struct net_device *dev) -{ - struct uml_net_private *lp = netdev_priv(dev); - int pkt_len; - struct sk_buff *skb; - - /* If we can't allocate memory, try again next round. */ - skb = dev_alloc_skb(lp->max_packet); - if (skb == NULL) { - drop_skb->dev = dev; - /* Read a packet into drop_skb and don't do anything with it. */ - (*lp->read)(lp->fd, drop_skb, lp); - dev->stats.rx_dropped++; - return 0; - } - - skb->dev = dev; - skb_put(skb, lp->max_packet); - skb_reset_mac_header(skb); - pkt_len = (*lp->read)(lp->fd, skb, lp); - - if (pkt_len > 0) { - skb_trim(skb, pkt_len); - skb->protocol = (*lp->protocol)(skb); - - dev->stats.rx_bytes += skb->len; - dev->stats.rx_packets++; - netif_rx(skb); - return pkt_len; - } - - kfree_skb(skb); - return pkt_len; -} - -static void uml_dev_close(struct work_struct *work) -{ - struct uml_net_private *lp = - container_of(work, struct uml_net_private, work); - dev_close(lp->dev); -} - -static irqreturn_t uml_net_interrupt(int irq, void *dev_id) -{ - struct net_device *dev = dev_id; - struct uml_net_private *lp = netdev_priv(dev); - int err; - - if (!netif_running(dev)) - return IRQ_NONE; - - spin_lock(&lp->lock); - while ((err = uml_net_rx(dev)) > 0) ; - if (err < 0) { - printk(KERN_ERR - "Device '%s' read returned %d, shutting it down\n", - dev->name, err); - /* dev_close can't be called in interrupt context, and takes - * again lp->lock. - * And dev_close() can be safely called multiple times on the - * same device, since it tests for (dev->flags & IFF_UP). So - * there's no harm in delaying the device shutdown. - * Furthermore, the workqueue will not re-enqueue an already - * enqueued work item. */ - schedule_work(&lp->work); - goto out; - } -out: - spin_unlock(&lp->lock); - return IRQ_HANDLED; -} - -static int uml_net_open(struct net_device *dev) -{ - struct uml_net_private *lp = netdev_priv(dev); - int err; - - if (lp->fd >= 0) { - err = -ENXIO; - goto out; - } - - lp->fd = (*lp->open)(&lp->user); - if (lp->fd < 0) { - err = lp->fd; - goto out; - } - - err = um_request_irq(dev->irq, lp->fd, IRQ_READ, uml_net_interrupt, - IRQF_SHARED, dev->name, dev); - if (err < 0) { - printk(KERN_ERR "uml_net_open: failed to get irq(%d)\n", err); - err = -ENETUNREACH; - goto out_close; - } - - netif_start_queue(dev); - - /* clear buffer - it can happen that the host side of the interface - * is full when we get here. In this case, new data is never queued, - * SIGIOs never arrive, and the net never works. - */ - while ((err = uml_net_rx(dev)) > 0) ; - - spin_lock(&opened_lock); - list_add(&lp->list, &opened); - spin_unlock(&opened_lock); - - return 0; -out_close: - if (lp->close != NULL) (*lp->close)(lp->fd, &lp->user); - lp->fd = -1; -out: - return err; -} - -static int uml_net_close(struct net_device *dev) -{ - struct uml_net_private *lp = netdev_priv(dev); - - netif_stop_queue(dev); - - um_free_irq(dev->irq, dev); - if (lp->close != NULL) - (*lp->close)(lp->fd, &lp->user); - lp->fd = -1; - - spin_lock(&opened_lock); - list_del(&lp->list); - spin_unlock(&opened_lock); - - return 0; -} - -static netdev_tx_t uml_net_start_xmit(struct sk_buff *skb, struct net_device *dev) -{ - struct uml_net_private *lp = netdev_priv(dev); - unsigned long flags; - int len; - - netif_stop_queue(dev); - - spin_lock_irqsave(&lp->lock, flags); - - len = (*lp->write)(lp->fd, skb, lp); - skb_tx_timestamp(skb); - - if (len == skb->len) { - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; - netif_trans_update(dev); - netif_start_queue(dev); - - /* this is normally done in the interrupt when tx finishes */ - netif_wake_queue(dev); - } - else if (len == 0) { - netif_start_queue(dev); - dev->stats.tx_dropped++; - } - else { - netif_start_queue(dev); - printk(KERN_ERR "uml_net_start_xmit: failed(%d)\n", len); - } - - spin_unlock_irqrestore(&lp->lock, flags); - - dev_consume_skb_any(skb); - - return NETDEV_TX_OK; -} - -static void uml_net_set_multicast_list(struct net_device *dev) -{ - return; -} - -static void uml_net_tx_timeout(struct net_device *dev, unsigned int txqueue) -{ - netif_trans_update(dev); - netif_wake_queue(dev); -} - -#ifdef CONFIG_NET_POLL_CONTROLLER -static void uml_net_poll_controller(struct net_device *dev) -{ - disable_irq(dev->irq); - uml_net_interrupt(dev->irq, dev); - enable_irq(dev->irq); -} -#endif - -static void uml_net_get_drvinfo(struct net_device *dev, - struct ethtool_drvinfo *info) -{ - strscpy(info->driver, DRIVER_NAME); -} - -static const struct ethtool_ops uml_net_ethtool_ops = { - .get_drvinfo = uml_net_get_drvinfo, - .get_link = ethtool_op_get_link, - .get_ts_info = ethtool_op_get_ts_info, -}; - -void uml_net_setup_etheraddr(struct net_device *dev, char *str) -{ - u8 addr[ETH_ALEN]; - char *end; - int i; - - if (str == NULL) - goto random; - - for (i = 0; i < 6; i++) { - addr[i] = simple_strtoul(str, &end, 16); - if ((end == str) || - ((*end != ':') && (*end != ',') && (*end != '\0'))) { - printk(KERN_ERR - "setup_etheraddr: failed to parse '%s' " - "as an ethernet address\n", str); - goto random; - } - str = end + 1; - } - if (is_multicast_ether_addr(addr)) { - printk(KERN_ERR - "Attempt to assign a multicast ethernet address to a " - "device disallowed\n"); - goto random; - } - if (!is_valid_ether_addr(addr)) { - printk(KERN_ERR - "Attempt to assign an invalid ethernet address to a " - "device disallowed\n"); - goto random; - } - if (!is_local_ether_addr(addr)) { - printk(KERN_WARNING - "Warning: Assigning a globally valid ethernet " - "address to a device\n"); - printk(KERN_WARNING "You should set the 2nd rightmost bit in " - "the first byte of the MAC,\n"); - printk(KERN_WARNING "i.e. %02x:%02x:%02x:%02x:%02x:%02x\n", - addr[0] | 0x02, addr[1], addr[2], addr[3], addr[4], - addr[5]); - } - eth_hw_addr_set(dev, addr); - return; - -random: - printk(KERN_INFO - "Choosing a random ethernet address for device %s\n", dev->name); - eth_hw_addr_random(dev); -} - -static DEFINE_SPINLOCK(devices_lock); -static LIST_HEAD(devices); - -static struct platform_driver uml_net_driver = { - .driver = { - .name = DRIVER_NAME, - }, -}; - -static void net_device_release(struct device *dev) -{ - struct uml_net *device = container_of(dev, struct uml_net, pdev.dev); - struct net_device *netdev = device->dev; - struct uml_net_private *lp = netdev_priv(netdev); - - if (lp->remove != NULL) - (*lp->remove)(&lp->user); - list_del(&device->list); - kfree(device); - free_netdev(netdev); -} - -static const struct net_device_ops uml_netdev_ops = { - .ndo_open = uml_net_open, - .ndo_stop = uml_net_close, - .ndo_start_xmit = uml_net_start_xmit, - .ndo_set_rx_mode = uml_net_set_multicast_list, - .ndo_tx_timeout = uml_net_tx_timeout, - .ndo_set_mac_address = eth_mac_addr, - .ndo_validate_addr = eth_validate_addr, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = uml_net_poll_controller, -#endif -}; - -/* - * Ensures that platform_driver_register is called only once by - * eth_configure. Will be set in an initcall. - */ -static int driver_registered; - -static void eth_configure(int n, void *init, char *mac, - struct transport *transport, gfp_t gfp_mask) -{ - struct uml_net *device; - struct net_device *dev; - struct uml_net_private *lp; - int err, size; - - size = transport->private_size + sizeof(struct uml_net_private); - - device = kzalloc(sizeof(*device), gfp_mask); - if (device == NULL) { - printk(KERN_ERR "eth_configure failed to allocate struct " - "uml_net\n"); - return; - } - - dev = alloc_etherdev(size); - if (dev == NULL) { - printk(KERN_ERR "eth_configure: failed to allocate struct " - "net_device for eth%d\n", n); - goto out_free_device; - } - - INIT_LIST_HEAD(&device->list); - device->index = n; - - /* If this name ends up conflicting with an existing registered - * netdevice, that is OK, register_netdev{,ice}() will notice this - * and fail. - */ - snprintf(dev->name, sizeof(dev->name), "eth%d", n); - - uml_net_setup_etheraddr(dev, mac); - - printk(KERN_INFO "Netdevice %d (%pM) : ", n, dev->dev_addr); - - lp = netdev_priv(dev); - /* This points to the transport private data. It's still clear, but we - * must memset it to 0 *now*. Let's help the drivers. */ - memset(lp, 0, size); - INIT_WORK(&lp->work, uml_dev_close); - - /* sysfs register */ - if (!driver_registered) { - platform_driver_register(¨_net_driver); - driver_registered = 1; - } - device->pdev.id = n; - device->pdev.name = DRIVER_NAME; - device->pdev.dev.release = net_device_release; - dev_set_drvdata(&device->pdev.dev, device); - if (platform_device_register(&device->pdev)) - goto out_free_netdev; - SET_NETDEV_DEV(dev,&device->pdev.dev); - - device->dev = dev; - - /* - * These just fill in a data structure, so there's no failure - * to be worried about. - */ - (*transport->kern->init)(dev, init); - - *lp = ((struct uml_net_private) - { .list = LIST_HEAD_INIT(lp->list), - .dev = dev, - .fd = -1, - .mac = { 0xfe, 0xfd, 0x0, 0x0, 0x0, 0x0}, - .max_packet = transport->user->max_packet, - .protocol = transport->kern->protocol, - .open = transport->user->open, - .close = transport->user->close, - .remove = transport->user->remove, - .read = transport->kern->read, - .write = transport->kern->write, - .add_address = transport->user->add_address, - .delete_address = transport->user->delete_address }); - - spin_lock_init(&lp->lock); - memcpy(lp->mac, dev->dev_addr, sizeof(lp->mac)); - - if ((transport->user->init != NULL) && - ((*transport->user->init)(&lp->user, dev) != 0)) - goto out_unregister; - - dev->mtu = transport->user->mtu; - dev->netdev_ops = ¨_netdev_ops; - dev->ethtool_ops = ¨_net_ethtool_ops; - dev->watchdog_timeo = (HZ >> 1); - dev->irq = UM_ETH_IRQ; - - err = update_drop_skb(lp->max_packet); - if (err) - goto out_undo_user_init; - - rtnl_lock(); - err = register_netdevice(dev); - rtnl_unlock(); - if (err) - goto out_undo_user_init; - - spin_lock(&devices_lock); - list_add(&device->list, &devices); - spin_unlock(&devices_lock); - - return; - -out_undo_user_init: - if (transport->user->remove != NULL) - (*transport->user->remove)(&lp->user); -out_unregister: - platform_device_unregister(&device->pdev); - return; /* platform_device_unregister frees dev and device */ -out_free_netdev: - free_netdev(dev); -out_free_device: - kfree(device); -} - -static struct uml_net *find_device(int n) -{ - struct uml_net *device; - struct list_head *ele; - - spin_lock(&devices_lock); - list_for_each(ele, &devices) { - device = list_entry(ele, struct uml_net, list); - if (device->index == n) - goto out; - } - device = NULL; - out: - spin_unlock(&devices_lock); - return device; -} - -static int eth_parse(char *str, int *index_out, char **str_out, - char **error_out) -{ - char *end; - int n, err = -EINVAL; - - n = simple_strtoul(str, &end, 0); - if (end == str) { - *error_out = "Bad device number"; - return err; - } - - str = end; - if (*str != '=') { - *error_out = "Expected '=' after device number"; - return err; - } - - str++; - if (find_device(n)) { - *error_out = "Device already configured"; - return err; - } - - *index_out = n; - *str_out = str; - return 0; -} - -struct eth_init { - struct list_head list; - char *init; - int index; -}; - -static DEFINE_SPINLOCK(transports_lock); -static LIST_HEAD(transports); - -/* Filled in during early boot */ -static LIST_HEAD(eth_cmd_line); - -static int check_transport(struct transport *transport, char *eth, int n, - void **init_out, char **mac_out, gfp_t gfp_mask) -{ - int len; - - len = strlen(transport->name); - if (strncmp(eth, transport->name, len)) - return 0; - - eth += len; - if (*eth == ',') - eth++; - else if (*eth != '\0') - return 0; - - *init_out = kmalloc(transport->setup_size, gfp_mask); - if (*init_out == NULL) - return 1; - - if (!transport->setup(eth, mac_out, *init_out)) { - kfree(*init_out); - *init_out = NULL; - } - return 1; -} - -void register_transport(struct transport *new) -{ - struct list_head *ele, *next; - struct eth_init *eth; - void *init; - char *mac = NULL; - int match; - - spin_lock(&transports_lock); - BUG_ON(!list_empty(&new->list)); - list_add(&new->list, &transports); - spin_unlock(&transports_lock); - - list_for_each_safe(ele, next, ð_cmd_line) { - eth = list_entry(ele, struct eth_init, list); - match = check_transport(new, eth->init, eth->index, &init, - &mac, GFP_KERNEL); - if (!match) - continue; - else if (init != NULL) { - eth_configure(eth->index, init, mac, new, GFP_KERNEL); - kfree(init); - } - list_del(ð->list); - } -} - -static int eth_setup_common(char *str, int index) -{ - struct list_head *ele; - struct transport *transport; - void *init; - char *mac = NULL; - int found = 0; - - spin_lock(&transports_lock); - list_for_each(ele, &transports) { - transport = list_entry(ele, struct transport, list); - if (!check_transport(transport, str, index, &init, - &mac, GFP_ATOMIC)) - continue; - if (init != NULL) { - eth_configure(index, init, mac, transport, GFP_ATOMIC); - kfree(init); - } - found = 1; - break; - } - - spin_unlock(&transports_lock); - return found; -} - -static int __init eth_setup(char *str) -{ - struct eth_init *new; - char *error; - int n, err; - - err = eth_parse(str, &n, &str, &error); - if (err) { - printk(KERN_ERR "eth_setup - Couldn't parse '%s' : %s\n", - str, error); - return 1; - } - - new = memblock_alloc_or_panic(sizeof(*new), SMP_CACHE_BYTES); - - INIT_LIST_HEAD(&new->list); - new->index = n; - new->init = str; - - list_add_tail(&new->list, ð_cmd_line); - return 1; -} - -__setup("eth", eth_setup); -__uml_help(eth_setup, -"eth[0-9]+=<transport>,<options>\n" -" Configure a network device.\n\n" -); - -static int net_config(char *str, char **error_out) -{ - int n, err; - - err = eth_parse(str, &n, &str, error_out); - if (err) - return err; - - /* This string is broken up and the pieces used by the underlying - * driver. So, it is freed only if eth_setup_common fails. - */ - str = kstrdup(str, GFP_KERNEL); - if (str == NULL) { - *error_out = "net_config failed to strdup string"; - return -ENOMEM; - } - err = !eth_setup_common(str, n); - if (err) - kfree(str); - return err; -} - -static int net_id(char **str, int *start_out, int *end_out) -{ - char *end; - int n; - - n = simple_strtoul(*str, &end, 0); - if ((*end != '\0') || (end == *str)) - return -1; - - *start_out = n; - *end_out = n; - *str = end; - return n; -} - -static int net_remove(int n, char **error_out) -{ - struct uml_net *device; - struct net_device *dev; - struct uml_net_private *lp; - - device = find_device(n); - if (device == NULL) - return -ENODEV; - - dev = device->dev; - lp = netdev_priv(dev); - if (lp->fd > 0) - return -EBUSY; - unregister_netdev(dev); - platform_device_unregister(&device->pdev); - - return 0; -} - -static struct mc_device net_mc = { - .list = LIST_HEAD_INIT(net_mc.list), - .name = "eth", - .config = net_config, - .get_config = NULL, - .id = net_id, - .remove = net_remove, -}; - -#ifdef CONFIG_INET -static int uml_inetaddr_event(struct notifier_block *this, unsigned long event, - void *ptr) -{ - struct in_ifaddr *ifa = ptr; - struct net_device *dev = ifa->ifa_dev->dev; - struct uml_net_private *lp; - void (*proc)(unsigned char *, unsigned char *, void *); - unsigned char addr_buf[4], netmask_buf[4]; - - if (dev->netdev_ops->ndo_open != uml_net_open) - return NOTIFY_DONE; - - lp = netdev_priv(dev); - - proc = NULL; - switch (event) { - case NETDEV_UP: - proc = lp->add_address; - break; - case NETDEV_DOWN: - proc = lp->delete_address; - break; - } - if (proc != NULL) { - memcpy(addr_buf, &ifa->ifa_address, sizeof(addr_buf)); - memcpy(netmask_buf, &ifa->ifa_mask, sizeof(netmask_buf)); - (*proc)(addr_buf, netmask_buf, &lp->user); - } - return NOTIFY_DONE; -} - -/* uml_net_init shouldn't be called twice on two CPUs at the same time */ -static struct notifier_block uml_inetaddr_notifier = { - .notifier_call = uml_inetaddr_event, -}; - -static void inet_register(void) -{ - struct list_head *ele; - struct uml_net_private *lp; - struct in_device *ip; - struct in_ifaddr *in; - - register_inetaddr_notifier(¨_inetaddr_notifier); - - /* Devices may have been opened already, so the uml_inetaddr_notifier - * didn't get a chance to run for them. This fakes it so that - * addresses which have already been set up get handled properly. - */ - spin_lock(&opened_lock); - list_for_each(ele, &opened) { - lp = list_entry(ele, struct uml_net_private, list); - ip = lp->dev->ip_ptr; - if (ip == NULL) - continue; - in = ip->ifa_list; - while (in != NULL) { - uml_inetaddr_event(NULL, NETDEV_UP, in); - in = in->ifa_next; - } - } - spin_unlock(&opened_lock); -} -#else -static inline void inet_register(void) -{ -} -#endif - -static int uml_net_init(void) -{ - mconsole_register_dev(&net_mc); - inet_register(); - return 0; -} - -__initcall(uml_net_init); - -static void close_devices(void) -{ - struct list_head *ele; - struct uml_net_private *lp; - - spin_lock(&opened_lock); - list_for_each(ele, &opened) { - lp = list_entry(ele, struct uml_net_private, list); - um_free_irq(lp->dev->irq, lp->dev); - if ((lp->close != NULL) && (lp->fd >= 0)) - (*lp->close)(lp->fd, &lp->user); - if (lp->remove != NULL) - (*lp->remove)(&lp->user); - } - spin_unlock(&opened_lock); -} - -__uml_exitcall(close_devices); - -void iter_addresses(void *d, void (*cb)(unsigned char *, unsigned char *, - void *), - void *arg) -{ - struct net_device *dev = d; - struct in_device *ip = dev->ip_ptr; - struct in_ifaddr *in; - unsigned char address[4], netmask[4]; - - if (ip == NULL) return; - in = ip->ifa_list; - while (in != NULL) { - memcpy(address, &in->ifa_address, sizeof(address)); - memcpy(netmask, &in->ifa_mask, sizeof(netmask)); - (*cb)(address, netmask, arg); - in = in->ifa_next; - } -} - -int dev_netmask(void *d, void *m) -{ - struct net_device *dev = d; - struct in_device *ip = dev->ip_ptr; - struct in_ifaddr *in; - __be32 *mask_out = m; - - if (ip == NULL) - return 1; - - in = ip->ifa_list; - if (in == NULL) - return 1; - - *mask_out = in->ifa_mask; - return 0; -} - -void *get_output_buffer(int *len_out) -{ - void *ret; - - ret = (void *) __get_free_pages(GFP_KERNEL, 0); - if (ret) *len_out = PAGE_SIZE; - else *len_out = 0; - return ret; -} - -void free_output_buffer(void *buffer) -{ - free_pages((unsigned long) buffer, 0); -} - -int tap_setup_common(char *str, char *type, char **dev_name, char **mac_out, - char **gate_addr) -{ - char *remain; - - remain = split_if_spec(str, dev_name, mac_out, gate_addr, NULL); - if (remain != NULL) { - printk(KERN_ERR "tap_setup_common - Extra garbage on " - "specification : '%s'\n", remain); - return 1; - } - - return 0; -} - -unsigned short eth_protocol(struct sk_buff *skb) -{ - return eth_type_trans(skb, skb->dev); -} diff --git a/arch/um/drivers/net_user.c b/arch/um/drivers/net_user.c deleted file mode 100644 index 4c9576452ab0..000000000000 --- a/arch/um/drivers/net_user.c +++ /dev/null @@ -1,271 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include <stdio.h> -#include <unistd.h> -#include <stdarg.h> -#include <errno.h> -#include <stddef.h> -#include <string.h> -#include <sys/socket.h> -#include <sys/wait.h> -#include <net_user.h> -#include <os.h> -#include <um_malloc.h> - -int tap_open_common(void *dev, char *gate_addr) -{ - int tap_addr[4]; - - if (gate_addr == NULL) - return 0; - if (sscanf(gate_addr, "%d.%d.%d.%d", &tap_addr[0], - &tap_addr[1], &tap_addr[2], &tap_addr[3]) != 4) { - printk(UM_KERN_ERR "Invalid tap IP address - '%s'\n", - gate_addr); - return -EINVAL; - } - return 0; -} - -void tap_check_ips(char *gate_addr, unsigned char *eth_addr) -{ - int tap_addr[4]; - - if ((gate_addr != NULL) && - (sscanf(gate_addr, "%d.%d.%d.%d", &tap_addr[0], - &tap_addr[1], &tap_addr[2], &tap_addr[3]) == 4) && - (eth_addr[0] == tap_addr[0]) && - (eth_addr[1] == tap_addr[1]) && - (eth_addr[2] == tap_addr[2]) && - (eth_addr[3] == tap_addr[3])) { - printk(UM_KERN_ERR "The tap IP address and the UML eth IP " - "address must be different\n"); - } -} - -/* Do reliable error handling as this fails frequently enough. */ -void read_output(int fd, char *output, int len) -{ - int remain, ret, expected; - char c; - char *str; - - if (output == NULL) { - output = &c; - len = sizeof(c); - } - - *output = '\0'; - ret = read(fd, &remain, sizeof(remain)); - - if (ret != sizeof(remain)) { - if (ret < 0) - ret = -errno; - expected = sizeof(remain); - str = "length"; - goto err; - } - - while (remain != 0) { - expected = (remain < len) ? remain : len; - ret = read(fd, output, expected); - if (ret != expected) { - if (ret < 0) - ret = -errno; - str = "data"; - goto err; - } - remain -= ret; - } - - return; - -err: - if (ret < 0) - printk(UM_KERN_ERR "read_output - read of %s failed, " - "errno = %d\n", str, -ret); - else - printk(UM_KERN_ERR "read_output - read of %s failed, read only " - "%d of %d bytes\n", str, ret, expected); -} - -int net_read(int fd, void *buf, int len) -{ - int n; - - n = read(fd, buf, len); - - if ((n < 0) && (errno == EAGAIN)) - return 0; - else if (n == 0) - return -ENOTCONN; - return n; -} - -int net_recvfrom(int fd, void *buf, int len) -{ - int n; - - CATCH_EINTR(n = recvfrom(fd, buf, len, 0, NULL, NULL)); - if (n < 0) { - if (errno == EAGAIN) - return 0; - return -errno; - } - else if (n == 0) - return -ENOTCONN; - return n; -} - -int net_write(int fd, void *buf, int len) -{ - int n; - - n = write(fd, buf, len); - - if ((n < 0) && (errno == EAGAIN)) - return 0; - else if (n == 0) - return -ENOTCONN; - return n; -} - -int net_send(int fd, void *buf, int len) -{ - int n; - - CATCH_EINTR(n = send(fd, buf, len, 0)); - if (n < 0) { - if (errno == EAGAIN) - return 0; - return -errno; - } - else if (n == 0) - return -ENOTCONN; - return n; -} - -int net_sendto(int fd, void *buf, int len, void *to, int sock_len) -{ - int n; - - CATCH_EINTR(n = sendto(fd, buf, len, 0, (struct sockaddr *) to, - sock_len)); - if (n < 0) { - if (errno == EAGAIN) - return 0; - return -errno; - } - else if (n == 0) - return -ENOTCONN; - return n; -} - -struct change_pre_exec_data { - int close_me; - int stdout_fd; -}; - -static void change_pre_exec(void *arg) -{ - struct change_pre_exec_data *data = arg; - - close(data->close_me); - dup2(data->stdout_fd, 1); -} - -static int change_tramp(char **argv, char *output, int output_len) -{ - int pid, fds[2], err; - struct change_pre_exec_data pe_data; - - err = os_pipe(fds, 1, 0); - if (err < 0) { - printk(UM_KERN_ERR "change_tramp - pipe failed, err = %d\n", - -err); - return err; - } - pe_data.close_me = fds[0]; - pe_data.stdout_fd = fds[1]; - pid = run_helper(change_pre_exec, &pe_data, argv); - - if (pid > 0) /* Avoid hang as we won't get data in failure case. */ - read_output(fds[0], output, output_len); - - close(fds[0]); - close(fds[1]); - - if (pid > 0) - helper_wait(pid); - return pid; -} - -static void change(char *dev, char *what, unsigned char *addr, - unsigned char *netmask) -{ - char addr_buf[sizeof("255.255.255.255\0")]; - char netmask_buf[sizeof("255.255.255.255\0")]; - char version[sizeof("nnnnn\0")]; - char *argv[] = { "uml_net", version, what, dev, addr_buf, - netmask_buf, NULL }; - char *output; - int output_len, pid; - - sprintf(version, "%d", UML_NET_VERSION); - sprintf(addr_buf, "%d.%d.%d.%d", addr[0], addr[1], addr[2], addr[3]); - sprintf(netmask_buf, "%d.%d.%d.%d", netmask[0], netmask[1], - netmask[2], netmask[3]); - - output_len = UM_KERN_PAGE_SIZE; - output = uml_kmalloc(output_len, UM_GFP_KERNEL); - if (output == NULL) - printk(UM_KERN_ERR "change : failed to allocate output " - "buffer\n"); - - pid = change_tramp(argv, output, output_len); - if (pid < 0) { - kfree(output); - return; - } - - if (output != NULL) { - printk("%s", output); - kfree(output); - } -} - -void open_addr(unsigned char *addr, unsigned char *netmask, void *arg) -{ - change(arg, "add", addr, netmask); -} - -void close_addr(unsigned char *addr, unsigned char *netmask, void *arg) -{ - change(arg, "del", addr, netmask); -} - -char *split_if_spec(char *str, ...) -{ - char **arg, *end, *ret = NULL; - va_list ap; - - va_start(ap, str); - while ((arg = va_arg(ap, char **)) != NULL) { - if (*str == '\0') - goto out; - end = strchr(str, ','); - if (end != str) - *arg = str; - if (end == NULL) - goto out; - *end++ = '\0'; - str = end; - } - ret = str; -out: - va_end(ap); - return ret; -} diff --git a/arch/um/drivers/slip.h b/arch/um/drivers/slip.h deleted file mode 100644 index 0f3b7ca99465..000000000000 --- a/arch/um/drivers/slip.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __UM_SLIP_H -#define __UM_SLIP_H - -#include "slip_common.h" - -struct slip_data { - void *dev; - char name[sizeof("slnnnnn\0")]; - char *addr; - char *gate_addr; - int slave; - struct slip_proto slip; -}; - -extern const struct net_user_info slip_user_info; - -extern int slip_user_read(int fd, void *buf, int len, struct slip_data *pri); -extern int slip_user_write(int fd, void *buf, int len, struct slip_data *pri); - -#endif diff --git a/arch/um/drivers/slip_common.c b/arch/um/drivers/slip_common.c deleted file mode 100644 index 20fe4f42743d..000000000000 --- a/arch/um/drivers/slip_common.c +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <string.h> -#include "slip_common.h" -#include <net_user.h> - -int slip_proto_read(int fd, void *buf, int len, struct slip_proto *slip) -{ - int i, n, size, start; - - if(slip->more > 0){ - i = 0; - while(i < slip->more){ - size = slip_unesc(slip->ibuf[i++], slip->ibuf, - &slip->pos, &slip->esc); - if(size){ - memcpy(buf, slip->ibuf, size); - memmove(slip->ibuf, &slip->ibuf[i], - slip->more - i); - slip->more = slip->more - i; - return size; - } - } - slip->more = 0; - } - - n = net_read(fd, &slip->ibuf[slip->pos], - sizeof(slip->ibuf) - slip->pos); - if(n <= 0) - return n; - - start = slip->pos; - for(i = 0; i < n; i++){ - size = slip_unesc(slip->ibuf[start + i], slip->ibuf,&slip->pos, - &slip->esc); - if(size){ - memcpy(buf, slip->ibuf, size); - memmove(slip->ibuf, &slip->ibuf[start+i+1], - n - (i + 1)); - slip->more = n - (i + 1); - return size; - } - } - return 0; -} - -int slip_proto_write(int fd, void *buf, int len, struct slip_proto *slip) -{ - int actual, n; - - actual = slip_esc(buf, slip->obuf, len); - n = net_write(fd, slip->obuf, actual); - if(n < 0) - return n; - else return len; -} diff --git a/arch/um/drivers/slip_common.h b/arch/um/drivers/slip_common.h deleted file mode 100644 index d3798b5caf7f..000000000000 --- a/arch/um/drivers/slip_common.h +++ /dev/null @@ -1,106 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __UM_SLIP_COMMON_H -#define __UM_SLIP_COMMON_H - -#define BUF_SIZE 1500 - /* two bytes each for a (pathological) max packet of escaped chars + * - * terminating END char + initial END char */ -#define ENC_BUF_SIZE (2 * BUF_SIZE + 2) - -/* SLIP protocol characters. */ -#define SLIP_END 0300 /* indicates end of frame */ -#define SLIP_ESC 0333 /* indicates byte stuffing */ -#define SLIP_ESC_END 0334 /* ESC ESC_END means END 'data' */ -#define SLIP_ESC_ESC 0335 /* ESC ESC_ESC means ESC 'data' */ - -static inline int slip_unesc(unsigned char c, unsigned char *buf, int *pos, - int *esc) -{ - int ret; - - switch(c){ - case SLIP_END: - *esc = 0; - ret=*pos; - *pos=0; - return(ret); - case SLIP_ESC: - *esc = 1; - return(0); - case SLIP_ESC_ESC: - if(*esc){ - *esc = 0; - c = SLIP_ESC; - } - break; - case SLIP_ESC_END: - if(*esc){ - *esc = 0; - c = SLIP_END; - } - break; - } - buf[(*pos)++] = c; - return(0); -} - -static inline int slip_esc(unsigned char *s, unsigned char *d, int len) -{ - unsigned char *ptr = d; - unsigned char c; - - /* - * Send an initial END character to flush out any - * data that may have accumulated in the receiver - * due to line noise. - */ - - *ptr++ = SLIP_END; - - /* - * For each byte in the packet, send the appropriate - * character sequence, according to the SLIP protocol. - */ - - while (len-- > 0) { - switch(c = *s++) { - case SLIP_END: - *ptr++ = SLIP_ESC; - *ptr++ = SLIP_ESC_END; - break; - case SLIP_ESC: - *ptr++ = SLIP_ESC; - *ptr++ = SLIP_ESC_ESC; - break; - default: - *ptr++ = c; - break; - } - } - *ptr++ = SLIP_END; - return (ptr - d); -} - -struct slip_proto { - unsigned char ibuf[ENC_BUF_SIZE]; - unsigned char obuf[ENC_BUF_SIZE]; - int more; /* more data: do not read fd until ibuf has been drained */ - int pos; - int esc; -}; - -static inline void slip_proto_init(struct slip_proto * slip) -{ - memset(slip->ibuf, 0, sizeof(slip->ibuf)); - memset(slip->obuf, 0, sizeof(slip->obuf)); - slip->more = 0; - slip->pos = 0; - slip->esc = 0; -} - -extern int slip_proto_read(int fd, void *buf, int len, - struct slip_proto *slip); -extern int slip_proto_write(int fd, void *buf, int len, - struct slip_proto *slip); - -#endif diff --git a/arch/um/drivers/slip_kern.c b/arch/um/drivers/slip_kern.c deleted file mode 100644 index c58ccdcc16d6..000000000000 --- a/arch/um/drivers/slip_kern.c +++ /dev/null @@ -1,93 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include <linux/if_arp.h> -#include <linux/init.h> -#include <linux/netdevice.h> -#include <net_kern.h> -#include "slip.h" - -struct slip_init { - char *gate_addr; -}; - -static void slip_init(struct net_device *dev, void *data) -{ - struct uml_net_private *private; - struct slip_data *spri; - struct slip_init *init = data; - - private = netdev_priv(dev); - spri = (struct slip_data *) private->user; - - memset(spri->name, 0, sizeof(spri->name)); - spri->addr = NULL; - spri->gate_addr = init->gate_addr; - spri->slave = -1; - spri->dev = dev; - - slip_proto_init(&spri->slip); - - dev->hard_header_len = 0; - dev->header_ops = NULL; - dev->addr_len = 0; - dev->type = ARPHRD_SLIP; - dev->tx_queue_len = 256; - dev->flags = IFF_NOARP; - printk("SLIP backend - SLIP IP = %s\n", spri->gate_addr); -} - -static unsigned short slip_protocol(struct sk_buff *skbuff) -{ - return htons(ETH_P_IP); -} - -static int slip_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return slip_user_read(fd, skb_mac_header(skb), skb->dev->mtu, - (struct slip_data *) &lp->user); -} - -static int slip_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return slip_user_write(fd, skb->data, skb->len, - (struct slip_data *) &lp->user); -} - -static const struct net_kern_info slip_kern_info = { - .init = slip_init, - .protocol = slip_protocol, - .read = slip_read, - .write = slip_write, -}; - -static int slip_setup(char *str, char **mac_out, void *data) -{ - struct slip_init *init = data; - - *init = ((struct slip_init) { .gate_addr = NULL }); - - if (str[0] != '\0') - init->gate_addr = str; - return 1; -} - -static struct transport slip_transport = { - .list = LIST_HEAD_INIT(slip_transport.list), - .name = "slip", - .setup = slip_setup, - .user = &slip_user_info, - .kern = &slip_kern_info, - .private_size = sizeof(struct slip_data), - .setup_size = sizeof(struct slip_init), -}; - -static int register_slip(void) -{ - register_transport(&slip_transport); - return 0; -} - -late_initcall(register_slip); diff --git a/arch/um/drivers/slip_user.c b/arch/um/drivers/slip_user.c deleted file mode 100644 index 7334019c9e60..000000000000 --- a/arch/um/drivers/slip_user.c +++ /dev/null @@ -1,252 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <errno.h> -#include <fcntl.h> -#include <string.h> -#include <termios.h> -#include <sys/wait.h> -#include <net_user.h> -#include <os.h> -#include "slip.h" -#include <um_malloc.h> - -static int slip_user_init(void *data, void *dev) -{ - struct slip_data *pri = data; - - pri->dev = dev; - return 0; -} - -static int set_up_tty(int fd) -{ - int i; - struct termios tios; - - if (tcgetattr(fd, &tios) < 0) { - printk(UM_KERN_ERR "could not get initial terminal " - "attributes\n"); - return -1; - } - - tios.c_cflag = CS8 | CREAD | HUPCL | CLOCAL; - tios.c_iflag = IGNBRK | IGNPAR; - tios.c_oflag = 0; - tios.c_lflag = 0; - for (i = 0; i < NCCS; i++) - tios.c_cc[i] = 0; - tios.c_cc[VMIN] = 1; - tios.c_cc[VTIME] = 0; - - cfsetospeed(&tios, B38400); - cfsetispeed(&tios, B38400); - - if (tcsetattr(fd, TCSAFLUSH, &tios) < 0) { - printk(UM_KERN_ERR "failed to set terminal attributes\n"); - return -1; - } - return 0; -} - -struct slip_pre_exec_data { - int stdin_fd; - int stdout_fd; - int close_me; -}; - -static void slip_pre_exec(void *arg) -{ - struct slip_pre_exec_data *data = arg; - - if (data->stdin_fd >= 0) - dup2(data->stdin_fd, 0); - dup2(data->stdout_fd, 1); - if (data->close_me >= 0) - close(data->close_me); -} - -static int slip_tramp(char **argv, int fd) -{ - struct slip_pre_exec_data pe_data; - char *output; - int pid, fds[2], err, output_len; - - err = os_pipe(fds, 1, 0); - if (err < 0) { - printk(UM_KERN_ERR "slip_tramp : pipe failed, err = %d\n", - -err); - goto out; - } - - err = 0; - pe_data.stdin_fd = fd; - pe_data.stdout_fd = fds[1]; - pe_data.close_me = fds[0]; - err = run_helper(slip_pre_exec, &pe_data, argv); - if (err < 0) - goto out_close; - pid = err; - - output_len = UM_KERN_PAGE_SIZE; - output = uml_kmalloc(output_len, UM_GFP_KERNEL); - if (output == NULL) { - printk(UM_KERN_ERR "slip_tramp : failed to allocate output " - "buffer\n"); - os_kill_process(pid, 1); - err = -ENOMEM; - goto out_close; - } - - close(fds[1]); - read_output(fds[0], output, output_len); - printk("%s", output); - - err = helper_wait(pid); - close(fds[0]); - - kfree(output); - return err; - -out_close: - close(fds[0]); - close(fds[1]); -out: - return err; -} - -static int slip_open(void *data) -{ - struct slip_data *pri = data; - char version_buf[sizeof("nnnnn\0")]; - char gate_buf[sizeof("nnn.nnn.nnn.nnn\0")]; - char *argv[] = { "uml_net", version_buf, "slip", "up", gate_buf, - NULL }; - int sfd, mfd, err; - - err = get_pty(); - if (err < 0) { - printk(UM_KERN_ERR "slip-open : Failed to open pty, err = %d\n", - -err); - goto out; - } - mfd = err; - - err = open(ptsname(mfd), O_RDWR, 0); - if (err < 0) { - printk(UM_KERN_ERR "Couldn't open tty for slip line, " - "err = %d\n", -err); - goto out_close; - } - sfd = err; - - err = set_up_tty(sfd); - if (err) - goto out_close2; - - pri->slave = sfd; - pri->slip.pos = 0; - pri->slip.esc = 0; - if (pri->gate_addr != NULL) { - sprintf(version_buf, "%d", UML_NET_VERSION); - strcpy(gate_buf, pri->gate_addr); - - err = slip_tramp(argv, sfd); - - if (err < 0) { - printk(UM_KERN_ERR "slip_tramp failed - err = %d\n", - -err); - goto out_close2; - } - err = os_get_ifname(pri->slave, pri->name); - if (err < 0) { - printk(UM_KERN_ERR "get_ifname failed, err = %d\n", - -err); - goto out_close2; - } - iter_addresses(pri->dev, open_addr, pri->name); - } - else { - err = os_set_slip(sfd); - if (err < 0) { - printk(UM_KERN_ERR "Failed to set slip discipline " - "encapsulation - err = %d\n", -err); - goto out_close2; - } - } - return mfd; -out_close2: - close(sfd); -out_close: - close(mfd); -out: - return err; -} - -static void slip_close(int fd, void *data) -{ - struct slip_data *pri = data; - char version_buf[sizeof("nnnnn\0")]; - char *argv[] = { "uml_net", version_buf, "slip", "down", pri->name, - NULL }; - int err; - - if (pri->gate_addr != NULL) - iter_addresses(pri->dev, close_addr, pri->name); - - sprintf(version_buf, "%d", UML_NET_VERSION); - - err = slip_tramp(argv, pri->slave); - - if (err != 0) - printk(UM_KERN_ERR "slip_tramp failed - errno = %d\n", -err); - close(fd); - close(pri->slave); - pri->slave = -1; -} - -int slip_user_read(int fd, void *buf, int len, struct slip_data *pri) -{ - return slip_proto_read(fd, buf, len, &pri->slip); -} - -int slip_user_write(int fd, void *buf, int len, struct slip_data *pri) -{ - return slip_proto_write(fd, buf, len, &pri->slip); -} - -static void slip_add_addr(unsigned char *addr, unsigned char *netmask, - void *data) -{ - struct slip_data *pri = data; - - if (pri->slave < 0) - return; - open_addr(addr, netmask, pri->name); -} - -static void slip_del_addr(unsigned char *addr, unsigned char *netmask, - void *data) -{ - struct slip_data *pri = data; - - if (pri->slave < 0) - return; - close_addr(addr, netmask, pri->name); -} - -const struct net_user_info slip_user_info = { - .init = slip_user_init, - .open = slip_open, - .close = slip_close, - .remove = NULL, - .add_address = slip_add_addr, - .delete_address = slip_del_addr, - .mtu = BUF_SIZE, - .max_packet = BUF_SIZE, -}; diff --git a/arch/um/drivers/slirp.h b/arch/um/drivers/slirp.h deleted file mode 100644 index 4aef2b88249a..000000000000 --- a/arch/um/drivers/slirp.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __UM_SLIRP_H -#define __UM_SLIRP_H - -#include "slip_common.h" - -#define SLIRP_MAX_ARGS 100 -/* - * XXX this next definition is here because I don't understand why this - * initializer doesn't work in slirp_kern.c: - * - * argv : { init->argv[ 0 ... SLIRP_MAX_ARGS-1 ] }, - * - * or why I can't typecast like this: - * - * argv : (char* [SLIRP_MAX_ARGS])(init->argv), - */ -struct arg_list_dummy_wrapper { char *argv[SLIRP_MAX_ARGS]; }; - -struct slirp_data { - void *dev; - struct arg_list_dummy_wrapper argw; - int pid; - int slave; - struct slip_proto slip; -}; - -extern const struct net_user_info slirp_user_info; - -extern int slirp_user_read(int fd, void *buf, int len, struct slirp_data *pri); -extern int slirp_user_write(int fd, void *buf, int len, - struct slirp_data *pri); - -#endif diff --git a/arch/um/drivers/slirp_kern.c b/arch/um/drivers/slirp_kern.c deleted file mode 100644 index 0a6151ee9572..000000000000 --- a/arch/um/drivers/slirp_kern.c +++ /dev/null @@ -1,120 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include <linux/if_arp.h> -#include <linux/init.h> -#include <linux/netdevice.h> -#include <linux/string.h> -#include <net_kern.h> -#include <net_user.h> -#include "slirp.h" - -struct slirp_init { - struct arg_list_dummy_wrapper argw; /* XXX should be simpler... */ -}; - -static void slirp_init(struct net_device *dev, void *data) -{ - struct uml_net_private *private; - struct slirp_data *spri; - struct slirp_init *init = data; - int i; - - private = netdev_priv(dev); - spri = (struct slirp_data *) private->user; - - spri->argw = init->argw; - spri->pid = -1; - spri->slave = -1; - spri->dev = dev; - - slip_proto_init(&spri->slip); - - dev->hard_header_len = 0; - dev->header_ops = NULL; - dev->addr_len = 0; - dev->type = ARPHRD_SLIP; - dev->tx_queue_len = 256; - dev->flags = IFF_NOARP; - printk("SLIRP backend - command line:"); - for (i = 0; spri->argw.argv[i] != NULL; i++) - printk(" '%s'",spri->argw.argv[i]); - printk("\n"); -} - -static unsigned short slirp_protocol(struct sk_buff *skbuff) -{ - return htons(ETH_P_IP); -} - -static int slirp_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return slirp_user_read(fd, skb_mac_header(skb), skb->dev->mtu, - (struct slirp_data *) &lp->user); -} - -static int slirp_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return slirp_user_write(fd, skb->data, skb->len, - (struct slirp_data *) &lp->user); -} - -const struct net_kern_info slirp_kern_info = { - .init = slirp_init, - .protocol = slirp_protocol, - .read = slirp_read, - .write = slirp_write, -}; - -static int slirp_setup(char *str, char **mac_out, void *data) -{ - struct slirp_init *init = data; - int i=0; - - *init = ((struct slirp_init) { .argw = { { "slirp", NULL } } }); - - str = split_if_spec(str, mac_out, NULL); - - if (str == NULL) /* no command line given after MAC addr */ - return 1; - - do { - if (i >= SLIRP_MAX_ARGS - 1) { - printk(KERN_WARNING "slirp_setup: truncating slirp " - "arguments\n"); - break; - } - init->argw.argv[i++] = str; - while(*str && *str!=',') { - if (*str == '_') - *str=' '; - str++; - } - if (*str != ',') - break; - *str++ = '\0'; - } while (1); - - init->argw.argv[i] = NULL; - return 1; -} - -static struct transport slirp_transport = { - .list = LIST_HEAD_INIT(slirp_transport.list), - .name = "slirp", - .setup = slirp_setup, - .user = &slirp_user_info, - .kern = &slirp_kern_info, - .private_size = sizeof(struct slirp_data), - .setup_size = sizeof(struct slirp_init), -}; - -static int register_slirp(void) -{ - register_transport(&slirp_transport); - return 0; -} - -late_initcall(register_slirp); diff --git a/arch/um/drivers/slirp_user.c b/arch/um/drivers/slirp_user.c deleted file mode 100644 index 97228aa080cb..000000000000 --- a/arch/um/drivers/slirp_user.c +++ /dev/null @@ -1,124 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include <unistd.h> -#include <errno.h> -#include <string.h> -#include <sys/wait.h> -#include <net_user.h> -#include <os.h> -#include "slirp.h" - -static int slirp_user_init(void *data, void *dev) -{ - struct slirp_data *pri = data; - - pri->dev = dev; - return 0; -} - -struct slirp_pre_exec_data { - int stdin_fd; - int stdout_fd; -}; - -static void slirp_pre_exec(void *arg) -{ - struct slirp_pre_exec_data *data = arg; - - if (data->stdin_fd != -1) - dup2(data->stdin_fd, 0); - if (data->stdout_fd != -1) - dup2(data->stdout_fd, 1); -} - -static int slirp_tramp(char **argv, int fd) -{ - struct slirp_pre_exec_data pe_data; - int pid; - - pe_data.stdin_fd = fd; - pe_data.stdout_fd = fd; - pid = run_helper(slirp_pre_exec, &pe_data, argv); - - return pid; -} - -static int slirp_open(void *data) -{ - struct slirp_data *pri = data; - int fds[2], err; - - err = os_pipe(fds, 1, 1); - if (err) - return err; - - err = slirp_tramp(pri->argw.argv, fds[1]); - if (err < 0) { - printk(UM_KERN_ERR "slirp_tramp failed - errno = %d\n", -err); - goto out; - } - - pri->slave = fds[1]; - pri->slip.pos = 0; - pri->slip.esc = 0; - pri->pid = err; - - return fds[0]; -out: - close(fds[0]); - close(fds[1]); - return err; -} - -static void slirp_close(int fd, void *data) -{ - struct slirp_data *pri = data; - int err; - - close(fd); - close(pri->slave); - - pri->slave = -1; - - if (pri->pid<1) { - printk(UM_KERN_ERR "slirp_close: no child process to shut " - "down\n"); - return; - } - -#if 0 - if (kill(pri->pid, SIGHUP)<0) { - printk(UM_KERN_ERR "slirp_close: sending hangup to %d failed " - "(%d)\n", pri->pid, errno); - } -#endif - err = helper_wait(pri->pid); - if (err < 0) - return; - - pri->pid = -1; -} - -int slirp_user_read(int fd, void *buf, int len, struct slirp_data *pri) -{ - return slip_proto_read(fd, buf, len, &pri->slip); -} - -int slirp_user_write(int fd, void *buf, int len, struct slirp_data *pri) -{ - return slip_proto_write(fd, buf, len, &pri->slip); -} - -const struct net_user_info slirp_user_info = { - .init = slirp_user_init, - .open = slirp_open, - .close = slirp_close, - .remove = NULL, - .add_address = NULL, - .delete_address = NULL, - .mtu = BUF_SIZE, - .max_packet = BUF_SIZE, -}; diff --git a/arch/um/drivers/umcast.h b/arch/um/drivers/umcast.h deleted file mode 100644 index fe39bee1e3bd..000000000000 --- a/arch/um/drivers/umcast.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#ifndef __DRIVERS_UMCAST_H -#define __DRIVERS_UMCAST_H - -#include <net_user.h> - -struct umcast_data { - char *addr; - unsigned short lport; - unsigned short rport; - void *listen_addr; - void *remote_addr; - int ttl; - int unicast; - void *dev; -}; - -extern const struct net_user_info umcast_user_info; - -extern int umcast_user_write(int fd, void *buf, int len, - struct umcast_data *pri); - -#endif diff --git a/arch/um/drivers/umcast_kern.c b/arch/um/drivers/umcast_kern.c deleted file mode 100644 index 595a54f2b9c6..000000000000 --- a/arch/um/drivers/umcast_kern.c +++ /dev/null @@ -1,188 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * user-mode-linux networking multicast transport - * Copyright (C) 2001 by Harald Welte <laforge@gnumonks.org> - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * - * based on the existing uml-networking code, which is - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 by various other people who didn't put their name here. - * - */ - -#include <linux/init.h> -#include <linux/netdevice.h> -#include "umcast.h" -#include <net_kern.h> - -struct umcast_init { - char *addr; - int lport; - int rport; - int ttl; - bool unicast; -}; - -static void umcast_init(struct net_device *dev, void *data) -{ - struct uml_net_private *pri; - struct umcast_data *dpri; - struct umcast_init *init = data; - - pri = netdev_priv(dev); - dpri = (struct umcast_data *) pri->user; - dpri->addr = init->addr; - dpri->lport = init->lport; - dpri->rport = init->rport; - dpri->unicast = init->unicast; - dpri->ttl = init->ttl; - dpri->dev = dev; - - if (dpri->unicast) { - printk(KERN_INFO "ucast backend address: %s:%u listen port: " - "%u\n", dpri->addr, dpri->rport, dpri->lport); - } else { - printk(KERN_INFO "mcast backend multicast address: %s:%u, " - "TTL:%u\n", dpri->addr, dpri->lport, dpri->ttl); - } -} - -static int umcast_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return net_recvfrom(fd, skb_mac_header(skb), - skb->dev->mtu + ETH_HEADER_OTHER); -} - -static int umcast_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return umcast_user_write(fd, skb->data, skb->len, - (struct umcast_data *) &lp->user); -} - -static const struct net_kern_info umcast_kern_info = { - .init = umcast_init, - .protocol = eth_protocol, - .read = umcast_read, - .write = umcast_write, -}; - -static int mcast_setup(char *str, char **mac_out, void *data) -{ - struct umcast_init *init = data; - char *port_str = NULL, *ttl_str = NULL, *remain; - char *last; - - *init = ((struct umcast_init) - { .addr = "239.192.168.1", - .lport = 1102, - .ttl = 1 }); - - remain = split_if_spec(str, mac_out, &init->addr, &port_str, &ttl_str, - NULL); - if (remain != NULL) { - printk(KERN_ERR "mcast_setup - Extra garbage on " - "specification : '%s'\n", remain); - return 0; - } - - if (port_str != NULL) { - init->lport = simple_strtoul(port_str, &last, 10); - if ((*last != '\0') || (last == port_str)) { - printk(KERN_ERR "mcast_setup - Bad port : '%s'\n", - port_str); - return 0; - } - } - - if (ttl_str != NULL) { - init->ttl = simple_strtoul(ttl_str, &last, 10); - if ((*last != '\0') || (last == ttl_str)) { - printk(KERN_ERR "mcast_setup - Bad ttl : '%s'\n", - ttl_str); - return 0; - } - } - - init->unicast = false; - init->rport = init->lport; - - printk(KERN_INFO "Configured mcast device: %s:%u-%u\n", init->addr, - init->lport, init->ttl); - - return 1; -} - -static int ucast_setup(char *str, char **mac_out, void *data) -{ - struct umcast_init *init = data; - char *lport_str = NULL, *rport_str = NULL, *remain; - char *last; - - *init = ((struct umcast_init) - { .addr = "", - .lport = 1102, - .rport = 1102 }); - - remain = split_if_spec(str, mac_out, &init->addr, - &lport_str, &rport_str, NULL); - if (remain != NULL) { - printk(KERN_ERR "ucast_setup - Extra garbage on " - "specification : '%s'\n", remain); - return 0; - } - - if (lport_str != NULL) { - init->lport = simple_strtoul(lport_str, &last, 10); - if ((*last != '\0') || (last == lport_str)) { - printk(KERN_ERR "ucast_setup - Bad listen port : " - "'%s'\n", lport_str); - return 0; - } - } - - if (rport_str != NULL) { - init->rport = simple_strtoul(rport_str, &last, 10); - if ((*last != '\0') || (last == rport_str)) { - printk(KERN_ERR "ucast_setup - Bad remote port : " - "'%s'\n", rport_str); - return 0; - } - } - - init->unicast = true; - - printk(KERN_INFO "Configured ucast device: :%u -> %s:%u\n", - init->lport, init->addr, init->rport); - - return 1; -} - -static struct transport mcast_transport = { - .list = LIST_HEAD_INIT(mcast_transport.list), - .name = "mcast", - .setup = mcast_setup, - .user = &umcast_user_info, - .kern = &umcast_kern_info, - .private_size = sizeof(struct umcast_data), - .setup_size = sizeof(struct umcast_init), -}; - -static struct transport ucast_transport = { - .list = LIST_HEAD_INIT(ucast_transport.list), - .name = "ucast", - .setup = ucast_setup, - .user = &umcast_user_info, - .kern = &umcast_kern_info, - .private_size = sizeof(struct umcast_data), - .setup_size = sizeof(struct umcast_init), -}; - -static int register_umcast(void) -{ - register_transport(&mcast_transport); - register_transport(&ucast_transport); - return 0; -} - -late_initcall(register_umcast); diff --git a/arch/um/drivers/umcast_user.c b/arch/um/drivers/umcast_user.c deleted file mode 100644 index b50b13cff04e..000000000000 --- a/arch/um/drivers/umcast_user.c +++ /dev/null @@ -1,184 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * user-mode-linux networking multicast transport - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Copyright (C) 2001 by Harald Welte <laforge@gnumonks.org> - * - * based on the existing uml-networking code, which is - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 by various other people who didn't put their name here. - * - * - */ - -#include <unistd.h> -#include <errno.h> -#include <netinet/in.h> -#include "umcast.h" -#include <net_user.h> -#include <um_malloc.h> - -static struct sockaddr_in *new_addr(char *addr, unsigned short port) -{ - struct sockaddr_in *sin; - - sin = uml_kmalloc(sizeof(struct sockaddr_in), UM_GFP_KERNEL); - if (sin == NULL) { - printk(UM_KERN_ERR "new_addr: allocation of sockaddr_in " - "failed\n"); - return NULL; - } - sin->sin_family = AF_INET; - if (addr) - sin->sin_addr.s_addr = in_aton(addr); - else - sin->sin_addr.s_addr = INADDR_ANY; - sin->sin_port = htons(port); - return sin; -} - -static int umcast_user_init(void *data, void *dev) -{ - struct umcast_data *pri = data; - - pri->remote_addr = new_addr(pri->addr, pri->rport); - if (pri->unicast) - pri->listen_addr = new_addr(NULL, pri->lport); - else - pri->listen_addr = pri->remote_addr; - pri->dev = dev; - return 0; -} - -static void umcast_remove(void *data) -{ - struct umcast_data *pri = data; - - kfree(pri->listen_addr); - if (pri->unicast) - kfree(pri->remote_addr); - pri->listen_addr = pri->remote_addr = NULL; -} - -static int umcast_open(void *data) -{ - struct umcast_data *pri = data; - struct sockaddr_in *lsin = pri->listen_addr; - struct sockaddr_in *rsin = pri->remote_addr; - struct ip_mreq mreq; - int fd, yes = 1, err = -EINVAL; - - - if ((!pri->unicast && lsin->sin_addr.s_addr == 0) || - (rsin->sin_addr.s_addr == 0) || - (lsin->sin_port == 0) || (rsin->sin_port == 0)) - goto out; - - fd = socket(AF_INET, SOCK_DGRAM, 0); - - if (fd < 0) { - err = -errno; - printk(UM_KERN_ERR "umcast_open : data socket failed, " - "errno = %d\n", errno); - goto out; - } - - if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0) { - err = -errno; - printk(UM_KERN_ERR "umcast_open: SO_REUSEADDR failed, " - "errno = %d\n", errno); - goto out_close; - } - - if (!pri->unicast) { - /* set ttl according to config */ - if (setsockopt(fd, SOL_IP, IP_MULTICAST_TTL, &pri->ttl, - sizeof(pri->ttl)) < 0) { - err = -errno; - printk(UM_KERN_ERR "umcast_open: IP_MULTICAST_TTL " - "failed, error = %d\n", errno); - goto out_close; - } - - /* set LOOP, so data does get fed back to local sockets */ - if (setsockopt(fd, SOL_IP, IP_MULTICAST_LOOP, - &yes, sizeof(yes)) < 0) { - err = -errno; - printk(UM_KERN_ERR "umcast_open: IP_MULTICAST_LOOP " - "failed, error = %d\n", errno); - goto out_close; - } - } - - /* bind socket to the address */ - if (bind(fd, (struct sockaddr *) lsin, sizeof(*lsin)) < 0) { - err = -errno; - printk(UM_KERN_ERR "umcast_open : data bind failed, " - "errno = %d\n", errno); - goto out_close; - } - - if (!pri->unicast) { - /* subscribe to the multicast group */ - mreq.imr_multiaddr.s_addr = lsin->sin_addr.s_addr; - mreq.imr_interface.s_addr = 0; - if (setsockopt(fd, SOL_IP, IP_ADD_MEMBERSHIP, - &mreq, sizeof(mreq)) < 0) { - err = -errno; - printk(UM_KERN_ERR "umcast_open: IP_ADD_MEMBERSHIP " - "failed, error = %d\n", errno); - printk(UM_KERN_ERR "There appears not to be a " - "multicast-capable network interface on the " - "host.\n"); - printk(UM_KERN_ERR "eth0 should be configured in order " - "to use the multicast transport.\n"); - goto out_close; - } - } - - return fd; - - out_close: - close(fd); - out: - return err; -} - -static void umcast_close(int fd, void *data) -{ - struct umcast_data *pri = data; - - if (!pri->unicast) { - struct ip_mreq mreq; - struct sockaddr_in *lsin = pri->listen_addr; - - mreq.imr_multiaddr.s_addr = lsin->sin_addr.s_addr; - mreq.imr_interface.s_addr = 0; - if (setsockopt(fd, SOL_IP, IP_DROP_MEMBERSHIP, - &mreq, sizeof(mreq)) < 0) { - printk(UM_KERN_ERR "umcast_close: IP_DROP_MEMBERSHIP " - "failed, error = %d\n", errno); - } - } - - close(fd); -} - -int umcast_user_write(int fd, void *buf, int len, struct umcast_data *pri) -{ - struct sockaddr_in *data_addr = pri->remote_addr; - - return net_sendto(fd, buf, len, data_addr, sizeof(*data_addr)); -} - -const struct net_user_info umcast_user_info = { - .init = umcast_user_init, - .open = umcast_open, - .close = umcast_close, - .remove = umcast_remove, - .add_address = NULL, - .delete_address = NULL, - .mtu = ETH_MAX_PACKET, - .max_packet = ETH_MAX_PACKET + ETH_HEADER_OTHER, -}; diff --git a/arch/um/drivers/vde.h b/arch/um/drivers/vde.h deleted file mode 100644 index cab0379e6142..000000000000 --- a/arch/um/drivers/vde.h +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2007 Luca Bigliardi (shammash@artha.org). - */ - -#ifndef __UM_VDE_H__ -#define __UM_VDE_H__ - -struct vde_data { - char *vde_switch; - char *descr; - void *args; - void *conn; - void *dev; -}; - -struct vde_init { - char *vde_switch; - char *descr; - int port; - char *group; - int mode; -}; - -extern const struct net_user_info vde_user_info; - -extern void vde_init_libstuff(struct vde_data *vpri, struct vde_init *init); - -extern int vde_user_read(void *conn, void *buf, int len); -extern int vde_user_write(void *conn, void *buf, int len); - -#endif diff --git a/arch/um/drivers/vde_kern.c b/arch/um/drivers/vde_kern.c deleted file mode 100644 index bc6f22cbfb35..000000000000 --- a/arch/um/drivers/vde_kern.c +++ /dev/null @@ -1,129 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2007 Luca Bigliardi (shammash@artha.org). - * - * Transport usage: - * ethN=vde,<vde_switch>,<mac addr>,<port>,<group>,<mode>,<description> - * - */ - -#include <linux/init.h> -#include <linux/netdevice.h> -#include <net_kern.h> -#include <net_user.h> -#include "vde.h" - -static void vde_init(struct net_device *dev, void *data) -{ - struct vde_init *init = data; - struct uml_net_private *pri; - struct vde_data *vpri; - - pri = netdev_priv(dev); - vpri = (struct vde_data *) pri->user; - - vpri->vde_switch = init->vde_switch; - vpri->descr = init->descr ? init->descr : "UML vde_transport"; - vpri->args = NULL; - vpri->conn = NULL; - vpri->dev = dev; - - printk("vde backend - %s, ", vpri->vde_switch ? - vpri->vde_switch : "(default socket)"); - - vde_init_libstuff(vpri, init); - - printk("\n"); -} - -static int vde_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - struct vde_data *pri = (struct vde_data *) &lp->user; - - if (pri->conn != NULL) - return vde_user_read(pri->conn, skb_mac_header(skb), - skb->dev->mtu + ETH_HEADER_OTHER); - - printk(KERN_ERR "vde_read - we have no VDECONN to read from"); - return -EBADF; -} - -static int vde_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - struct vde_data *pri = (struct vde_data *) &lp->user; - - if (pri->conn != NULL) - return vde_user_write((void *)pri->conn, skb->data, - skb->len); - - printk(KERN_ERR "vde_write - we have no VDECONN to write to"); - return -EBADF; -} - -static const struct net_kern_info vde_kern_info = { - .init = vde_init, - .protocol = eth_protocol, - .read = vde_read, - .write = vde_write, -}; - -static int vde_setup(char *str, char **mac_out, void *data) -{ - struct vde_init *init = data; - char *remain, *port_str = NULL, *mode_str = NULL, *last; - - *init = ((struct vde_init) - { .vde_switch = NULL, - .descr = NULL, - .port = 0, - .group = NULL, - .mode = 0 }); - - remain = split_if_spec(str, &init->vde_switch, mac_out, &port_str, - &init->group, &mode_str, &init->descr, NULL); - - if (remain != NULL) - printk(KERN_WARNING "vde_setup - Ignoring extra data :" - "'%s'\n", remain); - - if (port_str != NULL) { - init->port = simple_strtoul(port_str, &last, 10); - if ((*last != '\0') || (last == port_str)) { - printk(KERN_ERR "vde_setup - Bad port : '%s'\n", - port_str); - return 0; - } - } - - if (mode_str != NULL) { - init->mode = simple_strtoul(mode_str, &last, 8); - if ((*last != '\0') || (last == mode_str)) { - printk(KERN_ERR "vde_setup - Bad mode : '%s'\n", - mode_str); - return 0; - } - } - - printk(KERN_INFO "Configured vde device: %s\n", init->vde_switch ? - init->vde_switch : "(default socket)"); - - return 1; -} - -static struct transport vde_transport = { - .list = LIST_HEAD_INIT(vde_transport.list), - .name = "vde", - .setup = vde_setup, - .user = &vde_user_info, - .kern = &vde_kern_info, - .private_size = sizeof(struct vde_data), - .setup_size = sizeof(struct vde_init), -}; - -static int register_vde(void) -{ - register_transport(&vde_transport); - return 0; -} - -late_initcall(register_vde); diff --git a/arch/um/drivers/vde_user.c b/arch/um/drivers/vde_user.c deleted file mode 100644 index bc7dc4e1e486..000000000000 --- a/arch/um/drivers/vde_user.c +++ /dev/null @@ -1,125 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2007 Luca Bigliardi (shammash@artha.org). - */ - -#include <stddef.h> -#include <errno.h> -#include <libvdeplug.h> -#include <net_user.h> -#include <um_malloc.h> -#include "vde.h" - -static int vde_user_init(void *data, void *dev) -{ - struct vde_data *pri = data; - VDECONN *conn = NULL; - int err = -EINVAL; - - pri->dev = dev; - - conn = vde_open(pri->vde_switch, pri->descr, pri->args); - - if (conn == NULL) { - err = -errno; - printk(UM_KERN_ERR "vde_user_init: vde_open failed, " - "errno = %d\n", errno); - return err; - } - - printk(UM_KERN_INFO "vde backend - connection opened\n"); - - pri->conn = conn; - - return 0; -} - -static int vde_user_open(void *data) -{ - struct vde_data *pri = data; - - if (pri->conn != NULL) - return vde_datafd(pri->conn); - - printk(UM_KERN_WARNING "vde_open - we have no VDECONN to open"); - return -EINVAL; -} - -static void vde_remove(void *data) -{ - struct vde_data *pri = data; - - if (pri->conn != NULL) { - printk(UM_KERN_INFO "vde backend - closing connection\n"); - vde_close(pri->conn); - pri->conn = NULL; - kfree(pri->args); - pri->args = NULL; - return; - } - - printk(UM_KERN_WARNING "vde_remove - we have no VDECONN to remove"); -} - -const struct net_user_info vde_user_info = { - .init = vde_user_init, - .open = vde_user_open, - .close = NULL, - .remove = vde_remove, - .add_address = NULL, - .delete_address = NULL, - .mtu = ETH_MAX_PACKET, - .max_packet = ETH_MAX_PACKET + ETH_HEADER_OTHER, -}; - -void vde_init_libstuff(struct vde_data *vpri, struct vde_init *init) -{ - struct vde_open_args *args; - - vpri->args = uml_kmalloc(sizeof(struct vde_open_args), UM_GFP_KERNEL); - if (vpri->args == NULL) { - printk(UM_KERN_ERR "vde_init_libstuff - vde_open_args " - "allocation failed"); - return; - } - - args = vpri->args; - - args->port = init->port; - args->group = init->group; - args->mode = init->mode ? init->mode : 0700; - - args->port ? printk("port %d", args->port) : - printk("undefined port"); -} - -int vde_user_read(void *conn, void *buf, int len) -{ - VDECONN *vconn = conn; - int rv; - - if (vconn == NULL) - return 0; - - rv = vde_recv(vconn, buf, len, 0); - if (rv < 0) { - if (errno == EAGAIN) - return 0; - return -errno; - } - else if (rv == 0) - return -ENOTCONN; - - return rv; -} - -int vde_user_write(void *conn, void *buf, int len) -{ - VDECONN *vconn = conn; - - if (vconn == NULL) - return 0; - - return vde_send(vconn, buf, len, 0); -} - diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index b97bb52dd562..5226d2c52e6a 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -8,6 +8,8 @@ * Copyright (C) 2001 by various other people who didn't put their name here. */ +#define pr_fmt(fmt) "uml-vector: " fmt + #include <linux/memblock.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> @@ -27,7 +29,6 @@ #include <init.h> #include <irq_kern.h> #include <irq_user.h> -#include <net_kern.h> #include <os.h> #include "mconsole_kern.h" #include "vector_user.h" @@ -1539,7 +1540,41 @@ static void vector_timer_expire(struct timer_list *t) napi_schedule(&vp->napi); } +static void vector_setup_etheraddr(struct net_device *dev, char *str) +{ + u8 addr[ETH_ALEN]; + + if (str == NULL) + goto random; + + if (!mac_pton(str, addr)) { + netdev_err(dev, + "Failed to parse '%s' as an ethernet address\n", str); + goto random; + } + if (is_multicast_ether_addr(addr)) { + netdev_err(dev, + "Attempt to assign a multicast ethernet address to a device disallowed\n"); + goto random; + } + if (!is_valid_ether_addr(addr)) { + netdev_err(dev, + "Attempt to assign an invalid ethernet address to a device disallowed\n"); + goto random; + } + if (!is_local_ether_addr(addr)) { + netdev_warn(dev, "Warning: Assigning a globally valid ethernet address to a device\n"); + netdev_warn(dev, "You should set the 2nd rightmost bit in the first byte of the MAC,\n"); + netdev_warn(dev, "i.e. %02x:%02x:%02x:%02x:%02x:%02x\n", + addr[0] | 0x02, addr[1], addr[2], addr[3], addr[4], addr[5]); + } + eth_hw_addr_set(dev, addr); + return; +random: + netdev_info(dev, "Choosing a random ethernet address\n"); + eth_hw_addr_random(dev); +} static void vector_eth_configure( int n, @@ -1553,14 +1588,12 @@ static void vector_eth_configure( device = kzalloc(sizeof(*device), GFP_KERNEL); if (device == NULL) { - printk(KERN_ERR "eth_configure failed to allocate struct " - "vector_device\n"); + pr_err("Failed to allocate struct vector_device for vec%d\n", n); return; } dev = alloc_etherdev(sizeof(struct vector_private)); if (dev == NULL) { - printk(KERN_ERR "eth_configure: failed to allocate struct " - "net_device for vec%d\n", n); + pr_err("Failed to allocate struct net_device for vec%d\n", n); goto out_free_device; } @@ -1574,7 +1607,7 @@ static void vector_eth_configure( * and fail. */ snprintf(dev->name, sizeof(dev->name), "vec%d", n); - uml_net_setup_etheraddr(dev, uml_vector_fetch_arg(def, "mac")); + vector_setup_etheraddr(dev, uml_vector_fetch_arg(def, "mac")); vp = netdev_priv(dev); /* sysfs register */ @@ -1690,8 +1723,7 @@ static int __init vector_setup(char *str) err = vector_parse(str, &n, &str, &error); if (err) { - printk(KERN_ERR "vector_setup - Couldn't parse '%s' : %s\n", - str, error); + pr_err("Couldn't parse '%s': %s\n", str, error); return 1; } new = memblock_alloc_or_panic(sizeof(*new), SMP_CACHE_BYTES); diff --git a/arch/um/drivers/vfio_kern.c b/arch/um/drivers/vfio_kern.c new file mode 100644 index 000000000000..b51fc9888ae1 --- /dev/null +++ b/arch/um/drivers/vfio_kern.c @@ -0,0 +1,642 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 Ant Group + * Author: Tiwei Bie <tiwei.btw@antgroup.com> + */ + +#define pr_fmt(fmt) "vfio-uml: " fmt + +#include <linux/module.h> +#include <linux/logic_iomem.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/string.h> +#include <linux/unaligned.h> +#include <irq_kern.h> +#include <init.h> +#include <os.h> + +#include "virt-pci.h" +#include "vfio_user.h" + +#define to_vdev(_pdev) container_of(_pdev, struct uml_vfio_device, pdev) + +struct uml_vfio_intr_ctx { + struct uml_vfio_device *dev; + int irq; +}; + +struct uml_vfio_device { + const char *name; + int group; + + struct um_pci_device pdev; + struct uml_vfio_user_device udev; + struct uml_vfio_intr_ctx *intr_ctx; + + int msix_cap; + int msix_bar; + int msix_offset; + int msix_size; + u32 *msix_data; + + struct list_head list; +}; + +struct uml_vfio_group { + int id; + int fd; + int users; + struct list_head list; +}; + +static struct { + int fd; + int users; +} uml_vfio_container = { .fd = -1 }; +static DEFINE_MUTEX(uml_vfio_container_mtx); + +static LIST_HEAD(uml_vfio_groups); +static DEFINE_MUTEX(uml_vfio_groups_mtx); + +static LIST_HEAD(uml_vfio_devices); + +static int uml_vfio_set_container(int group_fd) +{ + int err; + + guard(mutex)(¨_vfio_container_mtx); + + err = uml_vfio_user_set_container(uml_vfio_container.fd, group_fd); + if (err) + return err; + + uml_vfio_container.users++; + if (uml_vfio_container.users > 1) + return 0; + + err = uml_vfio_user_setup_iommu(uml_vfio_container.fd); + if (err) { + uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd); + uml_vfio_container.users--; + } + return err; +} + +static void uml_vfio_unset_container(int group_fd) +{ + guard(mutex)(¨_vfio_container_mtx); + + uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd); + uml_vfio_container.users--; +} + +static int uml_vfio_open_group(int group_id) +{ + struct uml_vfio_group *group; + int err; + + guard(mutex)(¨_vfio_groups_mtx); + + list_for_each_entry(group, ¨_vfio_groups, list) { + if (group->id == group_id) { + group->users++; + return group->fd; + } + } + + group = kzalloc(sizeof(*group), GFP_KERNEL); + if (!group) + return -ENOMEM; + + group->fd = uml_vfio_user_open_group(group_id); + if (group->fd < 0) { + err = group->fd; + goto free_group; + } + + err = uml_vfio_set_container(group->fd); + if (err) + goto close_group; + + group->id = group_id; + group->users = 1; + + list_add(&group->list, ¨_vfio_groups); + + return group->fd; + +close_group: + os_close_file(group->fd); +free_group: + kfree(group); + return err; +} + +static int uml_vfio_release_group(int group_fd) +{ + struct uml_vfio_group *group; + + guard(mutex)(¨_vfio_groups_mtx); + + list_for_each_entry(group, ¨_vfio_groups, list) { + if (group->fd == group_fd) { + group->users--; + if (group->users == 0) { + uml_vfio_unset_container(group_fd); + os_close_file(group_fd); + list_del(&group->list); + kfree(group); + } + return 0; + } + } + + return -ENOENT; +} + +static irqreturn_t uml_vfio_interrupt(int unused, void *opaque) +{ + struct uml_vfio_intr_ctx *ctx = opaque; + struct uml_vfio_device *dev = ctx->dev; + int index = ctx - dev->intr_ctx; + int irqfd = dev->udev.irqfd[index]; + int irq = dev->msix_data[index]; + uint64_t v; + int r; + + do { + r = os_read_file(irqfd, &v, sizeof(v)); + if (r == sizeof(v)) + generic_handle_irq(irq); + } while (r == sizeof(v) || r == -EINTR); + WARN(r != -EAGAIN, "read returned %d\n", r); + + return IRQ_HANDLED; +} + +static int uml_vfio_activate_irq(struct uml_vfio_device *dev, int index) +{ + struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index]; + int err, irqfd; + + if (ctx->irq >= 0) + return 0; + + irqfd = uml_vfio_user_activate_irq(&dev->udev, index); + if (irqfd < 0) + return irqfd; + + ctx->irq = um_request_irq(UM_IRQ_ALLOC, irqfd, IRQ_READ, + uml_vfio_interrupt, 0, + "vfio-uml", ctx); + if (ctx->irq < 0) { + err = ctx->irq; + goto deactivate; + } + + err = add_sigio_fd(irqfd); + if (err) + goto free_irq; + + return 0; + +free_irq: + um_free_irq(ctx->irq, ctx); + ctx->irq = -1; +deactivate: + uml_vfio_user_deactivate_irq(&dev->udev, index); + return err; +} + +static int uml_vfio_deactivate_irq(struct uml_vfio_device *dev, int index) +{ + struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index]; + + if (ctx->irq >= 0) { + ignore_sigio_fd(dev->udev.irqfd[index]); + um_free_irq(ctx->irq, ctx); + uml_vfio_user_deactivate_irq(&dev->udev, index); + ctx->irq = -1; + } + return 0; +} + +static int uml_vfio_update_msix_cap(struct uml_vfio_device *dev, + unsigned int offset, int size, + unsigned long val) +{ + /* + * Here, we handle only the operations we care about, + * ignoring the rest. + */ + if (size == 2 && offset == dev->msix_cap + PCI_MSIX_FLAGS) { + switch (val & ~PCI_MSIX_FLAGS_QSIZE) { + case PCI_MSIX_FLAGS_ENABLE: + case 0: + return uml_vfio_user_update_irqs(&dev->udev); + } + } + return 0; +} + +static int uml_vfio_update_msix_table(struct uml_vfio_device *dev, + unsigned int offset, int size, + unsigned long val) +{ + int index; + + /* + * Here, we handle only the operations we care about, + * ignoring the rest. + */ + offset -= dev->msix_offset + PCI_MSIX_ENTRY_DATA; + + if (size != 4 || offset % PCI_MSIX_ENTRY_SIZE != 0) + return 0; + + index = offset / PCI_MSIX_ENTRY_SIZE; + if (index >= dev->udev.irq_count) + return -EINVAL; + + dev->msix_data[index] = val; + + return val ? uml_vfio_activate_irq(dev, index) : + uml_vfio_deactivate_irq(dev, index); +} + +static unsigned long __uml_vfio_cfgspace_read(struct uml_vfio_device *dev, + unsigned int offset, int size) +{ + u8 data[8]; + + memset(data, 0xff, sizeof(data)); + + if (uml_vfio_user_cfgspace_read(&dev->udev, offset, data, size)) + return ULONG_MAX; + + switch (size) { + case 1: + return data[0]; + case 2: + return le16_to_cpup((void *)data); + case 4: + return le32_to_cpup((void *)data); +#ifdef CONFIG_64BIT + case 8: + return le64_to_cpup((void *)data); +#endif + default: + return ULONG_MAX; + } +} + +static unsigned long uml_vfio_cfgspace_read(struct um_pci_device *pdev, + unsigned int offset, int size) +{ + struct uml_vfio_device *dev = to_vdev(pdev); + + return __uml_vfio_cfgspace_read(dev, offset, size); +} + +static void __uml_vfio_cfgspace_write(struct uml_vfio_device *dev, + unsigned int offset, int size, + unsigned long val) +{ + u8 data[8]; + + switch (size) { + case 1: + data[0] = (u8)val; + break; + case 2: + put_unaligned_le16(val, (void *)data); + break; + case 4: + put_unaligned_le32(val, (void *)data); + break; +#ifdef CONFIG_64BIT + case 8: + put_unaligned_le64(val, (void *)data); + break; +#endif + } + + WARN_ON(uml_vfio_user_cfgspace_write(&dev->udev, offset, data, size)); +} + +static void uml_vfio_cfgspace_write(struct um_pci_device *pdev, + unsigned int offset, int size, + unsigned long val) +{ + struct uml_vfio_device *dev = to_vdev(pdev); + + if (offset < dev->msix_cap + PCI_CAP_MSIX_SIZEOF && + offset + size > dev->msix_cap) + WARN_ON(uml_vfio_update_msix_cap(dev, offset, size, val)); + + __uml_vfio_cfgspace_write(dev, offset, size, val); +} + +static void uml_vfio_bar_copy_from(struct um_pci_device *pdev, int bar, + void *buffer, unsigned int offset, int size) +{ + struct uml_vfio_device *dev = to_vdev(pdev); + + memset(buffer, 0xff, size); + uml_vfio_user_bar_read(&dev->udev, bar, offset, buffer, size); +} + +static unsigned long uml_vfio_bar_read(struct um_pci_device *pdev, int bar, + unsigned int offset, int size) +{ + u8 data[8]; + + uml_vfio_bar_copy_from(pdev, bar, data, offset, size); + + switch (size) { + case 1: + return data[0]; + case 2: + return le16_to_cpup((void *)data); + case 4: + return le32_to_cpup((void *)data); +#ifdef CONFIG_64BIT + case 8: + return le64_to_cpup((void *)data); +#endif + default: + return ULONG_MAX; + } +} + +static void uml_vfio_bar_copy_to(struct um_pci_device *pdev, int bar, + unsigned int offset, const void *buffer, + int size) +{ + struct uml_vfio_device *dev = to_vdev(pdev); + + uml_vfio_user_bar_write(&dev->udev, bar, offset, buffer, size); +} + +static void uml_vfio_bar_write(struct um_pci_device *pdev, int bar, + unsigned int offset, int size, + unsigned long val) +{ + struct uml_vfio_device *dev = to_vdev(pdev); + u8 data[8]; + + if (bar == dev->msix_bar && offset + size > dev->msix_offset && + offset < dev->msix_offset + dev->msix_size) + WARN_ON(uml_vfio_update_msix_table(dev, offset, size, val)); + + switch (size) { + case 1: + data[0] = (u8)val; + break; + case 2: + put_unaligned_le16(val, (void *)data); + break; + case 4: + put_unaligned_le32(val, (void *)data); + break; +#ifdef CONFIG_64BIT + case 8: + put_unaligned_le64(val, (void *)data); + break; +#endif + } + + uml_vfio_bar_copy_to(pdev, bar, offset, data, size); +} + +static void uml_vfio_bar_set(struct um_pci_device *pdev, int bar, + unsigned int offset, u8 value, int size) +{ + struct uml_vfio_device *dev = to_vdev(pdev); + int i; + + for (i = 0; i < size; i++) + uml_vfio_user_bar_write(&dev->udev, bar, offset + i, &value, 1); +} + +static const struct um_pci_ops uml_vfio_um_pci_ops = { + .cfgspace_read = uml_vfio_cfgspace_read, + .cfgspace_write = uml_vfio_cfgspace_write, + .bar_read = uml_vfio_bar_read, + .bar_write = uml_vfio_bar_write, + .bar_copy_from = uml_vfio_bar_copy_from, + .bar_copy_to = uml_vfio_bar_copy_to, + .bar_set = uml_vfio_bar_set, +}; + +static u8 uml_vfio_find_capability(struct uml_vfio_device *dev, u8 cap) +{ + u8 id, pos; + u16 ent; + int ttl = 48; /* PCI_FIND_CAP_TTL */ + + pos = __uml_vfio_cfgspace_read(dev, PCI_CAPABILITY_LIST, sizeof(pos)); + + while (pos && ttl--) { + ent = __uml_vfio_cfgspace_read(dev, pos, sizeof(ent)); + + id = ent & 0xff; + if (id == 0xff) + break; + if (id == cap) + return pos; + + pos = ent >> 8; + } + + return 0; +} + +static int uml_vfio_read_msix_table(struct uml_vfio_device *dev) +{ + unsigned int off; + u16 flags; + u32 tbl; + + off = uml_vfio_find_capability(dev, PCI_CAP_ID_MSIX); + if (!off) + return -ENOTSUPP; + + dev->msix_cap = off; + + tbl = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_TABLE, sizeof(tbl)); + flags = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_FLAGS, sizeof(flags)); + + dev->msix_bar = tbl & PCI_MSIX_TABLE_BIR; + dev->msix_offset = tbl & PCI_MSIX_TABLE_OFFSET; + dev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * PCI_MSIX_ENTRY_SIZE; + + dev->msix_data = kzalloc(dev->msix_size, GFP_KERNEL); + if (!dev->msix_data) + return -ENOMEM; + + return 0; +} + +static void uml_vfio_open_device(struct uml_vfio_device *dev) +{ + struct uml_vfio_intr_ctx *ctx; + int err, group_id, i; + + group_id = uml_vfio_user_get_group_id(dev->name); + if (group_id < 0) { + pr_err("Failed to get group id (%s), error %d\n", + dev->name, group_id); + goto free_dev; + } + + dev->group = uml_vfio_open_group(group_id); + if (dev->group < 0) { + pr_err("Failed to open group %d (%s), error %d\n", + group_id, dev->name, dev->group); + goto free_dev; + } + + err = uml_vfio_user_setup_device(&dev->udev, dev->group, dev->name); + if (err) { + pr_err("Failed to setup device (%s), error %d\n", + dev->name, err); + goto release_group; + } + + err = uml_vfio_read_msix_table(dev); + if (err) { + pr_err("Failed to read MSI-X table (%s), error %d\n", + dev->name, err); + goto teardown_udev; + } + + dev->intr_ctx = kmalloc_array(dev->udev.irq_count, + sizeof(struct uml_vfio_intr_ctx), + GFP_KERNEL); + if (!dev->intr_ctx) { + pr_err("Failed to allocate interrupt context (%s)\n", + dev->name); + goto free_msix; + } + + for (i = 0; i < dev->udev.irq_count; i++) { + ctx = &dev->intr_ctx[i]; + ctx->dev = dev; + ctx->irq = -1; + } + + dev->pdev.ops = ¨_vfio_um_pci_ops; + + err = um_pci_device_register(&dev->pdev); + if (err) { + pr_err("Failed to register UM PCI device (%s), error %d\n", + dev->name, err); + goto free_intr_ctx; + } + + return; + +free_intr_ctx: + kfree(dev->intr_ctx); +free_msix: + kfree(dev->msix_data); +teardown_udev: + uml_vfio_user_teardown_device(&dev->udev); +release_group: + uml_vfio_release_group(dev->group); +free_dev: + list_del(&dev->list); + kfree(dev->name); + kfree(dev); +} + +static void uml_vfio_release_device(struct uml_vfio_device *dev) +{ + int i; + + for (i = 0; i < dev->udev.irq_count; i++) + uml_vfio_deactivate_irq(dev, i); + uml_vfio_user_update_irqs(&dev->udev); + + um_pci_device_unregister(&dev->pdev); + kfree(dev->intr_ctx); + kfree(dev->msix_data); + uml_vfio_user_teardown_device(&dev->udev); + uml_vfio_release_group(dev->group); + list_del(&dev->list); + kfree(dev->name); + kfree(dev); +} + +static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *kp) +{ + struct uml_vfio_device *dev; + int fd; + + if (uml_vfio_container.fd < 0) { + fd = uml_vfio_user_open_container(); + if (fd < 0) + return fd; + uml_vfio_container.fd = fd; + } + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + + dev->name = kstrdup(device, GFP_KERNEL); + if (!dev->name) { + kfree(dev); + return -ENOMEM; + } + + list_add_tail(&dev->list, ¨_vfio_devices); + return 0; +} + +static int uml_vfio_cmdline_get(char *buffer, const struct kernel_param *kp) +{ + return 0; +} + +static const struct kernel_param_ops uml_vfio_cmdline_param_ops = { + .set = uml_vfio_cmdline_set, + .get = uml_vfio_cmdline_get, +}; + +device_param_cb(device, ¨_vfio_cmdline_param_ops, NULL, 0400); +__uml_help(uml_vfio_cmdline_param_ops, +"vfio_uml.device=<domain:bus:slot.function>\n" +" Pass through a PCI device to UML via VFIO. Currently, only MSI-X\n" +" capable devices are supported, and it is assumed that drivers will\n" +" use MSI-X. This parameter can be specified multiple times to pass\n" +" through multiple PCI devices to UML.\n\n" +); + +static int __init uml_vfio_init(void) +{ + struct uml_vfio_device *dev, *n; + + sigio_broken(); + + /* If the opening fails, the device will be released. */ + list_for_each_entry_safe(dev, n, ¨_vfio_devices, list) + uml_vfio_open_device(dev); + + return 0; +} +late_initcall(uml_vfio_init); + +static void __exit uml_vfio_exit(void) +{ + struct uml_vfio_device *dev, *n; + + list_for_each_entry_safe(dev, n, ¨_vfio_devices, list) + uml_vfio_release_device(dev); + + if (uml_vfio_container.fd >= 0) + os_close_file(uml_vfio_container.fd); +} +module_exit(uml_vfio_exit); diff --git a/arch/um/drivers/vfio_user.c b/arch/um/drivers/vfio_user.c new file mode 100644 index 000000000000..6a45d8e14582 --- /dev/null +++ b/arch/um/drivers/vfio_user.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 Ant Group + * Author: Tiwei Bie <tiwei.btw@antgroup.com> + */ +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <sys/eventfd.h> +#include <linux/limits.h> +#include <linux/vfio.h> +#include <linux/pci_regs.h> +#include <as-layout.h> +#include <um_malloc.h> + +#include "vfio_user.h" + +int uml_vfio_user_open_container(void) +{ + int r, fd; + + fd = open("/dev/vfio/vfio", O_RDWR); + if (fd < 0) + return -errno; + + r = ioctl(fd, VFIO_GET_API_VERSION); + if (r != VFIO_API_VERSION) { + r = r < 0 ? -errno : -EINVAL; + goto error; + } + + r = ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU); + if (r <= 0) { + r = r < 0 ? -errno : -EINVAL; + goto error; + } + + return fd; + +error: + close(fd); + return r; +} + +int uml_vfio_user_setup_iommu(int container) +{ + /* + * This is a bit tricky. See the big comment in + * vhost_user_set_mem_table() in virtio_uml.c. + */ + unsigned long reserved = uml_reserved - uml_physmem; + struct vfio_iommu_type1_dma_map dma_map = { + .argsz = sizeof(dma_map), + .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, + .vaddr = uml_reserved, + .iova = reserved, + .size = physmem_size - reserved, + }; + + if (ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU) < 0) + return -errno; + + if (ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map) < 0) + return -errno; + + return 0; +} + +int uml_vfio_user_get_group_id(const char *device) +{ + char *path, *buf, *end; + const char *name; + int r; + + path = uml_kmalloc(PATH_MAX, UM_GFP_KERNEL); + if (!path) + return -ENOMEM; + + sprintf(path, "/sys/bus/pci/devices/%s/iommu_group", device); + + buf = uml_kmalloc(PATH_MAX + 1, UM_GFP_KERNEL); + if (!buf) { + r = -ENOMEM; + goto free_path; + } + + r = readlink(path, buf, PATH_MAX); + if (r < 0) { + r = -errno; + goto free_buf; + } + buf[r] = '\0'; + + name = basename(buf); + + r = strtoul(name, &end, 10); + if (*end != '\0' || end == name) { + r = -EINVAL; + goto free_buf; + } + +free_buf: + kfree(buf); +free_path: + kfree(path); + return r; +} + +int uml_vfio_user_open_group(int group_id) +{ + char *path; + int fd; + + path = uml_kmalloc(PATH_MAX, UM_GFP_KERNEL); + if (!path) + return -ENOMEM; + + sprintf(path, "/dev/vfio/%d", group_id); + + fd = open(path, O_RDWR); + if (fd < 0) { + fd = -errno; + goto out; + } + +out: + kfree(path); + return fd; +} + +int uml_vfio_user_set_container(int container, int group) +{ + if (ioctl(group, VFIO_GROUP_SET_CONTAINER, &container) < 0) + return -errno; + return 0; +} + +int uml_vfio_user_unset_container(int container, int group) +{ + if (ioctl(group, VFIO_GROUP_UNSET_CONTAINER, &container) < 0) + return -errno; + return 0; +} + +static int vfio_set_irqs(int device, int start, int count, int *irqfd) +{ + struct vfio_irq_set *irq_set; + int argsz = sizeof(*irq_set) + sizeof(*irqfd) * count; + int err = 0; + + irq_set = uml_kmalloc(argsz, UM_GFP_KERNEL); + if (!irq_set) + return -ENOMEM; + + irq_set->argsz = argsz; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_set->start = start; + irq_set->count = count; + memcpy(irq_set->data, irqfd, sizeof(*irqfd) * count); + + if (ioctl(device, VFIO_DEVICE_SET_IRQS, irq_set) < 0) { + err = -errno; + goto out; + } + +out: + kfree(irq_set); + return err; +} + +int uml_vfio_user_setup_device(struct uml_vfio_user_device *dev, + int group, const char *device) +{ + struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; + struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; + int err, i; + + dev->device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, device); + if (dev->device < 0) + return -errno; + + if (ioctl(dev->device, VFIO_DEVICE_GET_INFO, &device_info) < 0) { + err = -errno; + goto close_device; + } + + dev->num_regions = device_info.num_regions; + if (dev->num_regions > VFIO_PCI_CONFIG_REGION_INDEX + 1) + dev->num_regions = VFIO_PCI_CONFIG_REGION_INDEX + 1; + + dev->region = uml_kmalloc(sizeof(*dev->region) * dev->num_regions, + UM_GFP_KERNEL); + if (!dev->region) { + err = -ENOMEM; + goto close_device; + } + + for (i = 0; i < dev->num_regions; i++) { + struct vfio_region_info region = { + .argsz = sizeof(region), + .index = i, + }; + if (ioctl(dev->device, VFIO_DEVICE_GET_REGION_INFO, ®ion) < 0) { + err = -errno; + goto free_region; + } + dev->region[i].size = region.size; + dev->region[i].offset = region.offset; + } + + /* Only MSI-X is supported currently. */ + irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX; + if (ioctl(dev->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0) { + err = -errno; + goto free_region; + } + + dev->irq_count = irq_info.count; + + dev->irqfd = uml_kmalloc(sizeof(int) * dev->irq_count, UM_GFP_KERNEL); + if (!dev->irqfd) { + err = -ENOMEM; + goto free_region; + } + + memset(dev->irqfd, -1, sizeof(int) * dev->irq_count); + + err = vfio_set_irqs(dev->device, 0, dev->irq_count, dev->irqfd); + if (err) + goto free_irqfd; + + return 0; + +free_irqfd: + kfree(dev->irqfd); +free_region: + kfree(dev->region); +close_device: + close(dev->device); + return err; +} + +void uml_vfio_user_teardown_device(struct uml_vfio_user_device *dev) +{ + kfree(dev->irqfd); + kfree(dev->region); + close(dev->device); +} + +int uml_vfio_user_activate_irq(struct uml_vfio_user_device *dev, int index) +{ + int irqfd; + + irqfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (irqfd < 0) + return -errno; + + dev->irqfd[index] = irqfd; + return irqfd; +} + +void uml_vfio_user_deactivate_irq(struct uml_vfio_user_device *dev, int index) +{ + close(dev->irqfd[index]); + dev->irqfd[index] = -1; +} + +int uml_vfio_user_update_irqs(struct uml_vfio_user_device *dev) +{ + return vfio_set_irqs(dev->device, 0, dev->irq_count, dev->irqfd); +} + +static int vfio_region_read(struct uml_vfio_user_device *dev, unsigned int index, + uint64_t offset, void *buf, uint64_t size) +{ + if (index >= dev->num_regions || offset + size > dev->region[index].size) + return -EINVAL; + + if (pread(dev->device, buf, size, dev->region[index].offset + offset) < 0) + return -errno; + + return 0; +} + +static int vfio_region_write(struct uml_vfio_user_device *dev, unsigned int index, + uint64_t offset, const void *buf, uint64_t size) +{ + if (index >= dev->num_regions || offset + size > dev->region[index].size) + return -EINVAL; + + if (pwrite(dev->device, buf, size, dev->region[index].offset + offset) < 0) + return -errno; + + return 0; +} + +int uml_vfio_user_cfgspace_read(struct uml_vfio_user_device *dev, + unsigned int offset, void *buf, int size) +{ + return vfio_region_read(dev, VFIO_PCI_CONFIG_REGION_INDEX, + offset, buf, size); +} + +int uml_vfio_user_cfgspace_write(struct uml_vfio_user_device *dev, + unsigned int offset, const void *buf, int size) +{ + return vfio_region_write(dev, VFIO_PCI_CONFIG_REGION_INDEX, + offset, buf, size); +} + +int uml_vfio_user_bar_read(struct uml_vfio_user_device *dev, int bar, + unsigned int offset, void *buf, int size) +{ + return vfio_region_read(dev, bar, offset, buf, size); +} + +int uml_vfio_user_bar_write(struct uml_vfio_user_device *dev, int bar, + unsigned int offset, const void *buf, int size) +{ + return vfio_region_write(dev, bar, offset, buf, size); +} diff --git a/arch/um/drivers/vfio_user.h b/arch/um/drivers/vfio_user.h new file mode 100644 index 000000000000..75535e05059b --- /dev/null +++ b/arch/um/drivers/vfio_user.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __UM_VFIO_USER_H +#define __UM_VFIO_USER_H + +struct uml_vfio_user_device { + int device; + + struct { + uint64_t size; + uint64_t offset; + } *region; + int num_regions; + + int32_t *irqfd; + int irq_count; +}; + +int uml_vfio_user_open_container(void); +int uml_vfio_user_setup_iommu(int container); + +int uml_vfio_user_get_group_id(const char *device); +int uml_vfio_user_open_group(int group_id); +int uml_vfio_user_set_container(int container, int group); +int uml_vfio_user_unset_container(int container, int group); + +int uml_vfio_user_setup_device(struct uml_vfio_user_device *dev, + int group, const char *device); +void uml_vfio_user_teardown_device(struct uml_vfio_user_device *dev); + +int uml_vfio_user_activate_irq(struct uml_vfio_user_device *dev, int index); +void uml_vfio_user_deactivate_irq(struct uml_vfio_user_device *dev, int index); +int uml_vfio_user_update_irqs(struct uml_vfio_user_device *dev); + +int uml_vfio_user_cfgspace_read(struct uml_vfio_user_device *dev, + unsigned int offset, void *buf, int size); +int uml_vfio_user_cfgspace_write(struct uml_vfio_user_device *dev, + unsigned int offset, const void *buf, int size); + +int uml_vfio_user_bar_read(struct uml_vfio_user_device *dev, int bar, + unsigned int offset, void *buf, int size); +int uml_vfio_user_bar_write(struct uml_vfio_user_device *dev, int bar, + unsigned int offset, const void *buf, int size); + +#endif /* __UM_VFIO_USER_H */ diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c index b83b5a765d4e..0fe207ca4b72 100644 --- a/arch/um/drivers/virt-pci.c +++ b/arch/um/drivers/virt-pci.c @@ -538,11 +538,6 @@ void um_pci_platform_device_unregister(struct um_pci_device *dev) static int __init um_pci_init(void) { - struct irq_domain_info inner_domain_info = { - .size = MAX_MSI_VECTORS, - .hwirq_max = MAX_MSI_VECTORS, - .ops = &um_pci_inner_domain_ops, - }; int err, i; WARN_ON(logic_iomem_add_region(&virt_cfgspace_resource, @@ -564,10 +559,10 @@ static int __init um_pci_init(void) goto free; } - inner_domain_info.fwnode = um_pci_fwnode; - um_pci_inner_domain = irq_domain_instantiate(&inner_domain_info); - if (IS_ERR(um_pci_inner_domain)) { - err = PTR_ERR(um_pci_inner_domain); + um_pci_inner_domain = irq_domain_create_linear(um_pci_fwnode, MAX_MSI_VECTORS, + &um_pci_inner_domain_ops, NULL); + if (!um_pci_inner_domain) { + err = -ENOMEM; goto free; } @@ -602,7 +597,7 @@ static int __init um_pci_init(void) return 0; free: - if (!IS_ERR_OR_NULL(um_pci_inner_domain)) + if (um_pci_inner_domain) irq_domain_remove(um_pci_inner_domain); if (um_pci_fwnode) irq_domain_free_fwnode(um_pci_fwnode); diff --git a/arch/um/drivers/xterm.c b/arch/um/drivers/xterm.c index e4316c7981e8..d05918e422f9 100644 --- a/arch/um/drivers/xterm.c +++ b/arch/um/drivers/xterm.c @@ -81,7 +81,7 @@ __uml_setup("xterm=", xterm_setup, " '<switch> command arg1 arg2 ...'.\n" " The default values are 'xterm=" CONFIG_XTERM_CHAN_DEFAULT_EMULATOR ",-T,-e'.\n" -" Values for gnome-terminal are 'xterm=gnome-terminal,-t,-x'.\n\n" +" Values for gnome-terminal are 'xterm=gnome-terminal,-t,--'.\n\n" ); static int xterm_open(int input, int output, int primary, void *d, @@ -97,12 +97,9 @@ static int xterm_open(int input, int output, int primary, void *d, if (access(argv[4], X_OK) < 0) argv[4] = "port-helper"; - /* - * Check that DISPLAY is set, this doesn't guarantee the xterm - * will work but w/o it we can be pretty sure it won't. - */ - if (getenv("DISPLAY") == NULL) { - printk(UM_KERN_ERR "xterm_open: $DISPLAY not set.\n"); + /* Ensure we are running on Xorg or Wayland. */ + if (!getenv("DISPLAY") && !getenv("WAYLAND_DISPLAY")) { + printk(UM_KERN_ERR "xterm_open : neither $DISPLAY nor $WAYLAND_DISPLAY is set.\n"); return -ENODEV; } diff --git a/arch/um/include/asm/asm-prototypes.h b/arch/um/include/asm/asm-prototypes.h index 5898a26daa0d..408b31d59127 100644 --- a/arch/um/include/asm/asm-prototypes.h +++ b/arch/um/include/asm/asm-prototypes.h @@ -1 +1,6 @@ #include <asm-generic/asm-prototypes.h> +#include <asm/checksum.h> + +#ifdef CONFIG_UML_X86 +extern void cmpxchg8b_emu(void); +#endif diff --git a/arch/um/include/asm/irq.h b/arch/um/include/asm/irq.h index 749dfe8512e8..36dbedd1af48 100644 --- a/arch/um/include/asm/irq.h +++ b/arch/um/include/asm/irq.h @@ -13,17 +13,18 @@ #define TELNETD_IRQ 8 #define XTERM_IRQ 9 #define RANDOM_IRQ 10 +#define SIGCHLD_IRQ 11 #ifdef CONFIG_UML_NET_VECTOR -#define VECTOR_BASE_IRQ (RANDOM_IRQ + 1) +#define VECTOR_BASE_IRQ (SIGCHLD_IRQ + 1) #define VECTOR_IRQ_SPACE 8 #define UM_FIRST_DYN_IRQ (VECTOR_IRQ_SPACE + VECTOR_BASE_IRQ) #else -#define UM_FIRST_DYN_IRQ (RANDOM_IRQ + 1) +#define UM_FIRST_DYN_IRQ (SIGCHLD_IRQ + 1) #endif diff --git a/arch/um/include/asm/mmu.h b/arch/um/include/asm/mmu.h index a3eaca41ff61..4d0e4239f3cc 100644 --- a/arch/um/include/asm/mmu.h +++ b/arch/um/include/asm/mmu.h @@ -6,11 +6,14 @@ #ifndef __ARCH_UM_MMU_H #define __ARCH_UM_MMU_H +#include "linux/types.h" #include <mm_id.h> typedef struct mm_context { struct mm_id id; + struct list_head list; + /* Address range in need of a TLB sync */ unsigned long sync_tlb_range_from; unsigned long sync_tlb_range_to; diff --git a/arch/um/include/shared/common-offsets.h b/arch/um/include/shared/common-offsets.h index 73f3a4792ed8..8ca66a1918c3 100644 --- a/arch/um/include/shared/common-offsets.h +++ b/arch/um/include/shared/common-offsets.h @@ -14,3 +14,7 @@ DEFINE(UM_THREAD_SIZE, THREAD_SIZE); DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC); DEFINE(UM_NSEC_PER_USEC, NSEC_PER_USEC); + +DEFINE(UM_KERN_GDT_ENTRY_TLS_ENTRIES, GDT_ENTRY_TLS_ENTRIES); + +DEFINE(UM_SECCOMP_ARCH_NATIVE, SECCOMP_ARCH_NATIVE); diff --git a/arch/um/include/shared/irq_user.h b/arch/um/include/shared/irq_user.h index 88835b52ae2b..746abc24a5d5 100644 --- a/arch/um/include/shared/irq_user.h +++ b/arch/um/include/shared/irq_user.h @@ -17,6 +17,8 @@ enum um_irq_type { struct siginfo; extern void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, void *mc); +extern void sigchld_handler(int sig, struct siginfo *unused_si, + struct uml_pt_regs *regs, void *mc); void sigio_run_timetravel_handlers(void); extern void free_irq_by_fd(int fd); extern void deactivate_fd(int fd, int irqnum); diff --git a/arch/um/include/shared/net_kern.h b/arch/um/include/shared/net_kern.h deleted file mode 100644 index 67b2e9a1f2e5..000000000000 --- a/arch/um/include/shared/net_kern.h +++ /dev/null @@ -1,69 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2002 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#ifndef __UM_NET_KERN_H -#define __UM_NET_KERN_H - -#include <linux/netdevice.h> -#include <linux/platform_device.h> -#include <linux/skbuff.h> -#include <linux/socket.h> -#include <linux/list.h> -#include <linux/workqueue.h> - -struct uml_net { - struct list_head list; - struct net_device *dev; - struct platform_device pdev; - int index; -}; - -struct uml_net_private { - struct list_head list; - spinlock_t lock; - struct net_device *dev; - struct timer_list tl; - - struct work_struct work; - int fd; - unsigned char mac[ETH_ALEN]; - int max_packet; - unsigned short (*protocol)(struct sk_buff *); - int (*open)(void *); - void (*close)(int, void *); - void (*remove)(void *); - int (*read)(int, struct sk_buff *skb, struct uml_net_private *); - int (*write)(int, struct sk_buff *skb, struct uml_net_private *); - - void (*add_address)(unsigned char *, unsigned char *, void *); - void (*delete_address)(unsigned char *, unsigned char *, void *); - char user[]; -}; - -struct net_kern_info { - void (*init)(struct net_device *, void *); - unsigned short (*protocol)(struct sk_buff *); - int (*read)(int, struct sk_buff *skb, struct uml_net_private *); - int (*write)(int, struct sk_buff *skb, struct uml_net_private *); -}; - -struct transport { - struct list_head list; - const char *name; - int (* const setup)(char *, char **, void *); - const struct net_user_info *user; - const struct net_kern_info *kern; - const int private_size; - const int setup_size; -}; - -extern int tap_setup_common(char *str, char *type, char **dev_name, - char **mac_out, char **gate_addr); -extern void register_transport(struct transport *new); -extern unsigned short eth_protocol(struct sk_buff *skb); -extern void uml_net_setup_etheraddr(struct net_device *dev, char *str); - - -#endif diff --git a/arch/um/include/shared/net_user.h b/arch/um/include/shared/net_user.h deleted file mode 100644 index ba92a4d93531..000000000000 --- a/arch/um/include/shared/net_user.h +++ /dev/null @@ -1,52 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#ifndef __UM_NET_USER_H__ -#define __UM_NET_USER_H__ - -#define ETH_ADDR_LEN (6) -#define ETH_HEADER_ETHERTAP (16) -#define ETH_HEADER_OTHER (26) /* 14 for ethernet + VLAN + MPLS for crazy people */ -#define ETH_MAX_PACKET (1500) - -#define UML_NET_VERSION (4) - -struct net_user_info { - int (*init)(void *, void *); - int (*open)(void *); - void (*close)(int, void *); - void (*remove)(void *); - void (*add_address)(unsigned char *, unsigned char *, void *); - void (*delete_address)(unsigned char *, unsigned char *, void *); - int max_packet; - int mtu; -}; - -extern void iter_addresses(void *d, void (*cb)(unsigned char *, - unsigned char *, void *), - void *arg); - -extern void *get_output_buffer(int *len_out); -extern void free_output_buffer(void *buffer); - -extern int tap_open_common(void *dev, char *gate_addr); -extern void tap_check_ips(char *gate_addr, unsigned char *eth_addr); - -extern void read_output(int fd, char *output_out, int len); - -extern int net_read(int fd, void *buf, int len); -extern int net_recvfrom(int fd, void *buf, int len); -extern int net_write(int fd, void *buf, int len); -extern int net_send(int fd, void *buf, int len); -extern int net_sendto(int fd, void *buf, int len, void *to, int sock_len); - -extern void open_addr(unsigned char *addr, unsigned char *netmask, void *arg); -extern void close_addr(unsigned char *addr, unsigned char *netmask, void *arg); - -extern char *split_if_spec(char *str, ...); - -extern int dev_netmask(void *d, void *m); - -#endif diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 152a60080d5b..b35cc8ce333b 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -143,7 +143,6 @@ extern int os_access(const char *file, int mode); extern int os_set_exec_close(int fd); extern int os_ioctl_generic(int fd, unsigned int cmd, unsigned long arg); extern int os_get_ifname(int fd, char *namebuf); -extern int os_set_slip(int fd); extern int os_mode_fd(int fd, int mode); extern int os_seek_file(int fd, unsigned long long offset); @@ -198,6 +197,7 @@ extern int create_mem_file(unsigned long long len); extern void report_enomem(void); /* process.c */ +pid_t os_reap_child(void); extern void os_alarm_process(int pid); extern void os_kill_process(int pid, int reap_child); extern void os_kill_ptraced_process(int pid, int reap_child); @@ -286,7 +286,7 @@ int unmap(struct mm_id *mm_idp, unsigned long addr, unsigned long len); /* skas/process.c */ extern int is_skas_winch(int pid, int fd, void *data); -extern int start_userspace(unsigned long stub_stack); +extern int start_userspace(struct mm_id *mm_id); extern void userspace(struct uml_pt_regs *regs); extern void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)); extern void switch_threads(jmp_buf *me, jmp_buf *you); diff --git a/arch/um/include/shared/skas/mm_id.h b/arch/um/include/shared/skas/mm_id.h index 140388c282f6..89df9a55fbea 100644 --- a/arch/um/include/shared/skas/mm_id.h +++ b/arch/um/include/shared/skas/mm_id.h @@ -6,12 +6,21 @@ #ifndef __MM_ID_H #define __MM_ID_H +#define STUB_MAX_FDS 4 + struct mm_id { int pid; unsigned long stack; int syscall_data_len; + + /* Only used with SECCOMP mode */ + int sock; + int syscall_fd_num; + int syscall_fd_map[STUB_MAX_FDS]; }; void __switch_mm(struct mm_id *mm_idp); +void notify_mm_kill(int pid); + #endif diff --git a/arch/um/include/shared/skas/skas.h b/arch/um/include/shared/skas/skas.h index 85c50122ab98..7d1de4cab551 100644 --- a/arch/um/include/shared/skas/skas.h +++ b/arch/um/include/shared/skas/skas.h @@ -8,6 +8,7 @@ #include <sysdep/ptrace.h> +extern int using_seccomp; extern int userspace_pid[]; extern void new_thread_handler(void); diff --git a/arch/um/include/shared/skas/stub-data.h b/arch/um/include/shared/skas/stub-data.h index 81a4cace032c..c261a77a32f6 100644 --- a/arch/um/include/shared/skas/stub-data.h +++ b/arch/um/include/shared/skas/stub-data.h @@ -11,8 +11,15 @@ #include <linux/compiler_types.h> #include <as-layout.h> #include <sysdep/tls.h> +#include <sysdep/stub-data.h> +#include <mm_id.h> + +#define FUTEX_IN_CHILD 0 +#define FUTEX_IN_KERN 1 struct stub_init_data { + int seccomp; + unsigned long stub_start; int stub_code_fd; @@ -20,7 +27,8 @@ struct stub_init_data { int stub_data_fd; unsigned long stub_data_offset; - unsigned long segv_handler; + unsigned long signal_handler; + unsigned long signal_restorer; }; #define STUB_NEXT_SYSCALL(s) \ @@ -52,6 +60,16 @@ struct stub_data { /* 128 leaves enough room for additional fields in the struct */ struct stub_syscall syscall_data[(UM_KERN_PAGE_SIZE - 128) / sizeof(struct stub_syscall)] __aligned(16); + /* data shared with signal handler (only used in seccomp mode) */ + short restart_wait; + unsigned int futex; + int signal; + unsigned short si_offset; + unsigned short mctx_offset; + + /* seccomp architecture specific state restore */ + struct stub_data_arch arch_data; + /* Stack for our signal handlers and for calling into . */ unsigned char sigstack[UM_KERN_PAGE_SIZE] __aligned(UM_KERN_PAGE_SIZE); }; diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index 4df1cd0d2017..4669db2aa9be 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -25,7 +25,6 @@ obj-$(CONFIG_GPROF) += gprof_syms.o obj-$(CONFIG_OF) += dtb.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_STACKTRACE) += stacktrace.o -obj-$(CONFIG_GENERIC_PCI_IOMAP) += ioport.o USER_OBJS := config.o diff --git a/arch/um/kernel/ioport.c b/arch/um/kernel/ioport.c deleted file mode 100644 index 7220615b3beb..000000000000 --- a/arch/um/kernel/ioport.c +++ /dev/null @@ -1,13 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2021 Intel Corporation - * Author: Johannes Berg <johannes@sipsolutions.net> - */ -#include <asm/iomap.h> -#include <asm-generic/pci_iomap.h> - -void __iomem *__pci_ioport_map(struct pci_dev *dev, unsigned long port, - unsigned int nr) -{ - return NULL; -} diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index abe8f30a521c..0dfaf96bb7da 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -690,3 +690,9 @@ void __init init_IRQ(void) /* Initialize EPOLL Loop */ os_setup_epoll(); } + +void sigchld_handler(int sig, struct siginfo *unused_si, + struct uml_pt_regs *regs, void *mc) +{ + do_IRQ(SIGCHLD_IRQ, regs); +} diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 0eb5a1d3ba70..849fafa4b54f 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -8,6 +8,7 @@ #include <linux/sched/signal.h> #include <linux/slab.h> +#include <shared/irq_kern.h> #include <asm/pgalloc.h> #include <asm/sections.h> #include <asm/mmu_context.h> @@ -19,6 +20,9 @@ /* Ensure the stub_data struct covers the allocated area */ static_assert(sizeof(struct stub_data) == STUB_DATA_PAGES * UM_KERN_PAGE_SIZE); +spinlock_t mm_list_lock; +struct list_head mm_list; + int init_new_context(struct task_struct *task, struct mm_struct *mm) { struct mm_id *new_id = &mm->context.id; @@ -31,14 +35,14 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm) new_id->stack = stack; - block_signals_trace(); - new_id->pid = start_userspace(stack); - unblock_signals_trace(); + scoped_guard(spinlock_irqsave, &mm_list_lock) { + /* Insert into list, used for lookups when the child dies */ + list_add(&mm->context.list, &mm_list); + } - if (new_id->pid < 0) { - ret = new_id->pid; + ret = start_userspace(new_id); + if (ret < 0) goto out_free; - } /* Ensure the new MM is clean and nothing unwanted is mapped */ unmap(new_id, 0, STUB_START); @@ -60,13 +64,82 @@ void destroy_context(struct mm_struct *mm) * zero, resulting in a kill(0), which will result in the * whole UML suddenly dying. Also, cover negative and * 1 cases, since they shouldn't happen either. + * + * Negative cases happen if the child died unexpectedly. */ - if (mmu->id.pid < 2) { + if (mmu->id.pid >= 0 && mmu->id.pid < 2) { printk(KERN_ERR "corrupt mm_context - pid = %d\n", mmu->id.pid); return; } - os_kill_ptraced_process(mmu->id.pid, 1); + + if (mmu->id.pid > 0) { + os_kill_ptraced_process(mmu->id.pid, 1); + mmu->id.pid = -1; + } + + if (using_seccomp && mmu->id.sock) + os_close_file(mmu->id.sock); free_pages(mmu->id.stack, ilog2(STUB_DATA_PAGES)); + + guard(spinlock_irqsave)(&mm_list_lock); + + list_del(&mm->context.list); +} + +static irqreturn_t mm_sigchld_irq(int irq, void* dev) +{ + struct mm_context *mm_context; + pid_t pid; + + guard(spinlock)(&mm_list_lock); + + while ((pid = os_reap_child()) > 0) { + /* + * A child died, check if we have an MM with the PID. This is + * only relevant in SECCOMP mode (as ptrace will fail anyway). + * + * See wait_stub_done_seccomp for more details. + */ + list_for_each_entry(mm_context, &mm_list, list) { + if (mm_context->id.pid == pid) { + struct stub_data *stub_data; + printk("Unexpectedly lost MM child! Affected tasks will segfault."); + + /* Marks the MM as dead */ + mm_context->id.pid = -1; + + /* + * NOTE: If SMP is implemented, a futex_wake + * needs to be added here. + */ + stub_data = (void *)mm_context->id.stack; + stub_data->futex = FUTEX_IN_KERN; + + /* + * NOTE: Currently executing syscalls by + * affected tasks may finish normally. + */ + break; + } + } + } + + return IRQ_HANDLED; +} + +static int __init init_child_tracking(void) +{ + int err; + + spin_lock_init(&mm_list_lock); + INIT_LIST_HEAD(&mm_list); + + err = request_irq(SIGCHLD_IRQ, mm_sigchld_irq, 0, "SIGCHLD", NULL); + if (err < 0) + panic("Failed to register SIGCHLD IRQ: %d", err); + + return 0; } +early_initcall(init_child_tracking) diff --git a/arch/um/kernel/skas/stub.c b/arch/um/kernel/skas/stub.c index 796fc266d3bb..67cab46a602c 100644 --- a/arch/um/kernel/skas/stub.c +++ b/arch/um/kernel/skas/stub.c @@ -5,21 +5,54 @@ #include <sysdep/stub.h> -static __always_inline int syscall_handler(struct stub_data *d) +#include <linux/futex.h> +#include <sys/socket.h> +#include <errno.h> + +/* + * Known security issues + * + * Userspace can jump to this address to execute *any* syscall that is + * permitted by the stub. As we will return afterwards, it can do + * whatever it likes, including: + * - Tricking the kernel into handing out the memory FD + * - Using this memory FD to read/write all physical memory + * - Running in parallel to the kernel processing a syscall + * (possibly creating data races?) + * - Blocking e.g. SIGALRM to avoid time based scheduling + * + * To avoid this, the permitted location for each syscall needs to be + * checked for in the SECCOMP filter (which is reasonably simple). Also, + * more care will need to go into considerations how the code might be + * tricked by using a prepared stack (or even modifying the stack from + * another thread in case SMP support is added). + * + * As for the SIGALRM, the best counter measure will be to check in the + * kernel that the process is reporting back the SIGALRM in a timely + * fashion. + */ +static __always_inline int syscall_handler(int fd_map[STUB_MAX_FDS]) { + struct stub_data *d = get_stub_data(); int i; unsigned long res; + int fd; for (i = 0; i < d->syscall_data_len; i++) { struct stub_syscall *sc = &d->syscall_data[i]; switch (sc->syscall) { case STUB_SYSCALL_MMAP: + if (fd_map) + fd = fd_map[sc->mem.fd]; + else + fd = sc->mem.fd; + res = stub_syscall6(STUB_MMAP_NR, sc->mem.addr, sc->mem.length, sc->mem.prot, MAP_SHARED | MAP_FIXED, - sc->mem.fd, sc->mem.offset); + fd, sc->mem.offset); if (res != sc->mem.addr) { d->err = res; d->syscall_data_len = i; @@ -51,9 +84,98 @@ static __always_inline int syscall_handler(struct stub_data *d) void __section(".__syscall_stub") stub_syscall_handler(void) { + syscall_handler(NULL); + + trap_myself(); +} + +void __section(".__syscall_stub") +stub_signal_interrupt(int sig, siginfo_t *info, void *p) +{ struct stub_data *d = get_stub_data(); + char rcv_data; + union { + char data[CMSG_SPACE(sizeof(int) * STUB_MAX_FDS)]; + struct cmsghdr align; + } ctrl = {}; + struct iovec iov = { + .iov_base = &rcv_data, + .iov_len = 1, + }; + struct msghdr msghdr = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = &ctrl, + .msg_controllen = sizeof(ctrl), + }; + ucontext_t *uc = p; + struct cmsghdr *fd_msg; + int *fd_map; + int num_fds; + long res; - syscall_handler(d); + d->signal = sig; + d->si_offset = (unsigned long)info - (unsigned long)&d->sigstack[0]; + d->mctx_offset = (unsigned long)&uc->uc_mcontext - (unsigned long)&d->sigstack[0]; - trap_myself(); +restart_wait: + d->futex = FUTEX_IN_KERN; + do { + res = stub_syscall3(__NR_futex, (unsigned long)&d->futex, + FUTEX_WAKE, 1); + } while (res == -EINTR); + + do { + res = stub_syscall4(__NR_futex, (unsigned long)&d->futex, + FUTEX_WAIT, FUTEX_IN_KERN, 0); + } while (res == -EINTR || d->futex == FUTEX_IN_KERN); + + if (res < 0 && res != -EAGAIN) + stub_syscall1(__NR_exit_group, 1); + + if (d->syscall_data_len) { + /* Read passed FDs (if any) */ + do { + res = stub_syscall3(__NR_recvmsg, 0, (unsigned long)&msghdr, 0); + } while (res == -EINTR); + + /* We should never have a receive error (other than -EAGAIN) */ + if (res < 0 && res != -EAGAIN) + stub_syscall1(__NR_exit_group, 1); + + /* Receive the FDs */ + num_fds = 0; + fd_msg = msghdr.msg_control; + fd_map = (void *)&CMSG_DATA(fd_msg); + if (res == iov.iov_len && msghdr.msg_controllen > sizeof(struct cmsghdr)) + num_fds = (fd_msg->cmsg_len - CMSG_LEN(0)) / sizeof(int); + + /* Try running queued syscalls. */ + res = syscall_handler(fd_map); + + while (num_fds) + stub_syscall2(__NR_close, fd_map[--num_fds], 0); + } else { + res = 0; + } + + if (res < 0 || d->restart_wait) { + /* Report SIGSYS if we restart. */ + d->signal = SIGSYS; + d->restart_wait = 0; + + goto restart_wait; + } + + /* Restore arch dependent state that is not part of the mcontext */ + stub_seccomp_restore_state(&d->arch_data); + + /* Return so that the host modified mcontext is restored. */ +} + +void __section(".__syscall_stub") +stub_signal_restorer(void) +{ + /* We must not have anything on the stack when doing rt_sigreturn */ + stub_syscall0(__NR_rt_sigreturn); } diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c index 23c99b285e82..cbafaa684e66 100644 --- a/arch/um/kernel/skas/stub_exe.c +++ b/arch/um/kernel/skas/stub_exe.c @@ -1,8 +1,12 @@ #include <sys/ptrace.h> #include <sys/prctl.h> +#include <sys/fcntl.h> #include <asm/unistd.h> #include <sysdep/stub.h> #include <stub-data.h> +#include <linux/filter.h> +#include <linux/seccomp.h> +#include <generated/asm-offsets.h> void _start(void); @@ -25,8 +29,6 @@ noinline static void real_init(void) } sa = { /* Need to set SA_RESTORER (but the handler never returns) */ .sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO | 0x04000000, - /* no need to mask any signals */ - .sa_mask = 0, }; /* set a nice name */ @@ -35,13 +37,20 @@ noinline static void real_init(void) /* Make sure this process dies if the kernel dies */ stub_syscall2(__NR_prctl, PR_SET_PDEATHSIG, SIGKILL); + /* Needed in SECCOMP mode (and safe to do anyway) */ + stub_syscall5(__NR_prctl, PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + /* read information from STDIN and close it */ res = stub_syscall3(__NR_read, 0, (unsigned long)&init_data, sizeof(init_data)); if (res != sizeof(init_data)) stub_syscall1(__NR_exit, 10); - stub_syscall1(__NR_close, 0); + /* In SECCOMP mode, FD 0 is a socket and is later used for FD passing */ + if (!init_data.seccomp) + stub_syscall1(__NR_close, 0); + else + stub_syscall3(__NR_fcntl, 0, F_SETFL, O_NONBLOCK); /* map stub code + data */ res = stub_syscall6(STUB_MMAP_NR, @@ -59,22 +68,148 @@ noinline static void real_init(void) if (res != init_data.stub_start + UM_KERN_PAGE_SIZE) stub_syscall1(__NR_exit, 12); + /* In SECCOMP mode, we only need the signalling FD from now on */ + if (init_data.seccomp) { + res = stub_syscall3(__NR_close_range, 1, ~0U, 0); + if (res != 0) + stub_syscall1(__NR_exit, 13); + } + /* setup signal stack inside stub data */ stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE; stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0); - /* register SIGSEGV handler */ - sa.sa_handler_ = (void *) init_data.segv_handler; - res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, (unsigned long)&sa, 0, - sizeof(sa.sa_mask)); - if (res != 0) - stub_syscall1(__NR_exit, 13); + /* register signal handlers */ + sa.sa_handler_ = (void *) init_data.signal_handler; + sa.sa_restorer = (void *) init_data.signal_restorer; + if (!init_data.seccomp) { + /* In ptrace mode, the SIGSEGV handler never returns */ + sa.sa_mask = 0; + + res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 14); + } else { + /* SECCOMP mode uses rt_sigreturn, need to mask all signals */ + sa.sa_mask = ~0ULL; + + res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 15); + + res = stub_syscall4(__NR_rt_sigaction, SIGSYS, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 16); + + res = stub_syscall4(__NR_rt_sigaction, SIGALRM, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 17); + + res = stub_syscall4(__NR_rt_sigaction, SIGTRAP, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 18); + + res = stub_syscall4(__NR_rt_sigaction, SIGILL, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 19); + + res = stub_syscall4(__NR_rt_sigaction, SIGFPE, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 20); + } + + /* + * If in seccomp mode, install the SECCOMP filter and trigger a syscall. + * Otherwise set PTRACE_TRACEME and do a SIGSTOP. + */ + if (init_data.seccomp) { + struct sock_filter filter[] = { +#if __BITS_PER_LONG > 32 + /* [0] Load upper 32bit of instruction pointer from seccomp_data */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + (offsetof(struct seccomp_data, instruction_pointer) + 4)), + + /* [1] Jump forward 3 instructions if the upper address is not identical */ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (init_data.stub_start) >> 32, 0, 3), +#endif + /* [2] Load lower 32bit of instruction pointer from seccomp_data */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + (offsetof(struct seccomp_data, instruction_pointer))), + + /* [3] Mask out lower bits */ + BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 0xfffff000), + + /* [4] Jump to [6] if the lower bits are not on the expected page */ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (init_data.stub_start) & 0xfffff000, 1, 0), + + /* [5] Trap call, allow */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP), + + /* [6,7] Check architecture */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + offsetof(struct seccomp_data, arch)), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, + UM_SECCOMP_ARCH_NATIVE, 1, 0), + + /* [8] Kill (for architecture check) */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS), + + /* [9] Load syscall number */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + offsetof(struct seccomp_data, nr)), + + /* [10-16] Check against permitted syscalls */ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_futex, + 7, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_recvmsg, + 6, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_close, + 5, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, STUB_MMAP_NR, + 4, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_munmap, + 3, 0), +#ifdef __i386__ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_set_thread_area, + 2, 0), +#else + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_arch_prctl, + 2, 0), +#endif + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigreturn, + 1, 0), + + /* [17] Not one of the permitted syscalls */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS), + + /* [18] Permitted call for the stub */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = sizeof(filter) / sizeof(filter[0]), + .filter = filter, + }; + + if (stub_syscall3(__NR_seccomp, SECCOMP_SET_MODE_FILTER, + SECCOMP_FILTER_FLAG_TSYNC, + (unsigned long)&prog) != 0) + stub_syscall1(__NR_exit, 21); - stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); + /* Fall through, the exit syscall will cause SIGSYS */ + } else { + stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); - stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP); + stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP); + } - stub_syscall1(__NR_exit, 14); + stub_syscall1(__NR_exit, 30); __builtin_unreachable(); } diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 1394568c0210..ae0fa2173778 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -856,11 +856,16 @@ static struct clock_event_device timer_clockevent = { static irqreturn_t um_timer(int irq, void *dev) { - if (get_current()->mm != NULL) - { - /* userspace - relay signal, results in correct userspace timers */ + /* + * Interrupt the (possibly) running userspace process, technically this + * should only happen if userspace is currently executing. + * With infinite CPU time-travel, we can only get here when userspace + * is not executing. Do not notify there and avoid spurious scheduling. + */ + if (time_travel_mode != TT_MODE_INFCPU && + time_travel_mode != TT_MODE_EXTERNAL && + get_current()->mm) os_alarm_process(get_current()->mm->context.id.pid); - } (*timer_clockevent.event_handler)(&timer_clockevent); diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index ef2272e92a43..5b80a3a89c20 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -16,7 +16,122 @@ #include <kern_util.h> #include <os.h> #include <skas.h> -#include <arch.h> + +/* + * NOTE: UML does not have exception tables. As such, this is almost a copy + * of the code in mm/memory.c, only adjusting the logic to simply check whether + * we are coming from the kernel instead of doing an additional lookup in the + * exception table. + * We can do this simplification because we never get here if the exception was + * fixable. + */ +static inline bool get_mmap_lock_carefully(struct mm_struct *mm, bool is_user) +{ + if (likely(mmap_read_trylock(mm))) + return true; + + if (!is_user) + return false; + + return !mmap_read_lock_killable(mm); +} + +static inline bool mmap_upgrade_trylock(struct mm_struct *mm) +{ + /* + * We don't have this operation yet. + * + * It should be easy enough to do: it's basically a + * atomic_long_try_cmpxchg_acquire() + * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but + * it also needs the proper lockdep magic etc. + */ + return false; +} + +static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, bool is_user) +{ + mmap_read_unlock(mm); + if (!is_user) + return false; + + return !mmap_write_lock_killable(mm); +} + +/* + * Helper for page fault handling. + * + * This is kind of equivalend to "mmap_read_lock()" followed + * by "find_extend_vma()", except it's a lot more careful about + * the locking (and will drop the lock on failure). + * + * For example, if we have a kernel bug that causes a page + * fault, we don't want to just use mmap_read_lock() to get + * the mm lock, because that would deadlock if the bug were + * to happen while we're holding the mm lock for writing. + * + * So this checks the exception tables on kernel faults in + * order to only do this all for instructions that are actually + * expected to fault. + * + * We can also actually take the mm lock for writing if we + * need to extend the vma, which helps the VM layer a lot. + */ +static struct vm_area_struct * +um_lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long addr, bool is_user) +{ + struct vm_area_struct *vma; + + if (!get_mmap_lock_carefully(mm, is_user)) + return NULL; + + vma = find_vma(mm, addr); + if (likely(vma && (vma->vm_start <= addr))) + return vma; + + /* + * Well, dang. We might still be successful, but only + * if we can extend a vma to do so. + */ + if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { + mmap_read_unlock(mm); + return NULL; + } + + /* + * We can try to upgrade the mmap lock atomically, + * in which case we can continue to use the vma + * we already looked up. + * + * Otherwise we'll have to drop the mmap lock and + * re-take it, and also look up the vma again, + * re-checking it. + */ + if (!mmap_upgrade_trylock(mm)) { + if (!upgrade_mmap_lock_carefully(mm, is_user)) + return NULL; + + vma = find_vma(mm, addr); + if (!vma) + goto fail; + if (vma->vm_start <= addr) + goto success; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto fail; + } + + if (expand_stack_locked(vma, addr)) + goto fail; + +success: + mmap_write_downgrade(mm); + return vma; + +fail: + mmap_write_unlock(mm); + return NULL; +} /* * Note this is constrained to return 0, -EFAULT, -EACCES, -ENOMEM by @@ -44,21 +159,10 @@ int handle_page_fault(unsigned long address, unsigned long ip, if (is_user) flags |= FAULT_FLAG_USER; retry: - mmap_read_lock(mm); - vma = find_vma(mm, address); - if (!vma) - goto out; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto out; - if (is_user && !ARCH_IS_STACKGROW(address)) - goto out; - vma = expand_stack(mm, address); + vma = um_lock_mm_and_find_vma(mm, address, is_user); if (!vma) goto out_nosemaphore; -good_area: *code_out = SEGV_ACCERR; if (is_write) { if (!(vma->vm_flags & VM_WRITE)) diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile index 049dfa5bc9c6..fae836713487 100644 --- a/arch/um/os-Linux/Makefile +++ b/arch/um/os-Linux/Makefile @@ -8,7 +8,7 @@ KCOV_INSTRUMENT := n obj-y = execvp.o file.o helper.o irq.o main.o mem.o process.o \ registers.o sigio.o signal.o start_up.o time.o tty.o \ - umid.o user_syms.o util.o drivers/ skas/ + umid.o user_syms.o util.o skas/ CFLAGS_signal.o += -Wframe-larger-than=4096 diff --git a/arch/um/os-Linux/drivers/Makefile b/arch/um/os-Linux/drivers/Makefile deleted file mode 100644 index cf2d75bb1884..000000000000 --- a/arch/um/os-Linux/drivers/Makefile +++ /dev/null @@ -1,13 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com) -# - -ethertap-objs := ethertap_kern.o ethertap_user.o -tuntap-objs := tuntap_kern.o tuntap_user.o - -obj-y = -obj-$(CONFIG_UML_NET_ETHERTAP) += ethertap.o -obj-$(CONFIG_UML_NET_TUNTAP) += tuntap.o - -include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/arch/um/os-Linux/drivers/etap.h b/arch/um/os-Linux/drivers/etap.h deleted file mode 100644 index a475259f90e1..000000000000 --- a/arch/um/os-Linux/drivers/etap.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#ifndef __DRIVERS_ETAP_H -#define __DRIVERS_ETAP_H - -#include <net_user.h> - -struct ethertap_data { - char *dev_name; - char *gate_addr; - int data_fd; - int control_fd; - void *dev; -}; - -extern const struct net_user_info ethertap_user_info; - -#endif diff --git a/arch/um/os-Linux/drivers/ethertap_kern.c b/arch/um/os-Linux/drivers/ethertap_kern.c deleted file mode 100644 index 5e5ee40680ce..000000000000 --- a/arch/um/os-Linux/drivers/ethertap_kern.c +++ /dev/null @@ -1,100 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Copyright (C) 2001 by various other people who didn't put their name here. - */ - -#include <linux/init.h> -#include <linux/netdevice.h> -#include "etap.h" -#include <net_kern.h> - -struct ethertap_init { - char *dev_name; - char *gate_addr; -}; - -static void etap_init(struct net_device *dev, void *data) -{ - struct uml_net_private *pri; - struct ethertap_data *epri; - struct ethertap_init *init = data; - - pri = netdev_priv(dev); - epri = (struct ethertap_data *) pri->user; - epri->dev_name = init->dev_name; - epri->gate_addr = init->gate_addr; - epri->data_fd = -1; - epri->control_fd = -1; - epri->dev = dev; - - printk(KERN_INFO "ethertap backend - %s", epri->dev_name); - if (epri->gate_addr != NULL) - printk(KERN_CONT ", IP = %s", epri->gate_addr); - printk(KERN_CONT "\n"); -} - -static int etap_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - int len; - - len = net_recvfrom(fd, skb_mac_header(skb), - skb->dev->mtu + 2 + ETH_HEADER_ETHERTAP); - if (len <= 0) - return(len); - - skb_pull(skb, 2); - len -= 2; - return len; -} - -static int etap_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - skb_push(skb, 2); - return net_send(fd, skb->data, skb->len); -} - -const struct net_kern_info ethertap_kern_info = { - .init = etap_init, - .protocol = eth_protocol, - .read = etap_read, - .write = etap_write, -}; - -static int ethertap_setup(char *str, char **mac_out, void *data) -{ - struct ethertap_init *init = data; - - *init = ((struct ethertap_init) - { .dev_name = NULL, - .gate_addr = NULL }); - if (tap_setup_common(str, "ethertap", &init->dev_name, mac_out, - &init->gate_addr)) - return 0; - if (init->dev_name == NULL) { - printk(KERN_ERR "ethertap_setup : Missing tap device name\n"); - return 0; - } - - return 1; -} - -static struct transport ethertap_transport = { - .list = LIST_HEAD_INIT(ethertap_transport.list), - .name = "ethertap", - .setup = ethertap_setup, - .user = ðertap_user_info, - .kern = ðertap_kern_info, - .private_size = sizeof(struct ethertap_data), - .setup_size = sizeof(struct ethertap_init), -}; - -static int register_ethertap(void) -{ - register_transport(ðertap_transport); - return 0; -} - -late_initcall(register_ethertap); diff --git a/arch/um/os-Linux/drivers/ethertap_user.c b/arch/um/os-Linux/drivers/ethertap_user.c deleted file mode 100644 index bdf215c0eca7..000000000000 --- a/arch/um/os-Linux/drivers/ethertap_user.c +++ /dev/null @@ -1,248 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 by various other people who didn't put their name here. - */ - -#include <stdio.h> -#include <unistd.h> -#include <errno.h> -#include <string.h> -#include <sys/socket.h> -#include <sys/wait.h> -#include "etap.h" -#include <os.h> -#include <net_user.h> -#include <um_malloc.h> - -#define MAX_PACKET ETH_MAX_PACKET - -static int etap_user_init(void *data, void *dev) -{ - struct ethertap_data *pri = data; - - pri->dev = dev; - return 0; -} - -struct addr_change { - enum { ADD_ADDR, DEL_ADDR } what; - unsigned char addr[4]; - unsigned char netmask[4]; -}; - -static void etap_change(int op, unsigned char *addr, unsigned char *netmask, - int fd) -{ - struct addr_change change; - char *output; - int n; - - change.what = op; - memcpy(change.addr, addr, sizeof(change.addr)); - memcpy(change.netmask, netmask, sizeof(change.netmask)); - CATCH_EINTR(n = write(fd, &change, sizeof(change))); - if (n != sizeof(change)) { - printk(UM_KERN_ERR "etap_change - request failed, err = %d\n", - errno); - return; - } - - output = uml_kmalloc(UM_KERN_PAGE_SIZE, UM_GFP_KERNEL); - if (output == NULL) - printk(UM_KERN_ERR "etap_change : Failed to allocate output " - "buffer\n"); - read_output(fd, output, UM_KERN_PAGE_SIZE); - if (output != NULL) { - printk("%s", output); - kfree(output); - } -} - -static void etap_open_addr(unsigned char *addr, unsigned char *netmask, - void *arg) -{ - etap_change(ADD_ADDR, addr, netmask, *((int *) arg)); -} - -static void etap_close_addr(unsigned char *addr, unsigned char *netmask, - void *arg) -{ - etap_change(DEL_ADDR, addr, netmask, *((int *) arg)); -} - -struct etap_pre_exec_data { - int control_remote; - int control_me; - int data_me; -}; - -static void etap_pre_exec(void *arg) -{ - struct etap_pre_exec_data *data = arg; - - dup2(data->control_remote, 1); - close(data->data_me); - close(data->control_me); -} - -static int etap_tramp(char *dev, char *gate, int control_me, - int control_remote, int data_me, int data_remote) -{ - struct etap_pre_exec_data pe_data; - int pid, err, n; - char version_buf[sizeof("nnnnn\0")]; - char data_fd_buf[sizeof("nnnnnn\0")]; - char gate_buf[sizeof("nnn.nnn.nnn.nnn\0")]; - char *setup_args[] = { "uml_net", version_buf, "ethertap", dev, - data_fd_buf, gate_buf, NULL }; - char *nosetup_args[] = { "uml_net", version_buf, "ethertap", - dev, data_fd_buf, NULL }; - char **args, c; - - sprintf(data_fd_buf, "%d", data_remote); - sprintf(version_buf, "%d", UML_NET_VERSION); - if (gate != NULL) { - strscpy(gate_buf, gate); - args = setup_args; - } - else args = nosetup_args; - - err = 0; - pe_data.control_remote = control_remote; - pe_data.control_me = control_me; - pe_data.data_me = data_me; - pid = run_helper(etap_pre_exec, &pe_data, args); - - if (pid < 0) - err = pid; - close(data_remote); - close(control_remote); - CATCH_EINTR(n = read(control_me, &c, sizeof(c))); - if (n != sizeof(c)) { - err = -errno; - printk(UM_KERN_ERR "etap_tramp : read of status failed, " - "err = %d\n", -err); - return err; - } - if (c != 1) { - printk(UM_KERN_ERR "etap_tramp : uml_net failed\n"); - err = helper_wait(pid); - } - return err; -} - -static int etap_open(void *data) -{ - struct ethertap_data *pri = data; - char *output; - int data_fds[2], control_fds[2], err, output_len; - - err = tap_open_common(pri->dev, pri->gate_addr); - if (err) - return err; - - err = socketpair(AF_UNIX, SOCK_DGRAM, 0, data_fds); - if (err) { - err = -errno; - printk(UM_KERN_ERR "etap_open - data socketpair failed - " - "err = %d\n", errno); - return err; - } - - err = socketpair(AF_UNIX, SOCK_STREAM, 0, control_fds); - if (err) { - err = -errno; - printk(UM_KERN_ERR "etap_open - control socketpair failed - " - "err = %d\n", errno); - goto out_close_data; - } - - err = etap_tramp(pri->dev_name, pri->gate_addr, control_fds[0], - control_fds[1], data_fds[0], data_fds[1]); - output_len = UM_KERN_PAGE_SIZE; - output = uml_kmalloc(output_len, UM_GFP_KERNEL); - read_output(control_fds[0], output, output_len); - - if (output == NULL) - printk(UM_KERN_ERR "etap_open : failed to allocate output " - "buffer\n"); - else { - printk("%s", output); - kfree(output); - } - - if (err < 0) { - printk(UM_KERN_ERR "etap_tramp failed - err = %d\n", -err); - goto out_close_control; - } - - pri->data_fd = data_fds[0]; - pri->control_fd = control_fds[0]; - iter_addresses(pri->dev, etap_open_addr, &pri->control_fd); - return data_fds[0]; - -out_close_control: - close(control_fds[0]); - close(control_fds[1]); -out_close_data: - close(data_fds[0]); - close(data_fds[1]); - return err; -} - -static void etap_close(int fd, void *data) -{ - struct ethertap_data *pri = data; - - iter_addresses(pri->dev, etap_close_addr, &pri->control_fd); - close(fd); - - if (shutdown(pri->data_fd, SHUT_RDWR) < 0) - printk(UM_KERN_ERR "etap_close - shutdown data socket failed, " - "errno = %d\n", errno); - - if (shutdown(pri->control_fd, SHUT_RDWR) < 0) - printk(UM_KERN_ERR "etap_close - shutdown control socket " - "failed, errno = %d\n", errno); - - close(pri->data_fd); - pri->data_fd = -1; - close(pri->control_fd); - pri->control_fd = -1; -} - -static void etap_add_addr(unsigned char *addr, unsigned char *netmask, - void *data) -{ - struct ethertap_data *pri = data; - - tap_check_ips(pri->gate_addr, addr); - if (pri->control_fd == -1) - return; - etap_open_addr(addr, netmask, &pri->control_fd); -} - -static void etap_del_addr(unsigned char *addr, unsigned char *netmask, - void *data) -{ - struct ethertap_data *pri = data; - - if (pri->control_fd == -1) - return; - - etap_close_addr(addr, netmask, &pri->control_fd); -} - -const struct net_user_info ethertap_user_info = { - .init = etap_user_init, - .open = etap_open, - .close = etap_close, - .remove = NULL, - .add_address = etap_add_addr, - .delete_address = etap_del_addr, - .mtu = ETH_MAX_PACKET, - .max_packet = ETH_MAX_PACKET + ETH_HEADER_ETHERTAP, -}; diff --git a/arch/um/os-Linux/drivers/tuntap.h b/arch/um/os-Linux/drivers/tuntap.h deleted file mode 100644 index e364e42abfc5..000000000000 --- a/arch/um/os-Linux/drivers/tuntap.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#ifndef __UM_TUNTAP_H -#define __UM_TUNTAP_H - -#include <net_user.h> - -struct tuntap_data { - char *dev_name; - int fixed_config; - char *gate_addr; - int fd; - void *dev; -}; - -extern const struct net_user_info tuntap_user_info; - -#endif diff --git a/arch/um/os-Linux/drivers/tuntap_kern.c b/arch/um/os-Linux/drivers/tuntap_kern.c deleted file mode 100644 index ff022d9cf0dd..000000000000 --- a/arch/um/os-Linux/drivers/tuntap_kern.c +++ /dev/null @@ -1,86 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include <linux/netdevice.h> -#include <linux/init.h> -#include <linux/skbuff.h> -#include <asm/errno.h> -#include <net_kern.h> -#include "tuntap.h" - -struct tuntap_init { - char *dev_name; - char *gate_addr; -}; - -static void tuntap_init(struct net_device *dev, void *data) -{ - struct uml_net_private *pri; - struct tuntap_data *tpri; - struct tuntap_init *init = data; - - pri = netdev_priv(dev); - tpri = (struct tuntap_data *) pri->user; - tpri->dev_name = init->dev_name; - tpri->fixed_config = (init->dev_name != NULL); - tpri->gate_addr = init->gate_addr; - tpri->fd = -1; - tpri->dev = dev; - - printk(KERN_INFO "TUN/TAP backend - "); - if (tpri->gate_addr != NULL) - printk(KERN_CONT "IP = %s", tpri->gate_addr); - printk(KERN_CONT "\n"); -} - -static int tuntap_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return net_read(fd, skb_mac_header(skb), - skb->dev->mtu + ETH_HEADER_OTHER); -} - -static int tuntap_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return net_write(fd, skb->data, skb->len); -} - -const struct net_kern_info tuntap_kern_info = { - .init = tuntap_init, - .protocol = eth_protocol, - .read = tuntap_read, - .write = tuntap_write, -}; - -static int tuntap_setup(char *str, char **mac_out, void *data) -{ - struct tuntap_init *init = data; - - *init = ((struct tuntap_init) - { .dev_name = NULL, - .gate_addr = NULL }); - if (tap_setup_common(str, "tuntap", &init->dev_name, mac_out, - &init->gate_addr)) - return 0; - - return 1; -} - -static struct transport tuntap_transport = { - .list = LIST_HEAD_INIT(tuntap_transport.list), - .name = "tuntap", - .setup = tuntap_setup, - .user = &tuntap_user_info, - .kern = &tuntap_kern_info, - .private_size = sizeof(struct tuntap_data), - .setup_size = sizeof(struct tuntap_init), -}; - -static int register_tuntap(void) -{ - register_transport(&tuntap_transport); - return 0; -} - -late_initcall(register_tuntap); diff --git a/arch/um/os-Linux/drivers/tuntap_user.c b/arch/um/os-Linux/drivers/tuntap_user.c deleted file mode 100644 index 91f0e27ca3a6..000000000000 --- a/arch/um/os-Linux/drivers/tuntap_user.c +++ /dev/null @@ -1,215 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include <stdio.h> -#include <unistd.h> -#include <errno.h> -#include <string.h> -#include <linux/if_tun.h> -#include <net/if.h> -#include <sys/ioctl.h> -#include <sys/socket.h> -#include <sys/wait.h> -#include <sys/uio.h> -#include <kern_util.h> -#include <os.h> -#include "tuntap.h" - -static int tuntap_user_init(void *data, void *dev) -{ - struct tuntap_data *pri = data; - - pri->dev = dev; - return 0; -} - -static void tuntap_add_addr(unsigned char *addr, unsigned char *netmask, - void *data) -{ - struct tuntap_data *pri = data; - - tap_check_ips(pri->gate_addr, addr); - if ((pri->fd == -1) || pri->fixed_config) - return; - open_addr(addr, netmask, pri->dev_name); -} - -static void tuntap_del_addr(unsigned char *addr, unsigned char *netmask, - void *data) -{ - struct tuntap_data *pri = data; - - if ((pri->fd == -1) || pri->fixed_config) - return; - close_addr(addr, netmask, pri->dev_name); -} - -struct tuntap_pre_exec_data { - int stdout_fd; - int close_me; -}; - -static void tuntap_pre_exec(void *arg) -{ - struct tuntap_pre_exec_data *data = arg; - - dup2(data->stdout_fd, 1); - close(data->close_me); -} - -static int tuntap_open_tramp(char *gate, int *fd_out, int me, int remote, - char *buffer, int buffer_len, int *used_out) -{ - struct tuntap_pre_exec_data data; - char version_buf[sizeof("nnnnn\0")]; - char *argv[] = { "uml_net", version_buf, "tuntap", "up", gate, - NULL }; - char buf[CMSG_SPACE(sizeof(*fd_out))]; - struct msghdr msg; - struct cmsghdr *cmsg; - struct iovec iov; - int pid, n, err; - - sprintf(version_buf, "%d", UML_NET_VERSION); - - data.stdout_fd = remote; - data.close_me = me; - - pid = run_helper(tuntap_pre_exec, &data, argv); - - if (pid < 0) - return pid; - - close(remote); - - msg.msg_name = NULL; - msg.msg_namelen = 0; - if (buffer != NULL) { - iov = ((struct iovec) { buffer, buffer_len }); - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - } - else { - msg.msg_iov = NULL; - msg.msg_iovlen = 0; - } - msg.msg_control = buf; - msg.msg_controllen = sizeof(buf); - msg.msg_flags = 0; - n = recvmsg(me, &msg, 0); - *used_out = n; - if (n < 0) { - err = -errno; - printk(UM_KERN_ERR "tuntap_open_tramp : recvmsg failed - " - "errno = %d\n", errno); - return err; - } - helper_wait(pid); - - cmsg = CMSG_FIRSTHDR(&msg); - if (cmsg == NULL) { - printk(UM_KERN_ERR "tuntap_open_tramp : didn't receive a " - "message\n"); - return -EINVAL; - } - if ((cmsg->cmsg_level != SOL_SOCKET) || - (cmsg->cmsg_type != SCM_RIGHTS)) { - printk(UM_KERN_ERR "tuntap_open_tramp : didn't receive a " - "descriptor\n"); - return -EINVAL; - } - *fd_out = ((int *) CMSG_DATA(cmsg))[0]; - os_set_exec_close(*fd_out); - return 0; -} - -static int tuntap_open(void *data) -{ - struct ifreq ifr; - struct tuntap_data *pri = data; - char *output, *buffer; - int err, fds[2], len, used; - - err = tap_open_common(pri->dev, pri->gate_addr); - if (err < 0) - return err; - - if (pri->fixed_config) { - pri->fd = os_open_file("/dev/net/tun", - of_cloexec(of_rdwr(OPENFLAGS())), 0); - if (pri->fd < 0) { - printk(UM_KERN_ERR "Failed to open /dev/net/tun, " - "err = %d\n", -pri->fd); - return pri->fd; - } - memset(&ifr, 0, sizeof(ifr)); - ifr.ifr_flags = IFF_TAP | IFF_NO_PI; - strscpy(ifr.ifr_name, pri->dev_name); - if (ioctl(pri->fd, TUNSETIFF, &ifr) < 0) { - err = -errno; - printk(UM_KERN_ERR "TUNSETIFF failed, errno = %d\n", - errno); - close(pri->fd); - return err; - } - } - else { - err = socketpair(AF_UNIX, SOCK_DGRAM, 0, fds); - if (err) { - err = -errno; - printk(UM_KERN_ERR "tuntap_open : socketpair failed - " - "errno = %d\n", errno); - return err; - } - - buffer = get_output_buffer(&len); - if (buffer != NULL) - len--; - used = 0; - - err = tuntap_open_tramp(pri->gate_addr, &pri->fd, fds[0], - fds[1], buffer, len, &used); - - output = buffer; - if (err < 0) { - printk("%s", output); - free_output_buffer(buffer); - printk(UM_KERN_ERR "tuntap_open_tramp failed - " - "err = %d\n", -err); - return err; - } - - pri->dev_name = uml_strdup(buffer); - output += IFNAMSIZ; - printk("%s", output); - free_output_buffer(buffer); - - close(fds[0]); - iter_addresses(pri->dev, open_addr, pri->dev_name); - } - - return pri->fd; -} - -static void tuntap_close(int fd, void *data) -{ - struct tuntap_data *pri = data; - - if (!pri->fixed_config) - iter_addresses(pri->dev, close_addr, pri->dev_name); - close(fd); - pri->fd = -1; -} - -const struct net_user_info tuntap_user_info = { - .init = tuntap_user_init, - .open = tuntap_open, - .close = tuntap_close, - .remove = NULL, - .add_address = tuntap_add_addr, - .delete_address = tuntap_del_addr, - .mtu = ETH_MAX_PACKET, - .max_packet = ETH_MAX_PACKET + ETH_HEADER_OTHER, -}; diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c index a0d01c68ce3e..617886d1fb1e 100644 --- a/arch/um/os-Linux/file.c +++ b/arch/um/os-Linux/file.c @@ -106,21 +106,6 @@ int os_get_ifname(int fd, char* namebuf) return 0; } -int os_set_slip(int fd) -{ - int disc, sencap; - - disc = N_SLIP; - if (ioctl(fd, TIOCSETD, &disc) < 0) - return -errno; - - sencap = 0; - if (ioctl(fd, SIOCSIFENCAP, &sencap) < 0) - return -errno; - - return 0; -} - int os_mode_fd(int fd, int mode) { int err; diff --git a/arch/um/os-Linux/internal.h b/arch/um/os-Linux/internal.h index 317fca190c2b..5d8d3b0817a9 100644 --- a/arch/um/os-Linux/internal.h +++ b/arch/um/os-Linux/internal.h @@ -2,6 +2,9 @@ #ifndef __UM_OS_LINUX_INTERNAL_H #define __UM_OS_LINUX_INTERNAL_H +#include <mm_id.h> +#include <stub-data.h> + /* * elf_aux.c */ @@ -16,5 +19,5 @@ void check_tmpexec(void); * skas/process.c */ void wait_stub_done(int pid); - +void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys); #endif /* __UM_OS_LINUX_INTERNAL_H */ diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c index 184566edeee9..00b49e90d05f 100644 --- a/arch/um/os-Linux/process.c +++ b/arch/um/os-Linux/process.c @@ -18,17 +18,29 @@ #include <init.h> #include <longjmp.h> #include <os.h> +#include <skas/skas.h> void os_alarm_process(int pid) { + if (pid <= 0) + return; + kill(pid, SIGALRM); } void os_kill_process(int pid, int reap_child) { + if (pid <= 0) + return; + + /* Block signals until child is reaped */ + block_signals(); + kill(pid, SIGKILL); if (reap_child) CATCH_EINTR(waitpid(pid, NULL, __WALL)); + + unblock_signals(); } /* Kill off a ptraced child by all means available. kill it normally first, @@ -38,11 +50,27 @@ void os_kill_process(int pid, int reap_child) void os_kill_ptraced_process(int pid, int reap_child) { + if (pid <= 0) + return; + + /* Block signals until child is reaped */ + block_signals(); + kill(pid, SIGKILL); ptrace(PTRACE_KILL, pid); ptrace(PTRACE_CONT, pid); if (reap_child) CATCH_EINTR(waitpid(pid, NULL, __WALL)); + + unblock_signals(); +} + +pid_t os_reap_child(void) +{ + int status; + + /* Try to reap a child */ + return waitpid(-1, &status, WNOHANG); } /* Don't use the glibc version, which caches the result in TLS. It misses some @@ -151,6 +179,9 @@ void init_new_thread_signals(void) set_handler(SIGBUS); signal(SIGHUP, SIG_IGN); set_handler(SIGIO); + /* We (currently) only use the child reaper IRQ in seccomp mode */ + if (using_seccomp) + set_handler(SIGCHLD); signal(SIGWINCH, SIG_IGN); } diff --git a/arch/um/os-Linux/registers.c b/arch/um/os-Linux/registers.c index d7ca148807b2..bfba2cbc9478 100644 --- a/arch/um/os-Linux/registers.c +++ b/arch/um/os-Linux/registers.c @@ -14,8 +14,8 @@ /* This is set once at boot time and not changed thereafter */ -static unsigned long exec_regs[MAX_REG_NR]; -static unsigned long *exec_fp_regs; +unsigned long exec_regs[MAX_REG_NR]; +unsigned long *exec_fp_regs; int init_pid_registers(int pid) { diff --git a/arch/um/os-Linux/sigio.c b/arch/um/os-Linux/sigio.c index a05a6ecee756..6de145f8fe3d 100644 --- a/arch/um/os-Linux/sigio.c +++ b/arch/um/os-Linux/sigio.c @@ -12,6 +12,7 @@ #include <signal.h> #include <string.h> #include <sys/epoll.h> +#include <asm/unistd.h> #include <kern_util.h> #include <init.h> #include <os.h> @@ -46,7 +47,7 @@ static void *write_sigio_thread(void *unused) __func__, errno); } - CATCH_EINTR(r = tgkill(pid, pid, SIGIO)); + CATCH_EINTR(r = syscall(__NR_tgkill, pid, pid, SIGIO)); if (r < 0) printk(UM_KERN_ERR "%s: tgkill failed, errno = %d\n", __func__, errno); diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index e71e5b4878d1..11f07f498270 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -29,6 +29,7 @@ void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *, void *mc) = [SIGBUS] = relay_signal, [SIGSEGV] = segv_handler, [SIGIO] = sigio_handler, + [SIGCHLD] = sigchld_handler, }; static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc) @@ -44,7 +45,7 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc) } /* enable signals if sig isn't IRQ signal */ - if ((sig != SIGIO) && (sig != SIGWINCH)) + if ((sig != SIGIO) && (sig != SIGWINCH) && (sig != SIGCHLD)) unblock_signals_trace(); (*sig_info[sig])(sig, si, &r, mc); @@ -64,6 +65,9 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc) #define SIGALRM_BIT 1 #define SIGALRM_MASK (1 << SIGALRM_BIT) +#define SIGCHLD_BIT 2 +#define SIGCHLD_MASK (1 << SIGCHLD_BIT) + int signals_enabled; #if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) static int signals_blocked, signals_blocked_pending; @@ -102,6 +106,11 @@ static void sig_handler(int sig, struct siginfo *si, mcontext_t *mc) return; } + if (!enabled && (sig == SIGCHLD)) { + signals_pending |= SIGCHLD_MASK; + return; + } + block_signals_trace(); sig_handler_common(sig, si, mc); @@ -181,6 +190,8 @@ static void (*handlers[_NSIG])(int sig, struct siginfo *si, mcontext_t *mc) = { [SIGIO] = sig_handler, [SIGWINCH] = sig_handler, + /* SIGCHLD is only actually registered in seccomp mode. */ + [SIGCHLD] = sig_handler, [SIGALRM] = timer_alarm_handler, [SIGUSR1] = sigusr1_handler, @@ -309,6 +320,12 @@ void unblock_signals(void) if (save_pending & SIGIO_MASK) sig_handler_common(SIGIO, NULL, NULL); + if (save_pending & SIGCHLD_MASK) { + struct uml_pt_regs regs = {}; + + sigchld_handler(SIGCHLD, NULL, ®s, NULL); + } + /* Do not reenter the handler */ if ((save_pending & SIGALRM_MASK) && (!(signals_active & SIGALRM_MASK))) diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c index d7f1814b0e5a..8b9921ac3ef8 100644 --- a/arch/um/os-Linux/skas/mem.c +++ b/arch/um/os-Linux/skas/mem.c @@ -43,6 +43,16 @@ void syscall_stub_dump_error(struct mm_id *mm_idp) print_hex_dump(UM_KERN_ERR, " syscall data: ", 0, 16, 4, sc, sizeof(*sc), 0); + + if (using_seccomp) { + printk(UM_KERN_ERR "%s: FD map num: %d", __func__, + mm_idp->syscall_fd_num); + print_hex_dump(UM_KERN_ERR, + " FD map: ", 0, 16, + sizeof(mm_idp->syscall_fd_map[0]), + mm_idp->syscall_fd_map, + sizeof(mm_idp->syscall_fd_map), 0); + } } static inline unsigned long *check_init_stack(struct mm_id * mm_idp, @@ -80,27 +90,32 @@ static inline long do_syscall_stub(struct mm_id *mm_idp) int n, i; int err, pid = mm_idp->pid; - n = ptrace_setregs(pid, syscall_regs); - if (n < 0) { - printk(UM_KERN_ERR "Registers - \n"); - for (i = 0; i < MAX_REG_NR; i++) - printk(UM_KERN_ERR "\t%d\t0x%lx\n", i, syscall_regs[i]); - panic("%s : PTRACE_SETREGS failed, errno = %d\n", - __func__, -n); - } - /* Inform process how much we have filled in. */ proc_data->syscall_data_len = mm_idp->syscall_data_len; - err = ptrace(PTRACE_CONT, pid, 0, 0); - if (err) - panic("Failed to continue stub, pid = %d, errno = %d\n", pid, - errno); - - wait_stub_done(pid); + if (using_seccomp) { + proc_data->restart_wait = 1; + wait_stub_done_seccomp(mm_idp, 0, 1); + } else { + n = ptrace_setregs(pid, syscall_regs); + if (n < 0) { + printk(UM_KERN_ERR "Registers -\n"); + for (i = 0; i < MAX_REG_NR; i++) + printk(UM_KERN_ERR "\t%d\t0x%lx\n", i, syscall_regs[i]); + panic("%s : PTRACE_SETREGS failed, errno = %d\n", + __func__, -n); + } + + err = ptrace(PTRACE_CONT, pid, 0, 0); + if (err) + panic("Failed to continue stub, pid = %d, errno = %d\n", + pid, errno); + + wait_stub_done(pid); + } /* - * proc_data->err will be non-zero if there was an (unexpected) error. + * proc_data->err will be negative if there was an (unexpected) error. * In that case, syscall_data_len points to the last executed syscall, * otherwise it will be zero (but we do not need to rely on that). */ @@ -113,6 +128,9 @@ static inline long do_syscall_stub(struct mm_id *mm_idp) mm_idp->syscall_data_len = 0; } + if (using_seccomp) + mm_idp->syscall_fd_num = 0; + return mm_idp->syscall_data_len; } @@ -175,6 +193,44 @@ static struct stub_syscall *syscall_stub_get_previous(struct mm_id *mm_idp, return NULL; } +static int get_stub_fd(struct mm_id *mm_idp, int fd) +{ + int i; + + /* Find an FD slot (or flush and use first) */ + if (!using_seccomp) + return fd; + + /* Already crashed, value does not matter */ + if (mm_idp->syscall_data_len < 0) + return 0; + + /* Find existing FD in map if we can allocate another syscall */ + if (mm_idp->syscall_data_len < + ARRAY_SIZE(((struct stub_data *)NULL)->syscall_data)) { + for (i = 0; i < mm_idp->syscall_fd_num; i++) { + if (mm_idp->syscall_fd_map[i] == fd) + return i; + } + + if (mm_idp->syscall_fd_num < STUB_MAX_FDS) { + i = mm_idp->syscall_fd_num; + mm_idp->syscall_fd_map[i] = fd; + + mm_idp->syscall_fd_num++; + + return i; + } + } + + /* FD map full or no syscall space available, continue after flush */ + do_syscall_stub(mm_idp); + mm_idp->syscall_fd_map[0] = fd; + mm_idp->syscall_fd_num = 1; + + return 0; +} + int map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, int prot, int phys_fd, unsigned long long offset) { @@ -182,12 +238,21 @@ int map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, int prot, /* Compress with previous syscall if that is possible */ sc = syscall_stub_get_previous(mm_idp, STUB_SYSCALL_MMAP, virt); - if (sc && sc->mem.prot == prot && sc->mem.fd == phys_fd && + if (sc && sc->mem.prot == prot && sc->mem.offset == MMAP_OFFSET(offset - sc->mem.length)) { - sc->mem.length += len; - return 0; + int prev_fd = sc->mem.fd; + + if (using_seccomp) + prev_fd = mm_idp->syscall_fd_map[sc->mem.fd]; + + if (phys_fd == prev_fd) { + sc->mem.length += len; + return 0; + } } + phys_fd = get_stub_fd(mm_idp, phys_fd); + sc = syscall_stub_alloc(mm_idp); sc->syscall = STUB_SYSCALL_MMAP; sc->mem.addr = virt; diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index ae2aea062f06..e42ffac23e3c 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net> * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) * Copyright (C) 2002- 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) */ @@ -15,6 +16,7 @@ #include <sys/mman.h> #include <sys/wait.h> #include <sys/stat.h> +#include <sys/socket.h> #include <asm/unistd.h> #include <as-layout.h> #include <init.h> @@ -25,8 +27,11 @@ #include <registers.h> #include <skas.h> #include <sysdep/stub.h> +#include <sysdep/mcontext.h> +#include <linux/futex.h> #include <linux/threads.h> #include <timetravel.h> +#include <asm-generic/rwonce.h> #include "../internal.h" int is_skas_winch(int pid, int fd, void *data) @@ -142,6 +147,105 @@ bad_wait: fatal_sigsegv(); } +void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys) +{ + struct stub_data *data = (void *)mm_idp->stack; + int ret; + + do { + const char byte = 0; + struct iovec iov = { + .iov_base = (void *)&byte, + .iov_len = sizeof(byte), + }; + union { + char data[CMSG_SPACE(sizeof(mm_idp->syscall_fd_map))]; + struct cmsghdr align; + } ctrl; + struct msghdr msgh = { + .msg_iov = &iov, + .msg_iovlen = 1, + }; + + if (!running) { + if (mm_idp->syscall_fd_num) { + unsigned int fds_size = + sizeof(int) * mm_idp->syscall_fd_num; + struct cmsghdr *cmsg; + + msgh.msg_control = ctrl.data; + msgh.msg_controllen = CMSG_SPACE(fds_size); + cmsg = CMSG_FIRSTHDR(&msgh); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(fds_size); + memcpy(CMSG_DATA(cmsg), mm_idp->syscall_fd_map, + fds_size); + + CATCH_EINTR(syscall(__NR_sendmsg, mm_idp->sock, + &msgh, 0)); + } + + data->signal = 0; + data->futex = FUTEX_IN_CHILD; + CATCH_EINTR(syscall(__NR_futex, &data->futex, + FUTEX_WAKE, 1, NULL, NULL, 0)); + } + + do { + /* + * We need to check whether the child is still alive + * before and after the FUTEX_WAIT call. Before, in + * case it just died but we still updated data->futex + * to FUTEX_IN_CHILD. And after, in case it died while + * we were waiting (and SIGCHLD woke us up, see the + * IRQ handler in mmu.c). + * + * Either way, if PID is negative, then we have no + * choice but to kill the task. + */ + if (__READ_ONCE(mm_idp->pid) < 0) + goto out_kill; + + ret = syscall(__NR_futex, &data->futex, + FUTEX_WAIT, FUTEX_IN_CHILD, + NULL, NULL, 0); + if (ret < 0 && errno != EINTR && errno != EAGAIN) { + printk(UM_KERN_ERR "%s : FUTEX_WAIT failed, errno = %d\n", + __func__, errno); + goto out_kill; + } + } while (data->futex == FUTEX_IN_CHILD); + + if (__READ_ONCE(mm_idp->pid) < 0) + goto out_kill; + + running = 0; + + /* We may receive a SIGALRM before SIGSYS, iterate again. */ + } while (wait_sigsys && data->signal == SIGALRM); + + if (data->mctx_offset > sizeof(data->sigstack) - sizeof(mcontext_t)) { + printk(UM_KERN_ERR "%s : invalid mcontext offset", __func__); + goto out_kill; + } + + if (wait_sigsys && data->signal != SIGSYS) { + printk(UM_KERN_ERR "%s : expected SIGSYS but got %d", + __func__, data->signal); + goto out_kill; + } + + return; + +out_kill: + printk(UM_KERN_ERR "%s : failed to wait for stub, pid = %d, errno = %d\n", + __func__, mm_idp->pid, errno); + /* This is not true inside start_userspace */ + if (current_mm_id() == mm_idp) + fatal_sigsegv(); +} + extern unsigned long current_stub_stack(void); static void get_skas_faultinfo(int pid, struct faultinfo *fi) @@ -163,12 +267,6 @@ static void get_skas_faultinfo(int pid, struct faultinfo *fi) memcpy(fi, (void *)current_stub_stack(), sizeof(*fi)); } -static void handle_segv(int pid, struct uml_pt_regs *regs) -{ - get_skas_faultinfo(pid, ®s->faultinfo); - segv(regs->faultinfo, 0, 1, NULL, NULL); -} - static void handle_trap(int pid, struct uml_pt_regs *regs) { if ((UPT_IP(regs) >= STUB_START) && (UPT_IP(regs) < STUB_END)) @@ -181,29 +279,48 @@ extern char __syscall_stub_start[]; static int stub_exe_fd; +struct tramp_data { + struct stub_data *stub_data; + /* 0 is inherited, 1 is the kernel side */ + int sockpair[2]; +}; + #ifndef CLOSE_RANGE_CLOEXEC #define CLOSE_RANGE_CLOEXEC (1U << 2) #endif -static int userspace_tramp(void *stack) +static int userspace_tramp(void *data) { + struct tramp_data *tramp_data = data; char *const argv[] = { "uml-userspace", NULL }; - int pipe_fds[2]; unsigned long long offset; struct stub_init_data init_data = { + .seccomp = using_seccomp, .stub_start = STUB_START, - .segv_handler = STUB_CODE + - (unsigned long) stub_segv_handler - - (unsigned long) __syscall_stub_start, }; struct iomem_region *iomem; int ret; + if (using_seccomp) { + init_data.signal_handler = STUB_CODE + + (unsigned long) stub_signal_interrupt - + (unsigned long) __syscall_stub_start; + init_data.signal_restorer = STUB_CODE + + (unsigned long) stub_signal_restorer - + (unsigned long) __syscall_stub_start; + } else { + init_data.signal_handler = STUB_CODE + + (unsigned long) stub_segv_handler - + (unsigned long) __syscall_stub_start; + init_data.signal_restorer = 0; + } + init_data.stub_code_fd = phys_mapping(uml_to_phys(__syscall_stub_start), &offset); init_data.stub_code_offset = MMAP_OFFSET(offset); - init_data.stub_data_fd = phys_mapping(uml_to_phys(stack), &offset); + init_data.stub_data_fd = phys_mapping(uml_to_phys(tramp_data->stub_data), + &offset); init_data.stub_data_offset = MMAP_OFFSET(offset); /* @@ -214,20 +331,21 @@ static int userspace_tramp(void *stack) syscall(__NR_close_range, 0, ~0U, CLOSE_RANGE_CLOEXEC); fcntl(init_data.stub_data_fd, F_SETFD, 0); - for (iomem = iomem_regions; iomem; iomem = iomem->next) - fcntl(iomem->fd, F_SETFD, 0); - /* Create a pipe for init_data (no CLOEXEC) and dup2 to STDIN */ - if (pipe(pipe_fds)) - exit(2); + /* In SECCOMP mode, these FDs are passed when needed */ + if (!using_seccomp) { + for (iomem = iomem_regions; iomem; iomem = iomem->next) + fcntl(iomem->fd, F_SETFD, 0); + } - if (dup2(pipe_fds[0], 0) < 0) + /* dup2 signaling FD/socket to STDIN */ + if (dup2(tramp_data->sockpair[0], 0) < 0) exit(3); - close(pipe_fds[0]); + close(tramp_data->sockpair[0]); /* Write init_data and close write side */ - ret = write(pipe_fds[1], &init_data, sizeof(init_data)); - close(pipe_fds[1]); + ret = write(tramp_data->sockpair[1], &init_data, sizeof(init_data)); + close(tramp_data->sockpair[1]); if (ret != sizeof(init_data)) exit(4); @@ -315,11 +433,12 @@ static int __init init_stub_exe_fd(void) } __initcall(init_stub_exe_fd); +int using_seccomp; int userspace_pid[NR_CPUS]; /** * start_userspace() - prepare a new userspace process - * @stub_stack: pointer to the stub stack. + * @mm_id: The corresponding struct mm_id * * Setups a new temporary stack page that is used while userspace_tramp() runs * Clones the kernel process into a new userspace process, with FDs only. @@ -328,11 +447,15 @@ int userspace_pid[NR_CPUS]; * when negative: an error number. * FIXME: can PIDs become negative?! */ -int start_userspace(unsigned long stub_stack) +int start_userspace(struct mm_id *mm_id) { + struct stub_data *proc_data = (void *)mm_id->stack; + struct tramp_data tramp_data = { + .stub_data = proc_data, + }; void *stack; unsigned long sp; - int pid, status, n, err; + int status, n, err; /* setup a temporary stack page */ stack = mmap(NULL, UM_KERN_PAGE_SIZE, @@ -348,40 +471,55 @@ int start_userspace(unsigned long stub_stack) /* set stack pointer to the end of the stack page, so it can grow downwards */ sp = (unsigned long)stack + UM_KERN_PAGE_SIZE; - /* clone into new userspace process */ - pid = clone(userspace_tramp, (void *) sp, + /* socket pair for init data and SECCOMP FD passing (no CLOEXEC here) */ + if (socketpair(AF_UNIX, SOCK_STREAM, 0, tramp_data.sockpair)) { + err = -errno; + printk(UM_KERN_ERR "%s : socketpair failed, errno = %d\n", + __func__, errno); + return err; + } + + if (using_seccomp) + proc_data->futex = FUTEX_IN_CHILD; + + mm_id->pid = clone(userspace_tramp, (void *) sp, CLONE_VFORK | CLONE_VM | SIGCHLD, - (void *)stub_stack); - if (pid < 0) { + (void *)&tramp_data); + if (mm_id->pid < 0) { err = -errno; printk(UM_KERN_ERR "%s : clone failed, errno = %d\n", __func__, errno); - return err; + goto out_close; } - do { - CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL)); - if (n < 0) { + if (using_seccomp) { + wait_stub_done_seccomp(mm_id, 1, 1); + } else { + do { + CATCH_EINTR(n = waitpid(mm_id->pid, &status, + WUNTRACED | __WALL)); + if (n < 0) { + err = -errno; + printk(UM_KERN_ERR "%s : wait failed, errno = %d\n", + __func__, errno); + goto out_kill; + } + } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM)); + + if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) { + err = -EINVAL; + printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n", + __func__, status); + goto out_kill; + } + + if (ptrace(PTRACE_SETOPTIONS, mm_id->pid, NULL, + (void *) PTRACE_O_TRACESYSGOOD) < 0) { err = -errno; - printk(UM_KERN_ERR "%s : wait failed, errno = %d\n", + printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n", __func__, errno); goto out_kill; } - } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM)); - - if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) { - err = -EINVAL; - printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n", - __func__, status); - goto out_kill; - } - - if (ptrace(PTRACE_SETOPTIONS, pid, NULL, - (void *) PTRACE_O_TRACESYSGOOD) < 0) { - err = -errno; - printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n", - __func__, errno); - goto out_kill; } if (munmap(stack, UM_KERN_PAGE_SIZE) < 0) { @@ -391,10 +529,22 @@ int start_userspace(unsigned long stub_stack) goto out_kill; } - return pid; + close(tramp_data.sockpair[0]); + if (using_seccomp) + mm_id->sock = tramp_data.sockpair[1]; + else + close(tramp_data.sockpair[1]); + + return 0; + +out_kill: + os_kill_ptraced_process(mm_id->pid, 1); +out_close: + close(tramp_data.sockpair[0]); + close(tramp_data.sockpair[1]); + + mm_id->pid = -1; - out_kill: - os_kill_ptraced_process(pid, 1); return err; } @@ -404,7 +554,9 @@ extern unsigned long tt_extra_sched_jiffies; void userspace(struct uml_pt_regs *regs) { int err, status, op, pid = userspace_pid[0]; - siginfo_t si; + siginfo_t si_ptrace; + siginfo_t *si; + int sig; /* Handle any immediate reschedules or signals */ interrupt_end(); @@ -437,103 +589,177 @@ void userspace(struct uml_pt_regs *regs) current_mm_sync(); - /* Flush out any pending syscalls */ - err = syscall_stub_flush(current_mm_id()); - if (err) { - if (err == -ENOMEM) - report_enomem(); + if (using_seccomp) { + struct mm_id *mm_id = current_mm_id(); + struct stub_data *proc_data = (void *) mm_id->stack; + int ret; - printk(UM_KERN_ERR "%s - Error flushing stub syscalls: %d", - __func__, -err); - fatal_sigsegv(); - } + ret = set_stub_state(regs, proc_data, singlestepping()); + if (ret) { + printk(UM_KERN_ERR "%s - failed to set regs: %d", + __func__, ret); + fatal_sigsegv(); + } - /* - * This can legitimately fail if the process loads a - * bogus value into a segment register. It will - * segfault and PTRACE_GETREGS will read that value - * out of the process. However, PTRACE_SETREGS will - * fail. In this case, there is nothing to do but - * just kill the process. - */ - if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) { - printk(UM_KERN_ERR "%s - ptrace set regs failed, errno = %d\n", - __func__, errno); - fatal_sigsegv(); - } + /* Must have been reset by the syscall caller */ + if (proc_data->restart_wait != 0) + panic("Programming error: Flag to only run syscalls in child was not cleared!"); - if (put_fp_registers(pid, regs->fp)) { - printk(UM_KERN_ERR "%s - ptrace set fp regs failed, errno = %d\n", - __func__, errno); - fatal_sigsegv(); - } + /* Mark pending syscalls for flushing */ + proc_data->syscall_data_len = mm_id->syscall_data_len; - if (singlestepping()) - op = PTRACE_SYSEMU_SINGLESTEP; - else - op = PTRACE_SYSEMU; + wait_stub_done_seccomp(mm_id, 0, 0); - if (ptrace(op, pid, 0, 0)) { - printk(UM_KERN_ERR "%s - ptrace continue failed, op = %d, errno = %d\n", - __func__, op, errno); - fatal_sigsegv(); - } + sig = proc_data->signal; - CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); - if (err < 0) { - printk(UM_KERN_ERR "%s - wait failed, errno = %d\n", - __func__, errno); - fatal_sigsegv(); - } + if (sig == SIGTRAP && proc_data->err != 0) { + printk(UM_KERN_ERR "%s - Error flushing stub syscalls", + __func__); + syscall_stub_dump_error(mm_id); + mm_id->syscall_data_len = proc_data->err; + fatal_sigsegv(); + } - regs->is_user = 1; - if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) { - printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = %d\n", - __func__, errno); - fatal_sigsegv(); - } + mm_id->syscall_data_len = 0; + mm_id->syscall_fd_num = 0; - if (get_fp_registers(pid, regs->fp)) { - printk(UM_KERN_ERR "%s - get_fp_registers failed, errno = %d\n", - __func__, errno); - fatal_sigsegv(); - } + ret = get_stub_state(regs, proc_data, NULL); + if (ret) { + printk(UM_KERN_ERR "%s - failed to get regs: %d", + __func__, ret); + fatal_sigsegv(); + } - UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */ + if (proc_data->si_offset > sizeof(proc_data->sigstack) - sizeof(*si)) + panic("%s - Invalid siginfo offset from child", + __func__); + si = (void *)&proc_data->sigstack[proc_data->si_offset]; + + regs->is_user = 1; + + /* Fill in ORIG_RAX and extract fault information */ + PT_SYSCALL_NR(regs->gp) = si->si_syscall; + if (sig == SIGSEGV) { + mcontext_t *mcontext = (void *)&proc_data->sigstack[proc_data->mctx_offset]; - if (WIFSTOPPED(status)) { - int sig = WSTOPSIG(status); + GET_FAULTINFO_FROM_MC(regs->faultinfo, mcontext); + } + } else { + /* Flush out any pending syscalls */ + err = syscall_stub_flush(current_mm_id()); + if (err) { + if (err == -ENOMEM) + report_enomem(); + + printk(UM_KERN_ERR "%s - Error flushing stub syscalls: %d", + __func__, -err); + fatal_sigsegv(); + } - /* These signal handlers need the si argument. - * The SIGIO and SIGALARM handlers which constitute the - * majority of invocations, do not use it. + /* + * This can legitimately fail if the process loads a + * bogus value into a segment register. It will + * segfault and PTRACE_GETREGS will read that value + * out of the process. However, PTRACE_SETREGS will + * fail. In this case, there is nothing to do but + * just kill the process. */ - switch (sig) { - case SIGSEGV: - case SIGTRAP: - case SIGILL: - case SIGBUS: - case SIGFPE: - case SIGWINCH: - ptrace(PTRACE_GETSIGINFO, pid, 0, (struct siginfo *)&si); - break; + if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) { + printk(UM_KERN_ERR "%s - ptrace set regs failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); } - switch (sig) { - case SIGSEGV: - if (PTRACE_FULL_FAULTINFO) { + if (put_fp_registers(pid, regs->fp)) { + printk(UM_KERN_ERR "%s - ptrace set fp regs failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); + } + + if (singlestepping()) + op = PTRACE_SYSEMU_SINGLESTEP; + else + op = PTRACE_SYSEMU; + + if (ptrace(op, pid, 0, 0)) { + printk(UM_KERN_ERR "%s - ptrace continue failed, op = %d, errno = %d\n", + __func__, op, errno); + fatal_sigsegv(); + } + + CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); + if (err < 0) { + printk(UM_KERN_ERR "%s - wait failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); + } + + regs->is_user = 1; + if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) { + printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); + } + + if (get_fp_registers(pid, regs->fp)) { + printk(UM_KERN_ERR "%s - get_fp_registers failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); + } + + if (WIFSTOPPED(status)) { + sig = WSTOPSIG(status); + + /* + * These signal handlers need the si argument + * and SIGSEGV needs the faultinfo. + * The SIGIO and SIGALARM handlers which constitute + * the majority of invocations, do not use it. + */ + switch (sig) { + case SIGSEGV: get_skas_faultinfo(pid, ®s->faultinfo); - (*sig_info[SIGSEGV])(SIGSEGV, (struct siginfo *)&si, - regs, NULL); + fallthrough; + case SIGTRAP: + case SIGILL: + case SIGBUS: + case SIGFPE: + case SIGWINCH: + ptrace(PTRACE_GETSIGINFO, pid, 0, + (struct siginfo *)&si_ptrace); + si = &si_ptrace; + break; + default: + si = NULL; + break; } - else handle_segv(pid, regs); + } else { + sig = 0; + } + } + + UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */ + + if (sig) { + switch (sig) { + case SIGSEGV: + if (using_seccomp || PTRACE_FULL_FAULTINFO) + (*sig_info[SIGSEGV])(SIGSEGV, + (struct siginfo *)si, + regs, NULL); + else + segv(regs->faultinfo, 0, 1, NULL, NULL); + + break; + case SIGSYS: + handle_syscall(regs); break; case SIGTRAP + 0x80: handle_trap(pid, regs); break; case SIGTRAP: - relay_signal(SIGTRAP, (struct siginfo *)&si, regs, NULL); + relay_signal(SIGTRAP, (struct siginfo *)si, regs, NULL); break; case SIGALRM: break; @@ -543,7 +769,7 @@ void userspace(struct uml_pt_regs *regs) case SIGFPE: case SIGWINCH: block_signals_trace(); - (*sig_info[sig])(sig, (struct siginfo *)&si, regs, NULL); + (*sig_info[sig])(sig, (struct siginfo *)si, regs, NULL); unblock_signals_trace(); break; default: diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c index 93fc82c01aba..a827c2e01aa5 100644 --- a/arch/um/os-Linux/start_up.c +++ b/arch/um/os-Linux/start_up.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net> * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) */ @@ -24,6 +25,13 @@ #include <kern_util.h> #include <mem_user.h> #include <ptrace_user.h> +#include <stdbool.h> +#include <stub-data.h> +#include <sys/prctl.h> +#include <linux/seccomp.h> +#include <linux/filter.h> +#include <sysdep/mcontext.h> +#include <sysdep/stub.h> #include <registers.h> #include <skas.h> #include "internal.h" @@ -224,6 +232,140 @@ static void __init check_ptrace(void) check_sysemu(); } +extern unsigned long host_fp_size; +extern unsigned long exec_regs[MAX_REG_NR]; +extern unsigned long *exec_fp_regs; + +__initdata static struct stub_data *seccomp_test_stub_data; + +static void __init sigsys_handler(int sig, siginfo_t *info, void *p) +{ + ucontext_t *uc = p; + + /* Stow away the location of the mcontext in the stack */ + seccomp_test_stub_data->mctx_offset = (unsigned long)&uc->uc_mcontext - + (unsigned long)&seccomp_test_stub_data->sigstack[0]; + + /* Prevent libc from clearing memory (mctx_offset in particular) */ + syscall(__NR_exit, 0); +} + +static int __init seccomp_helper(void *data) +{ + static struct sock_filter filter[] = { + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_clock_nanosleep, 1, 0), + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP), + }; + static struct sock_fprog prog = { + .len = ARRAY_SIZE(filter), + .filter = filter, + }; + struct sigaction sa; + + /* close_range is needed for the stub */ + if (stub_syscall3(__NR_close_range, 1, ~0U, 0)) + exit(1); + + set_sigstack(seccomp_test_stub_data->sigstack, + sizeof(seccomp_test_stub_data->sigstack)); + + sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO; + sa.sa_sigaction = (void *) sigsys_handler; + sa.sa_restorer = NULL; + if (sigaction(SIGSYS, &sa, NULL) < 0) + exit(2); + + prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, + SECCOMP_FILTER_FLAG_TSYNC, &prog) != 0) + exit(3); + + sleep(0); + + /* Never reached. */ + _exit(4); +} + +static bool __init init_seccomp(void) +{ + int pid; + int status; + int n; + unsigned long sp; + + /* + * We check that we can install a seccomp filter and then exit(0) + * from a trapped syscall. + * + * Note that we cannot verify that no seccomp filter already exists + * for a syscall that results in the process/thread to be killed. + */ + + os_info("Checking that seccomp filters can be installed..."); + + seccomp_test_stub_data = mmap(0, sizeof(*seccomp_test_stub_data), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANON, 0, 0); + + /* Use the syscall data area as stack, we just need something */ + sp = (unsigned long)&seccomp_test_stub_data->syscall_data + + sizeof(seccomp_test_stub_data->syscall_data) - + sizeof(void *); + pid = clone(seccomp_helper, (void *)sp, CLONE_VFORK | CLONE_VM, NULL); + + if (pid < 0) + fatal_perror("check_seccomp : clone failed"); + + CATCH_EINTR(n = waitpid(pid, &status, __WCLONE)); + if (n < 0) + fatal_perror("check_seccomp : waitpid failed"); + + if (WIFEXITED(status) && WEXITSTATUS(status) == 0) { + struct uml_pt_regs *regs; + unsigned long fp_size; + int r; + + /* Fill in the host_fp_size from the mcontext. */ + regs = calloc(1, sizeof(struct uml_pt_regs)); + get_stub_state(regs, seccomp_test_stub_data, &fp_size); + host_fp_size = fp_size; + free(regs); + + /* Repeat with the correct size */ + regs = calloc(1, sizeof(struct uml_pt_regs) + host_fp_size); + r = get_stub_state(regs, seccomp_test_stub_data, NULL); + + /* Store as the default startup registers */ + exec_fp_regs = malloc(host_fp_size); + memcpy(exec_regs, regs->gp, sizeof(exec_regs)); + memcpy(exec_fp_regs, regs->fp, host_fp_size); + + munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data)); + + free(regs); + + if (r) { + os_info("failed to fetch registers: %d\n", r); + return false; + } + + os_info("OK\n"); + return true; + } + + if (WIFEXITED(status) && WEXITSTATUS(status) == 2) + os_info("missing\n"); + else + os_info("error\n"); + + munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data)); + return false; +} + + static void __init check_coredump_limit(void) { struct rlimit lim; @@ -278,6 +420,44 @@ void __init get_host_cpu_features( } } +static int seccomp_config __initdata; + +static int __init uml_seccomp_config(char *line, int *add) +{ + *add = 0; + + if (strcmp(line, "off") == 0) + seccomp_config = 0; + else if (strcmp(line, "auto") == 0) + seccomp_config = 1; + else if (strcmp(line, "on") == 0) + seccomp_config = 2; + else + fatal("Invalid seccomp option '%s', expected on/auto/off\n", + line); + + return 0; +} + +__uml_setup("seccomp=", uml_seccomp_config, +"seccomp=<on/auto/off>\n" +" Configure whether or not SECCOMP is used. With SECCOMP, userspace\n" +" processes work collaboratively with the kernel instead of being\n" +" traced using ptrace. All syscalls from the application are caught and\n" +" redirected using a signal. This signal handler in turn is permitted to\n" +" do the selected set of syscalls to communicate with the UML kernel and\n" +" do the required memory management.\n" +"\n" +" This method is overall faster than the ptrace based userspace, primarily\n" +" because it reduces the number of context switches for (minor) page faults.\n" +"\n" +" However, the SECCOMP filter is not (yet) restrictive enough to prevent\n" +" userspace from reading and writing all physical memory. Userspace\n" +" processes could also trick the stub into disabling SIGALRM which\n" +" prevents it from being interrupted for scheduling purposes.\n" +"\n" +" This is insecure and should only be used with a trusted userspace\n\n" +); void __init os_early_checks(void) { @@ -286,13 +466,24 @@ void __init os_early_checks(void) /* Print out the core dump limits early */ check_coredump_limit(); - check_ptrace(); - /* Need to check this early because mmapping happens before the * kernel is running. */ check_tmpexec(); + if (seccomp_config) { + if (init_seccomp()) { + using_seccomp = 1; + return; + } + + if (seccomp_config == 2) + fatal("SECCOMP userspace requested but not functional!\n"); + } + + using_seccomp = 0; + check_ptrace(); + pid = start_ptraced_child(); if (init_pid_registers(pid)) fatal("Failed to initialize default registers"); diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h index b07824500363..ddc144657efa 100644 --- a/arch/x86/um/asm/checksum.h +++ b/arch/x86/um/asm/checksum.h @@ -20,6 +20,9 @@ */ extern __wsum csum_partial(const void *buff, int len, __wsum sum); +/* Do not call this directly. Declared for export type visibility. */ +extern __visible __wsum csum_partial_copy_generic(const void *src, void *dst, int len); + /** * csum_fold - Fold and invert a 32bit checksum. * sum: 32bit unfolded sum diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h index 478710384b34..e222d2ae28fd 100644 --- a/arch/x86/um/asm/processor.h +++ b/arch/x86/um/asm/processor.h @@ -21,10 +21,10 @@ #include <asm/user.h> -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -static __always_inline void rep_nop(void) +/* PAUSE is a good thing to insert into busy-wait loops. */ +static __always_inline void native_pause(void) { - __asm__ __volatile__("rep;nop": : :"memory"); + __asm__ __volatile__("pause": : :"memory"); } static __always_inline void cpu_relax(void) @@ -33,7 +33,7 @@ static __always_inline void cpu_relax(void) time_travel_mode == TT_MODE_EXTERNAL) time_travel_ndelay(1); else - rep_nop(); + native_pause(); } #define task_pt_regs(t) (&(t)->thread.regs) diff --git a/arch/x86/um/os-Linux/mcontext.c b/arch/x86/um/os-Linux/mcontext.c index 37decaa74761..a21403df6663 100644 --- a/arch/x86/um/os-Linux/mcontext.c +++ b/arch/x86/um/os-Linux/mcontext.c @@ -1,7 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 -#include <sys/ucontext.h> #define __FRAME_OFFSETS +#include <linux/errno.h> +#include <linux/string.h> +#include <sys/ucontext.h> #include <asm/ptrace.h> +#include <asm/sigcontext.h> #include <sysdep/ptrace.h> #include <sysdep/mcontext.h> #include <arch.h> @@ -18,6 +21,10 @@ void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc) COPY2(UESP, ESP); /* sic */ COPY(EBX); COPY(EDX); COPY(ECX); COPY(EAX); COPY(EIP); COPY_SEG_CPL3(CS); COPY(EFL); COPY_SEG_CPL3(SS); +#undef COPY2 +#undef COPY +#undef COPY_SEG +#undef COPY_SEG_CPL3 #else #define COPY2(X,Y) regs->gp[X/sizeof(unsigned long)] = mc->gregs[REG_##Y] #define COPY(X) regs->gp[X/sizeof(unsigned long)] = mc->gregs[REG_##X] @@ -29,6 +36,8 @@ void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc) COPY2(EFLAGS, EFL); COPY2(CS, CSGSFS); regs->gp[SS / sizeof(unsigned long)] = mc->gregs[REG_CSGSFS] >> 48; +#undef COPY2 +#undef COPY #endif } @@ -42,3 +51,210 @@ void mc_set_rip(void *_mc, void *target) mc->gregs[REG_RIP] = (unsigned long)target; #endif } + +/* Same thing, but the copy macros are turned around. */ +void get_mc_from_regs(struct uml_pt_regs *regs, mcontext_t *mc, int single_stepping) +{ +#ifdef __i386__ +#define COPY2(X,Y) mc->gregs[REG_##Y] = regs->gp[X] +#define COPY(X) mc->gregs[REG_##X] = regs->gp[X] +#define COPY_SEG(X) mc->gregs[REG_##X] = regs->gp[X] & 0xffff; +#define COPY_SEG_CPL3(X) mc->gregs[REG_##X] = (regs->gp[X] & 0xffff) | 3; + COPY_SEG(GS); COPY_SEG(FS); COPY_SEG(ES); COPY_SEG(DS); + COPY(EDI); COPY(ESI); COPY(EBP); + COPY2(UESP, ESP); /* sic */ + COPY(EBX); COPY(EDX); COPY(ECX); COPY(EAX); + COPY(EIP); COPY_SEG_CPL3(CS); COPY(EFL); COPY_SEG_CPL3(SS); +#else +#define COPY2(X,Y) mc->gregs[REG_##Y] = regs->gp[X/sizeof(unsigned long)] +#define COPY(X) mc->gregs[REG_##X] = regs->gp[X/sizeof(unsigned long)] + COPY(R8); COPY(R9); COPY(R10); COPY(R11); + COPY(R12); COPY(R13); COPY(R14); COPY(R15); + COPY(RDI); COPY(RSI); COPY(RBP); COPY(RBX); + COPY(RDX); COPY(RAX); COPY(RCX); COPY(RSP); + COPY(RIP); + COPY2(EFLAGS, EFL); + mc->gregs[REG_CSGSFS] = mc->gregs[REG_CSGSFS] & 0xffffffffffffl; + mc->gregs[REG_CSGSFS] |= (regs->gp[SS / sizeof(unsigned long)] & 0xffff) << 48; +#endif + + if (single_stepping) + mc->gregs[REG_EFL] |= X86_EFLAGS_TF; + else + mc->gregs[REG_EFL] &= ~X86_EFLAGS_TF; +} + +#ifdef CONFIG_X86_32 +struct _xstate_64 { + struct _fpstate_64 fpstate; + struct _header xstate_hdr; + struct _ymmh_state ymmh; + /* New processor state extensions go here: */ +}; + +/* Not quite the right structures as these contain more information */ +int um_i387_from_fxsr(struct _fpstate_32 *i387, + const struct _fpstate_64 *fxsave); +int um_fxsr_from_i387(struct _fpstate_64 *fxsave, + const struct _fpstate_32 *from); +#else +#define _xstate_64 _xstate +#endif + +static struct _fpstate *get_fpstate(struct stub_data *data, + mcontext_t *mcontext, + int *fp_size) +{ + struct _fpstate *res; + + /* Assume floating point registers are on the same page */ + res = (void *)(((unsigned long)mcontext->fpregs & + (UM_KERN_PAGE_SIZE - 1)) + + (unsigned long)&data->sigstack[0]); + + if ((void *)res + sizeof(struct _fpstate) > + (void *)data->sigstack + sizeof(data->sigstack)) + return NULL; + + if (res->sw_reserved.magic1 != FP_XSTATE_MAGIC1) { + *fp_size = sizeof(struct _fpstate); + } else { + char *magic2_addr; + + magic2_addr = (void *)res; + magic2_addr += res->sw_reserved.extended_size; + magic2_addr -= FP_XSTATE_MAGIC2_SIZE; + + /* We still need to be within our stack */ + if ((void *)magic2_addr > + (void *)data->sigstack + sizeof(data->sigstack)) + return NULL; + + /* If we do not read MAGIC2, then we did something wrong */ + if (*(__u32 *)magic2_addr != FP_XSTATE_MAGIC2) + return NULL; + + /* Remove MAGIC2 from the size, we do not save/restore it */ + *fp_size = res->sw_reserved.extended_size - + FP_XSTATE_MAGIC2_SIZE; + } + + return res; +} + +int get_stub_state(struct uml_pt_regs *regs, struct stub_data *data, + unsigned long *fp_size_out) +{ + mcontext_t *mcontext; + struct _fpstate *fpstate_stub; + struct _xstate_64 *xstate_stub; + int fp_size, xstate_size; + + /* mctx_offset is verified by wait_stub_done_seccomp */ + mcontext = (void *)&data->sigstack[data->mctx_offset]; + + get_regs_from_mc(regs, mcontext); + + fpstate_stub = get_fpstate(data, mcontext, &fp_size); + if (!fpstate_stub) + return -EINVAL; + +#ifdef CONFIG_X86_32 + xstate_stub = (void *)&fpstate_stub->_fxsr_env; + xstate_size = fp_size - offsetof(struct _fpstate_32, _fxsr_env); +#else + xstate_stub = (void *)fpstate_stub; + xstate_size = fp_size; +#endif + + if (fp_size_out) + *fp_size_out = xstate_size; + + if (xstate_size > host_fp_size) + return -ENOSPC; + + memcpy(®s->fp, xstate_stub, xstate_size); + + /* We do not need to read the x86_64 FS_BASE/GS_BASE registers as + * we do not permit userspace to set them directly. + */ + +#ifdef CONFIG_X86_32 + /* Read the i387 legacy FP registers */ + if (um_fxsr_from_i387((void *)®s->fp, fpstate_stub)) + return -EINVAL; +#endif + + return 0; +} + +/* Copied because we cannot include regset.h here. */ +struct task_struct; +struct user_regset; +struct membuf { + void *p; + size_t left; +}; + +int fpregs_legacy_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to); + +int set_stub_state(struct uml_pt_regs *regs, struct stub_data *data, + int single_stepping) +{ + mcontext_t *mcontext; + struct _fpstate *fpstate_stub; + struct _xstate_64 *xstate_stub; + int fp_size, xstate_size; + + /* mctx_offset is verified by wait_stub_done_seccomp */ + mcontext = (void *)&data->sigstack[data->mctx_offset]; + + if ((unsigned long)mcontext < (unsigned long)data->sigstack || + (unsigned long)mcontext > + (unsigned long) data->sigstack + + sizeof(data->sigstack) - sizeof(*mcontext)) + return -EINVAL; + + get_mc_from_regs(regs, mcontext, single_stepping); + + fpstate_stub = get_fpstate(data, mcontext, &fp_size); + if (!fpstate_stub) + return -EINVAL; + +#ifdef CONFIG_X86_32 + xstate_stub = (void *)&fpstate_stub->_fxsr_env; + xstate_size = fp_size - offsetof(struct _fpstate_32, _fxsr_env); +#else + xstate_stub = (void *)fpstate_stub; + xstate_size = fp_size; +#endif + + memcpy(xstate_stub, ®s->fp, xstate_size); + +#ifdef __i386__ + /* + * On x86, the GDT entries are updated by arch_set_tls. + */ + + /* Store the i387 legacy FP registers which the host will use */ + if (um_i387_from_fxsr(fpstate_stub, (void *)®s->fp)) + return -EINVAL; +#else + /* + * On x86_64, we need to sync the FS_BASE/GS_BASE registers using the + * arch specific data. + */ + if (data->arch_data.fs_base != regs->gp[FS_BASE / sizeof(unsigned long)]) { + data->arch_data.fs_base = regs->gp[FS_BASE / sizeof(unsigned long)]; + data->arch_data.sync |= STUB_SYNC_FS_BASE; + } + if (data->arch_data.gs_base != regs->gp[GS_BASE / sizeof(unsigned long)]) { + data->arch_data.gs_base = regs->gp[GS_BASE / sizeof(unsigned long)]; + data->arch_data.sync |= STUB_SYNC_GS_BASE; + } +#endif + + return 0; +} diff --git a/arch/x86/um/ptrace.c b/arch/x86/um/ptrace.c index 57c504fd5626..3275870330fe 100644 --- a/arch/x86/um/ptrace.c +++ b/arch/x86/um/ptrace.c @@ -25,7 +25,8 @@ static inline unsigned short twd_i387_to_fxsr(unsigned short twd) return tmp; } -static inline unsigned long twd_fxsr_to_i387(struct user_fxsr_struct *fxsave) +static inline unsigned long +twd_fxsr_to_i387(const struct user_fxsr_struct *fxsave) { struct _fpxreg *st = NULL; unsigned long twd = (unsigned long) fxsave->twd; @@ -69,12 +70,16 @@ static inline unsigned long twd_fxsr_to_i387(struct user_fxsr_struct *fxsave) return ret; } -/* Get/set the old 32bit i387 registers (pre-FPX) */ -static int fpregs_legacy_get(struct task_struct *target, - const struct user_regset *regset, - struct membuf to) +/* + * Get/set the old 32bit i387 registers (pre-FPX) + * + * We provide simple wrappers for mcontext.c, they are only defined locally + * because mcontext.c is userspace facing and needs to a different definition + * of the structures. + */ +static int _um_i387_from_fxsr(struct membuf to, + const struct user_fxsr_struct *fxsave) { - struct user_fxsr_struct *fxsave = (void *)target->thread.regs.regs.fp; int i; membuf_store(&to, (unsigned long)fxsave->cwd | 0xffff0000ul); @@ -91,23 +96,36 @@ static int fpregs_legacy_get(struct task_struct *target, return 0; } -static int fpregs_legacy_set(struct task_struct *target, +int um_i387_from_fxsr(struct user_i387_struct *i387, + const struct user_fxsr_struct *fxsave); + +int um_i387_from_fxsr(struct user_i387_struct *i387, + const struct user_fxsr_struct *fxsave) +{ + struct membuf to = { + .p = i387, + .left = sizeof(*i387), + }; + + return _um_i387_from_fxsr(to, fxsave); +} + +static int fpregs_legacy_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) + struct membuf to) { struct user_fxsr_struct *fxsave = (void *)target->thread.regs.regs.fp; - const struct user_i387_struct *from; - struct user_i387_struct buf; - int i; - if (ubuf) { - if (copy_from_user(&buf, ubuf, sizeof(buf))) - return -EFAULT; - from = &buf; - } else { - from = kbuf; - } + return _um_i387_from_fxsr(to, fxsave); +} + +int um_fxsr_from_i387(struct user_fxsr_struct *fxsave, + const struct user_i387_struct *from); + +int um_fxsr_from_i387(struct user_fxsr_struct *fxsave, + const struct user_i387_struct *from) +{ + int i; fxsave->cwd = (unsigned short)(from->cwd & 0xffff); fxsave->swd = (unsigned short)(from->swd & 0xffff); @@ -125,6 +143,26 @@ static int fpregs_legacy_set(struct task_struct *target, return 0; } + +static int fpregs_legacy_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct user_fxsr_struct *fxsave = (void *)target->thread.regs.regs.fp; + const struct user_i387_struct *from; + struct user_i387_struct buf; + + if (ubuf) { + if (copy_from_user(&buf, ubuf, sizeof(buf))) + return -EFAULT; + from = &buf; + } else { + from = kbuf; + } + + return um_fxsr_from_i387(fxsave, &buf); +} #endif static int genregs_get(struct task_struct *target, diff --git a/arch/x86/um/shared/sysdep/kernel-offsets.h b/arch/x86/um/shared/sysdep/kernel-offsets.h index 48de3a71f845..6fd1ed400399 100644 --- a/arch/x86/um/shared/sysdep/kernel-offsets.h +++ b/arch/x86/um/shared/sysdep/kernel-offsets.h @@ -4,7 +4,9 @@ #include <linux/elf.h> #include <linux/crypto.h> #include <linux/kbuild.h> +#include <linux/audit.h> #include <asm/mman.h> +#include <asm/seccomp.h> /* workaround for a warning with -Wmissing-prototypes */ void foo(void); diff --git a/arch/x86/um/shared/sysdep/mcontext.h b/arch/x86/um/shared/sysdep/mcontext.h index b724c54da316..6fe490cc5b98 100644 --- a/arch/x86/um/shared/sysdep/mcontext.h +++ b/arch/x86/um/shared/sysdep/mcontext.h @@ -6,7 +6,16 @@ #ifndef __SYS_SIGCONTEXT_X86_H #define __SYS_SIGCONTEXT_X86_H +#include <stub-data.h> + extern void get_regs_from_mc(struct uml_pt_regs *, mcontext_t *); +extern void get_mc_from_regs(struct uml_pt_regs *regs, mcontext_t *mc, + int single_stepping); + +extern int get_stub_state(struct uml_pt_regs *regs, struct stub_data *data, + unsigned long *fp_size_out); +extern int set_stub_state(struct uml_pt_regs *regs, struct stub_data *data, + int single_stepping); #ifdef __i386__ diff --git a/arch/x86/um/shared/sysdep/stub-data.h b/arch/x86/um/shared/sysdep/stub-data.h new file mode 100644 index 000000000000..82b1b7f8ac3d --- /dev/null +++ b/arch/x86/um/shared/sysdep/stub-data.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ARCH_STUB_DATA_H +#define __ARCH_STUB_DATA_H + +#ifdef __i386__ +#include <generated/asm-offsets.h> +#include <asm/ldt.h> + +struct stub_data_arch { + int sync; + struct user_desc tls[UM_KERN_GDT_ENTRY_TLS_ENTRIES]; +}; +#else +#define STUB_SYNC_FS_BASE (1 << 0) +#define STUB_SYNC_GS_BASE (1 << 1) +struct stub_data_arch { + int sync; + unsigned long fs_base; + unsigned long gs_base; +}; +#endif + +#endif /* __ARCH_STUB_DATA_H */ diff --git a/arch/x86/um/shared/sysdep/stub.h b/arch/x86/um/shared/sysdep/stub.h index dc89f4423454..4fa58f5b4fca 100644 --- a/arch/x86/um/shared/sysdep/stub.h +++ b/arch/x86/um/shared/sysdep/stub.h @@ -13,3 +13,5 @@ extern void stub_segv_handler(int, siginfo_t *, void *); extern void stub_syscall_handler(void); +extern void stub_signal_interrupt(int, siginfo_t *, void *); +extern void stub_signal_restorer(void); diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h index 390988132c0a..df568fc3ceb4 100644 --- a/arch/x86/um/shared/sysdep/stub_32.h +++ b/arch/x86/um/shared/sysdep/stub_32.h @@ -131,4 +131,17 @@ static __always_inline void *get_stub_data(void) "call *%%eax ;" \ :: "i" ((1 + STUB_DATA_PAGES) * UM_KERN_PAGE_SIZE), \ "i" (&fn)) + +static __always_inline void +stub_seccomp_restore_state(struct stub_data_arch *arch) +{ + for (int i = 0; i < sizeof(arch->tls) / sizeof(arch->tls[0]); i++) { + if (arch->sync & (1 << i)) + stub_syscall1(__NR_set_thread_area, + (unsigned long) &arch->tls[i]); + } + + arch->sync = 0; +} + #endif diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h index 294affbec742..9cfd31afa769 100644 --- a/arch/x86/um/shared/sysdep/stub_64.h +++ b/arch/x86/um/shared/sysdep/stub_64.h @@ -10,6 +10,7 @@ #include <sysdep/ptrace_user.h> #include <generated/asm-offsets.h> #include <linux/stddef.h> +#include <asm/prctl.h> #define STUB_MMAP_NR __NR_mmap #define MMAP_OFFSET(o) (o) @@ -134,4 +135,20 @@ static __always_inline void *get_stub_data(void) "call *%%rax ;" \ :: "i" ((1 + STUB_DATA_PAGES) * UM_KERN_PAGE_SIZE), \ "i" (&fn)) + +static __always_inline void +stub_seccomp_restore_state(struct stub_data_arch *arch) +{ + /* + * We could use _writefsbase_u64/_writegsbase_u64 if the host reports + * support in the hwcaps (HWCAP2_FSGSBASE). + */ + if (arch->sync & STUB_SYNC_FS_BASE) + stub_syscall2(__NR_arch_prctl, ARCH_SET_FS, arch->fs_base); + if (arch->sync & STUB_SYNC_GS_BASE) + stub_syscall2(__NR_arch_prctl, ARCH_SET_GS, arch->gs_base); + + arch->sync = 0; +} + #endif diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c index fbb129023080..cb3f17627d16 100644 --- a/arch/x86/um/tls_32.c +++ b/arch/x86/um/tls_32.c @@ -12,6 +12,7 @@ #include <skas.h> #include <sysdep/tls.h> #include <asm/desc.h> +#include <stub-data.h> /* * If needed we can detect when it's uninitialized. @@ -21,14 +22,25 @@ static int host_supports_tls = -1; int host_gdt_entry_tls_min; -static int do_set_thread_area(struct user_desc *info) +static int do_set_thread_area(struct task_struct* task, struct user_desc *info) { int ret; - u32 cpu; - cpu = get_cpu(); - ret = os_set_thread_area(info, userspace_pid[cpu]); - put_cpu(); + if (info->entry_number < host_gdt_entry_tls_min || + info->entry_number >= host_gdt_entry_tls_min + GDT_ENTRY_TLS_ENTRIES) + return -EINVAL; + + if (using_seccomp) { + int idx = info->entry_number - host_gdt_entry_tls_min; + struct stub_data *data = (void *)task->mm->context.id.stack; + + data->arch_data.tls[idx] = *info; + data->arch_data.sync |= BIT(idx); + + return 0; + } + + ret = os_set_thread_area(info, task->mm->context.id.pid); if (ret) printk(KERN_ERR "PTRACE_SET_THREAD_AREA failed, err = %d, " @@ -97,7 +109,7 @@ static int load_TLS(int flags, struct task_struct *to) if (!(flags & O_FORCE) && curr->flushed) continue; - ret = do_set_thread_area(&curr->tls); + ret = do_set_thread_area(current, &curr->tls); if (ret) goto out; @@ -275,7 +287,7 @@ SYSCALL_DEFINE1(set_thread_area, struct user_desc __user *, user_desc) return -EFAULT; } - ret = do_set_thread_area(&info); + ret = do_set_thread_area(current, &info); if (ret) return ret; return set_tls_entry(current, &info, idx, 1); |