summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/alpha/include/asm/pgtable.h7
-rw-r--r--arch/arc/include/asm/hugepage.h2
-rw-r--r--arch/arc/include/asm/pgtable-levels.h2
-rw-r--r--arch/arc/include/asm/syscall.h25
-rw-r--r--arch/arm/include/asm/pgtable-3level.h1
-rw-r--r--arch/arm/include/asm/pgtable.h1
-rw-r--r--arch/arm/include/asm/syscall.h37
-rw-r--r--arch/arm/mm/flush.c4
-rw-r--r--arch/arm/mm/mmu.c2
-rw-r--r--arch/arm/probes/uprobes/core.c4
-rw-r--r--arch/arm64/Kconfig3
-rw-r--r--arch/arm64/boot/dts/qcom/qcm6490-shift-otter.dts2
-rw-r--r--arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts2
-rw-r--r--arch/arm64/hyperv/mshyperv.c53
-rw-r--r--arch/arm64/include/asm/kvm_host.h3
-rw-r--r--arch/arm64/include/asm/pgtable-types.h20
-rw-r--r--arch/arm64/include/asm/pgtable.h7
-rw-r--r--arch/arm64/include/asm/ptdump.h24
-rw-r--r--arch/arm64/include/asm/syscall.h29
-rw-r--r--arch/arm64/include/asm/sysreg.h1
-rw-r--r--arch/arm64/kernel/efi.c4
-rw-r--r--arch/arm64/kernel/pi/map_kernel.c2
-rw-r--r--arch/arm64/kernel/pi/map_range.c4
-rw-r--r--arch/arm64/kernel/pi/pi.h2
-rw-r--r--arch/arm64/kvm/arch_timer.c4
-rw-r--r--arch/arm64/kvm/arm.c69
-rw-r--r--arch/arm64/kvm/hypercalls.c10
-rw-r--r--arch/arm64/kvm/nested.c6
-rw-r--r--arch/arm64/kvm/vgic/vgic-debug.c5
-rw-r--r--arch/arm64/kvm/vgic/vgic-init.c31
-rw-r--r--arch/arm64/kvm/vgic/vgic-its.c56
-rw-r--r--arch/arm64/kvm/vgic/vgic-kvm-device.c12
-rw-r--r--arch/arm64/kvm/vgic/vgic-v4.c92
-rw-r--r--arch/arm64/mm/mmap.c2
-rw-r--r--arch/arm64/mm/mmu.c93
-rw-r--r--arch/arm64/mm/ptdump.c50
-rw-r--r--arch/csky/include/asm/pgalloc.h2
-rw-r--r--arch/csky/include/asm/pgtable.h5
-rw-r--r--arch/csky/include/asm/syscall.h13
-rw-r--r--arch/hexagon/include/asm/pgtable.h3
-rw-r--r--arch/hexagon/include/asm/syscall.h21
-rw-r--r--arch/loongarch/include/asm/pgalloc.h2
-rw-r--r--arch/loongarch/include/asm/pgtable.h7
-rw-r--r--arch/loongarch/include/asm/syscall.h15
-rw-r--r--arch/loongarch/mm/pgtable.c9
-rw-r--r--arch/m68k/coldfire/gpio.c8
-rw-r--r--arch/m68k/include/asm/mcf_pgalloc.h8
-rw-r--r--arch/m68k/include/asm/mcf_pgtable.h6
-rw-r--r--arch/m68k/include/asm/motorola_pgalloc.h10
-rw-r--r--arch/m68k/include/asm/motorola_pgtable.h6
-rw-r--r--arch/m68k/include/asm/sun3_pgtable.h6
-rw-r--r--arch/m68k/include/asm/syscall.h7
-rw-r--r--arch/m68k/kernel/setup_no.c3
-rw-r--r--arch/m68k/kernel/uboot.c2
-rw-r--r--arch/m68k/mm/motorola.c9
-rw-r--r--arch/microblaze/include/asm/pgtable.h8
-rw-r--r--arch/microblaze/include/asm/syscall.h7
-rw-r--r--arch/microblaze/mm/pgtable.c2
-rw-r--r--arch/mips/include/asm/pgalloc.h2
-rw-r--r--arch/mips/include/asm/pgtable.h9
-rw-r--r--arch/mips/include/asm/syscall.h43
-rw-r--r--arch/mips/mm/pgtable-32.c10
-rw-r--r--arch/mips/mm/pgtable-64.c9
-rw-r--r--arch/nios2/include/asm/pgtable.h6
-rw-r--r--arch/nios2/include/asm/syscall.h16
-rw-r--r--arch/openrisc/include/asm/pgtable.h2
-rw-r--r--arch/openrisc/include/asm/syscall.h13
-rw-r--r--arch/openrisc/mm/ioremap.c2
-rw-r--r--arch/parisc/boot/compressed/Makefile1
-rw-r--r--arch/parisc/include/asm/alternative.h4
-rw-r--r--arch/parisc/include/asm/assembly.h4
-rw-r--r--arch/parisc/include/asm/barrier.h4
-rw-r--r--arch/parisc/include/asm/cache.h4
-rw-r--r--arch/parisc/include/asm/current.h4
-rw-r--r--arch/parisc/include/asm/dwarf.h4
-rw-r--r--arch/parisc/include/asm/fixmap.h4
-rw-r--r--arch/parisc/include/asm/ftrace.h4
-rw-r--r--arch/parisc/include/asm/jump_label.h4
-rw-r--r--arch/parisc/include/asm/kexec.h4
-rw-r--r--arch/parisc/include/asm/kgdb.h2
-rw-r--r--arch/parisc/include/asm/linkage.h4
-rw-r--r--arch/parisc/include/asm/page.h6
-rw-r--r--arch/parisc/include/asm/pdc.h4
-rw-r--r--arch/parisc/include/asm/pdcpat.h4
-rw-r--r--arch/parisc/include/asm/pgalloc.h2
-rw-r--r--arch/parisc/include/asm/pgtable.h14
-rw-r--r--arch/parisc/include/asm/prefetch.h4
-rw-r--r--arch/parisc/include/asm/processor.h8
-rw-r--r--arch/parisc/include/asm/psw.h4
-rw-r--r--arch/parisc/include/asm/signal.h4
-rw-r--r--arch/parisc/include/asm/smp.h4
-rw-r--r--arch/parisc/include/asm/spinlock_types.h4
-rw-r--r--arch/parisc/include/asm/syscall.h19
-rw-r--r--arch/parisc/include/asm/thread_info.h4
-rw-r--r--arch/parisc/include/asm/traps.h2
-rw-r--r--arch/parisc/include/asm/unistd.h4
-rw-r--r--arch/parisc/include/asm/vdso.h4
-rw-r--r--arch/parisc/include/uapi/asm/pdc.h4
-rw-r--r--arch/parisc/include/uapi/asm/signal.h4
-rw-r--r--arch/parisc/kernel/unaligned.c2
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h1
-rw-r--r--arch/powerpc/include/asm/pgtable.h3
-rw-r--r--arch/powerpc/include/asm/syscall.h20
-rw-r--r--arch/powerpc/mm/book3s64/pgtable.c7
-rw-r--r--arch/powerpc/mm/pgtable-frag.c30
-rw-r--r--arch/powerpc/mm/ptdump/ptdump.c46
-rw-r--r--arch/riscv/include/asm/pgtable-64.h2
-rw-r--r--arch/riscv/include/asm/pgtable.h2
-rw-r--r--arch/riscv/include/asm/syscall.h19
-rw-r--r--arch/riscv/kvm/aia_device.c34
-rw-r--r--arch/riscv/mm/cacheflush.c2
-rw-r--r--arch/riscv/mm/init.c26
-rw-r--r--arch/riscv/mm/ptdump.c46
-rw-r--r--arch/s390/include/asm/gmap.h2
-rw-r--r--arch/s390/include/asm/gmap_helpers.h15
-rw-r--r--arch/s390/include/asm/pgalloc.h2
-rw-r--r--arch/s390/include/asm/pgtable.h11
-rw-r--r--arch/s390/include/asm/syscall.h21
-rw-r--r--arch/s390/include/asm/tlb.h5
-rw-r--r--arch/s390/include/asm/uv.h1
-rw-r--r--arch/s390/kernel/uv.c97
-rw-r--r--arch/s390/kvm/Makefile2
-rw-r--r--arch/s390/kvm/diag.c30
-rw-r--r--arch/s390/kvm/gaccess.c3
-rw-r--r--arch/s390/kvm/gmap-vsie.c1
-rw-r--r--arch/s390/kvm/gmap.c121
-rw-r--r--arch/s390/kvm/gmap.h39
-rw-r--r--arch/s390/kvm/intercept.c9
-rw-r--r--arch/s390/kvm/kvm-s390.c10
-rw-r--r--arch/s390/kvm/kvm-s390.h42
-rw-r--r--arch/s390/kvm/priv.c6
-rw-r--r--arch/s390/kvm/pv.c61
-rw-r--r--arch/s390/kvm/vsie.c19
-rw-r--r--arch/s390/lib/crypto/Makefile3
-rw-r--r--arch/s390/mm/Makefile2
-rw-r--r--arch/s390/mm/dump_pagetables.c46
-rw-r--r--arch/s390/mm/fault.c1
-rw-r--r--arch/s390/mm/gmap.c185
-rw-r--r--arch/s390/mm/gmap_helpers.c221
-rw-r--r--arch/s390/mm/init.c1
-rw-r--r--arch/s390/mm/pgalloc.c4
-rw-r--r--arch/s390/mm/pgtable.c1
-rw-r--r--arch/sh/include/asm/pgtable_32.h8
-rw-r--r--arch/sh/include/asm/syscall_32.h24
-rw-r--r--arch/sparc/include/asm/pgtable_32.h15
-rw-r--r--arch/sparc/include/asm/pgtable_64.h2
-rw-r--r--arch/sparc/include/asm/syscall.h22
-rw-r--r--arch/sparc/mm/init_64.c29
-rw-r--r--arch/sparc/mm/srmmu.c2
-rw-r--r--arch/um/include/asm/pgtable-2level.h1
-rw-r--r--arch/um/include/asm/pgtable-4level.h9
-rw-r--r--arch/um/include/asm/pgtable.h18
-rw-r--r--arch/um/include/asm/syscall-generic.h19
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/boot/compressed/kaslr.c50
-rw-r--r--arch/x86/coco/sev/core.c13
-rw-r--r--arch/x86/hyperv/hv_init.c67
-rw-r--r--arch/x86/hyperv/hv_vtl.c61
-rw-r--r--arch/x86/hyperv/ivm.c11
-rw-r--r--arch/x86/include/asm/apic.h8
-rw-r--r--arch/x86/include/asm/cpufeatures.h4
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h9
-rw-r--r--arch/x86/include/asm/mshyperv.h7
-rw-r--r--arch/x86/include/asm/msr.h4
-rw-r--r--arch/x86/include/asm/pgtable.h21
-rw-r--r--arch/x86/include/asm/posted_intr.h78
-rw-r--r--arch/x86/include/asm/set_memory.h2
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/include/asm/svm.h10
-rw-r--r--arch/x86/include/asm/syscall.h43
-rw-r--r--arch/x86/include/uapi/asm/kvm.h1
-rw-r--r--arch/x86/include/uapi/asm/setup_data.h13
-rw-r--r--arch/x86/include/uapi/asm/svm.h2
-rw-r--r--arch/x86/kernel/acpi/madt_wakeup.c2
-rw-r--r--arch/x86/kernel/apic/apic_noop.c8
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c2
-rw-r--r--arch/x86/kernel/crash.c26
-rw-r--r--arch/x86/kernel/e820.c18
-rw-r--r--arch/x86/kernel/irq.c63
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c58
-rw-r--r--arch/x86/kernel/machine_kexec_64.c22
-rw-r--r--arch/x86/kernel/setup.c42
-rw-r--r--arch/x86/kernel/smpboot.c10
-rw-r--r--arch/x86/kvm/cpuid.c8
-rw-r--r--arch/x86/kvm/ioapic.c7
-rw-r--r--arch/x86/kvm/ioapic.h2
-rw-r--r--arch/x86/kvm/irq_comm.c37
-rw-r--r--arch/x86/kvm/lapic.c28
-rw-r--r--arch/x86/kvm/lapic.h4
-rw-r--r--arch/x86/kvm/mmu/mmu.c5
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c19
-rw-r--r--arch/x86/kvm/svm/nested.c36
-rw-r--r--arch/x86/kvm/svm/sev.c185
-rw-r--r--arch/x86/kvm/svm/svm.c162
-rw-r--r--arch/x86/kvm/svm/svm.h14
-rw-r--r--arch/x86/kvm/vmx/common.h4
-rw-r--r--arch/x86/kvm/vmx/main.c208
-rw-r--r--arch/x86/kvm/vmx/nested.c48
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c17
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h5
-rw-r--r--arch/x86/kvm/vmx/vmenter.S3
-rw-r--r--arch/x86/kvm/vmx/vmx.c23
-rw-r--r--arch/x86/kvm/vmx/vmx.h3
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h66
-rw-r--r--arch/x86/kvm/x86.c44
-rw-r--r--arch/x86/kvm/x86.h18
-rw-r--r--arch/x86/mm/dump_pagetables.c71
-rw-r--r--arch/x86/mm/init_64.c15
-rw-r--r--arch/x86/mm/ioremap.c7
-rw-r--r--arch/x86/mm/pat/memtype.c194
-rw-r--r--arch/x86/mm/pat/memtype_interval.c63
-rw-r--r--arch/x86/mm/pat/set_memory.c13
-rw-r--r--arch/x86/mm/pgtable.c9
-rw-r--r--arch/x86/realmode/init.c2
-rw-r--r--arch/xtensa/Kbuild2
-rw-r--r--arch/xtensa/Kconfig3
-rw-r--r--arch/xtensa/boot/dts/Makefile2
-rw-r--r--arch/xtensa/configs/audio_kc705_defconfig2
-rw-r--r--arch/xtensa/configs/cadence_csp_defconfig2
-rw-r--r--arch/xtensa/configs/common_defconfig1
-rw-r--r--arch/xtensa/configs/generic_kc705_defconfig2
-rw-r--r--arch/xtensa/configs/nommu_kc705_defconfig2
-rw-r--r--arch/xtensa/configs/smp_lx200_defconfig2
-rw-r--r--arch/xtensa/configs/virt_defconfig2
-rw-r--r--arch/xtensa/configs/xip_kc705_defconfig2
-rw-r--r--arch/xtensa/include/asm/pgtable.h6
-rw-r--r--arch/xtensa/include/asm/ptrace.h5
-rw-r--r--arch/xtensa/include/asm/syscall.h18
230 files changed, 2678 insertions, 1795 deletions
diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h
index 02e8817a8921..2676017f42f1 100644
--- a/arch/alpha/include/asm/pgtable.h
+++ b/arch/alpha/include/asm/pgtable.h
@@ -192,13 +192,6 @@ extern unsigned long __zero_page(void);
#define pte_pfn(pte) (pte_val(pte) >> PFN_PTE_SHIFT)
#define pte_page(pte) pfn_to_page(pte_pfn(pte))
-#define mk_pte(page, pgprot) \
-({ \
- pte_t pte; \
- \
- pte_val(pte) = (page_to_pfn(page) << 32) | pgprot_val(pgprot); \
- pte; \
-})
extern inline pte_t pfn_pte(unsigned long physpfn, pgprot_t pgprot)
{ pte_t pte; pte_val(pte) = (PHYS_TWIDDLE(physpfn) << 32) | pgprot_val(pgprot); return pte; }
diff --git a/arch/arc/include/asm/hugepage.h b/arch/arc/include/asm/hugepage.h
index 8a2441670a8f..7765dc105d54 100644
--- a/arch/arc/include/asm/hugepage.h
+++ b/arch/arc/include/asm/hugepage.h
@@ -40,8 +40,6 @@ static inline pmd_t pte_pmd(pte_t pte)
#define pmd_young(pmd) pte_young(pmd_pte(pmd))
#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
-#define mk_pmd(page, prot) pte_pmd(mk_pte(page, prot))
-
#define pmd_trans_huge(pmd) (pmd_val(pmd) & _PAGE_HW_SZ)
#define pfn_pmd(pfn, prot) (__pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot)))
diff --git a/arch/arc/include/asm/pgtable-levels.h b/arch/arc/include/asm/pgtable-levels.h
index 86e148226463..d1ce4b0f1071 100644
--- a/arch/arc/include/asm/pgtable-levels.h
+++ b/arch/arc/include/asm/pgtable-levels.h
@@ -142,7 +142,6 @@
#define pmd_pfn(pmd) ((pmd_val(pmd) & PMD_MASK) >> PAGE_SHIFT)
#define pfn_pmd(pfn,prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
-#define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot)
#endif
@@ -177,7 +176,6 @@
#define set_pte(ptep, pte) ((*(ptep)) = (pte))
#define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT)
#define pfn_pte(pfn, prot) __pte(__pfn_to_phys(pfn) | pgprot_val(prot))
-#define mk_pte(page, prot) pfn_pte(page_to_pfn(page), prot)
#ifdef CONFIG_ISA_ARCV2
#define pmd_leaf(x) (pmd_val(x) & _PAGE_HW_SZ)
diff --git a/arch/arc/include/asm/syscall.h b/arch/arc/include/asm/syscall.h
index 9709256e31c8..728d625a10f1 100644
--- a/arch/arc/include/asm/syscall.h
+++ b/arch/arc/include/asm/syscall.h
@@ -24,6 +24,17 @@ syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
}
static inline void
+syscall_set_nr(struct task_struct *task, struct pt_regs *regs, int nr)
+{
+ /*
+ * Unlike syscall_get_nr(), syscall_set_nr() can be called only when
+ * the target task is stopped for tracing on entering syscall, so
+ * there is no need to have the same check syscall_get_nr() has.
+ */
+ regs->r8 = nr;
+}
+
+static inline void
syscall_rollback(struct task_struct *task, struct pt_regs *regs)
{
regs->r0 = regs->orig_r0;
@@ -67,6 +78,20 @@ syscall_get_arguments(struct task_struct *task, struct pt_regs *regs,
}
}
+static inline void
+syscall_set_arguments(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *args)
+{
+ unsigned long *inside_ptregs = &regs->r0;
+ unsigned int n = 6;
+ unsigned int i = 0;
+
+ while (n--) {
+ *inside_ptregs = args[i++];
+ inside_ptregs--;
+ }
+}
+
static inline int
syscall_get_arch(struct task_struct *task)
{
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index fa5939eb9864..7b71a3d414b7 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -209,7 +209,6 @@ PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF);
#define pmd_pfn(pmd) (((pmd_val(pmd) & PMD_MASK) & PHYS_MASK) >> PAGE_SHIFT)
#define pfn_pmd(pfn,prot) (__pmd(((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)))
-#define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot)
/* No hardware dirty/accessed bits -- generic_pmdp_establish() fits */
#define pmdp_establish generic_pmdp_establish
diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index 6b986ef6042f..7f1c3b4e3e04 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -168,7 +168,6 @@ static inline pte_t *pmd_page_vaddr(pmd_t pmd)
#define pfn_pte(pfn,prot) __pte(__pfn_to_phys(pfn) | pgprot_val(prot))
#define pte_page(pte) pfn_to_page(pte_pfn(pte))
-#define mk_pte(page,prot) pfn_pte(page_to_pfn(page), prot)
#define pte_clear(mm,addr,ptep) set_pte_ext(ptep, __pte(0), 0)
diff --git a/arch/arm/include/asm/syscall.h b/arch/arm/include/asm/syscall.h
index fe4326d938c1..18b102a30741 100644
--- a/arch/arm/include/asm/syscall.h
+++ b/arch/arm/include/asm/syscall.h
@@ -68,6 +68,30 @@ static inline void syscall_set_return_value(struct task_struct *task,
regs->ARM_r0 = (long) error ? error : val;
}
+static inline void syscall_set_nr(struct task_struct *task,
+ struct pt_regs *regs,
+ int nr)
+{
+ if (nr == -1) {
+ task_thread_info(task)->abi_syscall = -1;
+ /*
+ * When the syscall number is set to -1, the syscall will be
+ * skipped. In this case the syscall return value has to be
+ * set explicitly, otherwise the first syscall argument is
+ * returned as the syscall return value.
+ */
+ syscall_set_return_value(task, regs, -ENOSYS, 0);
+ return;
+ }
+ if ((IS_ENABLED(CONFIG_AEABI) && !IS_ENABLED(CONFIG_OABI_COMPAT))) {
+ task_thread_info(task)->abi_syscall = nr;
+ return;
+ }
+ task_thread_info(task)->abi_syscall =
+ (task_thread_info(task)->abi_syscall & ~__NR_SYSCALL_MASK) |
+ (nr & __NR_SYSCALL_MASK);
+}
+
#define SYSCALL_MAX_ARGS 7
static inline void syscall_get_arguments(struct task_struct *task,
@@ -80,6 +104,19 @@ static inline void syscall_get_arguments(struct task_struct *task,
memcpy(args, &regs->ARM_r0 + 1, 5 * sizeof(args[0]));
}
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ const unsigned long *args)
+{
+ memcpy(&regs->ARM_r0, args, 6 * sizeof(args[0]));
+ /*
+ * Also copy the first argument into ARM_ORIG_r0
+ * so that syscall_get_arguments() would return it
+ * instead of the previous value.
+ */
+ regs->ARM_ORIG_r0 = regs->ARM_r0;
+}
+
static inline int syscall_get_arch(struct task_struct *task)
{
/* ARM tasks don't change audit architectures on the fly. */
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index 0749cf8a6637..5219158d54cf 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -227,9 +227,9 @@ void __flush_dcache_folio(struct address_space *mapping, struct folio *folio)
}
/*
- * If this is a page cache page, and we have an aliasing VIPT cache,
+ * If this is a page cache folio, and we have an aliasing VIPT cache,
* we only need to do one flush - which would be at the relevant
- * userspace colour, which is congruent with page->index.
+ * userspace colour, which is congruent with folio->index.
*/
if (mapping && cache_is_vipt_aliasing())
flush_pfn_alias(folio_pfn(folio), folio_pos(folio));
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index f02f872ea8a9..edb7f56b7c91 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -735,7 +735,7 @@ static void *__init late_alloc(unsigned long sz)
void *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_HIGHMEM,
get_order(sz));
- if (!ptdesc || !pagetable_pte_ctor(ptdesc))
+ if (!ptdesc || !pagetable_pte_ctor(NULL, ptdesc))
BUG();
return ptdesc_to_virt(ptdesc);
}
diff --git a/arch/arm/probes/uprobes/core.c b/arch/arm/probes/uprobes/core.c
index f5f790c6e5f8..885e0c5e8c20 100644
--- a/arch/arm/probes/uprobes/core.c
+++ b/arch/arm/probes/uprobes/core.c
@@ -26,10 +26,10 @@ bool is_swbp_insn(uprobe_opcode_t *insn)
(UPROBE_SWBP_ARM_INSN & 0x0fffffff);
}
-int set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm,
+int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
unsigned long vaddr)
{
- return uprobe_write_opcode(auprobe, mm, vaddr,
+ return uprobe_write_opcode(auprobe, vma, vaddr,
__opcode_to_mem_arm(auprobe->bpinsn));
}
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index bcc1773fec77..55fc331af337 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1616,6 +1616,9 @@ config ARCH_SUPPORTS_KEXEC_IMAGE_VERIFY_SIG
config ARCH_DEFAULT_KEXEC_IMAGE_VERIFY_SIG
def_bool y
+config ARCH_SUPPORTS_KEXEC_HANDOVER
+ def_bool y
+
config ARCH_SUPPORTS_CRASH_DUMP
def_bool y
diff --git a/arch/arm64/boot/dts/qcom/qcm6490-shift-otter.dts b/arch/arm64/boot/dts/qcom/qcm6490-shift-otter.dts
index 712f29fbe85e..b9a0f7ac4d9c 100644
--- a/arch/arm64/boot/dts/qcom/qcm6490-shift-otter.dts
+++ b/arch/arm64/boot/dts/qcom/qcm6490-shift-otter.dts
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: BSD-3-Clause
/*
* Copyright (c) 2023, Luca Weiss <luca.weiss@fairphone.com>
- * Copyright (c) 2024, Caleb Connolly <caleb@postmarketos.org>
+ * Copyright (c) 2024, Casey Connolly <casey.connolly@linaro.org>
*/
/dts-v1/;
diff --git a/arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts b/arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts
index e5da58d11064..2cf7b5e1243c 100644
--- a/arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts
+++ b/arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2022, Alexander Martinz <amartinz@shiftphones.com>
- * Copyright (c) 2022, Caleb Connolly <caleb@connolly.tech>
+ * Copyright (c) 2022, Casey Connolly <casey.connolly@linaro.org>
* Copyright (c) 2022, Dylan Van Assche <me@dylanvanassche.be>
*/
diff --git a/arch/arm64/hyperv/mshyperv.c b/arch/arm64/hyperv/mshyperv.c
index 4e27cc29c79e..4fdc26ade1d7 100644
--- a/arch/arm64/hyperv/mshyperv.c
+++ b/arch/arm64/hyperv/mshyperv.c
@@ -28,6 +28,48 @@ int hv_get_hypervisor_version(union hv_hypervisor_version_info *info)
}
EXPORT_SYMBOL_GPL(hv_get_hypervisor_version);
+#ifdef CONFIG_ACPI
+
+static bool __init hyperv_detect_via_acpi(void)
+{
+ if (acpi_disabled)
+ return false;
+ /*
+ * Hypervisor ID is only available in ACPI v6+, and the
+ * structure layout was extended in v6 to accommodate that
+ * new field.
+ *
+ * At the very minimum, this check makes sure not to read
+ * past the FADT structure.
+ *
+ * It is also needed to catch running in some unknown
+ * non-Hyper-V environment that has ACPI 5.x or less.
+ * In such a case, it can't be Hyper-V.
+ */
+ if (acpi_gbl_FADT.header.revision < 6)
+ return false;
+ return strncmp((char *)&acpi_gbl_FADT.hypervisor_id, "MsHyperV", 8) == 0;
+}
+
+#else
+
+static bool __init hyperv_detect_via_acpi(void)
+{
+ return false;
+}
+
+#endif
+
+static bool __init hyperv_detect_via_smccc(void)
+{
+ uuid_t hyperv_uuid = UUID_INIT(
+ 0x58ba324d, 0x6447, 0x24cd,
+ 0x75, 0x6c, 0xef, 0x8e,
+ 0x24, 0x70, 0x59, 0x16);
+
+ return arm_smccc_hypervisor_has_uuid(&hyperv_uuid);
+}
+
static int __init hyperv_init(void)
{
struct hv_get_vp_registers_output result;
@@ -36,13 +78,11 @@ static int __init hyperv_init(void)
/*
* Allow for a kernel built with CONFIG_HYPERV to be running in
- * a non-Hyper-V environment, including on DT instead of ACPI.
+ * a non-Hyper-V environment.
+ *
* In such cases, do nothing and return success.
*/
- if (acpi_disabled)
- return 0;
-
- if (strncmp((char *)&acpi_gbl_FADT.hypervisor_id, "MsHyperV", 8))
+ if (!hyperv_detect_via_acpi() && !hyperv_detect_via_smccc())
return 0;
/* Setup the guest ID */
@@ -77,6 +117,9 @@ static int __init hyperv_init(void)
if (ms_hyperv.priv_high & HV_ACCESS_PARTITION_ID)
hv_get_partition_id();
+ ms_hyperv.vtl = get_vtl();
+ if (ms_hyperv.vtl > 0) /* non default VTL */
+ pr_info("Linux runs in Hyper-V Virtual Trust Level %d\n", ms_hyperv.vtl);
ms_hyperv_late_init();
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index d941abc6b5ee..6ce2c5173482 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -1320,9 +1320,6 @@ int __init populate_sysreg_config(const struct sys_reg_desc *sr,
unsigned int idx);
int __init populate_nv_trap_config(void);
-bool lock_all_vcpus(struct kvm *kvm);
-void unlock_all_vcpus(struct kvm *kvm);
-
void kvm_calculate_traps(struct kvm_vcpu *vcpu);
/* MMIO helpers */
diff --git a/arch/arm64/include/asm/pgtable-types.h b/arch/arm64/include/asm/pgtable-types.h
index 6d6d4065b0cb..265e8301d7ba 100644
--- a/arch/arm64/include/asm/pgtable-types.h
+++ b/arch/arm64/include/asm/pgtable-types.h
@@ -11,11 +11,19 @@
#include <asm/types.h>
-typedef u64 pteval_t;
-typedef u64 pmdval_t;
-typedef u64 pudval_t;
-typedef u64 p4dval_t;
-typedef u64 pgdval_t;
+/*
+ * Page Table Descriptor
+ *
+ * Generic page table descriptor format from which
+ * all level specific descriptors can be derived.
+ */
+typedef u64 ptdesc_t;
+
+typedef ptdesc_t pteval_t;
+typedef ptdesc_t pmdval_t;
+typedef ptdesc_t pudval_t;
+typedef ptdesc_t p4dval_t;
+typedef ptdesc_t pgdval_t;
/*
* These are used to make use of C type-checking..
@@ -46,7 +54,7 @@ typedef struct { pgdval_t pgd; } pgd_t;
#define pgd_val(x) ((x).pgd)
#define __pgd(x) ((pgd_t) { (x) } )
-typedef struct { pteval_t pgprot; } pgprot_t;
+typedef struct { ptdesc_t pgprot; } pgprot_t;
#define pgprot_val(x) ((x).pgprot)
#define __pgprot(x) ((pgprot_t) { (x) } )
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 5285757ee0c1..88db8a0c0b37 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -673,7 +673,6 @@ static inline pmd_t pmd_mkspecial(pmd_t pmd)
#define __phys_to_pmd_val(phys) __phys_to_pte_val(phys)
#define pmd_pfn(pmd) ((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT)
#define pfn_pmd(pfn,prot) __pmd(__phys_to_pmd_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
-#define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot)
#define pud_young(pud) pte_young(pud_pte(pud))
#define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud)))
@@ -906,12 +905,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
/* use ONLY for statically allocated translation tables */
#define pte_offset_kimg(dir,addr) ((pte_t *)__phys_to_kimg(pte_offset_phys((dir), (addr))))
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- */
-#define mk_pte(page,prot) pfn_pte(page_to_pfn(page),prot)
-
#if CONFIG_PGTABLE_LEVELS > 2
#define pmd_ERROR(e) \
diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h
index b2931d1ae0fb..fded5358641f 100644
--- a/arch/arm64/include/asm/ptdump.h
+++ b/arch/arm64/include/asm/ptdump.h
@@ -24,8 +24,8 @@ struct ptdump_info {
};
struct ptdump_prot_bits {
- u64 mask;
- u64 val;
+ ptdesc_t mask;
+ ptdesc_t val;
const char *set;
const char *clear;
};
@@ -34,7 +34,7 @@ struct ptdump_pg_level {
const struct ptdump_prot_bits *bits;
char name[4];
int num;
- u64 mask;
+ ptdesc_t mask;
};
/*
@@ -51,7 +51,7 @@ struct ptdump_pg_state {
const struct mm_struct *mm;
unsigned long start_address;
int level;
- u64 current_prot;
+ ptdesc_t current_prot;
bool check_wx;
unsigned long wx_pages;
unsigned long uxn_pages;
@@ -59,7 +59,13 @@ struct ptdump_pg_state {
void ptdump_walk(struct seq_file *s, struct ptdump_info *info);
void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
- u64 val);
+ pteval_t val);
+void note_page_pte(struct ptdump_state *st, unsigned long addr, pte_t pte);
+void note_page_pmd(struct ptdump_state *st, unsigned long addr, pmd_t pmd);
+void note_page_pud(struct ptdump_state *st, unsigned long addr, pud_t pud);
+void note_page_p4d(struct ptdump_state *st, unsigned long addr, p4d_t p4d);
+void note_page_pgd(struct ptdump_state *st, unsigned long addr, pgd_t pgd);
+void note_page_flush(struct ptdump_state *st);
#ifdef CONFIG_PTDUMP_DEBUGFS
#define EFI_RUNTIME_MAP_END DEFAULT_MAP_WINDOW_64
void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name);
@@ -69,7 +75,13 @@ static inline void ptdump_debugfs_register(struct ptdump_info *info,
#endif /* CONFIG_PTDUMP_DEBUGFS */
#else
static inline void note_page(struct ptdump_state *pt_st, unsigned long addr,
- int level, u64 val) { }
+ int level, pteval_t val) { }
+static inline void note_page_pte(struct ptdump_state *st, unsigned long addr, pte_t pte) { }
+static inline void note_page_pmd(struct ptdump_state *st, unsigned long addr, pmd_t pmd) { }
+static inline void note_page_pud(struct ptdump_state *st, unsigned long addr, pud_t pud) { }
+static inline void note_page_p4d(struct ptdump_state *st, unsigned long addr, p4d_t p4d) { }
+static inline void note_page_pgd(struct ptdump_state *st, unsigned long addr, pgd_t pgd) { }
+static inline void note_page_flush(struct ptdump_state *st) { }
#endif /* CONFIG_PTDUMP */
#endif /* __ASM_PTDUMP_H */
diff --git a/arch/arm64/include/asm/syscall.h b/arch/arm64/include/asm/syscall.h
index ab8e14b96f68..712daa90e643 100644
--- a/arch/arm64/include/asm/syscall.h
+++ b/arch/arm64/include/asm/syscall.h
@@ -61,6 +61,22 @@ static inline void syscall_set_return_value(struct task_struct *task,
regs->regs[0] = val;
}
+static inline void syscall_set_nr(struct task_struct *task,
+ struct pt_regs *regs,
+ int nr)
+{
+ regs->syscallno = nr;
+ if (nr == -1) {
+ /*
+ * When the syscall number is set to -1, the syscall will be
+ * skipped. In this case the syscall return value has to be
+ * set explicitly, otherwise the first syscall argument is
+ * returned as the syscall return value.
+ */
+ syscall_set_return_value(task, regs, -ENOSYS, 0);
+ }
+}
+
#define SYSCALL_MAX_ARGS 6
static inline void syscall_get_arguments(struct task_struct *task,
@@ -73,6 +89,19 @@ static inline void syscall_get_arguments(struct task_struct *task,
memcpy(args, &regs->regs[1], 5 * sizeof(args[0]));
}
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ const unsigned long *args)
+{
+ memcpy(&regs->regs[0], args, 6 * sizeof(args[0]));
+ /*
+ * Also copy the first argument into orig_x0
+ * so that syscall_get_arguments() would return it
+ * instead of the previous value.
+ */
+ regs->orig_x0 = regs->regs[0];
+}
+
/*
* We don't care about endianness (__AUDIT_ARCH_LE bit) here because
* AArch64 has the same system calls both on little- and big- endian.
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index cd853801a8f7..f1bb0d10c39a 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -12,6 +12,7 @@
#include <linux/bits.h>
#include <linux/stringify.h>
#include <linux/kasan-tags.h>
+#include <linux/kconfig.h>
#include <asm/gpr-num.h>
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 250e9d7c08a7..3857fd7ee8d4 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -29,7 +29,7 @@ static bool region_is_misaligned(const efi_memory_desc_t *md)
* executable, everything else can be mapped with the XN bits
* set. Also take the new (optional) RO/XP bits into account.
*/
-static __init pteval_t create_mapping_protection(efi_memory_desc_t *md)
+static __init ptdesc_t create_mapping_protection(efi_memory_desc_t *md)
{
u64 attr = md->attribute;
u32 type = md->type;
@@ -83,7 +83,7 @@ static __init pteval_t create_mapping_protection(efi_memory_desc_t *md)
int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md)
{
- pteval_t prot_val = create_mapping_protection(md);
+ ptdesc_t prot_val = create_mapping_protection(md);
bool page_mappings_only = (md->type == EFI_RUNTIME_SERVICES_CODE ||
md->type == EFI_RUNTIME_SERVICES_DATA);
diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c
index c6650cfe706c..0f4bd7771859 100644
--- a/arch/arm64/kernel/pi/map_kernel.c
+++ b/arch/arm64/kernel/pi/map_kernel.c
@@ -159,7 +159,7 @@ static void noinline __section(".idmap.text") set_ttbr0_for_lpa2(u64 ttbr)
static void __init remap_idmap_for_lpa2(void)
{
/* clear the bits that change meaning once LPA2 is turned on */
- pteval_t mask = PTE_SHARED;
+ ptdesc_t mask = PTE_SHARED;
/*
* We have to clear bits [9:8] in all block or page descriptors in the
diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c
index 81345f68f9fc..7982788e7b9a 100644
--- a/arch/arm64/kernel/pi/map_range.c
+++ b/arch/arm64/kernel/pi/map_range.c
@@ -30,7 +30,7 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot,
int level, pte_t *tbl, bool may_use_cont, u64 va_offset)
{
u64 cmask = (level == 3) ? CONT_PTE_SIZE - 1 : U64_MAX;
- pteval_t protval = pgprot_val(prot) & ~PTE_TYPE_MASK;
+ ptdesc_t protval = pgprot_val(prot) & ~PTE_TYPE_MASK;
int lshift = (3 - level) * PTDESC_TABLE_SHIFT;
u64 lmask = (PAGE_SIZE << lshift) - 1;
@@ -87,7 +87,7 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot,
}
}
-asmlinkage u64 __init create_init_idmap(pgd_t *pg_dir, pteval_t clrmask)
+asmlinkage u64 __init create_init_idmap(pgd_t *pg_dir, ptdesc_t clrmask)
{
u64 ptep = (u64)pg_dir + PAGE_SIZE;
pgprot_t text_prot = PAGE_KERNEL_ROX;
diff --git a/arch/arm64/kernel/pi/pi.h b/arch/arm64/kernel/pi/pi.h
index 1f4731a4e17e..46cafee7829f 100644
--- a/arch/arm64/kernel/pi/pi.h
+++ b/arch/arm64/kernel/pi/pi.h
@@ -34,4 +34,4 @@ void map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot,
asmlinkage void early_map_kernel(u64 boot_status, void *fdt);
-asmlinkage u64 create_init_idmap(pgd_t *pgd, pteval_t clrmask);
+asmlinkage u64 create_init_idmap(pgd_t *pgd, ptdesc_t clrmask);
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 5133dcbfe9f7..fdbc8beec930 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -1766,7 +1766,7 @@ int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm,
mutex_lock(&kvm->lock);
- if (lock_all_vcpus(kvm)) {
+ if (!kvm_trylock_all_vcpus(kvm)) {
set_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &kvm->arch.flags);
/*
@@ -1778,7 +1778,7 @@ int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm,
kvm->arch.timer_data.voffset = offset->counter_offset;
kvm->arch.timer_data.poffset = offset->counter_offset;
- unlock_all_vcpus(kvm);
+ kvm_unlock_all_vcpus(kvm);
} else {
ret = -EBUSY;
}
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 36cfcffb40d8..de2b4e9c9f9f 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1924,49 +1924,6 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
}
}
-/* unlocks vcpus from @vcpu_lock_idx and smaller */
-static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
-{
- struct kvm_vcpu *tmp_vcpu;
-
- for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
- tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
- mutex_unlock(&tmp_vcpu->mutex);
- }
-}
-
-void unlock_all_vcpus(struct kvm *kvm)
-{
- lockdep_assert_held(&kvm->lock);
-
- unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
-}
-
-/* Returns true if all vcpus were locked, false otherwise */
-bool lock_all_vcpus(struct kvm *kvm)
-{
- struct kvm_vcpu *tmp_vcpu;
- unsigned long c;
-
- lockdep_assert_held(&kvm->lock);
-
- /*
- * Any time a vcpu is in an ioctl (including running), the
- * core KVM code tries to grab the vcpu->mutex.
- *
- * By grabbing the vcpu->mutex of all VCPUs we ensure that no
- * other VCPUs can fiddle with the state while we access it.
- */
- kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
- if (!mutex_trylock(&tmp_vcpu->mutex)) {
- unlock_vcpus(kvm, c - 1);
- return false;
- }
- }
-
- return true;
-}
-
static unsigned long nvhe_percpu_size(void)
{
return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) -
@@ -2790,6 +2747,7 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq,
&irqfd->irq_entry);
}
+
void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
struct irq_bypass_producer *prod)
{
@@ -2800,8 +2758,29 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
if (irq_entry->type != KVM_IRQ_ROUTING_MSI)
return;
- kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq,
- &irqfd->irq_entry);
+ kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq);
+}
+
+bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
+ struct kvm_kernel_irq_routing_entry *new)
+{
+ if (new->type != KVM_IRQ_ROUTING_MSI)
+ return true;
+
+ return memcmp(&old->msi, &new->msi, sizeof(new->msi));
+}
+
+int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ /*
+ * Remapping the vLPI requires taking the its_lock mutex to resolve
+ * the new translation. We're in spinlock land at this point, so no
+ * chance of resolving the translation.
+ *
+ * Unmap the vLPI and fall back to software LPI injection.
+ */
+ return kvm_vgic_v4_unset_forwarding(kvm, host_irq);
}
void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 569941eeb3fe..58c5fe7d7572 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -270,6 +270,7 @@ int kvm_smccc_call_handler(struct kvm_vcpu *vcpu)
u32 feature;
u8 action;
gpa_t gpa;
+ uuid_t uuid;
action = kvm_smccc_get_action(vcpu, func_id);
switch (action) {
@@ -355,10 +356,11 @@ int kvm_smccc_call_handler(struct kvm_vcpu *vcpu)
val[0] = gpa;
break;
case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID:
- val[0] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0;
- val[1] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1;
- val[2] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2;
- val[3] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3;
+ uuid = ARM_SMCCC_VENDOR_HYP_UID_KVM;
+ val[0] = smccc_uuid_to_reg(&uuid, 0);
+ val[1] = smccc_uuid_to_reg(&uuid, 1);
+ val[2] = smccc_uuid_to_reg(&uuid, 2);
+ val[3] = smccc_uuid_to_reg(&uuid, 3);
break;
case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
val[0] = smccc_feat->vendor_hyp_bmap;
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 291dbe38eb5c..4a53e4147fb0 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -918,6 +918,8 @@ static void invalidate_vncr_va(struct kvm *kvm,
}
}
+#define tlbi_va_s1_to_va(v) (u64)sign_extend64((v) << 12, 48)
+
static void compute_s1_tlbi_range(struct kvm_vcpu *vcpu, u32 inst, u64 val,
struct s1e2_tlbi_scope *scope)
{
@@ -964,7 +966,7 @@ static void compute_s1_tlbi_range(struct kvm_vcpu *vcpu, u32 inst, u64 val,
scope->size = ttl_to_size(FIELD_GET(TLBI_TTL_MASK, val));
if (!scope->size)
scope->size = SZ_1G;
- scope->va = (val << 12) & ~(scope->size - 1);
+ scope->va = tlbi_va_s1_to_va(val) & ~(scope->size - 1);
scope->asid = FIELD_GET(TLBIR_ASID_MASK, val);
break;
case OP_TLBI_ASIDE1:
@@ -992,7 +994,7 @@ static void compute_s1_tlbi_range(struct kvm_vcpu *vcpu, u32 inst, u64 val,
scope->size = ttl_to_size(FIELD_GET(TLBI_TTL_MASK, val));
if (!scope->size)
scope->size = SZ_1G;
- scope->va = (val << 12) & ~(scope->size - 1);
+ scope->va = tlbi_va_s1_to_va(val) & ~(scope->size - 1);
break;
case OP_TLBI_RVAE2:
case OP_TLBI_RVAE2IS:
diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c
index f8425f381de9..2684f273d9e1 100644
--- a/arch/arm64/kvm/vgic/vgic-debug.c
+++ b/arch/arm64/kvm/vgic/vgic-debug.c
@@ -490,6 +490,9 @@ static int vgic_its_debug_show(struct seq_file *s, void *v)
struct its_device *dev = iter->dev;
struct its_ite *ite = iter->ite;
+ if (!ite)
+ return 0;
+
if (list_is_first(&ite->ite_list, &dev->itt_head)) {
seq_printf(s, "\n");
seq_printf(s, "Device ID: 0x%x, Event ID Range: [0 - %llu]\n",
@@ -498,7 +501,7 @@ static int vgic_its_debug_show(struct seq_file *s, void *v)
seq_printf(s, "-----------------------------------------------\n");
}
- if (ite && ite->irq && ite->collection) {
+ if (ite->irq && ite->collection) {
seq_printf(s, "%8u %8u %8u %8u %8u %2d\n",
ite->event_id, ite->irq->intid, ite->irq->hwintid,
ite->collection->target_addr,
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 1f33e71c2a73..eb1205654ac8 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -84,15 +84,40 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
!kvm_vgic_global_state.can_emulate_gicv2)
return -ENODEV;
- /* Must be held to avoid race with vCPU creation */
+ /*
+ * Ensure mutual exclusion with vCPU creation and any vCPU ioctls by:
+ *
+ * - Holding kvm->lock to prevent KVM_CREATE_VCPU from reaching
+ * kvm_arch_vcpu_precreate() and ensuring created_vcpus is stable.
+ * This alone is insufficient, as kvm_vm_ioctl_create_vcpu() drops
+ * the kvm->lock before completing the vCPU creation.
+ */
lockdep_assert_held(&kvm->lock);
+ /*
+ * - Acquiring the vCPU mutex for every *online* vCPU to prevent
+ * concurrent vCPU ioctls for vCPUs already visible to userspace.
+ */
ret = -EBUSY;
- if (!lock_all_vcpus(kvm))
+ if (kvm_trylock_all_vcpus(kvm))
return ret;
+ /*
+ * - Taking the config_lock which protects VGIC data structures such
+ * as the per-vCPU arrays of private IRQs (SGIs, PPIs).
+ */
mutex_lock(&kvm->arch.config_lock);
+ /*
+ * - Bailing on the entire thing if a vCPU is in the middle of creation,
+ * dropped the kvm->lock, but hasn't reached kvm_arch_vcpu_create().
+ *
+ * The whole combination of this guarantees that no vCPU can get into
+ * KVM with a VGIC configuration inconsistent with the VM's VGIC.
+ */
+ if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus))
+ goto out_unlock;
+
if (irqchip_in_kernel(kvm)) {
ret = -EEXIST;
goto out_unlock;
@@ -142,7 +167,7 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
out_unlock:
mutex_unlock(&kvm->arch.config_lock);
- unlock_all_vcpus(kvm);
+ kvm_unlock_all_vcpus(kvm);
return ret;
}
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 569f9da9049f..534049c7c94b 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -306,39 +306,34 @@ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
}
}
- raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
-
if (irq->hw)
- return its_prop_update_vlpi(irq->host_irq, prop, needs_inv);
+ ret = its_prop_update_vlpi(irq->host_irq, prop, needs_inv);
- return 0;
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ return ret;
}
static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu)
{
- int ret = 0;
- unsigned long flags;
+ struct its_vlpi_map map;
+ int ret;
- raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ guard(raw_spinlock_irqsave)(&irq->irq_lock);
irq->target_vcpu = vcpu;
- raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
- if (irq->hw) {
- struct its_vlpi_map map;
-
- ret = its_get_vlpi(irq->host_irq, &map);
- if (ret)
- return ret;
+ if (!irq->hw)
+ return 0;
- if (map.vpe)
- atomic_dec(&map.vpe->vlpi_count);
- map.vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
- atomic_inc(&map.vpe->vlpi_count);
+ ret = its_get_vlpi(irq->host_irq, &map);
+ if (ret)
+ return ret;
- ret = its_map_vlpi(irq->host_irq, &map);
- }
+ if (map.vpe)
+ atomic_dec(&map.vpe->vlpi_count);
- return ret;
+ map.vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+ atomic_inc(&map.vpe->vlpi_count);
+ return its_map_vlpi(irq->host_irq, &map);
}
static struct kvm_vcpu *collection_to_vcpu(struct kvm *kvm,
@@ -756,12 +751,17 @@ int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi)
/* Requires the its_lock to be held. */
static void its_free_ite(struct kvm *kvm, struct its_ite *ite)
{
+ struct vgic_irq *irq = ite->irq;
list_del(&ite->ite_list);
/* This put matches the get in vgic_add_lpi. */
- if (ite->irq) {
- if (ite->irq->hw)
- WARN_ON(its_unmap_vlpi(ite->irq->host_irq));
+ if (irq) {
+ scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) {
+ if (irq->hw)
+ WARN_ON(its_unmap_vlpi(ite->irq->host_irq));
+
+ irq->hw = false;
+ }
vgic_put_irq(kvm, ite->irq);
}
@@ -1971,7 +1971,7 @@ static int vgic_its_attr_regs_access(struct kvm_device *dev,
mutex_lock(&dev->kvm->lock);
- if (!lock_all_vcpus(dev->kvm)) {
+ if (kvm_trylock_all_vcpus(dev->kvm)) {
mutex_unlock(&dev->kvm->lock);
return -EBUSY;
}
@@ -2006,7 +2006,7 @@ static int vgic_its_attr_regs_access(struct kvm_device *dev,
}
out:
mutex_unlock(&dev->kvm->arch.config_lock);
- unlock_all_vcpus(dev->kvm);
+ kvm_unlock_all_vcpus(dev->kvm);
mutex_unlock(&dev->kvm->lock);
return ret;
}
@@ -2676,7 +2676,7 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
mutex_lock(&kvm->lock);
- if (!lock_all_vcpus(kvm)) {
+ if (kvm_trylock_all_vcpus(kvm)) {
mutex_unlock(&kvm->lock);
return -EBUSY;
}
@@ -2698,7 +2698,7 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
mutex_unlock(&its->its_lock);
mutex_unlock(&kvm->arch.config_lock);
- unlock_all_vcpus(kvm);
+ kvm_unlock_all_vcpus(kvm);
mutex_unlock(&kvm->lock);
return ret;
}
diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c
index 359094f68c23..f9ae790163fb 100644
--- a/arch/arm64/kvm/vgic/vgic-kvm-device.c
+++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c
@@ -268,7 +268,7 @@ static int vgic_set_common_attr(struct kvm_device *dev,
return -ENXIO;
mutex_lock(&dev->kvm->lock);
- if (!lock_all_vcpus(dev->kvm)) {
+ if (kvm_trylock_all_vcpus(dev->kvm)) {
mutex_unlock(&dev->kvm->lock);
return -EBUSY;
}
@@ -276,7 +276,7 @@ static int vgic_set_common_attr(struct kvm_device *dev,
mutex_lock(&dev->kvm->arch.config_lock);
r = vgic_v3_save_pending_tables(dev->kvm);
mutex_unlock(&dev->kvm->arch.config_lock);
- unlock_all_vcpus(dev->kvm);
+ kvm_unlock_all_vcpus(dev->kvm);
mutex_unlock(&dev->kvm->lock);
return r;
}
@@ -390,7 +390,7 @@ static int vgic_v2_attr_regs_access(struct kvm_device *dev,
mutex_lock(&dev->kvm->lock);
- if (!lock_all_vcpus(dev->kvm)) {
+ if (kvm_trylock_all_vcpus(dev->kvm)) {
mutex_unlock(&dev->kvm->lock);
return -EBUSY;
}
@@ -415,7 +415,7 @@ static int vgic_v2_attr_regs_access(struct kvm_device *dev,
out:
mutex_unlock(&dev->kvm->arch.config_lock);
- unlock_all_vcpus(dev->kvm);
+ kvm_unlock_all_vcpus(dev->kvm);
mutex_unlock(&dev->kvm->lock);
if (!ret && !is_write)
@@ -554,7 +554,7 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
mutex_lock(&dev->kvm->lock);
- if (!lock_all_vcpus(dev->kvm)) {
+ if (kvm_trylock_all_vcpus(dev->kvm)) {
mutex_unlock(&dev->kvm->lock);
return -EBUSY;
}
@@ -611,7 +611,7 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev,
out:
mutex_unlock(&dev->kvm->arch.config_lock);
- unlock_all_vcpus(dev->kvm);
+ kvm_unlock_all_vcpus(dev->kvm);
mutex_unlock(&dev->kvm->lock);
if (!ret && uaccess && !is_write) {
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index c7de6154627c..193946108192 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -444,7 +444,7 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
if (IS_ERR(its))
return 0;
- mutex_lock(&its->its_lock);
+ guard(mutex)(&its->its_lock);
/*
* Perform the actual DevID/EventID -> LPI translation.
@@ -455,11 +455,13 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
*/
if (vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
irq_entry->msi.data, &irq))
- goto out;
+ return 0;
+
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
/* Silently exit if the vLPI is already mapped */
if (irq->hw)
- goto out;
+ goto out_unlock_irq;
/*
* Emit the mapping request. If it fails, the ITS probably
@@ -479,68 +481,74 @@ int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
ret = its_map_vlpi(virq, &map);
if (ret)
- goto out;
+ goto out_unlock_irq;
irq->hw = true;
irq->host_irq = virq;
atomic_inc(&map.vpe->vlpi_count);
/* Transfer pending state */
- raw_spin_lock_irqsave(&irq->irq_lock, flags);
- if (irq->pending_latch) {
- ret = irq_set_irqchip_state(irq->host_irq,
- IRQCHIP_STATE_PENDING,
- irq->pending_latch);
- WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
+ if (!irq->pending_latch)
+ goto out_unlock_irq;
- /*
- * Clear pending_latch and communicate this state
- * change via vgic_queue_irq_unlock.
- */
- irq->pending_latch = false;
- vgic_queue_irq_unlock(kvm, irq, flags);
- } else {
- raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
- }
+ ret = irq_set_irqchip_state(irq->host_irq, IRQCHIP_STATE_PENDING,
+ irq->pending_latch);
+ WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);
-out:
- mutex_unlock(&its->its_lock);
+ /*
+ * Clear pending_latch and communicate this state
+ * change via vgic_queue_irq_unlock.
+ */
+ irq->pending_latch = false;
+ vgic_queue_irq_unlock(kvm, irq, flags);
+ return ret;
+
+out_unlock_irq:
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
return ret;
}
-int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int virq,
- struct kvm_kernel_irq_routing_entry *irq_entry)
+static struct vgic_irq *__vgic_host_irq_get_vlpi(struct kvm *kvm, int host_irq)
{
- struct vgic_its *its;
struct vgic_irq *irq;
- int ret;
+ unsigned long idx;
+
+ guard(rcu)();
+ xa_for_each(&kvm->arch.vgic.lpi_xa, idx, irq) {
+ if (!irq->hw || irq->host_irq != host_irq)
+ continue;
+
+ if (!vgic_try_get_irq_kref(irq))
+ return NULL;
+
+ return irq;
+ }
+
+ return NULL;
+}
+
+int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int host_irq)
+{
+ struct vgic_irq *irq;
+ unsigned long flags;
+ int ret = 0;
if (!vgic_supports_direct_msis(kvm))
return 0;
- /*
- * Get the ITS, and escape early on error (not a valid
- * doorbell for any of our vITSs).
- */
- its = vgic_get_its(kvm, irq_entry);
- if (IS_ERR(its))
+ irq = __vgic_host_irq_get_vlpi(kvm, host_irq);
+ if (!irq)
return 0;
- mutex_lock(&its->its_lock);
-
- ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
- irq_entry->msi.data, &irq);
- if (ret)
- goto out;
-
- WARN_ON(irq->hw && irq->host_irq != virq);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
+ WARN_ON(irq->hw && irq->host_irq != host_irq);
if (irq->hw) {
atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count);
irq->hw = false;
- ret = its_unmap_vlpi(virq);
+ ret = its_unmap_vlpi(host_irq);
}
-out:
- mutex_unlock(&its->its_lock);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+ vgic_put_irq(kvm, irq);
return ret;
}
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 07aeab8a7606..c86c348857c4 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -83,7 +83,7 @@ arch_initcall(adjust_protection_map);
pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
- pteval_t prot;
+ ptdesc_t prot;
/* Short circuit GCS to avoid bloating the table. */
if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) {
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index ea6695d53fb9..8fcf59ba39db 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -46,6 +46,13 @@
#define NO_CONT_MAPPINGS BIT(1)
#define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */
+enum pgtable_type {
+ TABLE_PTE,
+ TABLE_PMD,
+ TABLE_PUD,
+ TABLE_P4D,
+};
+
u64 kimage_voffset __ro_after_init;
EXPORT_SYMBOL(kimage_voffset);
@@ -107,7 +114,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
}
EXPORT_SYMBOL(phys_mem_access_prot);
-static phys_addr_t __init early_pgtable_alloc(int shift)
+static phys_addr_t __init early_pgtable_alloc(enum pgtable_type pgtable_type)
{
phys_addr_t phys;
@@ -192,7 +199,7 @@ static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
unsigned long end, phys_addr_t phys,
pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int),
+ phys_addr_t (*pgtable_alloc)(enum pgtable_type),
int flags)
{
unsigned long next;
@@ -207,7 +214,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
if (flags & NO_EXEC_MAPPINGS)
pmdval |= PMD_TABLE_PXN;
BUG_ON(!pgtable_alloc);
- pte_phys = pgtable_alloc(PAGE_SHIFT);
+ pte_phys = pgtable_alloc(TABLE_PTE);
ptep = pte_set_fixmap(pte_phys);
init_clear_pgtable(ptep);
ptep += pte_index(addr);
@@ -243,7 +250,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
phys_addr_t phys, pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int), int flags)
+ phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags)
{
unsigned long next;
@@ -277,7 +284,8 @@ static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
unsigned long end, phys_addr_t phys,
pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int), int flags)
+ phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+ int flags)
{
unsigned long next;
pud_t pud = READ_ONCE(*pudp);
@@ -294,7 +302,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
if (flags & NO_EXEC_MAPPINGS)
pudval |= PUD_TABLE_PXN;
BUG_ON(!pgtable_alloc);
- pmd_phys = pgtable_alloc(PMD_SHIFT);
+ pmd_phys = pgtable_alloc(TABLE_PMD);
pmdp = pmd_set_fixmap(pmd_phys);
init_clear_pgtable(pmdp);
pmdp += pmd_index(addr);
@@ -325,7 +333,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
phys_addr_t phys, pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int),
+ phys_addr_t (*pgtable_alloc)(enum pgtable_type),
int flags)
{
unsigned long next;
@@ -339,7 +347,7 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
if (flags & NO_EXEC_MAPPINGS)
p4dval |= P4D_TABLE_PXN;
BUG_ON(!pgtable_alloc);
- pud_phys = pgtable_alloc(PUD_SHIFT);
+ pud_phys = pgtable_alloc(TABLE_PUD);
pudp = pud_set_fixmap(pud_phys);
init_clear_pgtable(pudp);
pudp += pud_index(addr);
@@ -383,7 +391,7 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
phys_addr_t phys, pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int),
+ phys_addr_t (*pgtable_alloc)(enum pgtable_type),
int flags)
{
unsigned long next;
@@ -397,7 +405,7 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
if (flags & NO_EXEC_MAPPINGS)
pgdval |= PGD_TABLE_PXN;
BUG_ON(!pgtable_alloc);
- p4d_phys = pgtable_alloc(P4D_SHIFT);
+ p4d_phys = pgtable_alloc(TABLE_P4D);
p4dp = p4d_set_fixmap(p4d_phys);
init_clear_pgtable(p4dp);
p4dp += p4d_index(addr);
@@ -427,7 +435,7 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
unsigned long virt, phys_addr_t size,
pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int),
+ phys_addr_t (*pgtable_alloc)(enum pgtable_type),
int flags)
{
unsigned long addr, end, next;
@@ -455,7 +463,7 @@ static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
unsigned long virt, phys_addr_t size,
pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int),
+ phys_addr_t (*pgtable_alloc)(enum pgtable_type),
int flags)
{
mutex_lock(&fixmap_lock);
@@ -468,37 +476,48 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
extern __alias(__create_pgd_mapping_locked)
void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
phys_addr_t size, pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(int), int flags);
+ phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+ int flags);
#endif
-static phys_addr_t __pgd_pgtable_alloc(int shift)
+static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm,
+ enum pgtable_type pgtable_type)
{
/* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */
- void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL & ~__GFP_ZERO);
+ struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
+ phys_addr_t pa;
+
+ BUG_ON(!ptdesc);
+ pa = page_to_phys(ptdesc_page(ptdesc));
+
+ switch (pgtable_type) {
+ case TABLE_PTE:
+ BUG_ON(!pagetable_pte_ctor(mm, ptdesc));
+ break;
+ case TABLE_PMD:
+ BUG_ON(!pagetable_pmd_ctor(mm, ptdesc));
+ break;
+ case TABLE_PUD:
+ pagetable_pud_ctor(ptdesc);
+ break;
+ case TABLE_P4D:
+ pagetable_p4d_ctor(ptdesc);
+ break;
+ }
- BUG_ON(!ptr);
- return __pa(ptr);
+ return pa;
}
-static phys_addr_t pgd_pgtable_alloc(int shift)
+static phys_addr_t __maybe_unused
+pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type)
{
- phys_addr_t pa = __pgd_pgtable_alloc(shift);
- struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa));
-
- /*
- * Call proper page table ctor in case later we need to
- * call core mm functions like apply_to_page_range() on
- * this pre-allocated page table.
- *
- * We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is
- * folded, and if so pagetable_pte_ctor() becomes nop.
- */
- if (shift == PAGE_SHIFT)
- BUG_ON(!pagetable_pte_ctor(ptdesc));
- else if (shift == PMD_SHIFT)
- BUG_ON(!pagetable_pmd_ctor(ptdesc));
+ return __pgd_pgtable_alloc(&init_mm, pgtable_type);
+}
- return pa;
+static phys_addr_t
+pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type)
+{
+ return __pgd_pgtable_alloc(NULL, pgtable_type);
}
/*
@@ -530,7 +549,7 @@ void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
__create_pgd_mapping(mm->pgd, phys, virt, size, prot,
- pgd_pgtable_alloc, flags);
+ pgd_pgtable_alloc_special_mm, flags);
}
static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
@@ -744,7 +763,7 @@ static int __init map_entry_trampoline(void)
memset(tramp_pg_dir, 0, PGD_SIZE);
__create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS,
entry_tramp_text_size(), prot,
- __pgd_pgtable_alloc, NO_BLOCK_MAPPINGS);
+ pgd_pgtable_alloc_init_mm, NO_BLOCK_MAPPINGS);
/* Map both the text and data into the kernel page table */
for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++)
@@ -1350,7 +1369,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
- size, params->pgprot, __pgd_pgtable_alloc,
+ size, params->pgprot, pgd_pgtable_alloc_init_mm,
flags);
memblock_clear_nomap(start, size);
diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index 8cec0da4cff2..421a5de806c6 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -189,12 +189,12 @@ static void note_prot_wx(struct ptdump_pg_state *st, unsigned long addr)
}
void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
- u64 val)
+ pteval_t val)
{
struct ptdump_pg_state *st = container_of(pt_st, struct ptdump_pg_state, ptdump);
struct ptdump_pg_level *pg_level = st->pg_level;
static const char units[] = "KMGTPE";
- u64 prot = 0;
+ ptdesc_t prot = 0;
/* check if the current level has been folded dynamically */
if (st->mm && ((level == 1 && mm_p4d_folded(st->mm)) ||
@@ -251,6 +251,38 @@ void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
}
+void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte)
+{
+ note_page(pt_st, addr, 4, pte_val(pte));
+}
+
+void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd)
+{
+ note_page(pt_st, addr, 3, pmd_val(pmd));
+}
+
+void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud)
+{
+ note_page(pt_st, addr, 2, pud_val(pud));
+}
+
+void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d)
+{
+ note_page(pt_st, addr, 1, p4d_val(p4d));
+}
+
+void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd)
+{
+ note_page(pt_st, addr, 0, pgd_val(pgd));
+}
+
+void note_page_flush(struct ptdump_state *pt_st)
+{
+ pte_t pte_zero = {0};
+
+ note_page(pt_st, 0, -1, pte_val(pte_zero));
+}
+
void ptdump_walk(struct seq_file *s, struct ptdump_info *info)
{
unsigned long end = ~0UL;
@@ -266,7 +298,12 @@ void ptdump_walk(struct seq_file *s, struct ptdump_info *info)
.pg_level = &kernel_pg_levels[0],
.level = -1,
.ptdump = {
- .note_page = note_page,
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
.range = (struct ptdump_range[]){
{info->base_addr, end},
{0, 0}
@@ -303,7 +340,12 @@ bool ptdump_check_wx(void)
.level = -1,
.check_wx = true,
.ptdump = {
- .note_page = note_page,
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
.range = (struct ptdump_range[]) {
{_PAGE_OFFSET(vabits_actual), ~0UL},
{0, 0}
diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h
index 11055c574968..9ed2b15ffd94 100644
--- a/arch/csky/include/asm/pgalloc.h
+++ b/arch/csky/include/asm/pgalloc.h
@@ -29,7 +29,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
pte_t *pte;
unsigned long i;
- pte = (pte_t *) __get_free_page(GFP_KERNEL);
+ pte = __pte_alloc_one_kernel(mm);
if (!pte)
return NULL;
diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h
index a397e1718ab6..b8378431aeff 100644
--- a/arch/csky/include/asm/pgtable.h
+++ b/arch/csky/include/asm/pgtable.h
@@ -249,11 +249,6 @@ static inline pgprot_t pgprot_writecombine(pgprot_t _prot)
return __pgprot(prot);
}
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- */
-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
return __pte((pte_val(pte) & _PAGE_CHG_MASK) |
diff --git a/arch/csky/include/asm/syscall.h b/arch/csky/include/asm/syscall.h
index 0de5734950bf..717f44b4d26f 100644
--- a/arch/csky/include/asm/syscall.h
+++ b/arch/csky/include/asm/syscall.h
@@ -59,6 +59,19 @@ syscall_get_arguments(struct task_struct *task, struct pt_regs *regs,
memcpy(args, &regs->a1, 5 * sizeof(args[0]));
}
+static inline void
+syscall_set_arguments(struct task_struct *task, struct pt_regs *regs,
+ const unsigned long *args)
+{
+ memcpy(&regs->a0, args, 6 * sizeof(regs->a0));
+ /*
+ * Also copy the first argument into orig_a0
+ * so that syscall_get_arguments() would return it
+ * instead of the previous value.
+ */
+ regs->orig_a0 = regs->a0;
+}
+
static inline int
syscall_get_arch(struct task_struct *task)
{
diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h
index 8c5b7a1c3d90..9fbdfdbc539f 100644
--- a/arch/hexagon/include/asm/pgtable.h
+++ b/arch/hexagon/include/asm/pgtable.h
@@ -238,9 +238,6 @@ static inline int pte_present(pte_t pte)
return pte_val(pte) & _PAGE_PRESENT;
}
-/* mk_pte - make a PTE out of a page pointer and protection bits */
-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
-
/* pte_page - returns a page (frame pointer/descriptor?) based on a PTE */
#define pte_page(x) pfn_to_page(pte_pfn(x))
diff --git a/arch/hexagon/include/asm/syscall.h b/arch/hexagon/include/asm/syscall.h
index f6e454f18038..70637261817a 100644
--- a/arch/hexagon/include/asm/syscall.h
+++ b/arch/hexagon/include/asm/syscall.h
@@ -26,6 +26,13 @@ static inline long syscall_get_nr(struct task_struct *task,
return regs->r06;
}
+static inline void syscall_set_nr(struct task_struct *task,
+ struct pt_regs *regs,
+ int nr)
+{
+ regs->r06 = nr;
+}
+
static inline void syscall_get_arguments(struct task_struct *task,
struct pt_regs *regs,
unsigned long *args)
@@ -33,6 +40,13 @@ static inline void syscall_get_arguments(struct task_struct *task,
memcpy(args, &(&regs->r00)[0], 6 * sizeof(args[0]));
}
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ unsigned long *args)
+{
+ memcpy(&(&regs->r00)[0], args, 6 * sizeof(args[0]));
+}
+
static inline long syscall_get_error(struct task_struct *task,
struct pt_regs *regs)
{
@@ -45,6 +59,13 @@ static inline long syscall_get_return_value(struct task_struct *task,
return regs->r00;
}
+static inline void syscall_set_return_value(struct task_struct *task,
+ struct pt_regs *regs,
+ int error, long val)
+{
+ regs->r00 = (long) error ?: val;
+}
+
static inline int syscall_get_arch(struct task_struct *task)
{
return AUDIT_ARCH_HEXAGON;
diff --git a/arch/loongarch/include/asm/pgalloc.h b/arch/loongarch/include/asm/pgalloc.h
index b58f587f0f0a..1c63a9d9a6d3 100644
--- a/arch/loongarch/include/asm/pgalloc.h
+++ b/arch/loongarch/include/asm/pgalloc.h
@@ -69,7 +69,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
if (!ptdesc)
return NULL;
- if (!pagetable_pmd_ctor(ptdesc)) {
+ if (!pagetable_pmd_ctor(mm, ptdesc)) {
pagetable_free(ptdesc);
return NULL;
}
diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h
index da346733a1da..a3f17914dbab 100644
--- a/arch/loongarch/include/asm/pgtable.h
+++ b/arch/loongarch/include/asm/pgtable.h
@@ -255,7 +255,6 @@ static inline void pmd_clear(pmd_t *pmdp)
#define pmd_page_vaddr(pmd) pmd_val(pmd)
-extern pmd_t mk_pmd(struct page *page, pgprot_t prot);
extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd);
#define pte_page(x) pfn_to_page(pte_pfn(x))
@@ -426,12 +425,6 @@ static inline unsigned long pte_accessible(struct mm_struct *mm, pte_t a)
return false;
}
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- */
-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
-
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
return __pte((pte_val(pte) & _PAGE_CHG_MASK) |
diff --git a/arch/loongarch/include/asm/syscall.h b/arch/loongarch/include/asm/syscall.h
index e286dc58476e..81d2733f7b94 100644
--- a/arch/loongarch/include/asm/syscall.h
+++ b/arch/loongarch/include/asm/syscall.h
@@ -26,6 +26,13 @@ static inline long syscall_get_nr(struct task_struct *task,
return regs->regs[11];
}
+static inline void syscall_set_nr(struct task_struct *task,
+ struct pt_regs *regs,
+ int nr)
+{
+ regs->regs[11] = nr;
+}
+
static inline void syscall_rollback(struct task_struct *task,
struct pt_regs *regs)
{
@@ -61,6 +68,14 @@ static inline void syscall_get_arguments(struct task_struct *task,
memcpy(&args[1], &regs->regs[5], 5 * sizeof(long));
}
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ unsigned long *args)
+{
+ regs->orig_a0 = args[0];
+ memcpy(&regs->regs[5], &args[1], 5 * sizeof(long));
+}
+
static inline int syscall_get_arch(struct task_struct *task)
{
return AUDIT_ARCH_LOONGARCH64;
diff --git a/arch/loongarch/mm/pgtable.c b/arch/loongarch/mm/pgtable.c
index 22a94bb3e6e8..352d9b2e02ab 100644
--- a/arch/loongarch/mm/pgtable.c
+++ b/arch/loongarch/mm/pgtable.c
@@ -135,15 +135,6 @@ void kernel_pte_init(void *addr)
} while (p != end);
}
-pmd_t mk_pmd(struct page *page, pgprot_t prot)
-{
- pmd_t pmd;
-
- pmd_val(pmd) = (page_to_pfn(page) << PFN_PTE_SHIFT) | pgprot_val(prot);
-
- return pmd;
-}
-
void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
diff --git a/arch/m68k/coldfire/gpio.c b/arch/m68k/coldfire/gpio.c
index ca26de257871..30e5a4ed799d 100644
--- a/arch/m68k/coldfire/gpio.c
+++ b/arch/m68k/coldfire/gpio.c
@@ -123,10 +123,12 @@ static int mcfgpio_direction_output(struct gpio_chip *chip, unsigned offset,
return __mcfgpio_direction_output(offset, value);
}
-static void mcfgpio_set_value(struct gpio_chip *chip, unsigned offset,
- int value)
+static int mcfgpio_set_value(struct gpio_chip *chip, unsigned int offset,
+ int value)
{
__mcfgpio_set_value(offset, value);
+
+ return 0;
}
static int mcfgpio_request(struct gpio_chip *chip, unsigned offset)
@@ -158,7 +160,7 @@ static struct gpio_chip mcfgpio_chip = {
.direction_input = mcfgpio_direction_input,
.direction_output = mcfgpio_direction_output,
.get = mcfgpio_get_value,
- .set = mcfgpio_set_value,
+ .set_rv = mcfgpio_set_value,
.to_irq = mcfgpio_to_irq,
.base = 0,
.ngpio = MCFGPIO_PIN_MAX,
diff --git a/arch/m68k/include/asm/mcf_pgalloc.h b/arch/m68k/include/asm/mcf_pgalloc.h
index 4c648b51e7fd..fc5454d37da3 100644
--- a/arch/m68k/include/asm/mcf_pgalloc.h
+++ b/arch/m68k/include/asm/mcf_pgalloc.h
@@ -7,7 +7,7 @@
static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
{
- pagetable_free(virt_to_ptdesc(pte));
+ pagetable_dtor_free(virt_to_ptdesc(pte));
}
extern const char bad_pmd_string[];
@@ -19,6 +19,10 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
if (!ptdesc)
return NULL;
+ if (!pagetable_pte_ctor(mm, ptdesc)) {
+ pagetable_free(ptdesc);
+ return NULL;
+ }
return ptdesc_address(ptdesc);
}
@@ -48,7 +52,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
if (!ptdesc)
return NULL;
- if (!pagetable_pte_ctor(ptdesc)) {
+ if (!pagetable_pte_ctor(mm, ptdesc)) {
pagetable_free(ptdesc);
return NULL;
}
diff --git a/arch/m68k/include/asm/mcf_pgtable.h b/arch/m68k/include/asm/mcf_pgtable.h
index 48f87a8a8832..f5c596b211d4 100644
--- a/arch/m68k/include/asm/mcf_pgtable.h
+++ b/arch/m68k/include/asm/mcf_pgtable.h
@@ -96,12 +96,6 @@
#define pmd_pgtable(pmd) pfn_to_virt(pmd_val(pmd) >> PAGE_SHIFT)
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- */
-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
-
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
pte_val(pte) = (pte_val(pte) & CF_PAGE_CHG_MASK) | pgprot_val(newprot);
diff --git a/arch/m68k/include/asm/motorola_pgalloc.h b/arch/m68k/include/asm/motorola_pgalloc.h
index 5abe7da8ac5a..1091fb0affbe 100644
--- a/arch/m68k/include/asm/motorola_pgalloc.h
+++ b/arch/m68k/include/asm/motorola_pgalloc.h
@@ -15,7 +15,7 @@ enum m68k_table_types {
};
extern void init_pointer_table(void *table, int type);
-extern void *get_pointer_table(int type);
+extern void *get_pointer_table(struct mm_struct *mm, int type);
extern int free_pointer_table(void *table, int type);
/*
@@ -26,7 +26,7 @@ extern int free_pointer_table(void *table, int type);
static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
{
- return get_pointer_table(TABLE_PTE);
+ return get_pointer_table(mm, TABLE_PTE);
}
static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
@@ -36,7 +36,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
{
- return get_pointer_table(TABLE_PTE);
+ return get_pointer_table(mm, TABLE_PTE);
}
static inline void pte_free(struct mm_struct *mm, pgtable_t pgtable)
@@ -53,7 +53,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pgtable,
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
{
- return get_pointer_table(TABLE_PMD);
+ return get_pointer_table(mm, TABLE_PMD);
}
static inline int pmd_free(struct mm_struct *mm, pmd_t *pmd)
@@ -75,7 +75,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
static inline pgd_t *pgd_alloc(struct mm_struct *mm)
{
- return get_pointer_table(TABLE_PGD);
+ return get_pointer_table(mm, TABLE_PGD);
}
diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h
index 9866c7acdabe..040ac3bad713 100644
--- a/arch/m68k/include/asm/motorola_pgtable.h
+++ b/arch/m68k/include/asm/motorola_pgtable.h
@@ -81,12 +81,6 @@ extern unsigned long mm_cachebits;
#define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd))
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- */
-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
-
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
pte_val(pte) = (pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot);
diff --git a/arch/m68k/include/asm/sun3_pgtable.h b/arch/m68k/include/asm/sun3_pgtable.h
index 30081aee8164..73745dc0ec0e 100644
--- a/arch/m68k/include/asm/sun3_pgtable.h
+++ b/arch/m68k/include/asm/sun3_pgtable.h
@@ -76,12 +76,6 @@
#ifndef __ASSEMBLY__
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- */
-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
-
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
pte_val(pte) = (pte_val(pte) & SUN3_PAGE_CHG_MASK) | pgprot_val(newprot);
diff --git a/arch/m68k/include/asm/syscall.h b/arch/m68k/include/asm/syscall.h
index d1453e850cdd..bf84b160c2eb 100644
--- a/arch/m68k/include/asm/syscall.h
+++ b/arch/m68k/include/asm/syscall.h
@@ -14,6 +14,13 @@ static inline int syscall_get_nr(struct task_struct *task,
return regs->orig_d0;
}
+static inline void syscall_set_nr(struct task_struct *task,
+ struct pt_regs *regs,
+ int nr)
+{
+ regs->orig_d0 = nr;
+}
+
static inline void syscall_rollback(struct task_struct *task,
struct pt_regs *regs)
{
diff --git a/arch/m68k/kernel/setup_no.c b/arch/m68k/kernel/setup_no.c
index f9872098f5ca..f724875b15cc 100644
--- a/arch/m68k/kernel/setup_no.c
+++ b/arch/m68k/kernel/setup_no.c
@@ -145,8 +145,7 @@ void __init setup_arch(char **cmdline_p)
/* Keep a copy of command line */
*cmdline_p = &command_line[0];
- memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
- boot_command_line[COMMAND_LINE_SIZE-1] = 0;
+ strscpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
/*
* Give all the memory to the bootmap allocator, tell it to put the
diff --git a/arch/m68k/kernel/uboot.c b/arch/m68k/kernel/uboot.c
index 5e52ea150d5c..fa7c279ead5d 100644
--- a/arch/m68k/kernel/uboot.c
+++ b/arch/m68k/kernel/uboot.c
@@ -73,7 +73,7 @@ static void __init parse_uboot_commandline(char *commandp, int size)
uboot_cmd_end = sp[5];
if (uboot_cmd_start && uboot_cmd_end)
- strncpy(commandp, (const char *)uboot_cmd_start, size);
+ strscpy(commandp, (const char *)uboot_cmd_start, size);
#if defined(CONFIG_BLK_DEV_INITRD)
uboot_initrd_start = sp[2];
diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c
index 73651e093c4d..745bd575dcfa 100644
--- a/arch/m68k/mm/motorola.c
+++ b/arch/m68k/mm/motorola.c
@@ -105,7 +105,8 @@ static struct list_head ptable_list[3] = {
#define PD_PTABLE(page) ((ptable_desc *)&(virt_to_page((void *)(page))->lru))
#define PD_PAGE(ptable) (list_entry(ptable, struct page, lru))
-#define PD_MARKBITS(dp) (*(unsigned int *)&PD_PAGE(dp)->index)
+#define PD_PTDESC(ptable) (list_entry(ptable, struct ptdesc, pt_list))
+#define PD_MARKBITS(dp) (*(unsigned int *)&PD_PTDESC(dp)->pt_index)
static const int ptable_shift[3] = {
7+2, /* PGD */
@@ -139,7 +140,7 @@ void __init init_pointer_table(void *table, int type)
return;
}
-void *get_pointer_table(int type)
+void *get_pointer_table(struct mm_struct *mm, int type)
{
ptable_desc *dp = ptable_list[type].next;
unsigned int mask = list_empty(&ptable_list[type]) ? 0 : PD_MARKBITS(dp);
@@ -164,10 +165,10 @@ void *get_pointer_table(int type)
* m68k doesn't have SPLIT_PTE_PTLOCKS for not having
* SMP.
*/
- pagetable_pte_ctor(virt_to_ptdesc(page));
+ pagetable_pte_ctor(mm, virt_to_ptdesc(page));
break;
case TABLE_PMD:
- pagetable_pmd_ctor(virt_to_ptdesc(page));
+ pagetable_pmd_ctor(mm, virt_to_ptdesc(page));
break;
case TABLE_PGD:
pagetable_pgd_ctor(virt_to_ptdesc(page));
diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h
index e4ea2ec3642f..b1bb2c65dd04 100644
--- a/arch/microblaze/include/asm/pgtable.h
+++ b/arch/microblaze/include/asm/pgtable.h
@@ -285,14 +285,6 @@ static inline pte_t mk_pte_phys(phys_addr_t physpage, pgprot_t pgprot)
return pte;
}
-#define mk_pte(page, pgprot) \
-({ \
- pte_t pte; \
- pte_val(pte) = (((page - mem_map) << PAGE_SHIFT) + memory_start) | \
- pgprot_val(pgprot); \
- pte; \
-})
-
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
pte_val(pte) = (pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot);
diff --git a/arch/microblaze/include/asm/syscall.h b/arch/microblaze/include/asm/syscall.h
index 5eb3f624cc59..b5b6b91fae3e 100644
--- a/arch/microblaze/include/asm/syscall.h
+++ b/arch/microblaze/include/asm/syscall.h
@@ -14,6 +14,13 @@ static inline long syscall_get_nr(struct task_struct *task,
return regs->r12;
}
+static inline void syscall_set_nr(struct task_struct *task,
+ struct pt_regs *regs,
+ int nr)
+{
+ regs->r12 = nr;
+}
+
static inline void syscall_rollback(struct task_struct *task,
struct pt_regs *regs)
{
diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c
index 9f73265aad4e..e96dd1b7aba4 100644
--- a/arch/microblaze/mm/pgtable.c
+++ b/arch/microblaze/mm/pgtable.c
@@ -245,7 +245,7 @@ unsigned long iopa(unsigned long addr)
__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
{
if (mem_init_done)
- return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ return __pte_alloc_one_kernel(mm);
else
return memblock_alloc_try_nid(PAGE_SIZE, PAGE_SIZE,
MEMBLOCK_LOW_LIMIT,
diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h
index bbca420c96d3..942af87f1cdd 100644
--- a/arch/mips/include/asm/pgalloc.h
+++ b/arch/mips/include/asm/pgalloc.h
@@ -62,7 +62,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
if (!ptdesc)
return NULL;
- if (!pagetable_pmd_ctor(ptdesc)) {
+ if (!pagetable_pmd_ctor(mm, ptdesc)) {
pagetable_free(ptdesc);
return NULL;
}
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index c29a551eb0ca..4852b005a72d 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -504,12 +504,6 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
return true;
}
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- */
-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
-
#if defined(CONFIG_XPA)
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
@@ -719,9 +713,6 @@ static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
-/* Extern to avoid header file madness */
-extern pmd_t mk_pmd(struct page *page, pgprot_t prot);
-
static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
pmd_val(pmd) = (pmd_val(pmd) & (_PAGE_CHG_MASK | _PAGE_HUGE)) |
diff --git a/arch/mips/include/asm/syscall.h b/arch/mips/include/asm/syscall.h
index 056aa1b713e2..d19e67e2aa6a 100644
--- a/arch/mips/include/asm/syscall.h
+++ b/arch/mips/include/asm/syscall.h
@@ -41,6 +41,21 @@ static inline long syscall_get_nr(struct task_struct *task,
return task_thread_info(task)->syscall;
}
+static inline void syscall_set_nr(struct task_struct *task,
+ struct pt_regs *regs,
+ int nr)
+{
+ /*
+ * New syscall number has to be assigned to regs[2] because
+ * it is loaded from there unconditionally after return from
+ * syscall_trace_enter() invocation.
+ *
+ * Consequently, if the syscall was indirect and nr != __NR_syscall,
+ * then after this assignment the syscall will cease to be indirect.
+ */
+ task_thread_info(task)->syscall = regs->regs[2] = nr;
+}
+
static inline void mips_syscall_update_nr(struct task_struct *task,
struct pt_regs *regs)
{
@@ -74,6 +89,23 @@ static inline void mips_get_syscall_arg(unsigned long *arg,
#endif
}
+static inline void mips_set_syscall_arg(unsigned long *arg,
+ struct task_struct *task, struct pt_regs *regs, unsigned int n)
+{
+#ifdef CONFIG_32BIT
+ switch (n) {
+ case 0: case 1: case 2: case 3:
+ regs->regs[4 + n] = *arg;
+ return;
+ case 4: case 5: case 6: case 7:
+ *arg = regs->args[n] = *arg;
+ return;
+ }
+#else
+ regs->regs[4 + n] = *arg;
+#endif
+}
+
static inline long syscall_get_error(struct task_struct *task,
struct pt_regs *regs)
{
@@ -120,6 +152,17 @@ static inline void syscall_get_arguments(struct task_struct *task,
mips_get_syscall_arg(args++, task, regs, i++);
}
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ unsigned long *args)
+{
+ unsigned int i = 0;
+ unsigned int n = 6;
+
+ while (n--)
+ mips_set_syscall_arg(args++, task, regs, i++);
+}
+
extern const unsigned long sys_call_table[];
extern const unsigned long sys32_call_table[];
extern const unsigned long sysn32_call_table[];
diff --git a/arch/mips/mm/pgtable-32.c b/arch/mips/mm/pgtable-32.c
index 84dd5136d53a..e2cf2166d5cb 100644
--- a/arch/mips/mm/pgtable-32.c
+++ b/arch/mips/mm/pgtable-32.c
@@ -31,16 +31,6 @@ void pgd_init(void *addr)
}
#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
-pmd_t mk_pmd(struct page *page, pgprot_t prot)
-{
- pmd_t pmd;
-
- pmd_val(pmd) = (page_to_pfn(page) << PFN_PTE_SHIFT) | pgprot_val(prot);
-
- return pmd;
-}
-
-
void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
diff --git a/arch/mips/mm/pgtable-64.c b/arch/mips/mm/pgtable-64.c
index 1e544827dea9..b24f865de357 100644
--- a/arch/mips/mm/pgtable-64.c
+++ b/arch/mips/mm/pgtable-64.c
@@ -90,15 +90,6 @@ void pud_init(void *addr)
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-pmd_t mk_pmd(struct page *page, pgprot_t prot)
-{
- pmd_t pmd;
-
- pmd_val(pmd) = (page_to_pfn(page) << PFN_PTE_SHIFT) | pgprot_val(prot);
-
- return pmd;
-}
-
void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index e5d64c84aadf..e98578e27e26 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h
@@ -221,12 +221,6 @@ static inline void pte_clear(struct mm_struct *mm,
* Conversion functions: convert a page and protection to a page entry,
* and a page entry and page directory to the page they refer to.
*/
-#define mk_pte(page, prot) (pfn_pte(page_to_pfn(page), prot))
-
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- */
#define pmd_phys(pmd) virt_to_phys((void *)pmd_val(pmd))
#define pmd_pfn(pmd) (pmd_phys(pmd) >> PAGE_SHIFT)
#define pmd_page(pmd) (pfn_to_page(pmd_phys(pmd) >> PAGE_SHIFT))
diff --git a/arch/nios2/include/asm/syscall.h b/arch/nios2/include/asm/syscall.h
index fff52205fb65..8e3eb1d689bb 100644
--- a/arch/nios2/include/asm/syscall.h
+++ b/arch/nios2/include/asm/syscall.h
@@ -15,6 +15,11 @@ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
return regs->r2;
}
+static inline void syscall_set_nr(struct task_struct *task, struct pt_regs *regs, int nr)
+{
+ regs->r2 = nr;
+}
+
static inline void syscall_rollback(struct task_struct *task,
struct pt_regs *regs)
{
@@ -58,6 +63,17 @@ static inline void syscall_get_arguments(struct task_struct *task,
*args = regs->r9;
}
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs, const unsigned long *args)
+{
+ regs->r4 = *args++;
+ regs->r5 = *args++;
+ regs->r6 = *args++;
+ regs->r7 = *args++;
+ regs->r8 = *args++;
+ regs->r9 = *args;
+}
+
static inline int syscall_get_arch(struct task_struct *task)
{
return AUDIT_ARCH_NIOS2;
diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h
index 60c6ce7ff2dc..71bfb8c8c482 100644
--- a/arch/openrisc/include/asm/pgtable.h
+++ b/arch/openrisc/include/asm/pgtable.h
@@ -299,8 +299,6 @@ static inline pte_t __mk_pte(void *page, pgprot_t pgprot)
return pte;
}
-#define mk_pte(page, pgprot) __mk_pte(page_address(page), (pgprot))
-
#define mk_pte_phys(physpage, pgprot) \
({ \
pte_t __pte; \
diff --git a/arch/openrisc/include/asm/syscall.h b/arch/openrisc/include/asm/syscall.h
index 903ed882bdec..5e037d9659c5 100644
--- a/arch/openrisc/include/asm/syscall.h
+++ b/arch/openrisc/include/asm/syscall.h
@@ -26,6 +26,12 @@ syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
}
static inline void
+syscall_set_nr(struct task_struct *task, struct pt_regs *regs, int nr)
+{
+ regs->orig_gpr11 = nr;
+}
+
+static inline void
syscall_rollback(struct task_struct *task, struct pt_regs *regs)
{
regs->gpr[11] = regs->orig_gpr11;
@@ -57,6 +63,13 @@ syscall_get_arguments(struct task_struct *task, struct pt_regs *regs,
memcpy(args, &regs->gpr[3], 6 * sizeof(args[0]));
}
+static inline void
+syscall_set_arguments(struct task_struct *task, struct pt_regs *regs,
+ const unsigned long *args)
+{
+ memcpy(&regs->gpr[3], args, 6 * sizeof(args[0]));
+}
+
static inline int syscall_get_arch(struct task_struct *task)
{
return AUDIT_ARCH_OPENRISC;
diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c
index 8e63e86251ca..3b352f97fecb 100644
--- a/arch/openrisc/mm/ioremap.c
+++ b/arch/openrisc/mm/ioremap.c
@@ -36,7 +36,7 @@ pte_t __ref *pte_alloc_one_kernel(struct mm_struct *mm)
pte_t *pte;
if (likely(mem_init_done)) {
- pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
+ pte = __pte_alloc_one_kernel(mm);
} else {
pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
}
diff --git a/arch/parisc/boot/compressed/Makefile b/arch/parisc/boot/compressed/Makefile
index 92227fa813dc..17c42d718eb3 100644
--- a/arch/parisc/boot/compressed/Makefile
+++ b/arch/parisc/boot/compressed/Makefile
@@ -18,6 +18,7 @@ KBUILD_CFLAGS += -fno-PIE -mno-space-regs -mdisable-fpregs -Os
ifndef CONFIG_64BIT
KBUILD_CFLAGS += -mfast-indirect-calls
endif
+KBUILD_CFLAGS += -std=gnu11
LDFLAGS_vmlinux := -X -e startup --as-needed -T
$(obj)/vmlinux: $(obj)/vmlinux.lds $(addprefix $(obj)/, $(OBJECTS)) $(LIBGCC) FORCE
diff --git a/arch/parisc/include/asm/alternative.h b/arch/parisc/include/asm/alternative.h
index 1eb488f25b83..1601ae4b888d 100644
--- a/arch/parisc/include/asm/alternative.h
+++ b/arch/parisc/include/asm/alternative.h
@@ -13,7 +13,7 @@
#define INSN_PxTLB 0x02 /* modify pdtlb, pitlb */
#define INSN_NOP 0x08000240 /* nop */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/init.h>
#include <linux/types.h>
@@ -61,6 +61,6 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end,
.word (new_instr_ptr - .) ! \
.previous
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* __ASM_PARISC_ALTERNATIVE_H */
diff --git a/arch/parisc/include/asm/assembly.h b/arch/parisc/include/asm/assembly.h
index 000a28e1c5e8..c20261604f09 100644
--- a/arch/parisc/include/asm/assembly.h
+++ b/arch/parisc/include/asm/assembly.h
@@ -53,7 +53,7 @@
#define SR_TEMP2 2
#define SR_USER 3
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
#ifdef CONFIG_64BIT
#define LDREG ldd
@@ -582,5 +582,5 @@
.previous
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif
diff --git a/arch/parisc/include/asm/barrier.h b/arch/parisc/include/asm/barrier.h
index c705decf2bed..519b1903c5ed 100644
--- a/arch/parisc/include/asm/barrier.h
+++ b/arch/parisc/include/asm/barrier.h
@@ -4,7 +4,7 @@
#include <asm/alternative.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/* The synchronize caches instruction executes as a nop on systems in
which all memory references are performed in order. */
@@ -93,5 +93,5 @@ do { \
})
#include <asm-generic/barrier.h>
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif /* __ASM_BARRIER_H */
diff --git a/arch/parisc/include/asm/cache.h b/arch/parisc/include/asm/cache.h
index a3f0f100f219..3f8d3be6ef24 100644
--- a/arch/parisc/include/asm/cache.h
+++ b/arch/parisc/include/asm/cache.h
@@ -16,7 +16,7 @@
#define L1_CACHE_BYTES 16
#define L1_CACHE_SHIFT 4
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#define SMP_CACHE_BYTES L1_CACHE_BYTES
@@ -66,7 +66,7 @@ void parisc_setup_cache_timing(void);
ALTERNATIVE(ALT_COND_NO_IOC_FDC, INSN_NOP) :::"memory")
#define asm_syncdma() asm volatile("syncdma" :::"memory")
-#endif /* ! __ASSEMBLY__ */
+#endif /* ! __ASSEMBLER__ */
/* Classes of processor wrt: disabling space register hashing */
diff --git a/arch/parisc/include/asm/current.h b/arch/parisc/include/asm/current.h
index dc7aea07c3f3..2814529a4c28 100644
--- a/arch/parisc/include/asm/current.h
+++ b/arch/parisc/include/asm/current.h
@@ -2,7 +2,7 @@
#ifndef _ASM_PARISC_CURRENT_H
#define _ASM_PARISC_CURRENT_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct task_struct;
static __always_inline struct task_struct *get_current(void)
@@ -16,6 +16,6 @@ static __always_inline struct task_struct *get_current(void)
#define current get_current()
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_PARISC_CURRENT_H */
diff --git a/arch/parisc/include/asm/dwarf.h b/arch/parisc/include/asm/dwarf.h
index f4512db86a19..526f4a79262c 100644
--- a/arch/parisc/include/asm/dwarf.h
+++ b/arch/parisc/include/asm/dwarf.h
@@ -6,7 +6,7 @@
#ifndef _ASM_PARISC_DWARF_H
#define _ASM_PARISC_DWARF_H
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
#define CFI_STARTPROC .cfi_startproc
#define CFI_ENDPROC .cfi_endproc
@@ -15,6 +15,6 @@
#define CFI_REL_OFFSET .cfi_rel_offset
#define CFI_UNDEFINED .cfi_undefined
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_PARISC_DWARF_H */
diff --git a/arch/parisc/include/asm/fixmap.h b/arch/parisc/include/asm/fixmap.h
index 5cd80ce1163a..9cafa449c4a7 100644
--- a/arch/parisc/include/asm/fixmap.h
+++ b/arch/parisc/include/asm/fixmap.h
@@ -39,7 +39,7 @@
#define KERNEL_MAP_START (GATEWAY_PAGE_SIZE)
#define KERNEL_MAP_END (FIXMAP_START)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
enum fixed_addresses {
@@ -59,6 +59,6 @@ extern void *parisc_vmalloc_start;
void set_fixmap(enum fixed_addresses idx, phys_addr_t phys);
void clear_fixmap(enum fixed_addresses idx);
-#endif /*__ASSEMBLY__*/
+#endif /*__ASSEMBLER__*/
#endif /*_ASM_FIXMAP_H*/
diff --git a/arch/parisc/include/asm/ftrace.h b/arch/parisc/include/asm/ftrace.h
index f1cc1ee3a647..8b89d2b642eb 100644
--- a/arch/parisc/include/asm/ftrace.h
+++ b/arch/parisc/include/asm/ftrace.h
@@ -2,7 +2,7 @@
#ifndef _ASM_PARISC_FTRACE_H
#define _ASM_PARISC_FTRACE_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
extern void mcount(void);
#define MCOUNT_ADDR ((unsigned long)mcount)
@@ -29,6 +29,6 @@ unsigned long ftrace_call_adjust(unsigned long addr);
#define ftrace_return_address(n) return_address(n)
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_PARISC_FTRACE_H */
diff --git a/arch/parisc/include/asm/jump_label.h b/arch/parisc/include/asm/jump_label.h
index 317ebc5edc9f..f325ae3c622f 100644
--- a/arch/parisc/include/asm/jump_label.h
+++ b/arch/parisc/include/asm/jump_label.h
@@ -2,7 +2,7 @@
#ifndef _ASM_PARISC_JUMP_LABEL_H
#define _ASM_PARISC_JUMP_LABEL_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
#include <linux/stringify.h>
@@ -44,5 +44,5 @@ l_yes:
return true;
}
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif
diff --git a/arch/parisc/include/asm/kexec.h b/arch/parisc/include/asm/kexec.h
index 87e174006995..bf31e2d50df9 100644
--- a/arch/parisc/include/asm/kexec.h
+++ b/arch/parisc/include/asm/kexec.h
@@ -14,7 +14,7 @@
#define KEXEC_ARCH KEXEC_ARCH_PARISC
#define ARCH_HAS_KIMAGE_ARCH
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct kimage_arch {
unsigned long initrd_start;
@@ -28,6 +28,6 @@ static inline void crash_setup_regs(struct pt_regs *newregs,
/* Dummy implementation for now */
}
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* _ASM_PARISC_KEXEC_H */
diff --git a/arch/parisc/include/asm/kgdb.h b/arch/parisc/include/asm/kgdb.h
index 317cd434bee3..9ece98bc6d9d 100644
--- a/arch/parisc/include/asm/kgdb.h
+++ b/arch/parisc/include/asm/kgdb.h
@@ -21,7 +21,7 @@
#define CACHE_FLUSH_IS_SAFE 1
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
static inline void arch_kgdb_breakpoint(void)
{
diff --git a/arch/parisc/include/asm/linkage.h b/arch/parisc/include/asm/linkage.h
index cd6fe4febead..d4cad492b971 100644
--- a/arch/parisc/include/asm/linkage.h
+++ b/arch/parisc/include/asm/linkage.h
@@ -15,7 +15,7 @@
*/
#define ASM_NL !
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
#define ENTRY(name) \
ALIGN !\
@@ -35,6 +35,6 @@ name: ASM_NL\
.procend ASM_NL\
ENDPROC(name)
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* __ASM_PARISC_LINKAGE_H */
diff --git a/arch/parisc/include/asm/page.h b/arch/parisc/include/asm/page.h
index 7fd447092630..8f4e51071ea1 100644
--- a/arch/parisc/include/asm/page.h
+++ b/arch/parisc/include/asm/page.h
@@ -8,7 +8,7 @@
#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <asm/types.h>
#include <asm/cache.h>
@@ -93,7 +93,7 @@ typedef struct __physmem_range {
extern physmem_range_t pmem_ranges[];
extern int npmem_ranges;
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
/* WARNING: The definitions below must match exactly to sizeof(pte_t)
* etc
@@ -139,7 +139,7 @@ extern int npmem_ranges;
#define KERNEL_BINARY_TEXT_START (__PAGE_OFFSET + 0x100000)
/* These macros don't work for 64-bit C code -- don't allow in C at all */
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
# define PA(x) ((x)-__PAGE_OFFSET)
# define VA(x) ((x)+__PAGE_OFFSET)
#endif
diff --git a/arch/parisc/include/asm/pdc.h b/arch/parisc/include/asm/pdc.h
index 5d2d9737e579..6080a1516b34 100644
--- a/arch/parisc/include/asm/pdc.h
+++ b/arch/parisc/include/asm/pdc.h
@@ -4,7 +4,7 @@
#include <uapi/asm/pdc.h>
-#if !defined(__ASSEMBLY__)
+#if !defined(__ASSEMBLER__)
extern int parisc_narrow_firmware;
@@ -109,5 +109,5 @@ static inline char * os_id_to_string(u16 os_id) {
}
}
-#endif /* !defined(__ASSEMBLY__) */
+#endif /* !defined(__ASSEMBLER__) */
#endif /* _PARISC_PDC_H */
diff --git a/arch/parisc/include/asm/pdcpat.h b/arch/parisc/include/asm/pdcpat.h
index 8f160375b865..84ac81b1adde 100644
--- a/arch/parisc/include/asm/pdcpat.h
+++ b/arch/parisc/include/asm/pdcpat.h
@@ -210,7 +210,7 @@
#define PDC_PAT_SYSTEM_INFO 76L
/* PDC_PAT_SYSTEM_INFO uses the same options as PDC_SYSTEM_INFO function. */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/types.h>
#ifdef CONFIG_64BIT
@@ -389,6 +389,6 @@ extern int pdc_pat_mem_get_dimm_phys_location(
struct pdc_pat_mem_phys_mem_location *pret,
unsigned long phys_addr);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* ! __PARISC_PATPDC_H */
diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h
index 2ca74a56415c..3b84ee93edaa 100644
--- a/arch/parisc/include/asm/pgalloc.h
+++ b/arch/parisc/include/asm/pgalloc.h
@@ -39,7 +39,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address)
ptdesc = pagetable_alloc(gfp, PMD_TABLE_ORDER);
if (!ptdesc)
return NULL;
- if (!pagetable_pmd_ctor(ptdesc)) {
+ if (!pagetable_pmd_ctor(mm, ptdesc)) {
pagetable_free(ptdesc);
return NULL;
}
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index babf65751e81..80f5e2a28413 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -12,7 +12,7 @@
#include <asm/fixmap.h>
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/*
* we simulate an x86-style page table for the linux mm code
*/
@@ -73,7 +73,7 @@ extern void __update_cache(pte_t pte);
mb(); \
} while(0)
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#define pte_ERROR(e) \
printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
@@ -226,7 +226,7 @@ extern void __update_cache(pte_t pte);
#define PxD_FLAG_SHIFT (4)
#define PxD_VALUE_SHIFT (PFN_PTE_SHIFT-PxD_FLAG_SHIFT)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#define PAGE_NONE __pgprot(_PAGE_PRESENT | _PAGE_USER)
#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | _PAGE_WRITE)
@@ -338,10 +338,6 @@ static inline pte_t pte_mkspecial(pte_t pte) { pte_val(pte) |= _PAGE_SPECIAL; re
#endif
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- */
#define __mk_pte(addr,pgprot) \
({ \
pte_t __pte; \
@@ -351,8 +347,6 @@ static inline pte_t pte_mkspecial(pte_t pte) { pte_val(pte) |= _PAGE_SPECIAL; re
__pte; \
})
-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
-
static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
{
pte_t pte;
@@ -477,7 +471,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
#define pte_same(A,B) (pte_val(A) == pte_val(B))
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
/* TLB page size encoding - see table 3-1 in parisc20.pdf */
diff --git a/arch/parisc/include/asm/prefetch.h b/arch/parisc/include/asm/prefetch.h
index 6e63f720024d..748eefb27c68 100644
--- a/arch/parisc/include/asm/prefetch.h
+++ b/arch/parisc/include/asm/prefetch.h
@@ -16,7 +16,7 @@
#ifndef __ASM_PARISC_PREFETCH_H
#define __ASM_PARISC_PREFETCH_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#ifdef CONFIG_PREFETCH
#define ARCH_HAS_PREFETCH
@@ -40,6 +40,6 @@ static inline void prefetchw(const void *addr)
#endif /* CONFIG_PA20 */
#endif /* CONFIG_PREFETCH */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* __ASM_PARISC_PROCESSOR_H */
diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h
index 77fac02188e1..4c14bde39aac 100644
--- a/arch/parisc/include/asm/processor.h
+++ b/arch/parisc/include/asm/processor.h
@@ -9,7 +9,7 @@
#ifndef __ASM_PARISC_PROCESSOR_H
#define __ASM_PARISC_PROCESSOR_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <linux/threads.h>
#include <linux/irqreturn.h>
@@ -20,7 +20,7 @@
#include <asm/ptrace.h>
#include <asm/types.h>
#include <asm/percpu.h>
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#define HAVE_ARCH_PICK_MMAP_LAYOUT
@@ -45,7 +45,7 @@
#define STACK_TOP TASK_SIZE
#define STACK_TOP_MAX DEFAULT_TASK_SIZE
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
struct rlimit;
unsigned long mmap_upper_limit(struct rlimit *rlim_stack);
@@ -325,6 +325,6 @@ extern void sba_directed_lmmio(struct parisc_device *, struct resource *);
extern void lba_set_iregs(struct parisc_device *lba, u32 ibase, u32 imask);
extern void ccio_cujo20_fixup(struct parisc_device *dev, u32 iovp);
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#endif /* __ASM_PARISC_PROCESSOR_H */
diff --git a/arch/parisc/include/asm/psw.h b/arch/parisc/include/asm/psw.h
index 46921ffcc407..9140e1ab7e63 100644
--- a/arch/parisc/include/asm/psw.h
+++ b/arch/parisc/include/asm/psw.h
@@ -60,7 +60,7 @@
#define USER_PSW_MASK (WIDE_PSW | PSW_T | PSW_N | PSW_X | PSW_B | PSW_V | PSW_CB)
#define USER_PSW (PSW_C | PSW_Q | PSW_P | PSW_D | PSW_I)
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
/* The program status word as bitfields. */
struct pa_psw {
@@ -99,6 +99,6 @@ struct pa_psw {
#define pa_psw(task) ((struct pa_psw *) ((char *) (task) + TASK_PT_PSW))
#endif
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
#endif
diff --git a/arch/parisc/include/asm/signal.h b/arch/parisc/include/asm/signal.h
index e84883c6b4c7..85c3d7409bbc 100644
--- a/arch/parisc/include/asm/signal.h
+++ b/arch/parisc/include/asm/signal.h
@@ -4,12 +4,12 @@
#include <uapi/asm/signal.h>
-# ifndef __ASSEMBLY__
+# ifndef __ASSEMBLER__
/* Most things should be clean enough to redefine this at will, if care
is taken to make libc match. */
#include <asm/sigcontext.h>
-#endif /* !__ASSEMBLY */
+#endif /* !__ASSEMBLER__ */
#endif /* _ASM_PARISC_SIGNAL_H */
diff --git a/arch/parisc/include/asm/smp.h b/arch/parisc/include/asm/smp.h
index 94d1f21ce99a..0cf1c3a2696a 100644
--- a/arch/parisc/include/asm/smp.h
+++ b/arch/parisc/include/asm/smp.h
@@ -12,7 +12,7 @@ extern int init_per_cpu(int cpuid);
#define PDC_OS_BOOT_RENDEZVOUS 0x10
#define PDC_OS_BOOT_RENDEZVOUS_HI 0x28
-#ifndef ASSEMBLY
+#ifndef __ASSEMBLER__
#include <linux/bitops.h>
#include <linux/threads.h> /* for NR_CPUS */
#include <linux/cpumask.h>
@@ -34,7 +34,7 @@ extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
#define raw_smp_processor_id() (current_thread_info()->cpu)
-#endif /* !ASSEMBLY */
+#endif /* !__ASSEMBLER__ */
#else /* CONFIG_SMP */
diff --git a/arch/parisc/include/asm/spinlock_types.h b/arch/parisc/include/asm/spinlock_types.h
index 7b986b09dba8..8e6889bc23cc 100644
--- a/arch/parisc/include/asm/spinlock_types.h
+++ b/arch/parisc/include/asm/spinlock_types.h
@@ -6,7 +6,7 @@
#define SPINLOCK_BREAK_INSN 0x0000c006 /* break 6,6 */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
typedef struct {
volatile unsigned int lock[4];
@@ -26,7 +26,7 @@ typedef struct {
volatile unsigned int counter;
} arch_rwlock_t;
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#define __ARCH_RW_LOCK_UNLOCKED__ 0x01000000
#define __ARCH_RW_LOCK_UNLOCKED { .lock_mutex = __ARCH_SPIN_LOCK_UNLOCKED, \
diff --git a/arch/parisc/include/asm/syscall.h b/arch/parisc/include/asm/syscall.h
index 00b127a5e09b..c11222798ab2 100644
--- a/arch/parisc/include/asm/syscall.h
+++ b/arch/parisc/include/asm/syscall.h
@@ -17,6 +17,13 @@ static inline long syscall_get_nr(struct task_struct *tsk,
return regs->gr[20];
}
+static inline void syscall_set_nr(struct task_struct *tsk,
+ struct pt_regs *regs,
+ int nr)
+{
+ regs->gr[20] = nr;
+}
+
static inline void syscall_get_arguments(struct task_struct *tsk,
struct pt_regs *regs,
unsigned long *args)
@@ -29,6 +36,18 @@ static inline void syscall_get_arguments(struct task_struct *tsk,
args[0] = regs->gr[26];
}
+static inline void syscall_set_arguments(struct task_struct *tsk,
+ struct pt_regs *regs,
+ unsigned long *args)
+{
+ regs->gr[21] = args[5];
+ regs->gr[22] = args[4];
+ regs->gr[23] = args[3];
+ regs->gr[24] = args[2];
+ regs->gr[25] = args[1];
+ regs->gr[26] = args[0];
+}
+
static inline long syscall_get_error(struct task_struct *task,
struct pt_regs *regs)
{
diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h
index 1a58795f785c..b283738bb6da 100644
--- a/arch/parisc/include/asm/thread_info.h
+++ b/arch/parisc/include/asm/thread_info.h
@@ -2,7 +2,7 @@
#ifndef _ASM_PARISC_THREAD_INFO_H
#define _ASM_PARISC_THREAD_INFO_H
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#include <asm/processor.h>
#include <asm/special_insns.h>
@@ -20,7 +20,7 @@ struct thread_info {
.preempt_count = INIT_PREEMPT_COUNT, \
}
-#endif /* !__ASSEMBLY */
+#endif /* !__ASSEMBLER__ */
/* thread information allocation */
diff --git a/arch/parisc/include/asm/traps.h b/arch/parisc/include/asm/traps.h
index 0ccdb738a9a3..10c8fb68e404 100644
--- a/arch/parisc/include/asm/traps.h
+++ b/arch/parisc/include/asm/traps.h
@@ -4,7 +4,7 @@
#define PARISC_ITLB_TRAP 6 /* defined by architecture. Do not change. */
-#if !defined(__ASSEMBLY__)
+#if !defined(__ASSEMBLER__)
struct pt_regs;
/* traps.c */
diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h
index a97c0fd55f91..3e46c6ea9df6 100644
--- a/arch/parisc/include/asm/unistd.h
+++ b/arch/parisc/include/asm/unistd.h
@@ -6,7 +6,7 @@
#define __NR_Linux_syscalls __NR_syscalls
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#define SYS_ify(syscall_name) __NR_##syscall_name
@@ -144,7 +144,7 @@
#define __ARCH_WANT_SYS_UTIME
#endif
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
#undef STR
diff --git a/arch/parisc/include/asm/vdso.h b/arch/parisc/include/asm/vdso.h
index 5f581c1d6460..81bc1d42802a 100644
--- a/arch/parisc/include/asm/vdso.h
+++ b/arch/parisc/include/asm/vdso.h
@@ -2,7 +2,7 @@
#ifndef __PARISC_VDSO_H__
#define __PARISC_VDSO_H__
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#ifdef CONFIG_64BIT
#include <generated/vdso64-offsets.h>
@@ -12,7 +12,7 @@
#define VDSO64_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso64_offset_##name))
#define VDSO32_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso32_offset_##name))
-#endif /* __ASSEMBLY __ */
+#endif /* __ASSEMBLER__ */
/* Default link addresses for the vDSOs */
#define VDSO_LBASE 0
diff --git a/arch/parisc/include/uapi/asm/pdc.h b/arch/parisc/include/uapi/asm/pdc.h
index fef4f2e96160..65031ddf8372 100644
--- a/arch/parisc/include/uapi/asm/pdc.h
+++ b/arch/parisc/include/uapi/asm/pdc.h
@@ -361,7 +361,7 @@
/* size of the pdc_result buffer for firmware.c */
#define NUM_PDC_RESULT 32
-#if !defined(__ASSEMBLY__)
+#if !defined(__ASSEMBLER__)
/* flags for hardware_path */
#define PF_AUTOBOOT 0x80
@@ -741,6 +741,6 @@ struct pdc_firm_test_get_rtn_block { /* PDC_MODEL/PDC_FIRM_TEST_GET */
#define PIRANHA_CPU_ID 0x13
#define MAKO_CPU_ID 0x14
-#endif /* !defined(__ASSEMBLY__) */
+#endif /* !defined(__ASSEMBLER__) */
#endif /* _UAPI_PARISC_PDC_H */
diff --git a/arch/parisc/include/uapi/asm/signal.h b/arch/parisc/include/uapi/asm/signal.h
index 40d7a574c5dd..d99accf37341 100644
--- a/arch/parisc/include/uapi/asm/signal.h
+++ b/arch/parisc/include/uapi/asm/signal.h
@@ -61,7 +61,7 @@
#define _NSIG_BPW (sizeof(unsigned long) * 8)
#define _NSIG_WORDS (_NSIG / _NSIG_BPW)
-# ifndef __ASSEMBLY__
+# ifndef __ASSEMBLER__
# include <linux/types.h>
@@ -80,5 +80,5 @@ typedef struct sigaltstack {
__kernel_size_t ss_size;
} stack_t;
-#endif /* !__ASSEMBLY */
+#endif /* !__ASSEMBLER__ */
#endif /* _UAPI_ASM_PARISC_SIGNAL_H */
diff --git a/arch/parisc/kernel/unaligned.c b/arch/parisc/kernel/unaligned.c
index f4626943633a..00e97204783e 100644
--- a/arch/parisc/kernel/unaligned.c
+++ b/arch/parisc/kernel/unaligned.c
@@ -25,7 +25,7 @@
#define DPRINTF(fmt, args...)
#endif
-#define RFMT "%#08lx"
+#define RFMT "0x%08lx"
/* 1111 1100 0000 0000 0001 0011 1100 0000 */
#define OPCODE1(a,b,c) ((a)<<26|(b)<<12|(c)<<6)
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 6d98e6f08d4d..6ed93e290c2f 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1096,7 +1096,6 @@ static inline bool pmd_access_permitted(pmd_t pmd, bool write)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
extern pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot);
-extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
extern pud_t pud_modify(pud_t pud, pgprot_t newprot);
extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 2f72ad885332..93d77ad5a92f 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -53,9 +53,8 @@ void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
#define MAX_PTRS_PER_PGD PTRS_PER_PGD
#endif
-/* Keep these as a macros to avoid include dependency mess */
+/* Keep this as a macro to avoid include dependency mess */
#define pte_page(x) pfn_to_page(pte_pfn(x))
-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
static inline unsigned long pte_pfn(pte_t pte)
{
diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h
index 3dd36c5e334a..4b3c52ed6e9d 100644
--- a/arch/powerpc/include/asm/syscall.h
+++ b/arch/powerpc/include/asm/syscall.h
@@ -39,6 +39,16 @@ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
return -1;
}
+static inline void syscall_set_nr(struct task_struct *task, struct pt_regs *regs, int nr)
+{
+ /*
+ * Unlike syscall_get_nr(), syscall_set_nr() can be called only when
+ * the target task is stopped for tracing on entering syscall, so
+ * there is no need to have the same check syscall_get_nr() has.
+ */
+ regs->gpr[0] = nr;
+}
+
static inline void syscall_rollback(struct task_struct *task,
struct pt_regs *regs)
{
@@ -110,6 +120,16 @@ static inline void syscall_get_arguments(struct task_struct *task,
}
}
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ const unsigned long *args)
+{
+ memcpy(&regs->gpr[3], args, 6 * sizeof(args[0]));
+
+ /* Also copy the first argument into orig_gpr3 */
+ regs->orig_gpr3 = args[0];
+}
+
static inline int syscall_get_arch(struct task_struct *task)
{
if (is_tsk_32bit_task(task))
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 8f7d41ce2ca1..0db01e10a3f8 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -269,11 +269,6 @@ pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot)
return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot));
}
-pmd_t mk_pmd(struct page *page, pgprot_t pgprot)
-{
- return pfn_pmd(page_to_pfn(page), pgprot);
-}
-
pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
unsigned long pmdv;
@@ -422,7 +417,7 @@ static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
ptdesc = pagetable_alloc(gfp, 0);
if (!ptdesc)
return NULL;
- if (!pagetable_pmd_ctor(ptdesc)) {
+ if (!pagetable_pmd_ctor(mm, ptdesc)) {
pagetable_free(ptdesc);
return NULL;
}
diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c
index 713268ccb1a0..77e55eac16e4 100644
--- a/arch/powerpc/mm/pgtable-frag.c
+++ b/arch/powerpc/mm/pgtable-frag.c
@@ -56,19 +56,17 @@ static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel)
{
void *ret = NULL;
struct ptdesc *ptdesc;
+ gfp_t gfp = PGALLOC_GFP;
- if (!kernel) {
- ptdesc = pagetable_alloc(PGALLOC_GFP | __GFP_ACCOUNT, 0);
- if (!ptdesc)
- return NULL;
- if (!pagetable_pte_ctor(ptdesc)) {
- pagetable_free(ptdesc);
- return NULL;
- }
- } else {
- ptdesc = pagetable_alloc(PGALLOC_GFP, 0);
- if (!ptdesc)
- return NULL;
+ if (!kernel)
+ gfp |= __GFP_ACCOUNT;
+
+ ptdesc = pagetable_alloc(gfp, 0);
+ if (!ptdesc)
+ return NULL;
+ if (!pagetable_pte_ctor(mm, ptdesc)) {
+ pagetable_free(ptdesc);
+ return NULL;
}
atomic_set(&ptdesc->pt_frag_refcount, 1);
@@ -124,12 +122,10 @@ void pte_fragment_free(unsigned long *table, int kernel)
BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0);
if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) {
- if (kernel)
- pagetable_free(ptdesc);
- else if (folio_test_clear_active(ptdesc_folio(ptdesc)))
- call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
- else
+ if (kernel || !folio_test_clear_active(ptdesc_folio(ptdesc)))
pte_free_now(&ptdesc->pt_rcu_head);
+ else
+ call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
}
}
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index 9dc239967b77..b2358d794855 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -298,6 +298,38 @@ static void populate_markers(void)
#endif
}
+static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte)
+{
+ note_page(pt_st, addr, 4, pte_val(pte));
+}
+
+static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd)
+{
+ note_page(pt_st, addr, 3, pmd_val(pmd));
+}
+
+static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud)
+{
+ note_page(pt_st, addr, 2, pud_val(pud));
+}
+
+static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d)
+{
+ note_page(pt_st, addr, 1, p4d_val(p4d));
+}
+
+static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd)
+{
+ note_page(pt_st, addr, 0, pgd_val(pgd));
+}
+
+static void note_page_flush(struct ptdump_state *pt_st)
+{
+ pte_t pte_zero = {0};
+
+ note_page(pt_st, 0, -1, pte_val(pte_zero));
+}
+
static int ptdump_show(struct seq_file *m, void *v)
{
struct pg_state st = {
@@ -305,7 +337,12 @@ static int ptdump_show(struct seq_file *m, void *v)
.marker = address_markers,
.level = -1,
.ptdump = {
- .note_page = note_page,
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
.range = ptdump_range,
}
};
@@ -338,7 +375,12 @@ bool ptdump_check_wx(void)
.level = -1,
.check_wx = true,
.ptdump = {
- .note_page = note_page,
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
.range = ptdump_range,
}
};
diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
index 0897dd99ab8d..188fadc1c21f 100644
--- a/arch/riscv/include/asm/pgtable-64.h
+++ b/arch/riscv/include/asm/pgtable-64.h
@@ -262,8 +262,6 @@ static inline unsigned long _pmd_pfn(pmd_t pmd)
return __page_val_to_pfn(pmd_val(pmd));
}
-#define mk_pmd(page, prot) pfn_pmd(page_to_pfn(page), prot)
-
#define pmd_ERROR(e) \
pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 428e48e5f57d..f19240fd018e 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -343,8 +343,6 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot)
return __pte((pfn << _PAGE_PFN_SHIFT) | prot_val);
}
-#define mk_pte(page, prot) pfn_pte(page_to_pfn(page), prot)
-
#define pte_pgprot pte_pgprot
static inline pgprot_t pte_pgprot(pte_t pte)
{
diff --git a/arch/riscv/include/asm/syscall.h b/arch/riscv/include/asm/syscall.h
index eceabf59ae48..34313387f977 100644
--- a/arch/riscv/include/asm/syscall.h
+++ b/arch/riscv/include/asm/syscall.h
@@ -30,6 +30,13 @@ static inline int syscall_get_nr(struct task_struct *task,
return regs->a7;
}
+static inline void syscall_set_nr(struct task_struct *task,
+ struct pt_regs *regs,
+ int nr)
+{
+ regs->a7 = nr;
+}
+
static inline void syscall_rollback(struct task_struct *task,
struct pt_regs *regs)
{
@@ -69,6 +76,18 @@ static inline void syscall_get_arguments(struct task_struct *task,
args[5] = regs->a5;
}
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ const unsigned long *args)
+{
+ regs->orig_a0 = args[0];
+ regs->a1 = args[1];
+ regs->a2 = args[2];
+ regs->a3 = args[3];
+ regs->a4 = args[4];
+ regs->a5 = args[5];
+}
+
static inline int syscall_get_arch(struct task_struct *task)
{
#ifdef CONFIG_64BIT
diff --git a/arch/riscv/kvm/aia_device.c b/arch/riscv/kvm/aia_device.c
index 43e472ff3e1a..806c41931cde 100644
--- a/arch/riscv/kvm/aia_device.c
+++ b/arch/riscv/kvm/aia_device.c
@@ -12,36 +12,6 @@
#include <linux/kvm_host.h>
#include <linux/uaccess.h>
-static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
-{
- struct kvm_vcpu *tmp_vcpu;
-
- for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
- tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
- mutex_unlock(&tmp_vcpu->mutex);
- }
-}
-
-static void unlock_all_vcpus(struct kvm *kvm)
-{
- unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
-}
-
-static bool lock_all_vcpus(struct kvm *kvm)
-{
- struct kvm_vcpu *tmp_vcpu;
- unsigned long c;
-
- kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
- if (!mutex_trylock(&tmp_vcpu->mutex)) {
- unlock_vcpus(kvm, c - 1);
- return false;
- }
- }
-
- return true;
-}
-
static int aia_create(struct kvm_device *dev, u32 type)
{
int ret;
@@ -53,7 +23,7 @@ static int aia_create(struct kvm_device *dev, u32 type)
return -EEXIST;
ret = -EBUSY;
- if (!lock_all_vcpus(kvm))
+ if (kvm_trylock_all_vcpus(kvm))
return ret;
kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -65,7 +35,7 @@ static int aia_create(struct kvm_device *dev, u32 type)
kvm->arch.aia.in_kernel = true;
out_unlock:
- unlock_all_vcpus(kvm);
+ kvm_unlock_all_vcpus(kvm);
return ret;
}
diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c
index b81672729887..b8e96dfff19d 100644
--- a/arch/riscv/mm/cacheflush.c
+++ b/arch/riscv/mm/cacheflush.c
@@ -172,7 +172,7 @@ static void set_icache_stale_mask(void)
stale_cpu = cpumask_test_cpu(cpu, mask);
cpumask_setall(mask);
- cpumask_assign_cpu(cpu, mask, stale_cpu);
+ __assign_cpu(cpu, mask, stale_cpu);
put_cpu();
}
#endif
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index ab475ec6ca42..8d0374d7ce8e 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -442,7 +442,12 @@ static phys_addr_t __meminit alloc_pte_late(uintptr_t va)
{
struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0);
- BUG_ON(!ptdesc || !pagetable_pte_ctor(ptdesc));
+ /*
+ * We do not know which mm the PTE page is associated to at this point.
+ * Passing NULL to the ctor is the safe option, though it may result
+ * in unnecessary work (e.g. initialising the ptlock for init_mm).
+ */
+ BUG_ON(!ptdesc || !pagetable_pte_ctor(NULL, ptdesc));
return __pa((pte_t *)ptdesc_address(ptdesc));
}
@@ -522,7 +527,8 @@ static phys_addr_t __meminit alloc_pmd_late(uintptr_t va)
{
struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0);
- BUG_ON(!ptdesc || !pagetable_pmd_ctor(ptdesc));
+ /* See comment in alloc_pte_late() regarding NULL passed the ctor */
+ BUG_ON(!ptdesc || !pagetable_pmd_ctor(NULL, ptdesc));
return __pa((pmd_t *)ptdesc_address(ptdesc));
}
@@ -584,11 +590,11 @@ static phys_addr_t __init alloc_pud_fixmap(uintptr_t va)
static phys_addr_t __meminit alloc_pud_late(uintptr_t va)
{
- unsigned long vaddr;
+ struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0);
- vaddr = __get_free_page(GFP_KERNEL);
- BUG_ON(!vaddr);
- return __pa(vaddr);
+ BUG_ON(!ptdesc);
+ pagetable_pud_ctor(ptdesc);
+ return __pa((pud_t *)ptdesc_address(ptdesc));
}
static p4d_t *__init get_p4d_virt_early(phys_addr_t pa)
@@ -622,11 +628,11 @@ static phys_addr_t __init alloc_p4d_fixmap(uintptr_t va)
static phys_addr_t __meminit alloc_p4d_late(uintptr_t va)
{
- unsigned long vaddr;
+ struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0);
- vaddr = __get_free_page(GFP_KERNEL);
- BUG_ON(!vaddr);
- return __pa(vaddr);
+ BUG_ON(!ptdesc);
+ pagetable_p4d_ctor(ptdesc);
+ return __pa((p4d_t *)ptdesc_address(ptdesc));
}
static void __meminit create_pud_mapping(pud_t *pudp, uintptr_t va, phys_addr_t pa, phys_addr_t sz,
diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c
index 9d5f657a251b..32922550a50a 100644
--- a/arch/riscv/mm/ptdump.c
+++ b/arch/riscv/mm/ptdump.c
@@ -318,6 +318,38 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr,
}
}
+static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte)
+{
+ note_page(pt_st, addr, 4, pte_val(pte));
+}
+
+static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd)
+{
+ note_page(pt_st, addr, 3, pmd_val(pmd));
+}
+
+static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud)
+{
+ note_page(pt_st, addr, 2, pud_val(pud));
+}
+
+static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d)
+{
+ note_page(pt_st, addr, 1, p4d_val(p4d));
+}
+
+static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd)
+{
+ note_page(pt_st, addr, 0, pgd_val(pgd));
+}
+
+static void note_page_flush(struct ptdump_state *pt_st)
+{
+ pte_t pte_zero = {0};
+
+ note_page(pt_st, 0, -1, pte_val(pte_zero));
+}
+
static void ptdump_walk(struct seq_file *s, struct ptd_mm_info *pinfo)
{
struct pg_state st = {
@@ -325,7 +357,12 @@ static void ptdump_walk(struct seq_file *s, struct ptd_mm_info *pinfo)
.marker = pinfo->markers,
.level = -1,
.ptdump = {
- .note_page = note_page,
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
.range = (struct ptdump_range[]) {
{pinfo->base_addr, pinfo->end},
{0, 0}
@@ -347,7 +384,12 @@ bool ptdump_check_wx(void)
.level = -1,
.check_wx = true,
.ptdump = {
- .note_page = note_page,
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
.range = (struct ptdump_range[]) {
{KERN_VIRT_START, ULONG_MAX},
{0, 0}
diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index 9f2814d0e1e9..66c5808fd011 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -110,7 +110,6 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len);
unsigned long __gmap_translate(struct gmap *, unsigned long gaddr);
int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr);
-void gmap_discard(struct gmap *, unsigned long from, unsigned long to);
void __gmap_zap(struct gmap *, unsigned long gaddr);
void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr);
@@ -134,7 +133,6 @@ int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned
void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
unsigned long gaddr, unsigned long vmaddr);
-int s390_disable_cow_sharing(void);
int s390_replace_asce(struct gmap *gmap);
void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns);
int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
diff --git a/arch/s390/include/asm/gmap_helpers.h b/arch/s390/include/asm/gmap_helpers.h
new file mode 100644
index 000000000000..5356446a61c4
--- /dev/null
+++ b/arch/s390/include/asm/gmap_helpers.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Helper functions for KVM guest address space mapping code
+ *
+ * Copyright IBM Corp. 2025
+ */
+
+#ifndef _ASM_S390_GMAP_HELPERS_H
+#define _ASM_S390_GMAP_HELPERS_H
+
+void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr);
+void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end);
+int gmap_helper_disable_cow_sharing(void);
+
+#endif /* _ASM_S390_GMAP_HELPERS_H */
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index 005497ffebda..5345398df653 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -97,7 +97,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr)
if (!table)
return NULL;
crst_table_init(table, _SEGMENT_ENTRY_EMPTY);
- if (!pagetable_pmd_ctor(virt_to_ptdesc(table))) {
+ if (!pagetable_pmd_ctor(mm, virt_to_ptdesc(table))) {
crst_table_free(mm, table);
return NULL;
}
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index f8a6b54986ec..1c661ac62ce8 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1448,16 +1448,6 @@ static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot)
return pte_mkyoung(__pte);
}
-static inline pte_t mk_pte(struct page *page, pgprot_t pgprot)
-{
- unsigned long physpage = page_to_phys(page);
- pte_t __pte = mk_pte_phys(physpage, pgprot);
-
- if (pte_write(__pte) && PageDirty(page))
- __pte = pte_mkdirty(__pte);
- return __pte;
-}
-
#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
#define p4d_index(address) (((address) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
@@ -1879,7 +1869,6 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
#define pmdp_collapse_flush pmdp_collapse_flush
#define pfn_pmd(pfn, pgprot) mk_pmd_phys(((pfn) << PAGE_SHIFT), (pgprot))
-#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
static inline int pmd_trans_huge(pmd_t pmd)
{
diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h
index 0213ec800b57..bd4cb00ccd5e 100644
--- a/arch/s390/include/asm/syscall.h
+++ b/arch/s390/include/asm/syscall.h
@@ -24,6 +24,18 @@ static inline long syscall_get_nr(struct task_struct *task,
(regs->int_code & 0xffff) : -1;
}
+static inline void syscall_set_nr(struct task_struct *task,
+ struct pt_regs *regs,
+ int nr)
+{
+ /*
+ * Unlike syscall_get_nr(), syscall_set_nr() can be called only when
+ * the target task is stopped for tracing on entering syscall, so
+ * there is no need to have the same check syscall_get_nr() has.
+ */
+ regs->int_code = (regs->int_code & ~0xffff) | (nr & 0xffff);
+}
+
static inline void syscall_rollback(struct task_struct *task,
struct pt_regs *regs)
{
@@ -76,6 +88,15 @@ static inline void syscall_get_arguments(struct task_struct *task,
args[0] = regs->orig_gpr2 & mask;
}
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ const unsigned long *args)
+{
+ regs->orig_gpr2 = args[0];
+ for (int n = 1; n < 6; n++)
+ regs->gprs[2 + n] = args[n];
+}
+
static inline int syscall_get_arch(struct task_struct *task)
{
#ifdef CONFIG_COMPAT
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index f20601995bb0..1e50f6f1ad9d 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -36,11 +36,12 @@ static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,
#include <asm/tlbflush.h>
#include <asm-generic/tlb.h>
+#include <asm/gmap.h>
/*
* Release the page cache reference for a pte removed by
* tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page
- * has already been freed, so just do free_page_and_swap_cache.
+ * has already been freed, so just do free_folio_and_swap_cache.
*
* s390 doesn't delay rmap removal.
*/
@@ -49,7 +50,7 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
{
VM_WARN_ON_ONCE(delay_rmap);
- free_page_and_swap_cache(page);
+ free_folio_and_swap_cache(page_folio(page));
return false;
}
diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h
index b008402ec9aa..8018549a1ad2 100644
--- a/arch/s390/include/asm/uv.h
+++ b/arch/s390/include/asm/uv.h
@@ -16,7 +16,6 @@
#include <linux/bug.h>
#include <linux/sched.h>
#include <asm/page.h>
-#include <asm/gmap.h>
#include <asm/asm.h>
#define UVC_CC_OK 0
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index 4ab0b6b4866e..b99478e84da4 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -15,6 +15,7 @@
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/pagewalk.h>
+#include <linux/backing-dev.h>
#include <asm/facility.h>
#include <asm/sections.h>
#include <asm/uv.h>
@@ -135,7 +136,7 @@ int uv_destroy_folio(struct folio *folio)
{
int rc;
- /* See gmap_make_secure(): large folios cannot be secure */
+ /* Large folios cannot be secure */
if (unlikely(folio_test_large(folio)))
return 0;
@@ -184,7 +185,7 @@ int uv_convert_from_secure_folio(struct folio *folio)
{
int rc;
- /* See gmap_make_secure(): large folios cannot be secure */
+ /* Large folios cannot be secure */
if (unlikely(folio_test_large(folio)))
return 0;
@@ -324,32 +325,87 @@ static int make_folio_secure(struct mm_struct *mm, struct folio *folio, struct u
}
/**
- * s390_wiggle_split_folio() - try to drain extra references to a folio and optionally split.
+ * s390_wiggle_split_folio() - try to drain extra references to a folio and
+ * split the folio if it is large.
* @mm: the mm containing the folio to work on
* @folio: the folio
- * @split: whether to split a large folio
*
* Context: Must be called while holding an extra reference to the folio;
* the mm lock should not be held.
- * Return: 0 if the folio was split successfully;
- * -EAGAIN if the folio was not split successfully but another attempt
- * can be made, or if @split was set to false;
- * -EINVAL in case of other errors. See split_folio().
+ * Return: 0 if the operation was successful;
+ * -EAGAIN if splitting the large folio was not successful,
+ * but another attempt can be made;
+ * -EINVAL in case of other folio splitting errors. See split_folio().
*/
-static int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio, bool split)
+static int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio)
{
- int rc;
+ int rc, tried_splits;
lockdep_assert_not_held(&mm->mmap_lock);
folio_wait_writeback(folio);
lru_add_drain_all();
- if (split) {
+
+ if (!folio_test_large(folio))
+ return 0;
+
+ for (tried_splits = 0; tried_splits < 2; tried_splits++) {
+ struct address_space *mapping;
+ loff_t lstart, lend;
+ struct inode *inode;
+
folio_lock(folio);
rc = split_folio(folio);
+ if (rc != -EBUSY) {
+ folio_unlock(folio);
+ return rc;
+ }
+
+ /*
+ * Splitting with -EBUSY can fail for various reasons, but we
+ * have to handle one case explicitly for now: some mappings
+ * don't allow for splitting dirty folios; writeback will
+ * mark them clean again, including marking all page table
+ * entries mapping the folio read-only, to catch future write
+ * attempts.
+ *
+ * While the system should be writing back dirty folios in the
+ * background, we obtained this folio by looking up a writable
+ * page table entry. On these problematic mappings, writable
+ * page table entries imply dirty folios, preventing the
+ * split in the first place.
+ *
+ * To prevent a livelock when trigger writeback manually and
+ * letting the caller look up the folio again in the page
+ * table (turning it dirty), immediately try to split again.
+ *
+ * This is only a problem for some mappings (e.g., XFS);
+ * mappings that do not support writeback (e.g., shmem) do not
+ * apply.
+ */
+ if (!folio_test_dirty(folio) || folio_test_anon(folio) ||
+ !folio->mapping || !mapping_can_writeback(folio->mapping)) {
+ folio_unlock(folio);
+ break;
+ }
+
+ /*
+ * Ideally, we'd only trigger writeback on this exact folio. But
+ * there is no easy way to do that, so we'll stabilize the
+ * mapping while we still hold the folio lock, so we can drop
+ * the folio lock to trigger writeback on the range currently
+ * covered by the folio instead.
+ */
+ mapping = folio->mapping;
+ lstart = folio_pos(folio);
+ lend = lstart + folio_size(folio) - 1;
+ inode = igrab(mapping->host);
folio_unlock(folio);
- if (rc != -EBUSY)
- return rc;
+ if (unlikely(!inode))
+ break;
+
+ filemap_write_and_wait_range(mapping, lstart, lend);
+ iput(mapping->host);
}
return -EAGAIN;
}
@@ -393,8 +449,11 @@ int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header
folio_walk_end(&fw, vma);
mmap_read_unlock(mm);
- if (rc == -E2BIG || rc == -EBUSY)
- rc = s390_wiggle_split_folio(mm, folio, rc == -E2BIG);
+ if (rc == -E2BIG || rc == -EBUSY) {
+ rc = s390_wiggle_split_folio(mm, folio);
+ if (!rc)
+ rc = -EAGAIN;
+ }
folio_put(folio);
return rc;
@@ -403,15 +462,15 @@ EXPORT_SYMBOL_GPL(make_hva_secure);
/*
* To be called with the folio locked or with an extra reference! This will
- * prevent gmap_make_secure from touching the folio concurrently. Having 2
- * parallel arch_make_folio_accessible is fine, as the UV calls will become a
- * no-op if the folio is already exported.
+ * prevent kvm_s390_pv_make_secure() from touching the folio concurrently.
+ * Having 2 parallel arch_make_folio_accessible is fine, as the UV calls will
+ * become a no-op if the folio is already exported.
*/
int arch_make_folio_accessible(struct folio *folio)
{
int rc = 0;
- /* See gmap_make_secure(): large folios cannot be secure */
+ /* Large folios cannot be secure */
if (unlikely(folio_test_large(folio)))
return 0;
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index f0ffe874adc2..9a723c48b05a 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -8,7 +8,7 @@ include $(srctree)/virt/kvm/Makefile.kvm
ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap.o gmap-vsie.o
+kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap-vsie.o
kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o
obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 74f73141f9b9..53233dec8cad 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -11,12 +11,30 @@
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <asm/gmap.h>
+#include <asm/gmap_helpers.h>
#include <asm/virtio-ccw.h>
#include "kvm-s390.h"
#include "trace.h"
#include "trace-s390.h"
#include "gaccess.h"
+static void do_discard_gfn_range(struct kvm_vcpu *vcpu, gfn_t gfn_start, gfn_t gfn_end)
+{
+ struct kvm_memslot_iter iter;
+ struct kvm_memory_slot *slot;
+ struct kvm_memslots *slots;
+ unsigned long start, end;
+
+ slots = kvm_vcpu_memslots(vcpu);
+
+ kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
+ slot = iter.slot;
+ start = __gfn_to_hva_memslot(slot, max(gfn_start, slot->base_gfn));
+ end = __gfn_to_hva_memslot(slot, min(gfn_end, slot->base_gfn + slot->npages));
+ gmap_helper_discard(vcpu->kvm->mm, start, end);
+ }
+}
+
static int diag_release_pages(struct kvm_vcpu *vcpu)
{
unsigned long start, end;
@@ -32,12 +50,13 @@ static int diag_release_pages(struct kvm_vcpu *vcpu)
VCPU_EVENT(vcpu, 5, "diag release pages %lX %lX", start, end);
+ mmap_read_lock(vcpu->kvm->mm);
/*
* We checked for start >= end above, so lets check for the
* fast path (no prefix swap page involved)
*/
if (end <= prefix || start >= prefix + 2 * PAGE_SIZE) {
- gmap_discard(vcpu->arch.gmap, start, end);
+ do_discard_gfn_range(vcpu, gpa_to_gfn(start), gpa_to_gfn(end));
} else {
/*
* This is slow path. gmap_discard will check for start
@@ -45,13 +64,14 @@ static int diag_release_pages(struct kvm_vcpu *vcpu)
* prefix and let gmap_discard make some of these calls
* NOPs.
*/
- gmap_discard(vcpu->arch.gmap, start, prefix);
+ do_discard_gfn_range(vcpu, gpa_to_gfn(start), gpa_to_gfn(prefix));
if (start <= prefix)
- gmap_discard(vcpu->arch.gmap, 0, PAGE_SIZE);
+ do_discard_gfn_range(vcpu, 0, 1);
if (end > prefix + PAGE_SIZE)
- gmap_discard(vcpu->arch.gmap, PAGE_SIZE, 2 * PAGE_SIZE);
- gmap_discard(vcpu->arch.gmap, prefix + 2 * PAGE_SIZE, end);
+ do_discard_gfn_range(vcpu, 1, 2);
+ do_discard_gfn_range(vcpu, gpa_to_gfn(prefix) + 2, gpa_to_gfn(end));
}
+ mmap_read_unlock(vcpu->kvm->mm);
return 0;
}
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index f6fded15633a..e23670e1949c 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -16,9 +16,10 @@
#include <asm/gmap.h>
#include <asm/dat-bits.h>
#include "kvm-s390.h"
-#include "gmap.h"
#include "gaccess.h"
+#define GMAP_SHADOW_FAKE_TABLE 1ULL
+
/*
* vaddress union in order to easily decode a virtual address into its
* region first index, region second index etc. parts.
diff --git a/arch/s390/kvm/gmap-vsie.c b/arch/s390/kvm/gmap-vsie.c
index a6d1dbb04c97..56ef153eb8fe 100644
--- a/arch/s390/kvm/gmap-vsie.c
+++ b/arch/s390/kvm/gmap-vsie.c
@@ -22,7 +22,6 @@
#include <asm/uv.h>
#include "kvm-s390.h"
-#include "gmap.h"
/**
* gmap_find_shadow - find a specific asce in the list of shadow tables
diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c
deleted file mode 100644
index 6d8944d1b4a0..000000000000
--- a/arch/s390/kvm/gmap.c
+++ /dev/null
@@ -1,121 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Guest memory management for KVM/s390
- *
- * Copyright IBM Corp. 2008, 2020, 2024
- *
- * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
- * Martin Schwidefsky <schwidefsky@de.ibm.com>
- * David Hildenbrand <david@redhat.com>
- * Janosch Frank <frankja@linux.vnet.ibm.com>
- */
-
-#include <linux/compiler.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/pgtable.h>
-#include <linux/pagemap.h>
-
-#include <asm/lowcore.h>
-#include <asm/gmap.h>
-#include <asm/uv.h>
-
-#include "gmap.h"
-
-/**
- * gmap_make_secure() - make one guest page secure
- * @gmap: the guest gmap
- * @gaddr: the guest address that needs to be made secure
- * @uvcb: the UVCB specifying which operation needs to be performed
- *
- * Context: needs to be called with kvm->srcu held.
- * Return: 0 on success, < 0 in case of error.
- */
-int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb)
-{
- struct kvm *kvm = gmap->private;
- unsigned long vmaddr;
-
- lockdep_assert_held(&kvm->srcu);
-
- vmaddr = gfn_to_hva(kvm, gpa_to_gfn(gaddr));
- if (kvm_is_error_hva(vmaddr))
- return -EFAULT;
- return make_hva_secure(gmap->mm, vmaddr, uvcb);
-}
-
-int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr)
-{
- struct uv_cb_cts uvcb = {
- .header.cmd = UVC_CMD_CONV_TO_SEC_STOR,
- .header.len = sizeof(uvcb),
- .guest_handle = gmap->guest_handle,
- .gaddr = gaddr,
- };
-
- return gmap_make_secure(gmap, gaddr, &uvcb);
-}
-
-/**
- * __gmap_destroy_page() - Destroy a guest page.
- * @gmap: the gmap of the guest
- * @page: the page to destroy
- *
- * An attempt will be made to destroy the given guest page. If the attempt
- * fails, an attempt is made to export the page. If both attempts fail, an
- * appropriate error is returned.
- *
- * Context: must be called holding the mm lock for gmap->mm
- */
-static int __gmap_destroy_page(struct gmap *gmap, struct page *page)
-{
- struct folio *folio = page_folio(page);
- int rc;
-
- /*
- * See gmap_make_secure(): large folios cannot be secure. Small
- * folio implies FW_LEVEL_PTE.
- */
- if (folio_test_large(folio))
- return -EFAULT;
-
- rc = uv_destroy_folio(folio);
- /*
- * Fault handlers can race; it is possible that two CPUs will fault
- * on the same secure page. One CPU can destroy the page, reboot,
- * re-enter secure mode and import it, while the second CPU was
- * stuck at the beginning of the handler. At some point the second
- * CPU will be able to progress, and it will not be able to destroy
- * the page. In that case we do not want to terminate the process,
- * we instead try to export the page.
- */
- if (rc)
- rc = uv_convert_from_secure_folio(folio);
-
- return rc;
-}
-
-/**
- * gmap_destroy_page() - Destroy a guest page.
- * @gmap: the gmap of the guest
- * @gaddr: the guest address to destroy
- *
- * An attempt will be made to destroy the given guest page. If the attempt
- * fails, an attempt is made to export the page. If both attempts fail, an
- * appropriate error is returned.
- *
- * Context: may sleep.
- */
-int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr)
-{
- struct page *page;
- int rc = 0;
-
- mmap_read_lock(gmap->mm);
- page = gfn_to_page(gmap->private, gpa_to_gfn(gaddr));
- if (page)
- rc = __gmap_destroy_page(gmap, page);
- kvm_release_page_clean(page);
- mmap_read_unlock(gmap->mm);
- return rc;
-}
diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h
deleted file mode 100644
index c8f031c9ea5f..000000000000
--- a/arch/s390/kvm/gmap.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * KVM guest address space mapping code
- *
- * Copyright IBM Corp. 2007, 2016, 2025
- * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
- * Claudio Imbrenda <imbrenda@linux.ibm.com>
- */
-
-#ifndef ARCH_KVM_S390_GMAP_H
-#define ARCH_KVM_S390_GMAP_H
-
-#define GMAP_SHADOW_FAKE_TABLE 1ULL
-
-int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb);
-int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr);
-int gmap_destroy_page(struct gmap *gmap, unsigned long gaddr);
-struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level);
-
-/**
- * gmap_shadow_valid - check if a shadow guest address space matches the
- * given properties and is still valid
- * @sg: pointer to the shadow guest address space structure
- * @asce: ASCE for which the shadow table is requested
- * @edat_level: edat level to be used for the shadow translation
- *
- * Returns 1 if the gmap shadow is still valid and matches the given
- * properties, the caller can continue using it. Returns 0 otherwise, the
- * caller has to request a new shadow gmap in this case.
- *
- */
-static inline int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
-{
- if (sg->removed)
- return 0;
- return sg->orig_asce == asce && sg->edat_level == edat_level;
-}
-
-#endif
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index a06a000f196c..c7908950c1f4 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -21,7 +21,6 @@
#include "gaccess.h"
#include "trace.h"
#include "trace-s390.h"
-#include "gmap.h"
u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu)
{
@@ -545,7 +544,7 @@ static int handle_pv_uvc(struct kvm_vcpu *vcpu)
guest_uvcb->header.cmd);
return 0;
}
- rc = gmap_make_secure(vcpu->arch.gmap, uvcb.gaddr, &uvcb);
+ rc = kvm_s390_pv_make_secure(vcpu->kvm, uvcb.gaddr, &uvcb);
/*
* If the unpin did not succeed, the guest will exit again for the UVC
* and we will retry the unpin.
@@ -653,10 +652,8 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
break;
case ICPT_PV_PREF:
rc = 0;
- gmap_convert_to_secure(vcpu->arch.gmap,
- kvm_s390_get_prefix(vcpu));
- gmap_convert_to_secure(vcpu->arch.gmap,
- kvm_s390_get_prefix(vcpu) + PAGE_SIZE);
+ kvm_s390_pv_convert_to_secure(vcpu->kvm, kvm_s390_get_prefix(vcpu));
+ kvm_s390_pv_convert_to_secure(vcpu->kvm, kvm_s390_get_prefix(vcpu) + PAGE_SIZE);
break;
default:
return -EOPNOTSUPP;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 3f3175193fd7..d5ad10791c25 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -40,6 +40,7 @@
#include <asm/machine.h>
#include <asm/stp.h>
#include <asm/gmap.h>
+#include <asm/gmap_helpers.h>
#include <asm/nmi.h>
#include <asm/isc.h>
#include <asm/sclp.h>
@@ -52,7 +53,6 @@
#include "kvm-s390.h"
#include "gaccess.h"
#include "pci.h"
-#include "gmap.h"
#define CREATE_TRACE_POINTS
#include "trace.h"
@@ -2674,7 +2674,9 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
if (r)
break;
- r = s390_disable_cow_sharing();
+ mmap_write_lock(kvm->mm);
+ r = gmap_helper_disable_cow_sharing();
+ mmap_write_unlock(kvm->mm);
if (r)
break;
@@ -4973,7 +4975,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
* previous protected guest. The old pages need to be destroyed
* so the new guest can use them.
*/
- if (gmap_destroy_page(vcpu->arch.gmap, gaddr)) {
+ if (kvm_s390_pv_destroy_page(vcpu->kvm, gaddr)) {
/*
* Either KVM messed up the secure guest mapping or the
* same page is mapped into multiple secure guests.
@@ -4995,7 +4997,7 @@ static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
* guest has not been imported yet. Try to import the page into
* the protected guest.
*/
- rc = gmap_convert_to_secure(vcpu->arch.gmap, gaddr);
+ rc = kvm_s390_pv_convert_to_secure(vcpu->kvm, gaddr);
if (rc == -EINVAL)
send_sig(SIGSEGV, current, 0);
if (rc != -ENXIO)
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 8d3bbb2dd8d2..c44fe0c3a097 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -308,6 +308,9 @@ int kvm_s390_pv_dump_stor_state(struct kvm *kvm, void __user *buff_user,
u64 *gaddr, u64 buff_user_len, u16 *rc, u16 *rrc);
int kvm_s390_pv_dump_complete(struct kvm *kvm, void __user *buff_user,
u16 *rc, u16 *rrc);
+int kvm_s390_pv_destroy_page(struct kvm *kvm, unsigned long gaddr);
+int kvm_s390_pv_convert_to_secure(struct kvm *kvm, unsigned long gaddr);
+int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb);
static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm)
{
@@ -319,6 +322,41 @@ static inline u64 kvm_s390_pv_cpu_get_handle(struct kvm_vcpu *vcpu)
return vcpu->arch.pv.handle;
}
+/**
+ * __kvm_s390_pv_destroy_page() - Destroy a guest page.
+ * @page: the page to destroy
+ *
+ * An attempt will be made to destroy the given guest page. If the attempt
+ * fails, an attempt is made to export the page. If both attempts fail, an
+ * appropriate error is returned.
+ *
+ * Context: must be called holding the mm lock for gmap->mm
+ */
+static inline int __kvm_s390_pv_destroy_page(struct page *page)
+{
+ struct folio *folio = page_folio(page);
+ int rc;
+
+ /* Large folios cannot be secure. Small folio implies FW_LEVEL_PTE. */
+ if (folio_test_large(folio))
+ return -EFAULT;
+
+ rc = uv_destroy_folio(folio);
+ /*
+ * Fault handlers can race; it is possible that two CPUs will fault
+ * on the same secure page. One CPU can destroy the page, reboot,
+ * re-enter secure mode and import it, while the second CPU was
+ * stuck at the beginning of the handler. At some point the second
+ * CPU will be able to progress, and it will not be able to destroy
+ * the page. In that case we do not want to terminate the process,
+ * we instead try to export the page.
+ */
+ if (rc)
+ rc = uv_convert_from_secure_folio(folio);
+
+ return rc;
+}
+
/* implemented in interrupt.c */
int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu);
@@ -398,6 +436,10 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
unsigned long end);
void kvm_s390_vsie_init(struct kvm *kvm);
void kvm_s390_vsie_destroy(struct kvm *kvm);
+int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level);
+
+/* implemented in gmap-vsie.c */
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level);
/* implemented in sigp.c */
int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 1a49b89706f8..9253c70897a8 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -1248,6 +1248,8 @@ static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc)
static int handle_essa(struct kvm_vcpu *vcpu)
{
+ lockdep_assert_held(&vcpu->kvm->srcu);
+
/* entries expected to be 1FF */
int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3;
unsigned long *cbrlo;
@@ -1297,12 +1299,8 @@ static int handle_essa(struct kvm_vcpu *vcpu)
/* Retry the ESSA instruction */
kvm_s390_retry_instr(vcpu);
} else {
- int srcu_idx;
-
mmap_read_lock(vcpu->kvm->mm);
- srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
i = __do_essa(vcpu, orc);
- srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
mmap_read_unlock(vcpu->kvm->mm);
if (i < 0)
return i;
diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index 22c012aa5206..14c330ec8ceb 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -17,7 +17,6 @@
#include <linux/sched/mm.h>
#include <linux/mmu_notifier.h>
#include "kvm-s390.h"
-#include "gmap.h"
bool kvm_s390_pv_is_protected(struct kvm *kvm)
{
@@ -34,6 +33,64 @@ bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu)
EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected);
/**
+ * kvm_s390_pv_make_secure() - make one guest page secure
+ * @kvm: the guest
+ * @gaddr: the guest address that needs to be made secure
+ * @uvcb: the UVCB specifying which operation needs to be performed
+ *
+ * Context: needs to be called with kvm->srcu held.
+ * Return: 0 on success, < 0 in case of error.
+ */
+int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb)
+{
+ unsigned long vmaddr;
+
+ lockdep_assert_held(&kvm->srcu);
+
+ vmaddr = gfn_to_hva(kvm, gpa_to_gfn(gaddr));
+ if (kvm_is_error_hva(vmaddr))
+ return -EFAULT;
+ return make_hva_secure(kvm->mm, vmaddr, uvcb);
+}
+
+int kvm_s390_pv_convert_to_secure(struct kvm *kvm, unsigned long gaddr)
+{
+ struct uv_cb_cts uvcb = {
+ .header.cmd = UVC_CMD_CONV_TO_SEC_STOR,
+ .header.len = sizeof(uvcb),
+ .guest_handle = kvm_s390_pv_get_handle(kvm),
+ .gaddr = gaddr,
+ };
+
+ return kvm_s390_pv_make_secure(kvm, gaddr, &uvcb);
+}
+
+/**
+ * kvm_s390_pv_destroy_page() - Destroy a guest page.
+ * @kvm: the guest
+ * @gaddr: the guest address to destroy
+ *
+ * An attempt will be made to destroy the given guest page. If the attempt
+ * fails, an attempt is made to export the page. If both attempts fail, an
+ * appropriate error is returned.
+ *
+ * Context: may sleep.
+ */
+int kvm_s390_pv_destroy_page(struct kvm *kvm, unsigned long gaddr)
+{
+ struct page *page;
+ int rc = 0;
+
+ mmap_read_lock(kvm->mm);
+ page = gfn_to_page(kvm, gpa_to_gfn(gaddr));
+ if (page)
+ rc = __kvm_s390_pv_destroy_page(page);
+ kvm_release_page_clean(page);
+ mmap_read_unlock(kvm->mm);
+ return rc;
+}
+
+/**
* struct pv_vm_to_be_destroyed - Represents a protected VM that needs to
* be destroyed
*
@@ -638,7 +695,7 @@ static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak,
.tweak[0] = tweak,
.tweak[1] = offset,
};
- int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb);
+ int ret = kvm_s390_pv_make_secure(kvm, addr, &uvcb);
unsigned long vmaddr;
bool unlocked;
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index a78df3a4f353..13a9661d2b28 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -23,7 +23,6 @@
#include <asm/facility.h>
#include "kvm-s390.h"
#include "gaccess.h"
-#include "gmap.h"
enum vsie_page_flags {
VSIE_PAGE_IN_USE = 0,
@@ -68,6 +67,24 @@ struct vsie_page {
__u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */
};
+/**
+ * gmap_shadow_valid() - check if a shadow guest address space matches the
+ * given properties and is still valid
+ * @sg: pointer to the shadow guest address space structure
+ * @asce: ASCE for which the shadow table is requested
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns 1 if the gmap shadow is still valid and matches the given
+ * properties, the caller can continue using it. Returns 0 otherwise; the
+ * caller has to request a new shadow gmap in this case.
+ */
+int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
+{
+ if (sg->removed)
+ return 0;
+ return sg->orig_asce == asce && sg->edat_level == edat_level;
+}
+
/* trigger a validity icpt for the given scb */
static int set_validity_icpt(struct kvm_s390_sie_block *scb,
__u16 reason_code)
diff --git a/arch/s390/lib/crypto/Makefile b/arch/s390/lib/crypto/Makefile
index 920197967f46..5df30f1e7930 100644
--- a/arch/s390/lib/crypto/Makefile
+++ b/arch/s390/lib/crypto/Makefile
@@ -3,4 +3,5 @@
obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o
chacha_s390-y := chacha-glue.o chacha-s390.o
-obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256.o
+obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256-s390.o
+sha256-s390-y := sha256.o
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index 9726b91fe7e4..bd0401cc7ca5 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -12,3 +12,5 @@ obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_PTDUMP) += dump_pagetables.o
obj-$(CONFIG_PGSTE) += gmap.o
obj-$(CONFIG_PFAULT) += pfault.o
+
+obj-$(subst m,y,$(CONFIG_KVM)) += gmap_helpers.o
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index d3e943752fa0..ac604b176660 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -147,11 +147,48 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
}
}
+static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte)
+{
+ note_page(pt_st, addr, 4, pte_val(pte));
+}
+
+static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd)
+{
+ note_page(pt_st, addr, 3, pmd_val(pmd));
+}
+
+static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud)
+{
+ note_page(pt_st, addr, 2, pud_val(pud));
+}
+
+static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d)
+{
+ note_page(pt_st, addr, 1, p4d_val(p4d));
+}
+
+static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd)
+{
+ note_page(pt_st, addr, 0, pgd_val(pgd));
+}
+
+static void note_page_flush(struct ptdump_state *pt_st)
+{
+ pte_t pte_zero = {0};
+
+ note_page(pt_st, 0, -1, pte_val(pte_zero));
+}
+
bool ptdump_check_wx(void)
{
struct pg_state st = {
.ptdump = {
- .note_page = note_page,
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
.range = (struct ptdump_range[]) {
{.start = 0, .end = max_addr},
{.start = 0, .end = 0},
@@ -190,7 +227,12 @@ static int ptdump_show(struct seq_file *m, void *v)
{
struct pg_state st = {
.ptdump = {
- .note_page = note_page,
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
.range = (struct ptdump_range[]) {
{.start = 0, .end = max_addr},
{.start = 0, .end = 0},
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index da84ff6770de..3829521450dd 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -40,7 +40,6 @@
#include <asm/ptrace.h>
#include <asm/fault.h>
#include <asm/diag.h>
-#include <asm/gmap.h>
#include <asm/irq.h>
#include <asm/facility.h>
#include <asm/uv.h>
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index a94bd4870c65..012a4366a2ad 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -22,9 +22,9 @@
#include <asm/page-states.h>
#include <asm/pgalloc.h>
#include <asm/machine.h>
+#include <asm/gmap_helpers.h>
#include <asm/gmap.h>
#include <asm/page.h>
-#include <asm/tlb.h>
/*
* The address is saved in a radix tree directly; NULL would be ambiguous,
@@ -620,63 +620,20 @@ EXPORT_SYMBOL(__gmap_link);
*/
void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
{
- struct vm_area_struct *vma;
unsigned long vmaddr;
- spinlock_t *ptl;
- pte_t *ptep;
+
+ mmap_assert_locked(gmap->mm);
/* Find the vm address for the guest address */
vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
gaddr >> PMD_SHIFT);
if (vmaddr) {
vmaddr |= gaddr & ~PMD_MASK;
-
- vma = vma_lookup(gmap->mm, vmaddr);
- if (!vma || is_vm_hugetlb_page(vma))
- return;
-
- /* Get pointer to the page table entry */
- ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
- if (likely(ptep)) {
- ptep_zap_unused(gmap->mm, vmaddr, ptep, 0);
- pte_unmap_unlock(ptep, ptl);
- }
+ gmap_helper_zap_one_page(gmap->mm, vmaddr);
}
}
EXPORT_SYMBOL_GPL(__gmap_zap);
-void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
-{
- unsigned long gaddr, vmaddr, size;
- struct vm_area_struct *vma;
-
- mmap_read_lock(gmap->mm);
- for (gaddr = from; gaddr < to;
- gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
- /* Find the vm address for the guest address */
- vmaddr = (unsigned long)
- radix_tree_lookup(&gmap->guest_to_host,
- gaddr >> PMD_SHIFT);
- if (!vmaddr)
- continue;
- vmaddr |= gaddr & ~PMD_MASK;
- /* Find vma in the parent mm */
- vma = find_vma(gmap->mm, vmaddr);
- if (!vma)
- continue;
- /*
- * We do not discard pages that are backed by
- * hugetlbfs, so we don't have to refault them.
- */
- if (is_vm_hugetlb_page(vma))
- continue;
- size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
- zap_page_range_single(vma, vmaddr, size, NULL);
- }
- mmap_read_unlock(gmap->mm);
-}
-EXPORT_SYMBOL_GPL(gmap_discard);
-
static LIST_HEAD(gmap_notifier_list);
static DEFINE_SPINLOCK(gmap_notifier_lock);
@@ -2269,138 +2226,6 @@ int s390_enable_sie(void)
}
EXPORT_SYMBOL_GPL(s390_enable_sie);
-static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
- unsigned long end, struct mm_walk *walk)
-{
- unsigned long *found_addr = walk->private;
-
- /* Return 1 of the page is a zeropage. */
- if (is_zero_pfn(pte_pfn(*pte))) {
- /*
- * Shared zeropage in e.g., a FS DAX mapping? We cannot do the
- * right thing and likely don't care: FAULT_FLAG_UNSHARE
- * currently only works in COW mappings, which is also where
- * mm_forbids_zeropage() is checked.
- */
- if (!is_cow_mapping(walk->vma->vm_flags))
- return -EFAULT;
-
- *found_addr = addr;
- return 1;
- }
- return 0;
-}
-
-static const struct mm_walk_ops find_zeropage_ops = {
- .pte_entry = find_zeropage_pte_entry,
- .walk_lock = PGWALK_WRLOCK,
-};
-
-/*
- * Unshare all shared zeropages, replacing them by anonymous pages. Note that
- * we cannot simply zap all shared zeropages, because this could later
- * trigger unexpected userfaultfd missing events.
- *
- * This must be called after mm->context.allow_cow_sharing was
- * set to 0, to avoid future mappings of shared zeropages.
- *
- * mm contracts with s390, that even if mm were to remove a page table,
- * and racing with walk_page_range_vma() calling pte_offset_map_lock()
- * would fail, it will never insert a page table containing empty zero
- * pages once mm_forbids_zeropage(mm) i.e.
- * mm->context.allow_cow_sharing is set to 0.
- */
-static int __s390_unshare_zeropages(struct mm_struct *mm)
-{
- struct vm_area_struct *vma;
- VMA_ITERATOR(vmi, mm, 0);
- unsigned long addr;
- vm_fault_t fault;
- int rc;
-
- for_each_vma(vmi, vma) {
- /*
- * We could only look at COW mappings, but it's more future
- * proof to catch unexpected zeropages in other mappings and
- * fail.
- */
- if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma))
- continue;
- addr = vma->vm_start;
-
-retry:
- rc = walk_page_range_vma(vma, addr, vma->vm_end,
- &find_zeropage_ops, &addr);
- if (rc < 0)
- return rc;
- else if (!rc)
- continue;
-
- /* addr was updated by find_zeropage_pte_entry() */
- fault = handle_mm_fault(vma, addr,
- FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
- NULL);
- if (fault & VM_FAULT_OOM)
- return -ENOMEM;
- /*
- * See break_ksm(): even after handle_mm_fault() returned 0, we
- * must start the lookup from the current address, because
- * handle_mm_fault() may back out if there's any difficulty.
- *
- * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but
- * maybe they could trigger in the future on concurrent
- * truncation. In that case, the shared zeropage would be gone
- * and we can simply retry and make progress.
- */
- cond_resched();
- goto retry;
- }
-
- return 0;
-}
-
-static int __s390_disable_cow_sharing(struct mm_struct *mm)
-{
- int rc;
-
- if (!mm->context.allow_cow_sharing)
- return 0;
-
- mm->context.allow_cow_sharing = 0;
-
- /* Replace all shared zeropages by anonymous pages. */
- rc = __s390_unshare_zeropages(mm);
- /*
- * Make sure to disable KSM (if enabled for the whole process or
- * individual VMAs). Note that nothing currently hinders user space
- * from re-enabling it.
- */
- if (!rc)
- rc = ksm_disable(mm);
- if (rc)
- mm->context.allow_cow_sharing = 1;
- return rc;
-}
-
-/*
- * Disable most COW-sharing of memory pages for the whole process:
- * (1) Disable KSM and unmerge/unshare any KSM pages.
- * (2) Disallow shared zeropages and unshare any zerpages that are mapped.
- *
- * Not that we currently don't bother with COW-shared pages that are shared
- * with parent/child processes due to fork().
- */
-int s390_disable_cow_sharing(void)
-{
- int rc;
-
- mmap_write_lock(current->mm);
- rc = __s390_disable_cow_sharing(current->mm);
- mmap_write_unlock(current->mm);
- return rc;
-}
-EXPORT_SYMBOL_GPL(s390_disable_cow_sharing);
-
/*
* Enable storage key handling from now on and initialize the storage
* keys with the default key.
@@ -2468,7 +2293,7 @@ int s390_enable_skey(void)
goto out_up;
mm->context.uses_skeys = 1;
- rc = __s390_disable_cow_sharing(mm);
+ rc = gmap_helper_disable_cow_sharing();
if (rc) {
mm->context.uses_skeys = 0;
goto out_up;
diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c
new file mode 100644
index 000000000000..a45d417ad951
--- /dev/null
+++ b/arch/s390/mm/gmap_helpers.c
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Helper functions for KVM guest address space mapping code
+ *
+ * Copyright IBM Corp. 2007, 2025
+ */
+#include <linux/mm_types.h>
+#include <linux/mmap_lock.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pagewalk.h>
+#include <linux/ksm.h>
+#include <asm/gmap_helpers.h>
+
+/**
+ * ptep_zap_swap_entry() - discard a swap entry.
+ * @mm: the mm
+ * @entry: the swap entry that needs to be zapped
+ *
+ * Discards the given swap entry. If the swap entry was an actual swap
+ * entry (and not a migration entry, for example), the actual swapped
+ * page is also discarded from swap.
+ */
+static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
+{
+ if (!non_swap_entry(entry))
+ dec_mm_counter(mm, MM_SWAPENTS);
+ else if (is_migration_entry(entry))
+ dec_mm_counter(mm, mm_counter(pfn_swap_entry_folio(entry)));
+ free_swap_and_cache(entry);
+}
+
+/**
+ * gmap_helper_zap_one_page() - discard a page if it was swapped.
+ * @mm: the mm
+ * @vmaddr: the userspace virtual address that needs to be discarded
+ *
+ * If the given address maps to a swap entry, discard it.
+ *
+ * Context: needs to be called while holding the mmap lock.
+ */
+void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr)
+{
+ struct vm_area_struct *vma;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+ mmap_assert_locked(mm);
+
+ /* Find the vm address for the guest address */
+ vma = vma_lookup(mm, vmaddr);
+ if (!vma || is_vm_hugetlb_page(vma))
+ return;
+
+ /* Get pointer to the page table entry */
+ ptep = get_locked_pte(mm, vmaddr, &ptl);
+ if (unlikely(!ptep))
+ return;
+ if (pte_swap(*ptep))
+ ptep_zap_swap_entry(mm, pte_to_swp_entry(*ptep));
+ pte_unmap_unlock(ptep, ptl);
+}
+EXPORT_SYMBOL_GPL(gmap_helper_zap_one_page);
+
+/**
+ * gmap_helper_discard() - discard user pages in the given range
+ * @mm: the mm
+ * @vmaddr: starting userspace address
+ * @end: end address (first address outside the range)
+ *
+ * All userpace pages in the range [@vamddr, @end) are discarded and unmapped.
+ *
+ * Context: needs to be called while holding the mmap lock.
+ */
+void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end)
+{
+ struct vm_area_struct *vma;
+
+ mmap_assert_locked(mm);
+
+ while (vmaddr < end) {
+ vma = find_vma_intersection(mm, vmaddr, end);
+ if (!vma)
+ return;
+ if (!is_vm_hugetlb_page(vma))
+ zap_page_range_single(vma, vmaddr, min(end, vma->vm_end) - vmaddr, NULL);
+ vmaddr = vma->vm_end;
+ }
+}
+EXPORT_SYMBOL_GPL(gmap_helper_discard);
+
+static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ unsigned long *found_addr = walk->private;
+
+ /* Return 1 of the page is a zeropage. */
+ if (is_zero_pfn(pte_pfn(*pte))) {
+ /*
+ * Shared zeropage in e.g., a FS DAX mapping? We cannot do the
+ * right thing and likely don't care: FAULT_FLAG_UNSHARE
+ * currently only works in COW mappings, which is also where
+ * mm_forbids_zeropage() is checked.
+ */
+ if (!is_cow_mapping(walk->vma->vm_flags))
+ return -EFAULT;
+
+ *found_addr = addr;
+ return 1;
+ }
+ return 0;
+}
+
+static const struct mm_walk_ops find_zeropage_ops = {
+ .pte_entry = find_zeropage_pte_entry,
+ .walk_lock = PGWALK_WRLOCK,
+};
+
+/** __gmap_helper_unshare_zeropages() - unshare all shared zeropages
+ * @mm: the mm whose zero pages are to be unshared
+ *
+ * Unshare all shared zeropages, replacing them by anonymous pages. Note that
+ * we cannot simply zap all shared zeropages, because this could later
+ * trigger unexpected userfaultfd missing events.
+ *
+ * This must be called after mm->context.allow_cow_sharing was
+ * set to 0, to avoid future mappings of shared zeropages.
+ *
+ * mm contracts with s390, that even if mm were to remove a page table,
+ * and racing with walk_page_range_vma() calling pte_offset_map_lock()
+ * would fail, it will never insert a page table containing empty zero
+ * pages once mm_forbids_zeropage(mm) i.e.
+ * mm->context.allow_cow_sharing is set to 0.
+ */
+static int __gmap_helper_unshare_zeropages(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
+ unsigned long addr;
+ vm_fault_t fault;
+ int rc;
+
+ for_each_vma(vmi, vma) {
+ /*
+ * We could only look at COW mappings, but it's more future
+ * proof to catch unexpected zeropages in other mappings and
+ * fail.
+ */
+ if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma))
+ continue;
+ addr = vma->vm_start;
+
+retry:
+ rc = walk_page_range_vma(vma, addr, vma->vm_end,
+ &find_zeropage_ops, &addr);
+ if (rc < 0)
+ return rc;
+ else if (!rc)
+ continue;
+
+ /* addr was updated by find_zeropage_pte_entry() */
+ fault = handle_mm_fault(vma, addr,
+ FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
+ NULL);
+ if (fault & VM_FAULT_OOM)
+ return -ENOMEM;
+ /*
+ * See break_ksm(): even after handle_mm_fault() returned 0, we
+ * must start the lookup from the current address, because
+ * handle_mm_fault() may back out if there's any difficulty.
+ *
+ * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but
+ * maybe they could trigger in the future on concurrent
+ * truncation. In that case, the shared zeropage would be gone
+ * and we can simply retry and make progress.
+ */
+ cond_resched();
+ goto retry;
+ }
+
+ return 0;
+}
+
+/**
+ * gmap_helper_disable_cow_sharing() - disable all COW sharing
+ *
+ * Disable most COW-sharing of memory pages for the whole process:
+ * (1) Disable KSM and unmerge/unshare any KSM pages.
+ * (2) Disallow shared zeropages and unshare any zerpages that are mapped.
+ *
+ * Not that we currently don't bother with COW-shared pages that are shared
+ * with parent/child processes due to fork().
+ */
+int gmap_helper_disable_cow_sharing(void)
+{
+ struct mm_struct *mm = current->mm;
+ int rc;
+
+ mmap_assert_write_locked(mm);
+
+ if (!mm->context.allow_cow_sharing)
+ return 0;
+
+ mm->context.allow_cow_sharing = 0;
+
+ /* Replace all shared zeropages by anonymous pages. */
+ rc = __gmap_helper_unshare_zeropages(mm);
+ /*
+ * Make sure to disable KSM (if enabled for the whole process or
+ * individual VMAs). Note that nothing currently hinders user space
+ * from re-enabling it.
+ */
+ if (!rc)
+ rc = ksm_disable(mm);
+ if (rc)
+ mm->context.allow_cow_sharing = 1;
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_helper_disable_cow_sharing);
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index afa085e8186c..074bf4fb4ce2 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -40,7 +40,6 @@
#include <asm/kfence.h>
#include <asm/dma.h>
#include <asm/abs_lowcore.h>
-#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/sclp.h>
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index d177bea0bd73..b449fd2605b0 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -12,8 +12,6 @@
#include <asm/mmu_context.h>
#include <asm/page-states.h>
#include <asm/pgalloc.h>
-#include <asm/gmap.h>
-#include <asm/tlb.h>
#include <asm/tlbflush.h>
unsigned long *crst_table_alloc(struct mm_struct *mm)
@@ -144,7 +142,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
ptdesc = pagetable_alloc(GFP_KERNEL, 0);
if (!ptdesc)
return NULL;
- if (!pagetable_pte_ctor(ptdesc)) {
+ if (!pagetable_pte_ctor(mm, ptdesc)) {
pagetable_free(ptdesc);
return NULL;
}
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 9901934284ec..7df70cd8f739 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -20,7 +20,6 @@
#include <linux/ksm.h>
#include <linux/mman.h>
-#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/page-states.h>
diff --git a/arch/sh/include/asm/pgtable_32.h b/arch/sh/include/asm/pgtable_32.h
index f939f1215232..db2e48366e0d 100644
--- a/arch/sh/include/asm/pgtable_32.h
+++ b/arch/sh/include/asm/pgtable_32.h
@@ -380,14 +380,6 @@ PTE_BIT_FUNC(low, mkspecial, |= _PAGE_SPECIAL);
#define pgprot_noncached pgprot_writecombine
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- *
- * extern pte_t mk_pte(struct page *page, pgprot_t pgprot)
- */
-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
-
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
pte.pte_low &= _PAGE_CHG_MASK;
diff --git a/arch/sh/include/asm/syscall_32.h b/arch/sh/include/asm/syscall_32.h
index d87738eebe30..7027d87d901d 100644
--- a/arch/sh/include/asm/syscall_32.h
+++ b/arch/sh/include/asm/syscall_32.h
@@ -15,6 +15,18 @@ static inline long syscall_get_nr(struct task_struct *task,
return (regs->tra >= 0) ? regs->regs[3] : -1L;
}
+static inline void syscall_set_nr(struct task_struct *task,
+ struct pt_regs *regs,
+ int nr)
+{
+ /*
+ * Unlike syscall_get_nr(), syscall_set_nr() can be called only when
+ * the target task is stopped for tracing on entering syscall, so
+ * there is no need to have the same check syscall_get_nr() has.
+ */
+ regs->regs[3] = nr;
+}
+
static inline void syscall_rollback(struct task_struct *task,
struct pt_regs *regs)
{
@@ -57,6 +69,18 @@ static inline void syscall_get_arguments(struct task_struct *task,
args[0] = regs->regs[4];
}
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ const unsigned long *args)
+{
+ regs->regs[1] = args[5];
+ regs->regs[0] = args[4];
+ regs->regs[7] = args[3];
+ regs->regs[6] = args[2];
+ regs->regs[5] = args[1];
+ regs->regs[4] = args[0];
+}
+
static inline int syscall_get_arch(struct task_struct *task)
{
int arch = AUDIT_ARCH_SH;
diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h
index 62bcafe38b1f..1454ebe91539 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h
@@ -255,7 +255,11 @@ static inline pte_t pte_mkyoung(pte_t pte)
}
#define PFN_PTE_SHIFT (PAGE_SHIFT - 4)
-#define pfn_pte(pfn, prot) mk_pte(pfn_to_page(pfn), prot)
+
+static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
+{
+ return __pte((pfn << PFN_PTE_SHIFT) | pgprot_val(pgprot));
+}
static inline unsigned long pte_pfn(pte_t pte)
{
@@ -272,15 +276,6 @@ static inline unsigned long pte_pfn(pte_t pte)
#define pte_page(pte) pfn_to_page(pte_pfn(pte))
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- */
-static inline pte_t mk_pte(struct page *page, pgprot_t pgprot)
-{
- return __pte((page_to_pfn(page) << (PAGE_SHIFT-4)) | pgprot_val(pgprot));
-}
-
static inline pte_t mk_pte_phys(unsigned long page, pgprot_t pgprot)
{
return __pte(((page) >> 4) | pgprot_val(pgprot));
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index dc28f2c4eee3..4af03e3c161b 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -225,7 +225,6 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot)
BUILD_BUG_ON(_PAGE_SZBITS_4U != 0UL || _PAGE_SZBITS_4V != 0UL);
return __pte(paddr | pgprot_val(prot));
}
-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
@@ -234,7 +233,6 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
return __pmd(pte_val(pte));
}
-#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
#endif
/* This one can be done with two shifts. */
diff --git a/arch/sparc/include/asm/syscall.h b/arch/sparc/include/asm/syscall.h
index 20c109ac8cc9..b0233924d323 100644
--- a/arch/sparc/include/asm/syscall.h
+++ b/arch/sparc/include/asm/syscall.h
@@ -25,6 +25,18 @@ static inline long syscall_get_nr(struct task_struct *task,
return (syscall_p ? regs->u_regs[UREG_G1] : -1L);
}
+static inline void syscall_set_nr(struct task_struct *task,
+ struct pt_regs *regs,
+ int nr)
+{
+ /*
+ * Unlike syscall_get_nr(), syscall_set_nr() can be called only when
+ * the target task is stopped for tracing on entering syscall, so
+ * there is no need to have the same check syscall_get_nr() has.
+ */
+ regs->u_regs[UREG_G1] = nr;
+}
+
static inline void syscall_rollback(struct task_struct *task,
struct pt_regs *regs)
{
@@ -117,6 +129,16 @@ static inline void syscall_get_arguments(struct task_struct *task,
}
}
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ const unsigned long *args)
+{
+ unsigned int i;
+
+ for (i = 0; i < 6; i++)
+ regs->u_regs[UREG_I0 + i] = args[i];
+}
+
static inline int syscall_get_arch(struct task_struct *task)
{
#if defined(CONFIG_SPARC64) && defined(CONFIG_COMPAT)
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 760818950464..25ae4c897aae 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2878,33 +2878,27 @@ void __flush_tlb_all(void)
: : "r" (pstate));
}
-pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
-{
- struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- pte_t *pte = NULL;
-
- if (page)
- pte = (pte_t *) page_address(page);
-
- return pte;
-}
-
-pgtable_t pte_alloc_one(struct mm_struct *mm)
+static pte_t *__pte_alloc_one(struct mm_struct *mm)
{
struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL | __GFP_ZERO, 0);
if (!ptdesc)
return NULL;
- if (!pagetable_pte_ctor(ptdesc)) {
+ if (!pagetable_pte_ctor(mm, ptdesc)) {
pagetable_free(ptdesc);
return NULL;
}
return ptdesc_address(ptdesc);
}
-void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
{
- free_page((unsigned long)pte);
+ return __pte_alloc_one(mm);
+}
+
+pgtable_t pte_alloc_one(struct mm_struct *mm)
+{
+ return __pte_alloc_one(mm);
}
static void __pte_free(pgtable_t pte)
@@ -2915,6 +2909,11 @@ static void __pte_free(pgtable_t pte)
pagetable_free(ptdesc);
}
+void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+ __pte_free(pte);
+}
+
void pte_free(struct mm_struct *mm, pgtable_t pte)
{
__pte_free(pte);
diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c
index dd32711022f5..f8fb4911d360 100644
--- a/arch/sparc/mm/srmmu.c
+++ b/arch/sparc/mm/srmmu.c
@@ -350,7 +350,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm)
page = pfn_to_page(__nocache_pa((unsigned long)ptep) >> PAGE_SHIFT);
spin_lock(&mm->page_table_lock);
if (page_ref_inc_return(page) == 2 &&
- !pagetable_pte_ctor(page_ptdesc(page))) {
+ !pagetable_pte_ctor(mm, page_ptdesc(page))) {
page_ref_dec(page);
ptep = NULL;
}
diff --git a/arch/um/include/asm/pgtable-2level.h b/arch/um/include/asm/pgtable-2level.h
index ab0c8dd86564..14ec16f92ce4 100644
--- a/arch/um/include/asm/pgtable-2level.h
+++ b/arch/um/include/asm/pgtable-2level.h
@@ -37,7 +37,6 @@ static inline void pgd_mkuptodate(pgd_t pgd) { }
#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval))
#define pte_pfn(x) phys_to_pfn(pte_val(x))
-#define pfn_pte(pfn, prot) __pte(pfn_to_phys(pfn) | pgprot_val(prot))
#define pfn_pmd(pfn, prot) __pmd(pfn_to_phys(pfn) | pgprot_val(prot))
#endif
diff --git a/arch/um/include/asm/pgtable-4level.h b/arch/um/include/asm/pgtable-4level.h
index 0d279caee93c..7a271b7b83d2 100644
--- a/arch/um/include/asm/pgtable-4level.h
+++ b/arch/um/include/asm/pgtable-4level.h
@@ -102,15 +102,6 @@ static inline unsigned long pte_pfn(pte_t pte)
return phys_to_pfn(pte_val(pte));
}
-static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
-{
- pte_t pte;
- phys_t phys = pfn_to_phys(page_nr);
-
- pte_set_val(pte, phys, pgprot);
- return pte;
-}
-
static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
{
return __pmd((page_nr << PAGE_SHIFT) | pgprot_val(pgprot));
diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h
index 5601ca98e8a6..ca2a519d53ab 100644
--- a/arch/um/include/asm/pgtable.h
+++ b/arch/um/include/asm/pgtable.h
@@ -260,19 +260,17 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b)
return !((pte_val(pte_a) ^ pte_val(pte_b)) & ~_PAGE_NEEDSYNC);
}
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- */
-
#define __virt_to_page(virt) phys_to_page(__pa(virt))
#define virt_to_page(addr) __virt_to_page((const unsigned long) addr)
-#define mk_pte(page, pgprot) \
- ({ pte_t pte; \
- \
- pte_set_val(pte, page_to_phys(page), (pgprot)); \
- pte;})
+static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
+{
+ pte_t pte;
+
+ pte_set_val(pte, pfn_to_phys(pfn), pgprot);
+
+ return pte;
+}
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
diff --git a/arch/um/include/asm/syscall-generic.h b/arch/um/include/asm/syscall-generic.h
index 172b74143c4b..bcd73bcfe577 100644
--- a/arch/um/include/asm/syscall-generic.h
+++ b/arch/um/include/asm/syscall-generic.h
@@ -21,6 +21,11 @@ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
return PT_REGS_SYSCALL_NR(regs);
}
+static inline void syscall_set_nr(struct task_struct *task, struct pt_regs *regs, int nr)
+{
+ PT_REGS_SYSCALL_NR(regs) = nr;
+}
+
static inline void syscall_rollback(struct task_struct *task,
struct pt_regs *regs)
{
@@ -62,6 +67,20 @@ static inline void syscall_get_arguments(struct task_struct *task,
*args = UPT_SYSCALL_ARG6(r);
}
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ const unsigned long *args)
+{
+ struct uml_pt_regs *r = &regs->regs;
+
+ UPT_SYSCALL_ARG1(r) = *args++;
+ UPT_SYSCALL_ARG2(r) = *args++;
+ UPT_SYSCALL_ARG3(r) = *args++;
+ UPT_SYSCALL_ARG4(r) = *args++;
+ UPT_SYSCALL_ARG5(r) = *args++;
+ UPT_SYSCALL_ARG6(r) = *args;
+}
+
/* See arch/x86/um/asm/syscall.h for syscall_get_arch() definition. */
#endif /* __UM_SYSCALL_GENERIC_H */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ae1654280c40..340e5468980e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2005,6 +2005,9 @@ config ARCH_SUPPORTS_KEXEC_BZIMAGE_VERIFY_SIG
config ARCH_SUPPORTS_KEXEC_JUMP
def_bool y
+config ARCH_SUPPORTS_KEXEC_HANDOVER
+ def_bool X86_64
+
config ARCH_SUPPORTS_CRASH_DUMP
def_bool X86_64 || (X86_32 && HIGHMEM)
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index f03d59ea6e40..3b0948ad449f 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -760,6 +760,49 @@ static void process_e820_entries(unsigned long minimum,
}
}
+/*
+ * If KHO is active, only process its scratch areas to ensure we are not
+ * stepping onto preserved memory.
+ */
+static bool process_kho_entries(unsigned long minimum, unsigned long image_size)
+{
+ struct kho_scratch *kho_scratch;
+ struct setup_data *ptr;
+ struct kho_data *kho;
+ int i, nr_areas = 0;
+
+ if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER))
+ return false;
+
+ ptr = (struct setup_data *)(unsigned long)boot_params_ptr->hdr.setup_data;
+ while (ptr) {
+ if (ptr->type == SETUP_KEXEC_KHO) {
+ kho = (struct kho_data *)(unsigned long)ptr->data;
+ kho_scratch = (void *)(unsigned long)kho->scratch_addr;
+ nr_areas = kho->scratch_size / sizeof(*kho_scratch);
+ break;
+ }
+
+ ptr = (struct setup_data *)(unsigned long)ptr->next;
+ }
+
+ if (!nr_areas)
+ return false;
+
+ for (i = 0; i < nr_areas; i++) {
+ struct kho_scratch *area = &kho_scratch[i];
+ struct mem_vector region = {
+ .start = area->addr,
+ .size = area->size,
+ };
+
+ if (process_mem_region(&region, minimum, image_size))
+ break;
+ }
+
+ return true;
+}
+
static unsigned long find_random_phys_addr(unsigned long minimum,
unsigned long image_size)
{
@@ -775,7 +818,12 @@ static unsigned long find_random_phys_addr(unsigned long minimum,
return 0;
}
- if (!process_efi_entries(minimum, image_size))
+ /*
+ * During kexec handover only process KHO scratch areas that are known
+ * not to contain any data that must be preserved.
+ */
+ if (!process_kho_entries(minimum, image_size) &&
+ !process_efi_entries(minimum, image_size))
process_e820_entries(minimum, image_size);
phys_addr = slots_fetch_random();
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index fbc1215d2746..b6db4e0b936b 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -869,12 +869,12 @@ static void *snp_alloc_vmsa_page(int cpu)
return page_address(p + 1);
}
-static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
+static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip, unsigned int cpu)
{
struct sev_es_save_area *cur_vmsa, *vmsa;
struct svsm_ca *caa;
u8 sipi_vector;
- int cpu, ret;
+ int ret;
u64 cr4;
/*
@@ -895,15 +895,6 @@ static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
/* Override start_ip with known protected guest start IP */
start_ip = real_mode_header->sev_es_trampoline_start;
-
- /* Find the logical CPU for the APIC ID */
- for_each_present_cpu(cpu) {
- if (arch_match_cpu_phys_id(cpu, apic_id))
- break;
- }
- if (cpu >= nr_cpu_ids)
- return -EINVAL;
-
cur_vmsa = per_cpu(sev_vmsa, cpu);
/*
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 5d27194a2efa..3d1d3547095a 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -391,40 +391,6 @@ static void __init hv_stimer_setup_percpu_clockev(void)
old_setup_percpu_clockev();
}
-#if IS_ENABLED(CONFIG_HYPERV_VTL_MODE)
-static u8 __init get_vtl(void)
-{
- u64 control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_REGISTERS;
- struct hv_input_get_vp_registers *input;
- struct hv_output_get_vp_registers *output;
- unsigned long flags;
- u64 ret;
-
- local_irq_save(flags);
- input = *this_cpu_ptr(hyperv_pcpu_input_arg);
- output = *this_cpu_ptr(hyperv_pcpu_output_arg);
-
- memset(input, 0, struct_size(input, names, 1));
- input->partition_id = HV_PARTITION_ID_SELF;
- input->vp_index = HV_VP_INDEX_SELF;
- input->input_vtl.as_uint8 = 0;
- input->names[0] = HV_REGISTER_VSM_VP_STATUS;
-
- ret = hv_do_hypercall(control, input, output);
- if (hv_result_success(ret)) {
- ret = output->values[0].reg8 & HV_X64_VTL_MASK;
- } else {
- pr_err("Failed to get VTL(error: %lld) exiting...\n", ret);
- BUG();
- }
-
- local_irq_restore(flags);
- return ret;
-}
-#else
-static inline u8 get_vtl(void) { return 0; }
-#endif
-
/*
* This function is to be invoked early in the boot sequence after the
* hypervisor has been detected.
@@ -707,3 +673,36 @@ bool hv_is_hyperv_initialized(void)
return hypercall_msr.enable;
}
EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized);
+
+int hv_apicid_to_vp_index(u32 apic_id)
+{
+ u64 control;
+ u64 status;
+ unsigned long irq_flags;
+ struct hv_get_vp_from_apic_id_in *input;
+ u32 *output, ret;
+
+ local_irq_save(irq_flags);
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+ input->partition_id = HV_PARTITION_ID_SELF;
+ input->apic_ids[0] = apic_id;
+
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_INDEX_FROM_APIC_ID;
+ status = hv_do_hypercall(control, input, output);
+ ret = output[0];
+
+ local_irq_restore(irq_flags);
+
+ if (!hv_result_success(status)) {
+ pr_err("failed to get vp index from apic id %d, status %#llx\n",
+ apic_id, status);
+ return -EINVAL;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(hv_apicid_to_vp_index);
diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c
index 4580936dcb03..042e8712d8de 100644
--- a/arch/x86/hyperv/hv_vtl.c
+++ b/arch/x86/hyperv/hv_vtl.c
@@ -56,7 +56,12 @@ static void __noreturn hv_vtl_restart(char __maybe_unused *cmd)
void __init hv_vtl_init_platform(void)
{
- pr_info("Linux runs in Hyper-V Virtual Trust Level\n");
+ /*
+ * This function is a no-op if the VTL mode is not enabled.
+ * If it is, this function runs if and only the kernel boots in
+ * VTL2 which the x86 hv initialization path makes sure of.
+ */
+ pr_info("Linux runs in Hyper-V Virtual Trust Level %d\n", ms_hyperv.vtl);
x86_platform.realmode_reserve = x86_init_noop;
x86_platform.realmode_init = x86_init_noop;
@@ -207,63 +212,23 @@ free_lock:
return ret;
}
-static int hv_vtl_apicid_to_vp_id(u32 apic_id)
-{
- u64 control;
- u64 status;
- unsigned long irq_flags;
- struct hv_get_vp_from_apic_id_in *input;
- u32 *output, ret;
-
- local_irq_save(irq_flags);
-
- input = *this_cpu_ptr(hyperv_pcpu_input_arg);
- memset(input, 0, sizeof(*input));
- input->partition_id = HV_PARTITION_ID_SELF;
- input->apic_ids[0] = apic_id;
-
- output = *this_cpu_ptr(hyperv_pcpu_output_arg);
-
- control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_ID_FROM_APIC_ID;
- status = hv_do_hypercall(control, input, output);
- ret = output[0];
-
- local_irq_restore(irq_flags);
-
- if (!hv_result_success(status)) {
- pr_err("failed to get vp id from apic id %d, status %#llx\n",
- apic_id, status);
- return -EINVAL;
- }
-
- return ret;
-}
-
-static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip)
+static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip, unsigned int cpu)
{
- int vp_id, cpu;
-
- /* Find the logical CPU for the APIC ID */
- for_each_present_cpu(cpu) {
- if (arch_match_cpu_phys_id(cpu, apicid))
- break;
- }
- if (cpu >= nr_cpu_ids)
- return -EINVAL;
+ int vp_index;
pr_debug("Bringing up CPU with APIC ID %d in VTL2...\n", apicid);
- vp_id = hv_vtl_apicid_to_vp_id(apicid);
+ vp_index = hv_apicid_to_vp_index(apicid);
- if (vp_id < 0) {
+ if (vp_index < 0) {
pr_err("Couldn't find CPU with APIC ID %d\n", apicid);
return -EINVAL;
}
- if (vp_id > ms_hyperv.max_vp_index) {
- pr_err("Invalid CPU id %d for APIC ID %d\n", vp_id, apicid);
+ if (vp_index > ms_hyperv.max_vp_index) {
+ pr_err("Invalid CPU id %d for APIC ID %d\n", vp_index, apicid);
return -EINVAL;
}
- return hv_vtl_bringup_vcpu(vp_id, cpu, start_eip);
+ return hv_vtl_bringup_vcpu(vp_index, cpu, start_eip);
}
int __init hv_vtl_early_init(void)
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 09a165a3c41e..e93a2f488ff7 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -9,6 +9,7 @@
#include <linux/bitfield.h>
#include <linux/types.h>
#include <linux/slab.h>
+#include <linux/cpu.h>
#include <asm/svm.h>
#include <asm/sev.h>
#include <asm/io.h>
@@ -289,7 +290,7 @@ static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa)
free_page((unsigned long)vmsa);
}
-int hv_snp_boot_ap(u32 cpu, unsigned long start_ip)
+int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip, unsigned int cpu)
{
struct sev_es_save_area *vmsa = (struct sev_es_save_area *)
__get_free_page(GFP_KERNEL | __GFP_ZERO);
@@ -298,10 +299,16 @@ int hv_snp_boot_ap(u32 cpu, unsigned long start_ip)
u64 ret, retry = 5;
struct hv_enable_vp_vtl *start_vp_input;
unsigned long flags;
+ int vp_index;
if (!vmsa)
return -ENOMEM;
+ /* Find the Hyper-V VP index which might be not the same as APIC ID */
+ vp_index = hv_apicid_to_vp_index(apic_id);
+ if (vp_index < 0 || vp_index > ms_hyperv.max_vp_index)
+ return -EINVAL;
+
native_store_gdt(&gdtr);
vmsa->gdtr.base = gdtr.address;
@@ -349,7 +356,7 @@ int hv_snp_boot_ap(u32 cpu, unsigned long start_ip)
start_vp_input = (struct hv_enable_vp_vtl *)ap_start_input_arg;
memset(start_vp_input, 0, sizeof(*start_vp_input));
start_vp_input->partition_id = -1;
- start_vp_input->vp_index = cpu;
+ start_vp_input->vp_index = vp_index;
start_vp_input->target_vtl.target_vtl = ms_hyperv.vtl;
*(u64 *)&start_vp_input->vp_context = __pa(vmsa) | 1;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 68e10e30fe9b..23d86c9750b9 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -313,9 +313,9 @@ struct apic {
u32 (*get_apic_id)(u32 id);
/* wakeup_secondary_cpu */
- int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip);
+ int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip, unsigned int cpu);
/* wakeup secondary CPU using 64-bit wakeup point */
- int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip);
+ int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip, unsigned int cpu);
char *name;
};
@@ -333,8 +333,8 @@ struct apic_override {
void (*send_IPI_self)(int vector);
u64 (*icr_read)(void);
void (*icr_write)(u32 low, u32 high);
- int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip);
- int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip);
+ int (*wakeup_secondary_cpu)(u32 apicid, unsigned long start_eip, unsigned int cpu);
+ int (*wakeup_secondary_cpu_64)(u32 apicid, unsigned long start_eip, unsigned int cpu);
};
/*
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 5b50e0e35129..ee176236c2be 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -336,6 +336,7 @@
#define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */
#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */
#define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* Single Thread Indirect Branch Predictors always-on preferred */
+#define X86_FEATURE_AMD_IBRS_SAME_MODE (13*32+19) /* Indirect Branch Restricted Speculation same mode protection*/
#define X86_FEATURE_AMD_PPIN (13*32+23) /* "amd_ppin" Protected Processor Inventory Number */
#define X86_FEATURE_AMD_SSBD (13*32+24) /* Speculative Store Bypass Disable */
#define X86_FEATURE_VIRT_SSBD (13*32+25) /* "virt_ssbd" Virtualized Speculative Store Bypass Disable */
@@ -378,6 +379,7 @@
#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* "v_spec_ctrl" Virtual SPEC_CTRL */
#define X86_FEATURE_VNMI (15*32+25) /* "vnmi" Virtual NMI */
#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* SVME addr check */
+#define X86_FEATURE_BUS_LOCK_THRESHOLD (15*32+29) /* Bus lock threshold */
#define X86_FEATURE_IDLE_HLT (15*32+30) /* IDLE HLT intercept */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
@@ -446,6 +448,7 @@
#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" SEV-ES full debug state swap support */
#define X86_FEATURE_RMPREAD (19*32+21) /* RMPREAD instruction */
#define X86_FEATURE_SEGMENTED_RMP (19*32+23) /* Segmented RMP support */
+#define X86_FEATURE_ALLOWED_SEV_FEATURES (19*32+27) /* Allowed SEV Features */
#define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */
#define X86_FEATURE_HV_INUSE_WR_ALLOWED (19*32+30) /* Allow Write to in-use hypervisor-owned pages */
@@ -457,6 +460,7 @@
#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* Automatic IBRS */
#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* SMM_CTL MSR is not present */
+#define X86_FEATURE_PREFETCHI (20*32+20) /* Prefetch Data/Instruction to Cache Level */
#define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */
#define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */
#define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 79406bf07a1c..8d50e3e0a19b 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -127,7 +127,7 @@ KVM_X86_OP(leave_smm)
KVM_X86_OP(enable_smi_window)
#endif
KVM_X86_OP_OPTIONAL(dev_get_attr)
-KVM_X86_OP(mem_enc_ioctl)
+KVM_X86_OP_OPTIONAL(mem_enc_ioctl)
KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl)
KVM_X86_OP_OPTIONAL(mem_enc_register_region)
KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 67b464651c8d..b4a391929cdb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -126,7 +126,8 @@
KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_HV_TLB_FLUSH \
KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE KVM_ARCH_REQ(34)
+#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE \
+ KVM_ARCH_REQ_FLAGS(34, KVM_REQUEST_WAIT)
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -412,7 +413,6 @@ struct kvm_rmap_head {
};
struct kvm_pio_request {
- unsigned long linear_rip;
unsigned long count;
int in;
int port;
@@ -918,6 +918,7 @@ struct kvm_vcpu_arch {
bool emulate_regs_need_sync_to_vcpu;
bool emulate_regs_need_sync_from_vcpu;
int (*complete_userspace_io)(struct kvm_vcpu *vcpu);
+ unsigned long cui_linear_rip;
gpa_t time;
s8 pvclock_tsc_shift;
@@ -1035,6 +1036,7 @@ struct kvm_vcpu_arch {
int pending_ioapic_eoi;
int pending_external_vector;
+ int highest_stale_pending_ioapic_eoi;
/* be preempted when it's in kernel-mode(cpl=0) */
bool preempted_in_kernel;
@@ -1942,6 +1944,7 @@ struct kvm_arch_async_pf {
extern u32 __read_mostly kvm_nr_uret_msrs;
extern bool __read_mostly allow_smaller_maxphyaddr;
extern bool __read_mostly enable_apicv;
+extern bool __read_mostly enable_device_posted_irqs;
extern struct kvm_x86_ops kvm_x86_ops;
#define kvm_x86_call(func) static_call(kvm_x86_##func)
@@ -2445,7 +2448,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
static inline bool kvm_arch_has_irq_bypass(void)
{
- return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP);
+ return enable_device_posted_irqs;
}
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 778444310cfb..e1752ba47e67 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -269,11 +269,12 @@ int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
#ifdef CONFIG_AMD_MEM_ENCRYPT
bool hv_ghcb_negotiate_protocol(void);
void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason);
-int hv_snp_boot_ap(u32 cpu, unsigned long start_ip);
+int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip, unsigned int cpu);
#else
static inline bool hv_ghcb_negotiate_protocol(void) { return false; }
static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {}
-static inline int hv_snp_boot_ap(u32 cpu, unsigned long start_ip) { return 0; }
+static inline int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip,
+ unsigned int cpu) { return 0; }
#endif
#if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST)
@@ -307,6 +308,7 @@ static __always_inline u64 hv_raw_get_msr(unsigned int reg)
{
return native_rdmsrq(reg);
}
+int hv_apicid_to_vp_index(u32 apic_id);
#else /* CONFIG_HYPERV */
static inline void hyperv_init(void) {}
@@ -328,6 +330,7 @@ static inline void hv_set_msr(unsigned int reg, u64 value) { }
static inline u64 hv_get_msr(unsigned int reg) { return 0; }
static inline void hv_set_non_nested_msr(unsigned int reg, u64 value) { }
static inline u64 hv_get_non_nested_msr(unsigned int reg) { return 0; }
+static inline int hv_apicid_to_vp_index(u32 apic_id) { return -EINVAL; }
#endif /* CONFIG_HYPERV */
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 4096b8af4ba7..9c2ea29e12a9 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -228,7 +228,7 @@ static __always_inline u64 rdpmc(int counter)
#endif /* !CONFIG_PARAVIRT_XXL */
/* Instruction opcode for WRMSRNS supported in binutils >= 2.40 */
-#define WRMSRNS _ASM_BYTES(0x0f,0x01,0xc6)
+#define ASM_WRMSRNS _ASM_BYTES(0x0f,0x01,0xc6)
/* Non-serializing WRMSR, when available. Falls back to a serializing WRMSR. */
static __always_inline void wrmsrns(u32 msr, u64 val)
@@ -237,7 +237,7 @@ static __always_inline void wrmsrns(u32 msr, u64 val)
* WRMSR is 2 bytes. WRMSRNS is 3 bytes. Pad WRMSR with a redundant
* DS prefix to avoid a trailing NOP.
*/
- asm volatile("1: " ALTERNATIVE("ds wrmsr", WRMSRNS, X86_FEATURE_WRMSRNS)
+ asm volatile("1: " ALTERNATIVE("ds wrmsr", ASM_WRMSRNS, X86_FEATURE_WRMSRNS)
"2: " _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR)
: : "c" (msr), "a" ((u32)val), "d" ((u32)(val >> 32)));
}
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5ddba366d3b4..774430c3abff 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -777,6 +777,9 @@ static inline pgprotval_t check_pgprot(pgprot_t pgprot)
static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
+ /* This bit combination is used to mark shadow stacks */
+ WARN_ON_ONCE((pgprot_val(pgprot) & (_PAGE_DIRTY | _PAGE_RW)) ==
+ _PAGE_DIRTY);
pfn ^= protnone_mask(pgprot_val(pgprot));
pfn &= PTE_PFN_MASK;
return __pte(pfn | check_pgprot(pgprot));
@@ -1073,22 +1076,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
*/
#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd))
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- *
- * (Currently stuck as a macro because of indirect forward reference
- * to linux/mm.h:page_to_nid())
- */
-#define mk_pte(page, pgprot) \
-({ \
- pgprot_t __pgprot = pgprot; \
- \
- WARN_ON_ONCE((pgprot_val(__pgprot) & (_PAGE_DIRTY | _PAGE_RW)) == \
- _PAGE_DIRTY); \
- pfn_pte(page_to_pfn(page), __pgprot); \
-})
-
static inline int pmd_bad(pmd_t pmd)
{
return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
@@ -1353,8 +1340,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
-#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
-
#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h
index bb107ebbe713..a5f761fbf45b 100644
--- a/arch/x86/include/asm/posted_intr.h
+++ b/arch/x86/include/asm/posted_intr.h
@@ -1,19 +1,24 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _X86_POSTED_INTR_H
#define _X86_POSTED_INTR_H
+
+#include <asm/cmpxchg.h>
+#include <asm/rwonce.h>
#include <asm/irq_vectors.h>
+#include <linux/bitmap.h>
+
#define POSTED_INTR_ON 0
#define POSTED_INTR_SN 1
#define PID_TABLE_ENTRY_VALID 1
+#define NR_PIR_VECTORS 256
+#define NR_PIR_WORDS (NR_PIR_VECTORS / BITS_PER_LONG)
+
/* Posted-Interrupt Descriptor */
struct pi_desc {
- union {
- u32 pir[8]; /* Posted interrupt requested */
- u64 pir64[4];
- };
+ unsigned long pir[NR_PIR_WORDS]; /* Posted interrupt requested */
union {
struct {
u16 notifications; /* Suppress and outstanding bits */
@@ -26,6 +31,65 @@ struct pi_desc {
u32 rsvd[6];
} __aligned(64);
+/*
+ * De-multiplexing posted interrupts is on the performance path, the code
+ * below is written to optimize the cache performance based on the following
+ * considerations:
+ * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
+ * accessed by both CPU and IOMMU.
+ * 2.During software processing of posted interrupts, the CPU needs to do
+ * natural width read and xchg for checking and clearing posted interrupt
+ * request (PIR), a 256 bit field within the PID.
+ * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
+ * line when posting interrupts and setting control bits.
+ * 4.The CPU can access the cache line a magnitude faster than the IOMMU.
+ * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
+ * cache line. The cache line states after each operation are as follows,
+ * assuming a 64-bit kernel:
+ * CPU IOMMU PID Cache line state
+ * ---------------------------------------------------------------
+ *...read64 exclusive
+ *...lock xchg64 modified
+ *... post/atomic swap invalid
+ *...-------------------------------------------------------------
+ *
+ * To reduce L1 data cache miss, it is important to avoid contention with
+ * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
+ * when processing posted interrupts in software, e.g. to dispatch interrupt
+ * handlers for posted MSIs, or to move interrupts from the PIR to the vIRR
+ * in KVM.
+ *
+ * In addition, the code is trying to keep the cache line state consistent
+ * as much as possible. e.g. when making a copy and clearing the PIR
+ * (assuming non-zero PIR bits are present in the entire PIR), it does:
+ * read, read, read, read, xchg, xchg, xchg, xchg
+ * instead of:
+ * read, xchg, read, xchg, read, xchg, read, xchg
+ */
+static __always_inline bool pi_harvest_pir(unsigned long *pir,
+ unsigned long *pir_vals)
+{
+ unsigned long pending = 0;
+ int i;
+
+ for (i = 0; i < NR_PIR_WORDS; i++) {
+ pir_vals[i] = READ_ONCE(pir[i]);
+ pending |= pir_vals[i];
+ }
+
+ if (!pending)
+ return false;
+
+ for (i = 0; i < NR_PIR_WORDS; i++) {
+ if (!pir_vals[i])
+ continue;
+
+ pir_vals[i] = arch_xchg(&pir[i], 0);
+ }
+
+ return true;
+}
+
static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
{
return test_and_set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
@@ -43,12 +107,12 @@ static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
{
- return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
+ return test_and_set_bit(vector, pi_desc->pir);
}
static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
{
- return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
+ return bitmap_empty(pi_desc->pir, NR_VECTORS);
}
static inline void pi_set_sn(struct pi_desc *pi_desc)
@@ -110,7 +174,7 @@ static inline bool pi_pending_this_cpu(unsigned int vector)
if (WARN_ON_ONCE(vector > NR_VECTORS || vector < FIRST_EXTERNAL_VECTOR))
return false;
- return test_bit(vector, (unsigned long *)pid->pir);
+ return test_bit(vector, pid->pir);
}
extern void intel_posted_msi_init(void);
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 8d9f1c9aaa4c..61f56cdaccb5 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -4,6 +4,7 @@
#include <asm/page.h>
#include <asm-generic/set_memory.h>
+#include <asm/pgtable.h>
#define set_memory_rox set_memory_rox
int set_memory_rox(unsigned long addr, int numpages);
@@ -37,6 +38,7 @@ int set_memory_rox(unsigned long addr, int numpages);
* The caller is required to take care of these.
*/
+int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot);
int _set_memory_uc(unsigned long addr, int numpages);
int _set_memory_wc(unsigned long addr, int numpages);
int _set_memory_wt(unsigned long addr, int numpages);
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 6324f4c6c545..692af46603a1 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -68,6 +68,8 @@ extern void x86_ce4100_early_setup(void);
static inline void x86_ce4100_early_setup(void) { }
#endif
+#include <linux/kexec_handover.h>
+
#ifndef _SETUP
#include <asm/espfix.h>
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 9b7fa99ae951..ad954a1a6656 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -116,6 +116,7 @@ enum {
INTERCEPT_INVPCID,
INTERCEPT_MCOMMIT,
INTERCEPT_TLBSYNC,
+ INTERCEPT_BUSLOCK,
INTERCEPT_IDLE_HLT = 166,
};
@@ -159,7 +160,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u64 avic_physical_id; /* Offset 0xf8 */
u8 reserved_7[8];
u64 vmsa_pa; /* Used for an SEV-ES guest */
- u8 reserved_8[720];
+ u8 reserved_8[16];
+ u16 bus_lock_counter; /* Offset 0x120 */
+ u8 reserved_9[22];
+ u64 allowed_sev_features; /* Offset 0x138 */
+ u64 guest_sev_features; /* Offset 0x140 */
+ u8 reserved_10[664];
/*
* Offset 0x3e0, 32 bytes reserved
* for use by hypervisor/software.
@@ -291,6 +297,8 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_
#define SVM_SEV_FEAT_ALTERNATE_INJECTION BIT(4)
#define SVM_SEV_FEAT_DEBUG_SWAP BIT(5)
+#define VMCB_ALLOWED_SEV_FEATURES_VALID BIT_ULL(63)
+
struct vmcb_seg {
u16 selector;
u16 attrib;
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 7c488ff0c764..c10dbb74cd00 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -38,6 +38,13 @@ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs)
return regs->orig_ax;
}
+static inline void syscall_set_nr(struct task_struct *task,
+ struct pt_regs *regs,
+ int nr)
+{
+ regs->orig_ax = nr;
+}
+
static inline void syscall_rollback(struct task_struct *task,
struct pt_regs *regs)
{
@@ -90,6 +97,18 @@ static inline void syscall_get_arguments(struct task_struct *task,
args[5] = regs->bp;
}
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ const unsigned long *args)
+{
+ regs->bx = args[0];
+ regs->cx = args[1];
+ regs->dx = args[2];
+ regs->si = args[3];
+ regs->di = args[4];
+ regs->bp = args[5];
+}
+
static inline int syscall_get_arch(struct task_struct *task)
{
return AUDIT_ARCH_I386;
@@ -121,6 +140,30 @@ static inline void syscall_get_arguments(struct task_struct *task,
}
}
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ const unsigned long *args)
+{
+# ifdef CONFIG_IA32_EMULATION
+ if (task->thread_info.status & TS_COMPAT) {
+ regs->bx = *args++;
+ regs->cx = *args++;
+ regs->dx = *args++;
+ regs->si = *args++;
+ regs->di = *args++;
+ regs->bp = *args;
+ } else
+# endif
+ {
+ regs->di = *args++;
+ regs->si = *args++;
+ regs->dx = *args++;
+ regs->r10 = *args++;
+ regs->r8 = *args++;
+ regs->r9 = *args;
+ }
+}
+
static inline int syscall_get_arch(struct task_struct *task)
{
/* x32 tasks should be considered AUDIT_ARCH_X86_64. */
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 225a12e0d5d6..6f3499507c5e 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -845,6 +845,7 @@ struct kvm_sev_snp_launch_start {
};
/* Kept in sync with firmware values for simplicity. */
+#define KVM_SEV_PAGE_TYPE_INVALID 0x0
#define KVM_SEV_SNP_PAGE_TYPE_NORMAL 0x1
#define KVM_SEV_SNP_PAGE_TYPE_ZERO 0x3
#define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED 0x4
diff --git a/arch/x86/include/uapi/asm/setup_data.h b/arch/x86/include/uapi/asm/setup_data.h
index 50c45ead4e7c..2671c4e1b3a0 100644
--- a/arch/x86/include/uapi/asm/setup_data.h
+++ b/arch/x86/include/uapi/asm/setup_data.h
@@ -13,7 +13,8 @@
#define SETUP_CC_BLOB 7
#define SETUP_IMA 8
#define SETUP_RNG_SEED 9
-#define SETUP_ENUM_MAX SETUP_RNG_SEED
+#define SETUP_KEXEC_KHO 10
+#define SETUP_ENUM_MAX SETUP_KEXEC_KHO
#define SETUP_INDIRECT (1<<31)
#define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT)
@@ -78,6 +79,16 @@ struct ima_setup_data {
__u64 size;
} __attribute__((packed));
+/*
+ * Locations of kexec handover metadata
+ */
+struct kho_data {
+ __u64 fdt_addr;
+ __u64 fdt_size;
+ __u64 scratch_addr;
+ __u64 scratch_size;
+} __attribute__((packed));
+
#endif /* __ASSEMBLER__ */
#endif /* _UAPI_ASM_X86_SETUP_DATA_H */
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index ec1321248dac..9c640a521a67 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -95,6 +95,7 @@
#define SVM_EXIT_CR14_WRITE_TRAP 0x09e
#define SVM_EXIT_CR15_WRITE_TRAP 0x09f
#define SVM_EXIT_INVPCID 0x0a2
+#define SVM_EXIT_BUS_LOCK 0x0a5
#define SVM_EXIT_IDLE_HLT 0x0a6
#define SVM_EXIT_NPF 0x400
#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401
@@ -225,6 +226,7 @@
{ SVM_EXIT_CR4_WRITE_TRAP, "write_cr4_trap" }, \
{ SVM_EXIT_CR8_WRITE_TRAP, "write_cr8_trap" }, \
{ SVM_EXIT_INVPCID, "invpcid" }, \
+ { SVM_EXIT_BUS_LOCK, "buslock" }, \
{ SVM_EXIT_IDLE_HLT, "idle-halt" }, \
{ SVM_EXIT_NPF, "npf" }, \
{ SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \
diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c
index f36f28405dcc..6d7603511f52 100644
--- a/arch/x86/kernel/acpi/madt_wakeup.c
+++ b/arch/x86/kernel/acpi/madt_wakeup.c
@@ -126,7 +126,7 @@ static int __init acpi_mp_setup_reset(u64 reset_vector)
return 0;
}
-static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
+static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip, unsigned int cpu)
{
if (!acpi_mp_wake_mailbox_paddr) {
pr_warn_once("No MADT mailbox: cannot bringup secondary CPUs. Booting with kexec?\n");
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index b5bb7a2e8340..58abb941c45b 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -27,7 +27,13 @@ static void noop_send_IPI_allbutself(int vector) { }
static void noop_send_IPI_all(int vector) { }
static void noop_send_IPI_self(int vector) { }
static void noop_apic_icr_write(u32 low, u32 id) { }
-static int noop_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip) { return -1; }
+
+static int noop_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip,
+ unsigned int cpu)
+{
+ return -1;
+}
+
static u64 noop_apic_icr_read(void) { return 0; }
static u32 noop_get_apic_id(u32 apicid) { return 0; }
static void noop_apic_eoi(void) { }
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index e272bc7fdc8e..5c5be2d58242 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -57,7 +57,7 @@ static void numachip2_apic_icr_write(int apicid, unsigned int val)
numachip2_write32_lcsr(NUMACHIP2_APIC_ICR, (apicid << 12) | val);
}
-static int numachip_wakeup_secondary(u32 phys_apicid, unsigned long start_rip)
+static int numachip_wakeup_secondary(u32 phys_apicid, unsigned long start_rip, unsigned int cpu)
{
numachip_apic_icr_write(phys_apicid, APIC_DM_INIT);
numachip_apic_icr_write(phys_apicid, APIC_DM_STARTUP |
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 7fef504ca508..15209f220e1f 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -667,7 +667,7 @@ static __init void build_uv_gr_table(void)
}
}
-static int uv_wakeup_secondary(u32 phys_apicid, unsigned long start_rip)
+static int uv_wakeup_secondary(u32 phys_apicid, unsigned long start_rip, unsigned int cpu)
{
unsigned long val;
int pnode;
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 0be61c45400c..bcb534688dfe 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -278,6 +278,7 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
unsigned long long mend)
{
unsigned long start, end;
+ int ret;
cmem->ranges[0].start = mstart;
cmem->ranges[0].end = mend;
@@ -286,22 +287,43 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
/* Exclude elf header region */
start = image->elf_load_addr;
end = start + image->elf_headers_sz - 1;
- return crash_exclude_mem_range(cmem, start, end);
+ ret = crash_exclude_mem_range(cmem, start, end);
+
+ if (ret)
+ return ret;
+
+ /* Exclude dm crypt keys region */
+ if (image->dm_crypt_keys_addr) {
+ start = image->dm_crypt_keys_addr;
+ end = start + image->dm_crypt_keys_sz - 1;
+ return crash_exclude_mem_range(cmem, start, end);
+ }
+
+ return ret;
}
/* Prepare memory map for crash dump kernel */
int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
{
+ unsigned int nr_ranges = 0;
int i, ret = 0;
unsigned long flags;
struct e820_entry ei;
struct crash_memmap_data cmd;
struct crash_mem *cmem;
- cmem = vzalloc(struct_size(cmem, ranges, 1));
+ /*
+ * Using random kexec_buf for passing dm crypt keys may cause a range
+ * split. So use two slots here.
+ */
+ nr_ranges = 2;
+ cmem = vzalloc(struct_size(cmem, ranges, nr_ranges));
if (!cmem)
return -ENOMEM;
+ cmem->max_nr_ranges = nr_ranges;
+ cmem->nr_ranges = 0;
+
memset(&cmd, 0, sizeof(struct crash_memmap_data));
cmd.params = params;
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 9920122018a0..c3acbd26408b 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1300,6 +1300,24 @@ void __init e820__memblock_setup(void)
}
/*
+ * At this point memblock is only allowed to allocate from memory
+ * below 1M (aka ISA_END_ADDRESS) up until direct map is completely set
+ * up in init_mem_mapping().
+ *
+ * KHO kernels are special and use only scratch memory for memblock
+ * allocations, but memory below 1M is ignored by kernel after early
+ * boot and cannot be naturally marked as scratch.
+ *
+ * To allow allocation of the real-mode trampoline and a few (if any)
+ * other very early allocations from below 1M forcibly mark the memory
+ * below 1M as scratch.
+ *
+ * After real mode trampoline is allocated, we clear that scratch
+ * marking.
+ */
+ memblock_mark_kho_scratch(0, SZ_1M);
+
+ /*
* 32-bit systems are limited to 4BG of memory even with HIGHMEM and
* to even less without it.
* Discard memory after max_pfn - the actual limit detected at runtime.
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 81f9b78e0f7b..9ed29ff10e59 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -380,61 +380,18 @@ void intel_posted_msi_init(void)
this_cpu_write(posted_msi_pi_desc.ndst, destination);
}
-/*
- * De-multiplexing posted interrupts is on the performance path, the code
- * below is written to optimize the cache performance based on the following
- * considerations:
- * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
- * accessed by both CPU and IOMMU.
- * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg
- * for checking and clearing posted interrupt request (PIR), a 256 bit field
- * within the PID.
- * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
- * line when posting interrupts and setting control bits.
- * 4.The CPU can access the cache line a magnitude faster than the IOMMU.
- * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
- * cache line. The cache line states after each operation are as follows:
- * CPU IOMMU PID Cache line state
- * ---------------------------------------------------------------
- *...read64 exclusive
- *...lock xchg64 modified
- *... post/atomic swap invalid
- *...-------------------------------------------------------------
- *
- * To reduce L1 data cache miss, it is important to avoid contention with
- * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
- * to dispatch interrupt handlers.
- *
- * In addition, the code is trying to keep the cache line state consistent
- * as much as possible. e.g. when making a copy and clearing the PIR
- * (assuming non-zero PIR bits are present in the entire PIR), it does:
- * read, read, read, read, xchg, xchg, xchg, xchg
- * instead of:
- * read, xchg, read, xchg, read, xchg, read, xchg
- */
-static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs)
+static __always_inline bool handle_pending_pir(unsigned long *pir, struct pt_regs *regs)
{
- int i, vec = FIRST_EXTERNAL_VECTOR;
- unsigned long pir_copy[4];
- bool handled = false;
+ unsigned long pir_copy[NR_PIR_WORDS];
+ int vec = FIRST_EXTERNAL_VECTOR;
- for (i = 0; i < 4; i++)
- pir_copy[i] = pir[i];
-
- for (i = 0; i < 4; i++) {
- if (!pir_copy[i])
- continue;
+ if (!pi_harvest_pir(pir, pir_copy))
+ return false;
- pir_copy[i] = arch_xchg(&pir[i], 0);
- handled = true;
- }
-
- if (handled) {
- for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR)
- call_irq_handler(vec, regs);
- }
+ for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR)
+ call_irq_handler(vec, regs);
- return handled;
+ return true;
}
/*
@@ -464,7 +421,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
* MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here.
*/
while (++i < MAX_POSTED_MSI_COALESCING_LOOP) {
- if (!handle_pending_pir(pid->pir64, regs))
+ if (!handle_pending_pir(pid->pir, regs))
break;
}
@@ -479,7 +436,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
* process PIR bits one last time such that handling the new interrupts
* are not delayed until the next IRQ.
*/
- handle_pending_pir(pid->pir64, regs);
+ handle_pending_pir(pid->pir, regs);
apic_eoi();
irq_exit();
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 68530fad05f7..24a41f0e0cf1 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -27,6 +27,8 @@
#include <asm/kexec-bzimage64.h>
#define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */
+#define MAX_DMCRYPTKEYS_STR_LEN 31 /* dmcryptkeys=0x<64bit-value> */
+
/*
* Defines lowest physical address for various segments. Not sure where
@@ -76,6 +78,10 @@ static int setup_cmdline(struct kimage *image, struct boot_params *params,
if (image->type == KEXEC_TYPE_CRASH) {
len = sprintf(cmdline_ptr,
"elfcorehdr=0x%lx ", image->elf_load_addr);
+
+ if (image->dm_crypt_keys_addr != 0)
+ len += sprintf(cmdline_ptr + len,
+ "dmcryptkeys=0x%lx ", image->dm_crypt_keys_addr);
}
memcpy(cmdline_ptr + len, cmdline, cmdline_len);
cmdline_len += len;
@@ -233,6 +239,32 @@ setup_ima_state(const struct kimage *image, struct boot_params *params,
#endif /* CONFIG_IMA_KEXEC */
}
+static void setup_kho(const struct kimage *image, struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int setup_data_offset)
+{
+ struct setup_data *sd = (void *)params + setup_data_offset;
+ struct kho_data *kho = (void *)sd + sizeof(*sd);
+
+ if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER))
+ return;
+
+ sd->type = SETUP_KEXEC_KHO;
+ sd->len = sizeof(struct kho_data);
+
+ /* Only add if we have all KHO images in place */
+ if (!image->kho.fdt || !image->kho.scratch)
+ return;
+
+ /* Add setup data */
+ kho->fdt_addr = image->kho.fdt;
+ kho->fdt_size = PAGE_SIZE;
+ kho->scratch_addr = image->kho.scratch->mem;
+ kho->scratch_size = image->kho.scratch->bufsz;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = params_load_addr + setup_data_offset;
+}
+
static int
setup_boot_parameters(struct kimage *image, struct boot_params *params,
unsigned long params_load_addr,
@@ -312,6 +344,13 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
sizeof(struct ima_setup_data);
}
+ if (IS_ENABLED(CONFIG_KEXEC_HANDOVER)) {
+ /* Setup space to store preservation metadata */
+ setup_kho(image, params, params_load_addr, setup_data_offset);
+ setup_data_offset += sizeof(struct setup_data) +
+ sizeof(struct kho_data);
+ }
+
/* Setup RNG seed */
setup_rng_seed(params, params_load_addr, setup_data_offset);
@@ -441,6 +480,19 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
ret = crash_load_segments(image);
if (ret)
return ERR_PTR(ret);
+ ret = crash_load_dm_crypt_keys(image);
+ if (ret == -ENOENT) {
+ kexec_dprintk("No dm crypt key to load\n");
+ } else if (ret) {
+ pr_err("Failed to load dm crypt keys\n");
+ return ERR_PTR(ret);
+ }
+ if (image->dm_crypt_keys_addr &&
+ cmdline_len + MAX_ELFCOREHDR_STR_LEN + MAX_DMCRYPTKEYS_STR_LEN >
+ header->cmdline_size) {
+ pr_err("Appending dmcryptkeys=<addr> to command line exceeds maximum allowed length\n");
+ return ERR_PTR(-EINVAL);
+ }
}
#endif
@@ -468,6 +520,8 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
efi_map_sz = efi_get_runtime_map_size();
params_cmdline_sz = sizeof(struct boot_params) + cmdline_len +
MAX_ELFCOREHDR_STR_LEN;
+ if (image->dm_crypt_keys_addr)
+ params_cmdline_sz += MAX_DMCRYPTKEYS_STR_LEN;
params_cmdline_sz = ALIGN(params_cmdline_sz, 16);
kbuf.bufsz = params_cmdline_sz + ALIGN(efi_map_sz, 16) +
sizeof(struct setup_data) +
@@ -479,6 +533,10 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
kbuf.bufsz += sizeof(struct setup_data) +
sizeof(struct ima_setup_data);
+ if (IS_ENABLED(CONFIG_KEXEC_HANDOVER))
+ kbuf.bufsz += sizeof(struct setup_data) +
+ sizeof(struct kho_data);
+
params = kzalloc(kbuf.bufsz, GFP_KERNEL);
if (!params)
return ERR_PTR(-ENOMEM);
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 949c9e4bfad2..697fb99406e6 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -630,13 +630,35 @@ static void kexec_mark_crashkres(bool protect)
kexec_mark_range(control, crashk_res.end, protect);
}
+/* make the memory storing dm crypt keys in/accessible */
+static void kexec_mark_dm_crypt_keys(bool protect)
+{
+ unsigned long start_paddr, end_paddr;
+ unsigned int nr_pages;
+
+ if (kexec_crash_image->dm_crypt_keys_addr) {
+ start_paddr = kexec_crash_image->dm_crypt_keys_addr;
+ end_paddr = start_paddr + kexec_crash_image->dm_crypt_keys_sz - 1;
+ nr_pages = (PAGE_ALIGN(end_paddr) - PAGE_ALIGN_DOWN(start_paddr))/PAGE_SIZE;
+ if (protect)
+ set_memory_np((unsigned long)phys_to_virt(start_paddr), nr_pages);
+ else
+ __set_memory_prot(
+ (unsigned long)phys_to_virt(start_paddr),
+ nr_pages,
+ __pgprot(_PAGE_PRESENT | _PAGE_NX | _PAGE_RW));
+ }
+}
+
void arch_kexec_protect_crashkres(void)
{
kexec_mark_crashkres(true);
+ kexec_mark_dm_crypt_keys(true);
}
void arch_kexec_unprotect_crashkres(void)
{
+ kexec_mark_dm_crypt_keys(false);
kexec_mark_crashkres(false);
}
#endif
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 7d9ed79a93c0..fb27be697128 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -282,8 +282,8 @@ static void __init cleanup_highmap(void)
static void __init reserve_brk(void)
{
if (_brk_end > _brk_start)
- memblock_reserve(__pa_symbol(_brk_start),
- _brk_end - _brk_start);
+ memblock_reserve_kern(__pa_symbol(_brk_start),
+ _brk_end - _brk_start);
/* Mark brk area as locked down and no longer taking any
new allocations */
@@ -356,7 +356,7 @@ static void __init early_reserve_initrd(void)
!ramdisk_image || !ramdisk_size)
return; /* No initrd provided by bootloader */
- memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
+ memblock_reserve_kern(ramdisk_image, ramdisk_end - ramdisk_image);
}
static void __init reserve_initrd(void)
@@ -409,7 +409,7 @@ static void __init add_early_ima_buffer(u64 phys_addr)
}
if (data->size) {
- memblock_reserve(data->addr, data->size);
+ memblock_reserve_kern(data->addr, data->size);
ima_kexec_buffer_phys = data->addr;
ima_kexec_buffer_size = data->size;
}
@@ -447,6 +447,29 @@ int __init ima_get_kexec_buffer(void **addr, size_t *size)
}
#endif
+static void __init add_kho(u64 phys_addr, u32 data_len)
+{
+ struct kho_data *kho;
+ u64 addr = phys_addr + sizeof(struct setup_data);
+ u64 size = data_len - sizeof(struct setup_data);
+
+ if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER)) {
+ pr_warn("Passed KHO data, but CONFIG_KEXEC_HANDOVER not set. Ignoring.\n");
+ return;
+ }
+
+ kho = early_memremap(addr, size);
+ if (!kho) {
+ pr_warn("setup: failed to memremap kho data (0x%llx, 0x%llx)\n",
+ addr, size);
+ return;
+ }
+
+ kho_populate(kho->fdt_addr, kho->fdt_size, kho->scratch_addr, kho->scratch_size);
+
+ early_memunmap(kho, size);
+}
+
static void __init parse_setup_data(void)
{
struct setup_data *data;
@@ -475,6 +498,9 @@ static void __init parse_setup_data(void)
case SETUP_IMA:
add_early_ima_buffer(pa_data);
break;
+ case SETUP_KEXEC_KHO:
+ add_kho(pa_data, data_len);
+ break;
case SETUP_RNG_SEED:
data = early_memremap(pa_data, data_len);
add_bootloader_randomness(data->data, data->len);
@@ -549,7 +575,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
len = sizeof(*data);
pa_next = data->next;
- memblock_reserve(pa_data, sizeof(*data) + data->len);
+ memblock_reserve_kern(pa_data, sizeof(*data) + data->len);
if (data->type == SETUP_INDIRECT) {
len += data->len;
@@ -563,7 +589,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
indirect = (struct setup_indirect *)data->data;
if (indirect->type != SETUP_INDIRECT)
- memblock_reserve(indirect->addr, indirect->len);
+ memblock_reserve_kern(indirect->addr, indirect->len);
}
pa_data = pa_next;
@@ -766,8 +792,8 @@ static void __init early_reserve_memory(void)
* __end_of_kernel_reserve symbol must be explicitly reserved with a
* separate memblock_reserve() or they will be discarded.
*/
- memblock_reserve(__pa_symbol(_text),
- (unsigned long)__end_of_kernel_reserve - (unsigned long)_text);
+ memblock_reserve_kern(__pa_symbol(_text),
+ (unsigned long)__end_of_kernel_reserve - (unsigned long)_text);
/*
* The first 4Kb of memory is a BIOS owned area, but generally it is
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 1ba92ac9441d..fc78c2325fd2 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -695,7 +695,7 @@ static void send_init_sequence(u32 phys_apicid)
/*
* Wake up AP by INIT, INIT, STARTUP sequence.
*/
-static int wakeup_secondary_cpu_via_init(u32 phys_apicid, unsigned long start_eip)
+static int wakeup_secondary_cpu_via_init(u32 phys_apicid, unsigned long start_eip, unsigned int cpu)
{
unsigned long send_status = 0, accept_status = 0;
int num_starts, j, maxlvt;
@@ -842,7 +842,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
* Returns zero if startup was successfully sent, else error code from
* ->wakeup_secondary_cpu.
*/
-static int do_boot_cpu(u32 apicid, int cpu, struct task_struct *idle)
+static int do_boot_cpu(u32 apicid, unsigned int cpu, struct task_struct *idle)
{
unsigned long start_ip = real_mode_header->trampoline_start;
int ret;
@@ -896,11 +896,11 @@ static int do_boot_cpu(u32 apicid, int cpu, struct task_struct *idle)
* - Use an INIT boot APIC message
*/
if (apic->wakeup_secondary_cpu_64)
- ret = apic->wakeup_secondary_cpu_64(apicid, start_ip);
+ ret = apic->wakeup_secondary_cpu_64(apicid, start_ip, cpu);
else if (apic->wakeup_secondary_cpu)
- ret = apic->wakeup_secondary_cpu(apicid, start_ip);
+ ret = apic->wakeup_secondary_cpu(apicid, start_ip, cpu);
else
- ret = wakeup_secondary_cpu_via_init(apicid, start_ip);
+ ret = wakeup_secondary_cpu_via_init(apicid, start_ip, cpu);
/* If the wakeup mechanism failed, cleanup the warm reset vector */
if (ret)
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 6569b453546b..b2d006756e02 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -978,6 +978,7 @@ void kvm_set_cpu_caps(void)
F(FZRM),
F(FSRS),
F(FSRC),
+ F(WRMSRNS),
F(AMX_FP16),
F(AVX_IFMA),
F(LAM),
@@ -1093,6 +1094,7 @@ void kvm_set_cpu_caps(void)
F(AMD_SSB_NO),
F(AMD_STIBP),
F(AMD_STIBP_ALWAYS_ON),
+ F(AMD_IBRS_SAME_MODE),
F(AMD_PSFD),
F(AMD_IBPB_RET),
);
@@ -1150,6 +1152,7 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_init(CPUID_8000_0021_EAX,
F(NO_NESTED_DATA_BP),
+ F(WRMSR_XX_BASE_NS),
/*
* Synthesize "LFENCE is serializing" into the AMD-defined entry
* in KVM's supported CPUID, i.e. if the feature is reported as
@@ -1163,10 +1166,13 @@ void kvm_set_cpu_caps(void)
SYNTHESIZED_F(LFENCE_RDTSC),
/* SmmPgCfgLock */
F(NULL_SEL_CLR_BASE),
+ /* UpperAddressIgnore */
F(AUTOIBRS),
+ F(PREFETCHI),
EMULATED_F(NO_SMM_CTL_MSR),
/* PrefetchCtlMsr */
- F(WRMSR_XX_BASE_NS),
+ /* GpOnUserCpuid */
+ /* EPSF */
SYNTHESIZED_F(SBPB),
SYNTHESIZED_F(IBPB_BRTYPE),
SYNTHESIZED_F(SRSO_NO),
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 995eb5054360..45dae2d5d2f1 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -296,11 +296,8 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
index == RTC_GSI) {
u16 dm = kvm_lapic_irq_dest_mode(!!e->fields.dest_mode);
- if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
- e->fields.dest_id, dm) ||
- kvm_apic_pending_eoi(vcpu, e->fields.vector))
- __set_bit(e->fields.vector,
- ioapic_handled_vectors);
+ kvm_scan_ioapic_irq(vcpu, e->fields.dest_id, dm,
+ e->fields.vector, ioapic_handled_vectors);
}
}
spin_unlock(&ioapic->lock);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 539333ac4b38..aa8cb4ac0479 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -120,4 +120,6 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
ulong *ioapic_handled_vectors);
void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
ulong *ioapic_handled_vectors);
+void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode,
+ u8 vector, unsigned long *ioapic_handled_vectors);
#endif
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 8136695f7b96..d6d792b5d1bd 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -402,6 +402,33 @@ void kvm_arch_post_irq_routing_update(struct kvm *kvm)
kvm_make_scan_ioapic_request(kvm);
}
+void kvm_scan_ioapic_irq(struct kvm_vcpu *vcpu, u32 dest_id, u16 dest_mode,
+ u8 vector, unsigned long *ioapic_handled_vectors)
+{
+ /*
+ * Intercept EOI if the vCPU is the target of the new IRQ routing, or
+ * the vCPU has a pending IRQ from the old routing, i.e. if the vCPU
+ * may receive a level-triggered IRQ in the future, or already received
+ * level-triggered IRQ. The EOI needs to be intercepted and forwarded
+ * to I/O APIC emulation so that the IRQ can be de-asserted.
+ */
+ if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, dest_id, dest_mode)) {
+ __set_bit(vector, ioapic_handled_vectors);
+ } else if (kvm_apic_pending_eoi(vcpu, vector)) {
+ __set_bit(vector, ioapic_handled_vectors);
+
+ /*
+ * Track the highest pending EOI for which the vCPU is NOT the
+ * target in the new routing. Only the EOI for the IRQ that is
+ * in-flight (for the old routing) needs to be intercepted, any
+ * future IRQs that arrive on this vCPU will be coincidental to
+ * the level-triggered routing and don't need to be intercepted.
+ */
+ if ((int)vector > vcpu->arch.highest_stale_pending_ioapic_eoi)
+ vcpu->arch.highest_stale_pending_ioapic_eoi = vector;
+ }
+}
+
void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
ulong *ioapic_handled_vectors)
{
@@ -424,11 +451,11 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
kvm_set_msi_irq(vcpu->kvm, entry, &irq);
- if (irq.trig_mode &&
- (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
- irq.dest_id, irq.dest_mode) ||
- kvm_apic_pending_eoi(vcpu, irq.vector)))
- __set_bit(irq.vector, ioapic_handled_vectors);
+ if (!irq.trig_mode)
+ continue;
+
+ kvm_scan_ioapic_irq(vcpu, irq.dest_id, irq.dest_mode,
+ irq.vector, ioapic_handled_vectors);
}
}
srcu_read_unlock(&kvm->irq_srcu, idx);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c9de81cc27e1..73418dc0ebb2 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -655,27 +655,29 @@ static u8 count_vectors(void *bitmap)
return count;
}
-bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr)
+bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr)
{
+ unsigned long pir_vals[NR_PIR_WORDS];
+ u32 *__pir = (void *)pir_vals;
u32 i, vec;
- u32 pir_val, irr_val, prev_irr_val;
+ u32 irr_val, prev_irr_val;
int max_updated_irr;
max_updated_irr = -1;
*max_irr = -1;
+ if (!pi_harvest_pir(pir, pir_vals))
+ return false;
+
for (i = vec = 0; i <= 7; i++, vec += 32) {
u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10);
- irr_val = *p_irr;
- pir_val = READ_ONCE(pir[i]);
-
- if (pir_val) {
- pir_val = xchg(&pir[i], 0);
+ irr_val = READ_ONCE(*p_irr);
+ if (__pir[i]) {
prev_irr_val = irr_val;
do {
- irr_val = prev_irr_val | pir_val;
+ irr_val = prev_irr_val | __pir[i];
} while (prev_irr_val != irr_val &&
!try_cmpxchg(p_irr, &prev_irr_val, irr_val));
@@ -691,7 +693,7 @@ bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr)
}
EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
-bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr)
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr)
{
struct kvm_lapic *apic = vcpu->arch.apic;
bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr);
@@ -1459,6 +1461,14 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
if (!kvm_ioapic_handles_vector(apic, vector))
return;
+ /*
+ * If the intercepted EOI is for an IRQ that was pending from previous
+ * routing, then re-scan the I/O APIC routes as EOIs for the IRQ likely
+ * no longer need to be intercepted.
+ */
+ if (apic->vcpu->arch.highest_stale_pending_ioapic_eoi == vector)
+ kvm_make_request(KVM_REQ_SCAN_IOAPIC, apic->vcpu);
+
/* Request a KVM exit to inform the userspace IOAPIC. */
if (irqchip_split(apic->vcpu->kvm)) {
apic->vcpu->arch.pending_ioapic_eoi = vector;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index e33c969439f7..4ce30db65828 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -103,8 +103,8 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int shorthand, unsigned int dest, int dest_mode);
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec);
-bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr);
-bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr);
+bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr);
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr);
void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 7b3f1783ab3c..cbc84c6abc2e 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3020,7 +3020,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
}
if (is_shadow_present_pte(*sptep)) {
- if (prefetch)
+ if (prefetch && is_last_spte(*sptep, level) &&
+ pfn == spte_to_pfn(*sptep))
return RET_PF_SPURIOUS;
/*
@@ -3034,7 +3035,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
child = spte_to_child_sp(pte);
drop_parent_pte(vcpu->kvm, child, sptep);
flush = true;
- } else if (pfn != spte_to_pfn(*sptep)) {
+ } else if (WARN_ON_ONCE(pfn != spte_to_pfn(*sptep))) {
drop_spte(vcpu->kvm, sptep);
flush = true;
} else
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 405874f4d088..7f3d7229b2c1 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -378,7 +378,7 @@ static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
/* Zapping leaf spte is allowed only when write lock is held. */
lockdep_assert_held_write(&kvm->mmu_lock);
/* Because write lock is held, operation should success. */
- ret = static_call(kvm_x86_remove_external_spte)(kvm, gfn, level, old_pfn);
+ ret = kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_pfn);
KVM_BUG_ON(ret, kvm);
}
@@ -485,8 +485,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
}
if (is_mirror_sp(sp) &&
- WARN_ON(static_call(kvm_x86_free_external_spt)(kvm, base_gfn, sp->role.level,
- sp->external_spt))) {
+ WARN_ON(kvm_x86_call(free_external_spt)(kvm, base_gfn, sp->role.level,
+ sp->external_spt))) {
/*
* Failed to free page table page in mirror page table and
* there is nothing to do further.
@@ -538,12 +538,12 @@ static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sp
* external page table, or leaf.
*/
if (is_leaf) {
- ret = static_call(kvm_x86_set_external_spte)(kvm, gfn, level, new_pfn);
+ ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_pfn);
} else {
void *external_spt = get_external_spt(gfn, new_spte, level);
KVM_BUG_ON(!external_spt, kvm);
- ret = static_call(kvm_x86_link_external_spt)(kvm, gfn, level, external_spt);
+ ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt);
}
if (ret)
__kvm_tdp_mmu_write_spte(sptep, old_spte);
@@ -1153,13 +1153,12 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
return RET_PF_RETRY;
- if (fault->prefetch && is_shadow_present_pte(iter->old_spte))
- return RET_PF_SPURIOUS;
-
if (is_shadow_present_pte(iter->old_spte) &&
- is_access_allowed(fault, iter->old_spte) &&
- is_last_spte(iter->old_spte, iter->level))
+ (fault->prefetch || is_access_allowed(fault, iter->old_spte)) &&
+ is_last_spte(iter->old_spte, iter->level)) {
+ WARN_ON_ONCE(fault->pfn != spte_to_pfn(iter->old_spte));
return RET_PF_SPURIOUS;
+ }
if (unlikely(!fault->slot))
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 834b67672d50..8427a48b8b7a 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -678,6 +678,33 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
+ /*
+ * Stash vmcb02's counter if the guest hasn't moved past the guilty
+ * instruction; otherwise, reset the counter to '0'.
+ *
+ * In order to detect if L2 has made forward progress or not, track the
+ * RIP at which a bus lock has occurred on a per-vmcb12 basis. If RIP
+ * is changed, guest has clearly made forward progress, bus_lock_counter
+ * still remained '1', so reset bus_lock_counter to '0'. Eg. In the
+ * scenario, where a buslock happened in L1 before VMRUN, the bus lock
+ * firmly happened on an instruction in the past. Even if vmcb01's
+ * counter is still '1', (because the guilty instruction got patched),
+ * the vCPU has clearly made forward progress and so KVM should reset
+ * vmcb02's counter to '0'.
+ *
+ * If the RIP hasn't changed, stash the bus lock counter at nested VMRUN
+ * to prevent the same guilty instruction from triggering a VM-Exit. Eg.
+ * if userspace rate-limits the vCPU, then it's entirely possible that
+ * L1's tick interrupt is pending by the time userspace re-runs the
+ * vCPU. If KVM unconditionally clears the counter on VMRUN, then when
+ * L1 re-enters L2, the same instruction will trigger a VM-Exit and the
+ * entire cycle start over.
+ */
+ if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip))
+ vmcb02->control.bus_lock_counter = 1;
+ else
+ vmcb02->control.bus_lock_counter = 0;
+
/* Done at vmrun: asid. */
/* Also overwritten later if necessary. */
@@ -1039,8 +1066,17 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
}
+ /*
+ * Invalidate bus_lock_rip unless KVM is still waiting for the guest
+ * to make forward progress before re-enabling bus lock detection.
+ */
+ if (!vmcb02->control.bus_lock_counter)
+ svm->nested.ctl.bus_lock_rip = INVALID_GPA;
+
nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
+ kvm_nested_vmexit_handle_ibrs(vcpu);
+
svm_switch_vmcb(svm, &svm->vmcb01);
/*
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 1aa0f07d3a63..5a69b657dae9 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -561,6 +561,8 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
+ sev->policy = params.policy;
+
memset(&start, 0, sizeof(start));
dh_blob = NULL;
@@ -1593,11 +1595,11 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
/* allocate memory for header and transport buffer */
ret = -ENOMEM;
- hdr = kzalloc(params.hdr_len, GFP_KERNEL_ACCOUNT);
+ hdr = kzalloc(params.hdr_len, GFP_KERNEL);
if (!hdr)
goto e_unpin;
- trans_data = kzalloc(params.trans_len, GFP_KERNEL_ACCOUNT);
+ trans_data = kzalloc(params.trans_len, GFP_KERNEL);
if (!trans_data)
goto e_free_hdr;
@@ -1883,70 +1885,6 @@ static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
atomic_set_release(&src_sev->migration_in_progress, 0);
}
-/* vCPU mutex subclasses. */
-enum sev_migration_role {
- SEV_MIGRATION_SOURCE = 0,
- SEV_MIGRATION_TARGET,
- SEV_NR_MIGRATION_ROLES,
-};
-
-static int sev_lock_vcpus_for_migration(struct kvm *kvm,
- enum sev_migration_role role)
-{
- struct kvm_vcpu *vcpu;
- unsigned long i, j;
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (mutex_lock_killable_nested(&vcpu->mutex, role))
- goto out_unlock;
-
-#ifdef CONFIG_PROVE_LOCKING
- if (!i)
- /*
- * Reset the role to one that avoids colliding with
- * the role used for the first vcpu mutex.
- */
- role = SEV_NR_MIGRATION_ROLES;
- else
- mutex_release(&vcpu->mutex.dep_map, _THIS_IP_);
-#endif
- }
-
- return 0;
-
-out_unlock:
-
- kvm_for_each_vcpu(j, vcpu, kvm) {
- if (i == j)
- break;
-
-#ifdef CONFIG_PROVE_LOCKING
- if (j)
- mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_);
-#endif
-
- mutex_unlock(&vcpu->mutex);
- }
- return -EINTR;
-}
-
-static void sev_unlock_vcpus_for_migration(struct kvm *kvm)
-{
- struct kvm_vcpu *vcpu;
- unsigned long i;
- bool first = true;
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (first)
- first = false;
- else
- mutex_acquire(&vcpu->mutex.dep_map,
- SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_);
-
- mutex_unlock(&vcpu->mutex);
- }
-}
-
static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
{
struct kvm_sev_info *dst = to_kvm_sev_info(dst_kvm);
@@ -2084,10 +2022,10 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
charged = true;
}
- ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE);
+ ret = kvm_lock_all_vcpus(kvm);
if (ret)
goto out_dst_cgroup;
- ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET);
+ ret = kvm_lock_all_vcpus(source_kvm);
if (ret)
goto out_dst_vcpu;
@@ -2101,9 +2039,9 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
ret = 0;
out_source_vcpu:
- sev_unlock_vcpus_for_migration(source_kvm);
+ kvm_unlock_all_vcpus(source_kvm);
out_dst_vcpu:
- sev_unlock_vcpus_for_migration(kvm);
+ kvm_unlock_all_vcpus(kvm);
out_dst_cgroup:
/* Operates on the source on success, on the destination on failure. */
if (charged)
@@ -2200,6 +2138,8 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET)
return -EINVAL;
+ sev->policy = params.policy;
+
sev->snp_context = snp_context_create(kvm, argp);
if (!sev->snp_context)
return -ENOTTY;
@@ -4007,10 +3947,8 @@ static int sev_snp_ap_creation(struct vcpu_svm *svm)
* Unless Creation is deferred until INIT, signal the vCPU to update
* its state.
*/
- if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT) {
- kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);
- kvm_vcpu_kick(target_vcpu);
- }
+ if (request != SVM_VMGEXIT_AP_CREATE_ON_INIT)
+ kvm_make_request_and_kick(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);
return 0;
}
@@ -4468,6 +4406,7 @@ void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm)
static void sev_es_init_vmcb(struct vcpu_svm *svm)
{
+ struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm);
struct vmcb *vmcb = svm->vmcb01.ptr;
struct kvm_vcpu *vcpu = &svm->vcpu;
@@ -4483,6 +4422,10 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
if (svm->sev_es.vmsa && !svm->sev_es.snp_has_guest_vmsa)
svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa);
+ if (cpu_feature_enabled(X86_FEATURE_ALLOWED_SEV_FEATURES))
+ svm->vmcb->control.allowed_sev_features = sev->vmsa_features |
+ VMCB_ALLOWED_SEV_FEATURES_VALID;
+
/* Can't intercept CR register access, HV can't modify CR registers */
svm_clr_intercept(svm, INTERCEPT_CR0_READ);
svm_clr_intercept(svm, INTERCEPT_CR4_READ);
@@ -4943,3 +4886,97 @@ int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
return level;
}
+
+struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_save_area *vmsa;
+ struct kvm_sev_info *sev;
+ int error = 0;
+ int ret;
+
+ if (!sev_es_guest(vcpu->kvm))
+ return NULL;
+
+ /*
+ * If the VMSA has not yet been encrypted, return a pointer to the
+ * current un-encrypted VMSA.
+ */
+ if (!vcpu->arch.guest_state_protected)
+ return (struct vmcb_save_area *)svm->sev_es.vmsa;
+
+ sev = to_kvm_sev_info(vcpu->kvm);
+
+ /* Check if the SEV policy allows debugging */
+ if (sev_snp_guest(vcpu->kvm)) {
+ if (!(sev->policy & SNP_POLICY_DEBUG))
+ return NULL;
+ } else {
+ if (sev->policy & SEV_POLICY_NODBG)
+ return NULL;
+ }
+
+ if (sev_snp_guest(vcpu->kvm)) {
+ struct sev_data_snp_dbg dbg = {0};
+
+ vmsa = snp_alloc_firmware_page(__GFP_ZERO);
+ if (!vmsa)
+ return NULL;
+
+ dbg.gctx_paddr = __psp_pa(sev->snp_context);
+ dbg.src_addr = svm->vmcb->control.vmsa_pa;
+ dbg.dst_addr = __psp_pa(vmsa);
+
+ ret = sev_do_cmd(SEV_CMD_SNP_DBG_DECRYPT, &dbg, &error);
+
+ /*
+ * Return the target page to a hypervisor page no matter what.
+ * If this fails, the page can't be used, so leak it and don't
+ * try to use it.
+ */
+ if (snp_page_reclaim(vcpu->kvm, PHYS_PFN(__pa(vmsa))))
+ return NULL;
+
+ if (ret) {
+ pr_err("SEV: SNP_DBG_DECRYPT failed ret=%d, fw_error=%d (%#x)\n",
+ ret, error, error);
+ free_page((unsigned long)vmsa);
+
+ return NULL;
+ }
+ } else {
+ struct sev_data_dbg dbg = {0};
+ struct page *vmsa_page;
+
+ vmsa_page = alloc_page(GFP_KERNEL);
+ if (!vmsa_page)
+ return NULL;
+
+ vmsa = page_address(vmsa_page);
+
+ dbg.handle = sev->handle;
+ dbg.src_addr = svm->vmcb->control.vmsa_pa;
+ dbg.dst_addr = __psp_pa(vmsa);
+ dbg.len = PAGE_SIZE;
+
+ ret = sev_do_cmd(SEV_CMD_DBG_DECRYPT, &dbg, &error);
+ if (ret) {
+ pr_err("SEV: SEV_CMD_DBG_DECRYPT failed ret=%d, fw_error=%d (0x%x)\n",
+ ret, error, error);
+ __free_page(vmsa_page);
+
+ return NULL;
+ }
+ }
+
+ return vmsa;
+}
+
+void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa)
+{
+ /* If the VMSA has not yet been encrypted, nothing was allocated */
+ if (!vcpu->arch.guest_state_protected || !vmsa)
+ return;
+
+ free_page((unsigned long)vmsa);
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index ffb34dadff1c..ab9b947dbf4f 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -29,6 +29,7 @@
#include <linux/cc_platform.h>
#include <linux/smp.h>
#include <linux/string_choices.h>
+#include <linux/mutex.h>
#include <asm/apic.h>
#include <asm/msr.h>
@@ -232,6 +233,8 @@ module_param(tsc_scaling, int, 0444);
static bool avic;
module_param(avic, bool, 0444);
+module_param(enable_device_posted_irqs, bool, 0444);
+
bool __read_mostly dump_invalid_vmcb;
module_param(dump_invalid_vmcb, bool, 0644);
@@ -250,6 +253,8 @@ static unsigned long iopm_base;
DEFINE_PER_CPU(struct svm_cpu_data, svm_data);
+static DEFINE_MUTEX(vmcb_dump_mutex);
+
/*
* Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via
* the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE.
@@ -1369,6 +1374,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
}
+ if (vcpu->kvm->arch.bus_lock_detection_enabled)
+ svm_set_intercept(svm, INTERCEPT_BUSLOCK);
+
if (sev_guest(vcpu->kvm))
sev_init_vmcb(svm);
@@ -1478,25 +1486,10 @@ out:
return err;
}
-static void svm_clear_current_vmcb(struct vmcb *vmcb)
-{
- int i;
-
- for_each_online_cpu(i)
- cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL);
-}
-
static void svm_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- /*
- * The vmcb page can be recycled, causing a false negative in
- * svm_vcpu_load(). So, ensure that no logical CPU has this
- * vmcb page recorded as its current vmcb.
- */
- svm_clear_current_vmcb(svm->vmcb);
-
svm_leave_nested(vcpu);
svm_free_nested(svm);
@@ -1610,19 +1603,9 @@ static void svm_prepare_host_switch(struct kvm_vcpu *vcpu)
static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
-
if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);
- if (sd->current_vmcb != svm->vmcb) {
- sd->current_vmcb = svm->vmcb;
-
- if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT) &&
- static_branch_likely(&switch_vcpu_ibpb))
- indirect_branch_prediction_barrier();
- }
if (kvm_vcpu_apicv_active(vcpu))
avic_vcpu_load(vcpu, cpu);
}
@@ -3221,17 +3204,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
}
/*
- * AMD changed the architectural behavior of bits 5:2. On CPUs
- * without BusLockTrap, bits 5:2 control "external pins", but
- * on CPUs that support BusLockDetect, bit 2 enables BusLockTrap
- * and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed
- * the guest to set bits 5:2 despite not actually virtualizing
- * Performance-Monitoring/Breakpoint external pins. Drop bits
- * 5:2 for backwards compatibility.
- */
- data &= ~GENMASK(5, 2);
-
- /*
* Suppress BTF as KVM doesn't virtualize BTF, but there's no
* way to communicate lack of support to the guest.
*/
@@ -3361,6 +3333,37 @@ static int invpcid_interception(struct kvm_vcpu *vcpu)
return kvm_handle_invpcid(vcpu, type, gva);
}
+static inline int complete_userspace_buslock(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * If userspace has NOT changed RIP, then KVM's ABI is to let the guest
+ * execute the bus-locking instruction. Set the bus lock counter to '1'
+ * to effectively step past the bus lock.
+ */
+ if (kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))
+ svm->vmcb->control.bus_lock_counter = 1;
+
+ return 1;
+}
+
+static int bus_lock_exit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
+ vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
+
+ vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.complete_userspace_io = complete_userspace_buslock;
+
+ if (is_guest_mode(vcpu))
+ svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip;
+
+ return 0;
+}
+
static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_READ_CR0] = cr_interception,
[SVM_EXIT_READ_CR3] = cr_interception,
@@ -3430,6 +3433,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_INVPCID] = invpcid_interception,
[SVM_EXIT_IDLE_HLT] = kvm_emulate_halt,
[SVM_EXIT_NPF] = npf_interception,
+ [SVM_EXIT_BUS_LOCK] = bus_lock_exit,
[SVM_EXIT_RSM] = rsm_interception,
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
[SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
@@ -3444,14 +3448,21 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save;
+ char *vm_type;
if (!dump_invalid_vmcb) {
pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
return;
}
- pr_err("VMCB %p, last attempted VMRUN on CPU %d\n",
- svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
+ guard(mutex)(&vmcb_dump_mutex);
+
+ vm_type = sev_snp_guest(vcpu->kvm) ? "SEV-SNP" :
+ sev_es_guest(vcpu->kvm) ? "SEV-ES" :
+ sev_guest(vcpu->kvm) ? "SEV" : "SVM";
+
+ pr_err("%s vCPU%u VMCB %p, last attempted VMRUN on CPU %d\n",
+ vm_type, vcpu->vcpu_id, svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu);
pr_err("VMCB Control Area:\n");
pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff);
pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16);
@@ -3489,6 +3500,17 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa);
+ pr_err("%-20s%016llx\n", "allowed_sev_features:", control->allowed_sev_features);
+ pr_err("%-20s%016llx\n", "guest_sev_features:", control->guest_sev_features);
+
+ if (sev_es_guest(vcpu->kvm)) {
+ save = sev_decrypt_vmsa(vcpu);
+ if (!save)
+ goto no_vmsa;
+
+ save01 = save;
+ }
+
pr_err("VMCB State Save Area:\n");
pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
"es:",
@@ -3559,6 +3581,63 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
pr_err("%-15s %016llx %-13s %016llx\n",
"excp_from:", save->last_excp_from,
"excp_to:", save->last_excp_to);
+
+ if (sev_es_guest(vcpu->kvm)) {
+ struct sev_es_save_area *vmsa = (struct sev_es_save_area *)save;
+
+ pr_err("%-15s %016llx\n",
+ "sev_features", vmsa->sev_features);
+
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rax:", vmsa->rax, "rbx:", vmsa->rbx);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rcx:", vmsa->rcx, "rdx:", vmsa->rdx);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rsi:", vmsa->rsi, "rdi:", vmsa->rdi);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "rbp:", vmsa->rbp, "rsp:", vmsa->rsp);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "r8:", vmsa->r8, "r9:", vmsa->r9);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "r10:", vmsa->r10, "r11:", vmsa->r11);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "r12:", vmsa->r12, "r13:", vmsa->r13);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "r14:", vmsa->r14, "r15:", vmsa->r15);
+ pr_err("%-15s %016llx %-13s %016llx\n",
+ "xcr0:", vmsa->xcr0, "xss:", vmsa->xss);
+ } else {
+ pr_err("%-15s %016llx %-13s %016lx\n",
+ "rax:", save->rax, "rbx:",
+ vcpu->arch.regs[VCPU_REGS_RBX]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "rcx:", vcpu->arch.regs[VCPU_REGS_RCX],
+ "rdx:", vcpu->arch.regs[VCPU_REGS_RDX]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "rsi:", vcpu->arch.regs[VCPU_REGS_RSI],
+ "rdi:", vcpu->arch.regs[VCPU_REGS_RDI]);
+ pr_err("%-15s %016lx %-13s %016llx\n",
+ "rbp:", vcpu->arch.regs[VCPU_REGS_RBP],
+ "rsp:", save->rsp);
+#ifdef CONFIG_X86_64
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "r8:", vcpu->arch.regs[VCPU_REGS_R8],
+ "r9:", vcpu->arch.regs[VCPU_REGS_R9]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "r10:", vcpu->arch.regs[VCPU_REGS_R10],
+ "r11:", vcpu->arch.regs[VCPU_REGS_R11]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "r12:", vcpu->arch.regs[VCPU_REGS_R12],
+ "r13:", vcpu->arch.regs[VCPU_REGS_R13]);
+ pr_err("%-15s %016lx %-13s %016lx\n",
+ "r14:", vcpu->arch.regs[VCPU_REGS_R14],
+ "r15:", vcpu->arch.regs[VCPU_REGS_R15]);
+#endif
+ }
+
+no_vmsa:
+ if (sev_es_guest(vcpu->kvm))
+ sev_free_decrypted_vmsa(vcpu, save);
}
static bool svm_check_exit_valid(u64 exit_code)
@@ -3595,6 +3674,10 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
return kvm_emulate_halt(vcpu);
else if (exit_code == SVM_EXIT_NPF)
return npf_interception(vcpu);
+#ifdef CONFIG_KVM_AMD_SEV
+ else if (exit_code == SVM_EXIT_VMGEXIT)
+ return sev_handle_vmgexit(vcpu);
+#endif
#endif
return svm_exit_handlers[exit_code](vcpu);
}
@@ -5356,6 +5439,9 @@ static __init void svm_set_cpu_caps(void)
kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
}
+ if (cpu_feature_enabled(X86_FEATURE_BUS_LOCK_THRESHOLD))
+ kvm_caps.has_bus_lock_exit = true;
+
/* CPUID 0x80000008 */
if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
boot_cpu_has(X86_FEATURE_AMD_SSBD))
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index f16b068c4228..e6f3c6a153a0 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -98,6 +98,7 @@ struct kvm_sev_info {
unsigned int asid; /* ASID used for this guest */
unsigned int handle; /* SEV firmware handle */
int fd; /* SEV device fd */
+ unsigned long policy;
unsigned long pages_locked; /* Number of pages locked */
struct list_head regions_list; /* List of registered regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
@@ -114,6 +115,9 @@ struct kvm_sev_info {
struct mutex guest_req_mutex; /* Must acquire before using bounce buffers */
};
+#define SEV_POLICY_NODBG BIT_ULL(0)
+#define SNP_POLICY_DEBUG BIT_ULL(19)
+
struct kvm_svm {
struct kvm kvm;
@@ -169,6 +173,7 @@ struct vmcb_ctrl_area_cached {
u64 nested_cr3;
u64 virt_ext;
u32 clean;
+ u64 bus_lock_rip;
union {
#if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV)
struct hv_vmcb_enlightenments hv_enlightenments;
@@ -340,8 +345,6 @@ struct svm_cpu_data {
struct vmcb *save_area;
unsigned long save_area_pa;
- struct vmcb *current_vmcb;
-
/* index = sev_asid, value = vmcb pointer */
struct vmcb **sev_vmcbs;
};
@@ -785,6 +788,8 @@ void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu);
int sev_gmem_prepare(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
void sev_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn);
+struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu);
+void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa);
#else
static inline struct page *snp_safe_alloc_page_node(int node, gfp_t gfp)
{
@@ -816,6 +821,11 @@ static inline int sev_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
return 0;
}
+static inline struct vmcb_save_area *sev_decrypt_vmsa(struct kvm_vcpu *vcpu)
+{
+ return NULL;
+}
+static inline void sev_free_decrypted_vmsa(struct kvm_vcpu *vcpu, struct vmcb_save_area *vmsa) {}
#endif
/* vmenter.S */
diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h
index 8f46a06e2c44..a0c5e8781c33 100644
--- a/arch/x86/kvm/vmx/common.h
+++ b/arch/x86/kvm/vmx/common.h
@@ -71,8 +71,8 @@ static __always_inline bool is_td_vcpu(struct kvm_vcpu *vcpu)
#else
-static inline bool is_td(struct kvm *kvm) { return false; }
-static inline bool is_td_vcpu(struct kvm_vcpu *vcpu) { return false; }
+static __always_inline bool is_td(struct kvm *kvm) { return false; }
+static __always_inline bool is_td_vcpu(struct kvm_vcpu *vcpu) { return false; }
#endif
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 94d5d907d37b..d1e02e567b57 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -12,7 +12,6 @@
#ifdef CONFIG_KVM_INTEL_TDX
static_assert(offsetof(struct vcpu_vmx, vt) == offsetof(struct vcpu_tdx, vt));
-#endif
static void vt_disable_virtualization_cpu(void)
{
@@ -240,7 +239,7 @@ static int vt_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
if (is_td_vcpu(vcpu))
return tdx_complete_emulated_msr(vcpu, err);
- return kvm_complete_insn_gp(vcpu, err);
+ return vmx_complete_emulated_msr(vcpu, err);
}
#ifdef CONFIG_KVM_SMM
@@ -315,14 +314,6 @@ static void vt_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
return vmx_set_virtual_apic_mode(vcpu);
}
-static void vt_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
-{
- struct pi_desc *pi = vcpu_to_pi_desc(vcpu);
-
- pi_clear_on(pi);
- memset(pi->pir, 0, sizeof(pi->pir));
-}
-
static void vt_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
{
if (is_td_vcpu(vcpu))
@@ -888,6 +879,13 @@ static int vt_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
return 0;
}
+#define vt_op(name) vt_##name
+#define vt_op_tdx_only(name) vt_##name
+#else /* CONFIG_KVM_INTEL_TDX */
+#define vt_op(name) vmx_##name
+#define vt_op_tdx_only(name) NULL
+#endif /* CONFIG_KVM_INTEL_TDX */
+
#define VMX_REQUIRED_APICV_INHIBITS \
(BIT(APICV_INHIBIT_REASON_DISABLED) | \
BIT(APICV_INHIBIT_REASON_ABSENT) | \
@@ -905,113 +903,113 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.hardware_unsetup = vmx_hardware_unsetup,
.enable_virtualization_cpu = vmx_enable_virtualization_cpu,
- .disable_virtualization_cpu = vt_disable_virtualization_cpu,
+ .disable_virtualization_cpu = vt_op(disable_virtualization_cpu),
.emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu,
- .has_emulated_msr = vt_has_emulated_msr,
+ .has_emulated_msr = vt_op(has_emulated_msr),
.vm_size = sizeof(struct kvm_vmx),
- .vm_init = vt_vm_init,
- .vm_pre_destroy = vt_vm_pre_destroy,
- .vm_destroy = vt_vm_destroy,
+ .vm_init = vt_op(vm_init),
+ .vm_destroy = vt_op(vm_destroy),
+ .vm_pre_destroy = vt_op_tdx_only(vm_pre_destroy),
- .vcpu_precreate = vt_vcpu_precreate,
- .vcpu_create = vt_vcpu_create,
- .vcpu_free = vt_vcpu_free,
- .vcpu_reset = vt_vcpu_reset,
+ .vcpu_precreate = vt_op(vcpu_precreate),
+ .vcpu_create = vt_op(vcpu_create),
+ .vcpu_free = vt_op(vcpu_free),
+ .vcpu_reset = vt_op(vcpu_reset),
- .prepare_switch_to_guest = vt_prepare_switch_to_guest,
- .vcpu_load = vt_vcpu_load,
- .vcpu_put = vt_vcpu_put,
+ .prepare_switch_to_guest = vt_op(prepare_switch_to_guest),
+ .vcpu_load = vt_op(vcpu_load),
+ .vcpu_put = vt_op(vcpu_put),
- .update_exception_bitmap = vt_update_exception_bitmap,
+ .update_exception_bitmap = vt_op(update_exception_bitmap),
.get_feature_msr = vmx_get_feature_msr,
- .get_msr = vt_get_msr,
- .set_msr = vt_set_msr,
-
- .get_segment_base = vt_get_segment_base,
- .get_segment = vt_get_segment,
- .set_segment = vt_set_segment,
- .get_cpl = vt_get_cpl,
- .get_cpl_no_cache = vt_get_cpl_no_cache,
- .get_cs_db_l_bits = vt_get_cs_db_l_bits,
- .is_valid_cr0 = vt_is_valid_cr0,
- .set_cr0 = vt_set_cr0,
- .is_valid_cr4 = vt_is_valid_cr4,
- .set_cr4 = vt_set_cr4,
- .set_efer = vt_set_efer,
- .get_idt = vt_get_idt,
- .set_idt = vt_set_idt,
- .get_gdt = vt_get_gdt,
- .set_gdt = vt_set_gdt,
- .set_dr6 = vt_set_dr6,
- .set_dr7 = vt_set_dr7,
- .sync_dirty_debug_regs = vt_sync_dirty_debug_regs,
- .cache_reg = vt_cache_reg,
- .get_rflags = vt_get_rflags,
- .set_rflags = vt_set_rflags,
- .get_if_flag = vt_get_if_flag,
-
- .flush_tlb_all = vt_flush_tlb_all,
- .flush_tlb_current = vt_flush_tlb_current,
- .flush_tlb_gva = vt_flush_tlb_gva,
- .flush_tlb_guest = vt_flush_tlb_guest,
-
- .vcpu_pre_run = vt_vcpu_pre_run,
- .vcpu_run = vt_vcpu_run,
- .handle_exit = vt_handle_exit,
+ .get_msr = vt_op(get_msr),
+ .set_msr = vt_op(set_msr),
+
+ .get_segment_base = vt_op(get_segment_base),
+ .get_segment = vt_op(get_segment),
+ .set_segment = vt_op(set_segment),
+ .get_cpl = vt_op(get_cpl),
+ .get_cpl_no_cache = vt_op(get_cpl_no_cache),
+ .get_cs_db_l_bits = vt_op(get_cs_db_l_bits),
+ .is_valid_cr0 = vt_op(is_valid_cr0),
+ .set_cr0 = vt_op(set_cr0),
+ .is_valid_cr4 = vt_op(is_valid_cr4),
+ .set_cr4 = vt_op(set_cr4),
+ .set_efer = vt_op(set_efer),
+ .get_idt = vt_op(get_idt),
+ .set_idt = vt_op(set_idt),
+ .get_gdt = vt_op(get_gdt),
+ .set_gdt = vt_op(set_gdt),
+ .set_dr6 = vt_op(set_dr6),
+ .set_dr7 = vt_op(set_dr7),
+ .sync_dirty_debug_regs = vt_op(sync_dirty_debug_regs),
+ .cache_reg = vt_op(cache_reg),
+ .get_rflags = vt_op(get_rflags),
+ .set_rflags = vt_op(set_rflags),
+ .get_if_flag = vt_op(get_if_flag),
+
+ .flush_tlb_all = vt_op(flush_tlb_all),
+ .flush_tlb_current = vt_op(flush_tlb_current),
+ .flush_tlb_gva = vt_op(flush_tlb_gva),
+ .flush_tlb_guest = vt_op(flush_tlb_guest),
+
+ .vcpu_pre_run = vt_op(vcpu_pre_run),
+ .vcpu_run = vt_op(vcpu_run),
+ .handle_exit = vt_op(handle_exit),
.skip_emulated_instruction = vmx_skip_emulated_instruction,
.update_emulated_instruction = vmx_update_emulated_instruction,
- .set_interrupt_shadow = vt_set_interrupt_shadow,
- .get_interrupt_shadow = vt_get_interrupt_shadow,
- .patch_hypercall = vt_patch_hypercall,
- .inject_irq = vt_inject_irq,
- .inject_nmi = vt_inject_nmi,
- .inject_exception = vt_inject_exception,
- .cancel_injection = vt_cancel_injection,
- .interrupt_allowed = vt_interrupt_allowed,
- .nmi_allowed = vt_nmi_allowed,
- .get_nmi_mask = vt_get_nmi_mask,
- .set_nmi_mask = vt_set_nmi_mask,
- .enable_nmi_window = vt_enable_nmi_window,
- .enable_irq_window = vt_enable_irq_window,
- .update_cr8_intercept = vt_update_cr8_intercept,
+ .set_interrupt_shadow = vt_op(set_interrupt_shadow),
+ .get_interrupt_shadow = vt_op(get_interrupt_shadow),
+ .patch_hypercall = vt_op(patch_hypercall),
+ .inject_irq = vt_op(inject_irq),
+ .inject_nmi = vt_op(inject_nmi),
+ .inject_exception = vt_op(inject_exception),
+ .cancel_injection = vt_op(cancel_injection),
+ .interrupt_allowed = vt_op(interrupt_allowed),
+ .nmi_allowed = vt_op(nmi_allowed),
+ .get_nmi_mask = vt_op(get_nmi_mask),
+ .set_nmi_mask = vt_op(set_nmi_mask),
+ .enable_nmi_window = vt_op(enable_nmi_window),
+ .enable_irq_window = vt_op(enable_irq_window),
+ .update_cr8_intercept = vt_op(update_cr8_intercept),
.x2apic_icr_is_split = false,
- .set_virtual_apic_mode = vt_set_virtual_apic_mode,
- .set_apic_access_page_addr = vt_set_apic_access_page_addr,
- .refresh_apicv_exec_ctrl = vt_refresh_apicv_exec_ctrl,
- .load_eoi_exitmap = vt_load_eoi_exitmap,
- .apicv_pre_state_restore = vt_apicv_pre_state_restore,
+ .set_virtual_apic_mode = vt_op(set_virtual_apic_mode),
+ .set_apic_access_page_addr = vt_op(set_apic_access_page_addr),
+ .refresh_apicv_exec_ctrl = vt_op(refresh_apicv_exec_ctrl),
+ .load_eoi_exitmap = vt_op(load_eoi_exitmap),
+ .apicv_pre_state_restore = pi_apicv_pre_state_restore,
.required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
- .hwapic_isr_update = vt_hwapic_isr_update,
- .sync_pir_to_irr = vt_sync_pir_to_irr,
- .deliver_interrupt = vt_deliver_interrupt,
+ .hwapic_isr_update = vt_op(hwapic_isr_update),
+ .sync_pir_to_irr = vt_op(sync_pir_to_irr),
+ .deliver_interrupt = vt_op(deliver_interrupt),
.dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
- .set_tss_addr = vt_set_tss_addr,
- .set_identity_map_addr = vt_set_identity_map_addr,
+ .set_tss_addr = vt_op(set_tss_addr),
+ .set_identity_map_addr = vt_op(set_identity_map_addr),
.get_mt_mask = vmx_get_mt_mask,
- .get_exit_info = vt_get_exit_info,
- .get_entry_info = vt_get_entry_info,
+ .get_exit_info = vt_op(get_exit_info),
+ .get_entry_info = vt_op(get_entry_info),
- .vcpu_after_set_cpuid = vt_vcpu_after_set_cpuid,
+ .vcpu_after_set_cpuid = vt_op(vcpu_after_set_cpuid),
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
- .get_l2_tsc_offset = vt_get_l2_tsc_offset,
- .get_l2_tsc_multiplier = vt_get_l2_tsc_multiplier,
- .write_tsc_offset = vt_write_tsc_offset,
- .write_tsc_multiplier = vt_write_tsc_multiplier,
+ .get_l2_tsc_offset = vt_op(get_l2_tsc_offset),
+ .get_l2_tsc_multiplier = vt_op(get_l2_tsc_multiplier),
+ .write_tsc_offset = vt_op(write_tsc_offset),
+ .write_tsc_multiplier = vt_op(write_tsc_multiplier),
- .load_mmu_pgd = vt_load_mmu_pgd,
+ .load_mmu_pgd = vt_op(load_mmu_pgd),
.check_intercept = vmx_check_intercept,
.handle_exit_irqoff = vmx_handle_exit_irqoff,
- .update_cpu_dirty_logging = vt_update_cpu_dirty_logging,
+ .update_cpu_dirty_logging = vt_op(update_cpu_dirty_logging),
.nested_ops = &vmx_nested_ops,
@@ -1019,38 +1017,38 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.pi_start_assignment = vmx_pi_start_assignment,
#ifdef CONFIG_X86_64
- .set_hv_timer = vt_set_hv_timer,
- .cancel_hv_timer = vt_cancel_hv_timer,
+ .set_hv_timer = vt_op(set_hv_timer),
+ .cancel_hv_timer = vt_op(cancel_hv_timer),
#endif
- .setup_mce = vt_setup_mce,
+ .setup_mce = vt_op(setup_mce),
#ifdef CONFIG_KVM_SMM
- .smi_allowed = vt_smi_allowed,
- .enter_smm = vt_enter_smm,
- .leave_smm = vt_leave_smm,
- .enable_smi_window = vt_enable_smi_window,
+ .smi_allowed = vt_op(smi_allowed),
+ .enter_smm = vt_op(enter_smm),
+ .leave_smm = vt_op(leave_smm),
+ .enable_smi_window = vt_op(enable_smi_window),
#endif
- .check_emulate_instruction = vt_check_emulate_instruction,
- .apic_init_signal_blocked = vt_apic_init_signal_blocked,
+ .check_emulate_instruction = vt_op(check_emulate_instruction),
+ .apic_init_signal_blocked = vt_op(apic_init_signal_blocked),
.migrate_timers = vmx_migrate_timers,
- .msr_filter_changed = vt_msr_filter_changed,
- .complete_emulated_msr = vt_complete_emulated_msr,
+ .msr_filter_changed = vt_op(msr_filter_changed),
+ .complete_emulated_msr = vt_op(complete_emulated_msr),
.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
.get_untagged_addr = vmx_get_untagged_addr,
- .mem_enc_ioctl = vt_mem_enc_ioctl,
- .vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl,
+ .mem_enc_ioctl = vt_op_tdx_only(mem_enc_ioctl),
+ .vcpu_mem_enc_ioctl = vt_op_tdx_only(vcpu_mem_enc_ioctl),
- .private_max_mapping_level = vt_gmem_private_max_mapping_level
+ .private_max_mapping_level = vt_op_tdx_only(gmem_private_max_mapping_level)
};
struct kvm_x86_init_ops vt_init_ops __initdata = {
- .hardware_setup = vt_hardware_setup,
+ .hardware_setup = vt_op(hardware_setup),
.handle_intel_pt_intr = NULL,
.runtime_ops = &vt_x86_ops,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 71701e2414a4..7211c71d4241 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -302,7 +302,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
cpu = get_cpu();
prev = vmx->loaded_vmcs;
vmx->loaded_vmcs = vmcs;
- vmx_vcpu_load_vmcs(vcpu, cpu, prev);
+ vmx_vcpu_load_vmcs(vcpu, cpu);
vmx_sync_vmcs_host_state(vmx, prev);
put_cpu();
@@ -825,12 +825,30 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
return 0;
}
+static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
+ vmx->nested.msrs.misc_high);
+
+ return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
+}
+
static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
u32 count, u64 addr)
{
if (count == 0)
return 0;
+ /*
+ * Exceeding the limit results in architecturally _undefined_ behavior,
+ * i.e. KVM is allowed to do literally anything in response to a bad
+ * limit. Immediately generate a consistency check so that code that
+ * consumes the count doesn't need to worry about extreme edge cases.
+ */
+ if (count > nested_vmx_max_atomic_switch_msrs(vcpu))
+ return -EINVAL;
+
if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) ||
!kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1)))
return -EINVAL;
@@ -941,15 +959,6 @@ static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
return 0;
}
-static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
- vmx->nested.msrs.misc_high);
-
- return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
-}
-
/*
* Load guest's/host's msr at nested entry/exit.
* return 0 for success, entry index for failure.
@@ -966,7 +975,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
for (i = 0; i < count; i++) {
- if (unlikely(i >= max_msr_list_size))
+ if (WARN_ON_ONCE(i >= max_msr_list_size))
goto fail;
if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
@@ -1054,7 +1063,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
for (i = 0; i < count; i++) {
- if (unlikely(i >= max_msr_list_size))
+ if (WARN_ON_ONCE(i >= max_msr_list_size))
return -EINVAL;
if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
@@ -4521,12 +4530,12 @@ static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
cpu = get_cpu();
vmx->loaded_vmcs = &vmx->nested.vmcs02;
- vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01);
+ vmx_vcpu_load_vmcs(vcpu, cpu);
sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
vmx->loaded_vmcs = &vmx->vmcs01;
- vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02);
+ vmx_vcpu_load_vmcs(vcpu, cpu);
put_cpu();
}
@@ -5021,16 +5030,7 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
- /*
- * If IBRS is advertised to the vCPU, KVM must flush the indirect
- * branch predictors when transitioning from L2 to L1, as L1 expects
- * hardware (KVM in this case) to provide separate predictor modes.
- * Bare metal isolates VMX root (host) from VMX non-root (guest), but
- * doesn't isolate different VMCSs, i.e. in this case, doesn't provide
- * separate modes for L2 vs L1.
- */
- if (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL))
- indirect_branch_prediction_barrier();
+ kvm_nested_vmexit_handle_ibrs(vcpu);
/* Update any VMCS fields that might have changed while L2 ran */
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 99d1d599ff8c..5c615e5845bf 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -34,7 +34,7 @@ static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock);
#define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING
-struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
{
return &(to_vt(vcpu)->pi_desc);
}
@@ -148,9 +148,8 @@ after_clear_sn:
static bool vmx_can_use_vtd_pi(struct kvm *kvm)
{
- return irqchip_in_kernel(kvm) && enable_apicv &&
- kvm_arch_has_assigned_device(kvm) &&
- irq_remapping_cap(IRQ_POSTING_CAP);
+ return irqchip_in_kernel(kvm) && kvm_arch_has_irq_bypass() &&
+ kvm_arch_has_assigned_device(kvm);
}
/*
@@ -264,6 +263,14 @@ void __init pi_init_cpu(int cpu)
raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
}
+void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi = vcpu_to_pi_desc(vcpu);
+
+ pi_clear_on(pi);
+ memset(pi->pir, 0, sizeof(pi->pir));
+}
+
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
@@ -281,7 +288,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
*/
void vmx_pi_start_assignment(struct kvm *kvm)
{
- if (!irq_remapping_cap(IRQ_POSTING_CAP))
+ if (!kvm_arch_has_irq_bypass())
return;
kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK);
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 68605ca7ef68..80499ea0e674 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -5,12 +5,11 @@
#include <linux/bitmap.h>
#include <asm/posted_intr.h>
-struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu);
-
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
void pi_wakeup_handler(void);
void __init pi_init_cpu(int cpu);
+void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu);
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
@@ -20,7 +19,7 @@ static inline int pi_find_highest_vector(struct pi_desc *pi_desc)
{
int vec;
- vec = find_last_bit((unsigned long *)pi_desc->pir, 256);
+ vec = find_last_bit(pi_desc->pir, 256);
return vec < 256 ? vec : -1;
}
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index f6986dee6f8c..0a6cf5bff2aa 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -59,8 +59,7 @@
* without the explicit restore, thinks the stack is getting walloped.
* Using an unwind hint is problematic due to x86-64's dynamic alignment.
*/
- mov %_ASM_BP, %_ASM_SP
- pop %_ASM_BP
+ leave
RET
.endm
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b12414108cbf..4953846cb30d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -117,6 +117,8 @@ module_param(enable_apicv, bool, 0444);
bool __read_mostly enable_ipiv = true;
module_param(enable_ipiv, bool, 0444);
+module_param(enable_device_posted_irqs, bool, 0444);
+
/*
* If nested=1, nested virtualization is supported, i.e., guests may use
* VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -772,8 +774,11 @@ void vmx_emergency_disable_virtualization_cpu(void)
return;
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
- loaded_vmcss_on_cpu_link)
+ loaded_vmcss_on_cpu_link) {
vmcs_clear(v->vmcs);
+ if (v->shadow_vmcs)
+ vmcs_clear(v->shadow_vmcs);
+ }
kvm_cpu_vmxoff();
}
@@ -1445,8 +1450,7 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
}
}
-void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
- struct loaded_vmcs *buddy)
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
@@ -1473,17 +1477,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
if (prev != vmx->loaded_vmcs->vmcs) {
per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
vmcs_load(vmx->loaded_vmcs->vmcs);
-
- /*
- * No indirect branch prediction barrier needed when switching
- * the active VMCS within a vCPU, unless IBRS is advertised to
- * the vCPU. To minimize the number of IBPBs executed, KVM
- * performs IBPB on nested VM-Exit (a single nested transition
- * may switch the active VMCS multiple times).
- */
- if (static_branch_likely(&switch_vcpu_ibpb) &&
- (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)))
- indirect_branch_prediction_barrier();
}
if (!already_loaded) {
@@ -1522,7 +1515,7 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);
- vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
+ vmx_vcpu_load_vmcs(vcpu, cpu);
vmx_vcpu_pi_load(vcpu, cpu);
}
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 6d1e40ecc024..b5758c33c60f 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -354,8 +354,7 @@ static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
return vt->exit_intr_info;
}
-void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
- struct loaded_vmcs *buddy);
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu);
int allocate_vpid(void);
void free_vpid(int vpid);
void vmx_set_constant_host_state(struct vcpu_vmx *vmx);
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index 6bf8be570b2e..b4596f651232 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -57,6 +57,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
int vmx_get_feature_msr(u32 msr, u64 *data);
int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+#define vmx_complete_emulated_msr kvm_complete_insn_gp
u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg);
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
@@ -163,71 +164,6 @@ void tdx_flush_tlb_current(struct kvm_vcpu *vcpu);
void tdx_flush_tlb_all(struct kvm_vcpu *vcpu);
void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn);
-#else
-static inline void tdx_disable_virtualization_cpu(void) {}
-static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; }
-static inline void tdx_mmu_release_hkid(struct kvm *kvm) {}
-static inline void tdx_vm_destroy(struct kvm *kvm) {}
-static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; }
-
-static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; }
-static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {}
-static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {}
-static inline void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) {}
-static inline int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; }
-static inline fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
-{
- return EXIT_FASTPATH_NONE;
-}
-static inline void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) {}
-static inline void tdx_vcpu_put(struct kvm_vcpu *vcpu) {}
-static inline bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu) { return false; }
-static inline int tdx_handle_exit(struct kvm_vcpu *vcpu,
- enum exit_fastpath_completion fastpath) { return 0; }
-
-static inline void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
- int trig_mode, int vector) {}
-static inline void tdx_inject_nmi(struct kvm_vcpu *vcpu) {}
-static inline void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, u64 *info1,
- u64 *info2, u32 *intr_info, u32 *error_code) {}
-static inline bool tdx_has_emulated_msr(u32 index) { return false; }
-static inline int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { return 1; }
-static inline int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { return 1; }
-
-static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; }
-
-static inline int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level,
- void *private_spt)
-{
- return -EOPNOTSUPP;
-}
-
-static inline int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
- enum pg_level level,
- void *private_spt)
-{
- return -EOPNOTSUPP;
-}
-
-static inline int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level,
- kvm_pfn_t pfn)
-{
- return -EOPNOTSUPP;
-}
-
-static inline int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
- enum pg_level level,
- kvm_pfn_t pfn)
-{
- return -EOPNOTSUPP;
-}
-
-static inline void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) {}
-static inline void tdx_flush_tlb_all(struct kvm_vcpu *vcpu) {}
-static inline void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) {}
-static inline int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) { return 0; }
#endif
#endif /* __KVM_X86_VMX_X86_OPS_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 570e7f8cbf64..b58a74c1722d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -226,6 +226,9 @@ EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
bool __read_mostly enable_apicv = true;
EXPORT_SYMBOL_GPL(enable_apicv);
+bool __read_mostly enable_device_posted_irqs = true;
+EXPORT_SYMBOL_GPL(enable_device_posted_irqs);
+
const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS(),
STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
@@ -4990,6 +4993,8 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
return kvm_arch_has_noncoherent_dma(vcpu->kvm);
}
+static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu);
+
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -5012,6 +5017,19 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_x86_call(vcpu_load)(vcpu, cpu);
+ if (vcpu != per_cpu(last_vcpu, cpu)) {
+ /*
+ * Flush the branch predictor when switching vCPUs on the same
+ * physical CPU, as each vCPU needs its own branch prediction
+ * domain. No IBPB is needed when switching between L1 and L2
+ * on the same vCPU unless IBRS is advertised to the vCPU; that
+ * is handled on the nested VM-Exit path.
+ */
+ if (static_branch_likely(&switch_vcpu_ibpb))
+ indirect_branch_prediction_barrier();
+ per_cpu(last_vcpu, cpu) = vcpu;
+ }
+
/* Save host pkru register if supported */
vcpu->arch.host_pkru = read_pkru();
@@ -7326,10 +7344,13 @@ set_pit2_out:
r = READ_ONCE(kvm->arch.default_tsc_khz);
goto out;
}
- case KVM_MEMORY_ENCRYPT_OP: {
+ case KVM_MEMORY_ENCRYPT_OP:
+ r = -ENOTTY;
+ if (!kvm_x86_ops.mem_enc_ioctl)
+ goto out;
+
r = kvm_x86_call(mem_enc_ioctl)(kvm, argp);
break;
- }
case KVM_MEMORY_ENCRYPT_REG_REGION: {
struct kvm_enc_region region;
@@ -8023,7 +8044,7 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
return rc;
if (!vcpu->mmio_nr_fragments)
- return rc;
+ return X86EMUL_CONTINUE;
gpa = vcpu->mmio_fragments[0].gpa;
@@ -9361,7 +9382,7 @@ static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
{
vcpu->arch.pio.count = 0;
- if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)))
return 1;
return kvm_skip_emulated_instruction(vcpu);
@@ -9386,7 +9407,7 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
complete_fast_pio_out_port_0x7e;
kvm_skip_emulated_instruction(vcpu);
} else {
- vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.complete_userspace_io = complete_fast_pio_out;
}
return 0;
@@ -9399,7 +9420,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
/* We should only ever be called with arch.pio.count equal to 1 */
BUG_ON(vcpu->arch.pio.count != 1);
- if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) {
vcpu->arch.pio.count = 0;
return 1;
}
@@ -9428,7 +9449,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
return ret;
}
- vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.complete_userspace_io = complete_fast_pio_in;
return 0;
@@ -9811,6 +9832,9 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
if (r != 0)
goto out_mmu_exit;
+ enable_device_posted_irqs &= enable_apicv &&
+ irq_remapping_cap(IRQ_POSTING_CAP);
+
kvm_ops_update(ops);
for_each_online_cpu(cpu) {
@@ -10694,6 +10718,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
return;
bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
+ vcpu->arch.highest_stale_pending_ioapic_eoi = -1;
kvm_x86_call(sync_pir_to_irr)(vcpu);
@@ -12419,13 +12444,16 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
- int idx;
+ int idx, cpu;
kvm_clear_async_pf_completion_queue(vcpu);
kvm_mmu_unload(vcpu);
kvmclock_reset(vcpu);
+ for_each_possible_cpu(cpu)
+ cmpxchg(per_cpu_ptr(&last_vcpu, cpu), vcpu, NULL);
+
kvm_x86_call(vcpu_free)(vcpu);
kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 88a9475899c8..832f0faf4779 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -121,6 +121,24 @@ static inline void kvm_leave_nested(struct kvm_vcpu *vcpu)
kvm_x86_ops.nested_ops->leave_nested(vcpu);
}
+/*
+ * If IBRS is advertised to the vCPU, KVM must flush the indirect branch
+ * predictors when transitioning from L2 to L1, as L1 expects hardware (KVM in
+ * this case) to provide separate predictor modes. Bare metal isolates the host
+ * from the guest, but doesn't isolate different guests from one another (in
+ * this case L1 and L2). The exception is if bare metal supports same mode IBRS,
+ * which offers protection within the same mode, and hence protects L1 from L2.
+ */
+static inline void kvm_nested_vmexit_handle_ibrs(struct kvm_vcpu *vcpu)
+{
+ if (cpu_feature_enabled(X86_FEATURE_AMD_IBRS_SAME_MODE))
+ return;
+
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBRS))
+ indirect_branch_prediction_barrier();
+}
+
static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu)
{
return vcpu->arch.last_vmentry_cpu != -1;
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 89079ea73e65..a4700ef6eb64 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -266,6 +266,32 @@ static void effective_prot(struct ptdump_state *pt_st, int level, u64 val)
st->prot_levels[level] = effective;
}
+static void effective_prot_pte(struct ptdump_state *st, pte_t pte)
+{
+ effective_prot(st, 4, pte_val(pte));
+}
+
+static void effective_prot_pmd(struct ptdump_state *st, pmd_t pmd)
+{
+ effective_prot(st, 3, pmd_val(pmd));
+}
+
+static void effective_prot_pud(struct ptdump_state *st, pud_t pud)
+{
+ effective_prot(st, 2, pud_val(pud));
+}
+
+static void effective_prot_p4d(struct ptdump_state *st, p4d_t p4d)
+{
+ effective_prot(st, 1, p4d_val(p4d));
+}
+
+static void effective_prot_pgd(struct ptdump_state *st, pgd_t pgd)
+{
+ effective_prot(st, 0, pgd_val(pgd));
+}
+
+
/*
* This function gets called on a break in a continuous series
* of PTE entries; the next one is different so we need to
@@ -362,6 +388,38 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
}
}
+static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte)
+{
+ note_page(pt_st, addr, 4, pte_val(pte));
+}
+
+static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd)
+{
+ note_page(pt_st, addr, 3, pmd_val(pmd));
+}
+
+static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud)
+{
+ note_page(pt_st, addr, 2, pud_val(pud));
+}
+
+static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d)
+{
+ note_page(pt_st, addr, 1, p4d_val(p4d));
+}
+
+static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd)
+{
+ note_page(pt_st, addr, 0, pgd_val(pgd));
+}
+
+static void note_page_flush(struct ptdump_state *pt_st)
+{
+ pte_t pte_zero = {0};
+
+ note_page(pt_st, 0, -1, pte_val(pte_zero));
+}
+
bool ptdump_walk_pgd_level_core(struct seq_file *m,
struct mm_struct *mm, pgd_t *pgd,
bool checkwx, bool dmesg)
@@ -378,8 +436,17 @@ bool ptdump_walk_pgd_level_core(struct seq_file *m,
struct pg_state st = {
.ptdump = {
- .note_page = note_page,
- .effective_prot = effective_prot,
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
+ .effective_prot_pte = effective_prot_pte,
+ .effective_prot_pmd = effective_prot_pmd,
+ .effective_prot_pud = effective_prot_pud,
+ .effective_prot_p4d = effective_prot_p4d,
+ .effective_prot_pgd = effective_prot_pgd,
.range = ptdump_ranges
},
.level = -1,
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 66330fe4e18c..ee66fae9ebcc 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1467,16 +1467,21 @@ static unsigned long probe_memory_block_size(void)
}
/*
- * Use max block size to minimize overhead on bare metal, where
- * alignment for memory hotplug isn't a concern.
+ * When hotplug alignment is not a concern, maximize blocksize
+ * to minimize overhead. Otherwise, align to the lesser of advice
+ * alignment and end of memory alignment.
*/
- if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
+ bz = memory_block_advised_max_size();
+ if (!bz) {
bz = MAX_BLOCK_SIZE;
- goto done;
+ if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
+ goto done;
+ } else {
+ bz = max(min(bz, MAX_BLOCK_SIZE), MIN_MEMORY_BLOCK_SIZE);
}
/* Find the largest allowed block size that aligns to memory end */
- for (bz = MAX_BLOCK_SIZE; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) {
+ for (; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) {
if (IS_ALIGNED(boot_mem_end, bz))
break;
}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 331e101bf801..12c8180ca1ba 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -71,7 +71,7 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
static unsigned int __ioremap_check_ram(struct resource *res)
{
unsigned long start_pfn, stop_pfn;
- unsigned long i;
+ unsigned long pfn;
if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM)
return 0;
@@ -79,9 +79,8 @@ static unsigned int __ioremap_check_ram(struct resource *res)
start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT;
stop_pfn = (res->end + 1) >> PAGE_SHIFT;
if (stop_pfn > start_pfn) {
- for (i = 0; i < (stop_pfn - start_pfn); ++i)
- if (pfn_valid(start_pfn + i) &&
- !PageReserved(pfn_to_page(start_pfn + i)))
+ for_each_valid_pfn(pfn, start_pfn, stop_pfn)
+ if (!PageReserved(pfn_to_page(pfn)))
return IORES_MAP_SYSTEM_RAM;
}
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index c97b527c66fe..2e7923844afe 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -775,6 +775,12 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
return vma_prot;
}
+static inline void pgprot_set_cachemode(pgprot_t *prot, enum page_cache_mode pcm)
+{
+ *prot = __pgprot((pgprot_val(*prot) & ~_PAGE_CACHE_MASK) |
+ cachemode2protval(pcm));
+}
+
int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t *vma_prot)
{
@@ -789,8 +795,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
if (file->f_flags & O_DSYNC)
pcm = _PAGE_CACHE_MODE_UC_MINUS;
- *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
- cachemode2protval(pcm));
+ pgprot_set_cachemode(vma_prot, pcm);
return 1;
}
@@ -831,8 +836,7 @@ int memtype_kernel_map_sync(u64 base, unsigned long size,
* Reserved non RAM regions only and after successful memtype_reserve,
* this func also keeps identity mapping (if any) in sync with this new prot.
*/
-static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
- int strict_prot)
+static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot)
{
int is_ram = 0;
int ret;
@@ -858,9 +862,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
(unsigned long long)paddr,
(unsigned long long)(paddr + size - 1),
cattr_name(pcm));
- *vma_prot = __pgprot((pgprot_val(*vma_prot) &
- (~_PAGE_CACHE_MASK)) |
- cachemode2protval(pcm));
+ pgprot_set_cachemode(vma_prot, pcm);
}
return 0;
}
@@ -870,8 +872,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
return ret;
if (pcm != want_pcm) {
- if (strict_prot ||
- !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
+ if (!is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
memtype_free(paddr, paddr + size);
pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n",
current->comm, current->pid,
@@ -881,13 +882,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
cattr_name(pcm));
return -EINVAL;
}
- /*
- * We allow returning different type than the one requested in
- * non strict case.
- */
- *vma_prot = __pgprot((pgprot_val(*vma_prot) &
- (~_PAGE_CACHE_MASK)) |
- cachemode2protval(pcm));
+ pgprot_set_cachemode(vma_prot, pcm);
}
if (memtype_kernel_map_sync(paddr, size, pcm) < 0) {
@@ -910,124 +905,14 @@ static void free_pfn_range(u64 paddr, unsigned long size)
memtype_free(paddr, paddr + size);
}
-static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
- resource_size_t *phys)
-{
- struct follow_pfnmap_args args = { .vma = vma, .address = vma->vm_start };
-
- if (follow_pfnmap_start(&args))
- return -EINVAL;
-
- /* Never return PFNs of anon folios in COW mappings. */
- if (!args.special) {
- follow_pfnmap_end(&args);
- return -EINVAL;
- }
-
- *prot = pgprot_val(args.pgprot);
- *phys = (resource_size_t)args.pfn << PAGE_SHIFT;
- follow_pfnmap_end(&args);
- return 0;
-}
-
-static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
- pgprot_t *pgprot)
-{
- unsigned long prot;
-
- VM_WARN_ON_ONCE(!(vma->vm_flags & VM_PAT));
-
- /*
- * We need the starting PFN and cachemode used for track_pfn_remap()
- * that covered the whole VMA. For most mappings, we can obtain that
- * information from the page tables. For COW mappings, we might now
- * suddenly have anon folios mapped and follow_phys() will fail.
- *
- * Fallback to using vma->vm_pgoff, see remap_pfn_range_notrack(), to
- * detect the PFN. If we need the cachemode as well, we're out of luck
- * for now and have to fail fork().
- */
- if (!follow_phys(vma, &prot, paddr)) {
- if (pgprot)
- *pgprot = __pgprot(prot);
- return 0;
- }
- if (is_cow_mapping(vma->vm_flags)) {
- if (pgprot)
- return -EINVAL;
- *paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
- return 0;
- }
- WARN_ON_ONCE(1);
- return -EINVAL;
-}
-
-int track_pfn_copy(struct vm_area_struct *dst_vma,
- struct vm_area_struct *src_vma, unsigned long *pfn)
-{
- const unsigned long vma_size = src_vma->vm_end - src_vma->vm_start;
- resource_size_t paddr;
- pgprot_t pgprot;
- int rc;
-
- if (!(src_vma->vm_flags & VM_PAT))
- return 0;
-
- /*
- * Duplicate the PAT information for the dst VMA based on the src
- * VMA.
- */
- if (get_pat_info(src_vma, &paddr, &pgprot))
- return -EINVAL;
- rc = reserve_pfn_range(paddr, vma_size, &pgprot, 1);
- if (rc)
- return rc;
-
- /* Reservation for the destination VMA succeeded. */
- vm_flags_set(dst_vma, VM_PAT);
- *pfn = PHYS_PFN(paddr);
- return 0;
-}
-
-void untrack_pfn_copy(struct vm_area_struct *dst_vma, unsigned long pfn)
-{
- untrack_pfn(dst_vma, pfn, dst_vma->vm_end - dst_vma->vm_start, true);
- /*
- * Reservation was freed, any copied page tables will get cleaned
- * up later, but without getting PAT involved again.
- */
-}
-
-/*
- * prot is passed in as a parameter for the new mapping. If the vma has
- * a linear pfn mapping for the entire range, or no vma is provided,
- * reserve the entire pfn + size range with single reserve_pfn_range
- * call.
- */
-int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
- unsigned long pfn, unsigned long addr, unsigned long size)
+int pfnmap_setup_cachemode(unsigned long pfn, unsigned long size, pgprot_t *prot)
{
resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
enum page_cache_mode pcm;
- /* reserve the whole chunk starting from paddr */
- if (!vma || (addr == vma->vm_start
- && size == (vma->vm_end - vma->vm_start))) {
- int ret;
-
- ret = reserve_pfn_range(paddr, size, prot, 0);
- if (ret == 0 && vma)
- vm_flags_set(vma, VM_PAT);
- return ret;
- }
-
if (!pat_enabled())
return 0;
- /*
- * For anything smaller than the vma size we set prot based on the
- * lookup.
- */
pcm = lookup_memtype(paddr);
/* Check memtype for the remaining pages */
@@ -1038,70 +923,35 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
return -EINVAL;
}
- *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
- cachemode2protval(pcm));
-
+ pgprot_set_cachemode(prot, pcm);
return 0;
}
-void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn)
+int pfnmap_track(unsigned long pfn, unsigned long size, pgprot_t *prot)
{
- enum page_cache_mode pcm;
+ const resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
- if (!pat_enabled())
- return;
-
- /* Set prot based on lookup */
- pcm = lookup_memtype(pfn_t_to_phys(pfn));
- *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
- cachemode2protval(pcm));
+ return reserve_pfn_range(paddr, size, prot);
}
-/*
- * untrack_pfn is called while unmapping a pfnmap for a region.
- * untrack can be called for a specific region indicated by pfn and size or
- * can be for the entire vma (in which case pfn, size are zero).
- */
-void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
- unsigned long size, bool mm_wr_locked)
+void pfnmap_untrack(unsigned long pfn, unsigned long size)
{
- resource_size_t paddr;
-
- if (vma && !(vma->vm_flags & VM_PAT))
- return;
+ const resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
- /* free the chunk starting from pfn or the whole chunk */
- paddr = (resource_size_t)pfn << PAGE_SHIFT;
- if (!paddr && !size) {
- if (get_pat_info(vma, &paddr, NULL))
- return;
- size = vma->vm_end - vma->vm_start;
- }
free_pfn_range(paddr, size);
- if (vma) {
- if (mm_wr_locked)
- vm_flags_clear(vma, VM_PAT);
- else
- __vm_flags_mod(vma, 0, VM_PAT);
- }
-}
-
-void untrack_pfn_clear(struct vm_area_struct *vma)
-{
- vm_flags_clear(vma, VM_PAT);
}
pgprot_t pgprot_writecombine(pgprot_t prot)
{
- return __pgprot(pgprot_val(prot) |
- cachemode2protval(_PAGE_CACHE_MODE_WC));
+ pgprot_set_cachemode(&prot, _PAGE_CACHE_MODE_WC);
+ return prot;
}
EXPORT_SYMBOL_GPL(pgprot_writecombine);
pgprot_t pgprot_writethrough(pgprot_t prot)
{
- return __pgprot(pgprot_val(prot) |
- cachemode2protval(_PAGE_CACHE_MODE_WT));
+ pgprot_set_cachemode(&prot, _PAGE_CACHE_MODE_WT);
+ return prot;
}
EXPORT_SYMBOL_GPL(pgprot_writethrough);
diff --git a/arch/x86/mm/pat/memtype_interval.c b/arch/x86/mm/pat/memtype_interval.c
index 645613d59942..e5844ed1311e 100644
--- a/arch/x86/mm/pat/memtype_interval.c
+++ b/arch/x86/mm/pat/memtype_interval.c
@@ -49,32 +49,6 @@ INTERVAL_TREE_DEFINE(struct memtype, rb, u64, subtree_max_end,
static struct rb_root_cached memtype_rbroot = RB_ROOT_CACHED;
-enum {
- MEMTYPE_EXACT_MATCH = 0,
- MEMTYPE_END_MATCH = 1
-};
-
-static struct memtype *memtype_match(u64 start, u64 end, int match_type)
-{
- struct memtype *entry_match;
-
- entry_match = interval_iter_first(&memtype_rbroot, start, end-1);
-
- while (entry_match != NULL && entry_match->start < end) {
- if ((match_type == MEMTYPE_EXACT_MATCH) &&
- (entry_match->start == start) && (entry_match->end == end))
- return entry_match;
-
- if ((match_type == MEMTYPE_END_MATCH) &&
- (entry_match->start < start) && (entry_match->end == end))
- return entry_match;
-
- entry_match = interval_iter_next(entry_match, start, end-1);
- }
-
- return NULL; /* Returns NULL if there is no match */
-}
-
static int memtype_check_conflict(u64 start, u64 end,
enum page_cache_mode reqtype,
enum page_cache_mode *newtype)
@@ -130,35 +104,16 @@ int memtype_check_insert(struct memtype *entry_new, enum page_cache_mode *ret_ty
struct memtype *memtype_erase(u64 start, u64 end)
{
- struct memtype *entry_old;
-
- /*
- * Since the memtype_rbroot tree allows overlapping ranges,
- * memtype_erase() checks with EXACT_MATCH first, i.e. free
- * a whole node for the munmap case. If no such entry is found,
- * it then checks with END_MATCH, i.e. shrink the size of a node
- * from the end for the mremap case.
- */
- entry_old = memtype_match(start, end, MEMTYPE_EXACT_MATCH);
- if (!entry_old) {
- entry_old = memtype_match(start, end, MEMTYPE_END_MATCH);
- if (!entry_old)
- return ERR_PTR(-EINVAL);
+ struct memtype *entry = interval_iter_first(&memtype_rbroot, start, end - 1);
+
+ while (entry && entry->start < end) {
+ if (entry->start == start && entry->end == end) {
+ interval_remove(entry, &memtype_rbroot);
+ return entry;
+ }
+ entry = interval_iter_next(entry, start, end - 1);
}
-
- if (entry_old->start == start) {
- /* munmap: erase this node */
- interval_remove(entry_old, &memtype_rbroot);
- } else {
- /* mremap: update the end value of this node */
- interval_remove(entry_old, &memtype_rbroot);
- entry_old->end = start;
- interval_insert(entry_old, &memtype_rbroot);
-
- return NULL;
- }
-
- return entry_old;
+ return ERR_PTR(-EINVAL);
}
struct memtype *memtype_lookup(u64 addr)
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 30ab4aced761..46edc11726b7 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2148,6 +2148,19 @@ static inline int cpa_clear_pages_array(struct page **pages, int numpages,
CPA_PAGES_ARRAY, pages);
}
+/*
+ * __set_memory_prot is an internal helper for callers that have been passed
+ * a pgprot_t value from upper layers and a reservation has already been taken.
+ * If you want to set the pgprot to a specific page protocol, use the
+ * set_memory_xx() functions.
+ */
+int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot)
+{
+ return change_page_attr_set_clr(&addr, numpages, prot,
+ __pgprot(~pgprot_val(prot)), 0, 0,
+ NULL);
+}
+
int _set_memory_uc(unsigned long addr, int numpages)
{
/*
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 62777ba4de1a..ddf248c3ee7d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -189,7 +189,7 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
if (!ptdesc)
failed = true;
- if (ptdesc && !pagetable_pmd_ctor(ptdesc)) {
+ if (ptdesc && !pagetable_pmd_ctor(mm, ptdesc)) {
pagetable_free(ptdesc);
ptdesc = NULL;
failed = true;
@@ -751,14 +751,13 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
for (i = 0; i < PTRS_PER_PMD; i++) {
if (!pmd_none(pmd_sv[i])) {
pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
- free_page((unsigned long)pte);
+ pte_free_kernel(&init_mm, pte);
}
}
free_page((unsigned long)pmd_sv);
- pagetable_dtor(virt_to_ptdesc(pmd));
- free_page((unsigned long)pmd);
+ pmd_free(&init_mm, pmd);
return 1;
}
@@ -781,7 +780,7 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
/* INVLPG to clear all paging-structure caches */
flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
- free_page((unsigned long)pte);
+ pte_free_kernel(&init_mm, pte);
return 1;
}
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index ed5c63c0b4e5..88be32026768 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -66,6 +66,8 @@ void __init reserve_real_mode(void)
* setup_arch().
*/
memblock_reserve(0, SZ_1M);
+
+ memblock_clear_kho_scratch(0, SZ_1M);
}
static void __init sme_sev_setup_real_mode(struct trampoline_header *th)
diff --git a/arch/xtensa/Kbuild b/arch/xtensa/Kbuild
index fd12f61745ba..015baeb765b9 100644
--- a/arch/xtensa/Kbuild
+++ b/arch/xtensa/Kbuild
@@ -1,2 +1,2 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-y += kernel/ mm/ platforms/ boot/dts/
+obj-y += kernel/ mm/ platforms/
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index d3db28f2f811..f2f9cd9cde50 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -20,6 +20,7 @@ config XTENSA
select ARCH_USE_QUEUED_SPINLOCKS
select ARCH_WANT_IPC_PARSE_VERSION
select BUILDTIME_TABLE_SORT
+ select GENERIC_BUILTIN_DTB
select CLONE_BACKWARDS
select COMMON_CLK
select DMA_NONCOHERENT_MMAP if MMU
@@ -462,7 +463,7 @@ config USE_OF
help
Include support for flattened device tree machine descriptions.
-config BUILTIN_DTB_SOURCE
+config BUILTIN_DTB_NAME
string "DTB to build into the kernel image"
depends on OF
diff --git a/arch/xtensa/boot/dts/Makefile b/arch/xtensa/boot/dts/Makefile
index d6408c16d74e..7271294ce523 100644
--- a/arch/xtensa/boot/dts/Makefile
+++ b/arch/xtensa/boot/dts/Makefile
@@ -7,7 +7,7 @@
#
#
-obj-$(CONFIG_OF) += $(addsuffix .dtb.o, $(CONFIG_BUILTIN_DTB_SOURCE))
+dtb-$(CONFIG_OF) += $(addsuffix .dtb, $(CONFIG_BUILTIN_DTB_NAME))
# for CONFIG_OF_ALL_DTBS test
dtb- := $(patsubst $(src)/%.dts,%.dtb, $(wildcard $(src)/*.dts))
diff --git a/arch/xtensa/configs/audio_kc705_defconfig b/arch/xtensa/configs/audio_kc705_defconfig
index 436b7cac9694..f2af1a32c9c7 100644
--- a/arch/xtensa/configs/audio_kc705_defconfig
+++ b/arch/xtensa/configs/audio_kc705_defconfig
@@ -30,7 +30,7 @@ CONFIG_XTENSA_PLATFORM_XTFPGA=y
CONFIG_CMDLINE_BOOL=y
CONFIG_CMDLINE="earlycon=uart8250,mmio32native,0xfd050020,115200n8 console=ttyS0,115200n8 ip=dhcp root=/dev/nfs rw debug memmap=0x38000000@0"
CONFIG_USE_OF=y
-CONFIG_BUILTIN_DTB_SOURCE="kc705"
+CONFIG_BUILTIN_DTB_NAME="kc705"
# CONFIG_COMPACTION is not set
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
CONFIG_PM=y
diff --git a/arch/xtensa/configs/cadence_csp_defconfig b/arch/xtensa/configs/cadence_csp_defconfig
index 49f50d1bd724..88ed5284e21c 100644
--- a/arch/xtensa/configs/cadence_csp_defconfig
+++ b/arch/xtensa/configs/cadence_csp_defconfig
@@ -34,7 +34,7 @@ CONFIG_HIGHMEM=y
# CONFIG_PCI is not set
CONFIG_XTENSA_PLATFORM_XTFPGA=y
CONFIG_USE_OF=y
-CONFIG_BUILTIN_DTB_SOURCE="csp"
+CONFIG_BUILTIN_DTB_NAME="csp"
# CONFIG_COMPACTION is not set
CONFIG_XTFPGA_LCD=y
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
diff --git a/arch/xtensa/configs/common_defconfig b/arch/xtensa/configs/common_defconfig
index fa9389869154..09e4a1d9d1f3 100644
--- a/arch/xtensa/configs/common_defconfig
+++ b/arch/xtensa/configs/common_defconfig
@@ -32,7 +32,6 @@ CONFIG_NET_SCH_TEQL=m
CONFIG_NET_SCH_TBF=m
CONFIG_NET_SCH_GRED=m
CONFIG_NET_SCH_DSMARK=m
-CONFIG_NET_CLS_TCINDEX=m
CONFIG_NET_CLS_ROUTE4=m
CONFIG_NET_CLS_FW=m
CONFIG_NET_CLS_U32=m
diff --git a/arch/xtensa/configs/generic_kc705_defconfig b/arch/xtensa/configs/generic_kc705_defconfig
index e376238bc5ca..4427907becca 100644
--- a/arch/xtensa/configs/generic_kc705_defconfig
+++ b/arch/xtensa/configs/generic_kc705_defconfig
@@ -29,7 +29,7 @@ CONFIG_XTENSA_PLATFORM_XTFPGA=y
CONFIG_CMDLINE_BOOL=y
CONFIG_CMDLINE="earlycon=uart8250,mmio32native,0xfd050020,115200n8 console=ttyS0,115200n8 ip=dhcp root=/dev/nfs rw debug memmap=0x38000000@0"
CONFIG_USE_OF=y
-CONFIG_BUILTIN_DTB_SOURCE="kc705"
+CONFIG_BUILTIN_DTB_NAME="kc705"
# CONFIG_COMPACTION is not set
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
CONFIG_NET=y
diff --git a/arch/xtensa/configs/nommu_kc705_defconfig b/arch/xtensa/configs/nommu_kc705_defconfig
index c2ab4306ee20..5828228522ba 100644
--- a/arch/xtensa/configs/nommu_kc705_defconfig
+++ b/arch/xtensa/configs/nommu_kc705_defconfig
@@ -36,7 +36,7 @@ CONFIG_XTENSA_PLATFORM_XTFPGA=y
CONFIG_CMDLINE_BOOL=y
CONFIG_CMDLINE="earlycon=uart8250,mmio32native,0x9d050020,115200n8 console=ttyS0,115200n8 ip=dhcp root=/dev/nfs rw debug memmap=256M@0x60000000"
CONFIG_USE_OF=y
-CONFIG_BUILTIN_DTB_SOURCE="kc705_nommu"
+CONFIG_BUILTIN_DTB_NAME="kc705_nommu"
CONFIG_BINFMT_FLAT=y
CONFIG_NET=y
CONFIG_PACKET=y
diff --git a/arch/xtensa/configs/smp_lx200_defconfig b/arch/xtensa/configs/smp_lx200_defconfig
index 63b56ce79f83..326966ca7831 100644
--- a/arch/xtensa/configs/smp_lx200_defconfig
+++ b/arch/xtensa/configs/smp_lx200_defconfig
@@ -33,7 +33,7 @@ CONFIG_XTENSA_PLATFORM_XTFPGA=y
CONFIG_CMDLINE_BOOL=y
CONFIG_CMDLINE="earlycon=uart8250,mmio32native,0xfd050020,115200n8 console=ttyS0,115200n8 ip=dhcp root=/dev/nfs rw debug memmap=96M@0"
CONFIG_USE_OF=y
-CONFIG_BUILTIN_DTB_SOURCE="lx200mx"
+CONFIG_BUILTIN_DTB_NAME="lx200mx"
# CONFIG_COMPACTION is not set
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
CONFIG_NET=y
diff --git a/arch/xtensa/configs/virt_defconfig b/arch/xtensa/configs/virt_defconfig
index 98acb7191cb7..e37048985b47 100644
--- a/arch/xtensa/configs/virt_defconfig
+++ b/arch/xtensa/configs/virt_defconfig
@@ -24,7 +24,7 @@ CONFIG_HIGHMEM=y
CONFIG_CMDLINE_BOOL=y
CONFIG_CMDLINE="console=ttyS0,115200n8 ip=dhcp root=/dev/nfs rw debug memmap=0x80000000@0"
CONFIG_USE_OF=y
-CONFIG_BUILTIN_DTB_SOURCE="virt"
+CONFIG_BUILTIN_DTB_NAME="virt"
# CONFIG_PARSE_BOOTPARAM is not set
CONFIG_JUMP_LABEL=y
CONFIG_MODULES=y
diff --git a/arch/xtensa/configs/xip_kc705_defconfig b/arch/xtensa/configs/xip_kc705_defconfig
index 165652c45b85..ee47438f9b51 100644
--- a/arch/xtensa/configs/xip_kc705_defconfig
+++ b/arch/xtensa/configs/xip_kc705_defconfig
@@ -29,7 +29,7 @@ CONFIG_XTENSA_PLATFORM_XTFPGA=y
CONFIG_CMDLINE_BOOL=y
CONFIG_CMDLINE="earlycon=uart8250,mmio32native,0xfd050020,115200n8 console=ttyS0,115200n8 ip=dhcp root=/dev/nfs rw debug memmap=0x38000000@0"
CONFIG_USE_OF=y
-CONFIG_BUILTIN_DTB_SOURCE="kc705"
+CONFIG_BUILTIN_DTB_NAME="kc705"
# CONFIG_PARSE_BOOTPARAM is not set
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
# CONFIG_COMPACTION is not set
diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h
index 1647a7cc3fbf..cb1725c40e36 100644
--- a/arch/xtensa/include/asm/pgtable.h
+++ b/arch/xtensa/include/asm/pgtable.h
@@ -269,17 +269,11 @@ static inline pte_t pte_mkwrite_novma(pte_t pte)
((__pgprot((pgprot_val(prot) & ~_PAGE_CA_MASK) | \
_PAGE_CA_BYPASS)))
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- */
-
#define PFN_PTE_SHIFT PAGE_SHIFT
#define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT)
#define pte_same(a,b) (pte_val(a) == pte_val(b))
#define pte_page(x) pfn_to_page(pte_pfn(x))
#define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
-#define mk_pte(page, prot) pfn_pte(page_to_pfn(page), prot)
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
diff --git a/arch/xtensa/include/asm/ptrace.h b/arch/xtensa/include/asm/ptrace.h
index 86c70117371b..4871e5a4d6fb 100644
--- a/arch/xtensa/include/asm/ptrace.h
+++ b/arch/xtensa/include/asm/ptrace.h
@@ -72,13 +72,10 @@ struct pt_regs {
/* Additional configurable registers that are used by the compiler. */
xtregs_opt_t xtregs_opt;
- /* Make sure the areg field is 16 bytes aligned. */
- int align[0] __attribute__ ((aligned(16)));
-
/* current register frame.
* Note: The ESF for kernel exceptions ends after 16 registers!
*/
- unsigned long areg[XCHAL_NUM_AREGS];
+ unsigned long areg[XCHAL_NUM_AREGS] __aligned(16);
};
# define arch_has_single_step() (1)
diff --git a/arch/xtensa/include/asm/syscall.h b/arch/xtensa/include/asm/syscall.h
index 5ee974bf8330..7db3b489c8ad 100644
--- a/arch/xtensa/include/asm/syscall.h
+++ b/arch/xtensa/include/asm/syscall.h
@@ -28,6 +28,13 @@ static inline long syscall_get_nr(struct task_struct *task,
return regs->syscall;
}
+static inline void syscall_set_nr(struct task_struct *task,
+ struct pt_regs *regs,
+ int nr)
+{
+ regs->syscall = nr;
+}
+
static inline void syscall_rollback(struct task_struct *task,
struct pt_regs *regs)
{
@@ -68,6 +75,17 @@ static inline void syscall_get_arguments(struct task_struct *task,
args[i] = regs->areg[reg[i]];
}
+static inline void syscall_set_arguments(struct task_struct *task,
+ struct pt_regs *regs,
+ const unsigned long *args)
+{
+ static const unsigned int reg[] = XTENSA_SYSCALL_ARGUMENT_REGS;
+ unsigned int i;
+
+ for (i = 0; i < 6; ++i)
+ regs->areg[reg[i]] = args[i];
+}
+
asmlinkage long xtensa_rt_sigreturn(void);
asmlinkage long xtensa_shmat(int, char __user *, int);
asmlinkage long xtensa_fadvise64_64(int, int,