// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2023, Microsoft Corporation. * * Author: * Roman Kisel * Saurabh Sengar * Naman Jain */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "../../kernel/fpu/legacy.h" #include "mshv.h" #include "mshv_vtl.h" #include "hyperv_vmbus.h" MODULE_AUTHOR("Microsoft"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Microsoft Hyper-V VTL Driver"); #define MSHV_ENTRY_REASON_LOWER_VTL_CALL 0x1 #define MSHV_ENTRY_REASON_INTERRUPT 0x2 #define MSHV_ENTRY_REASON_INTERCEPT 0x3 #define MSHV_REAL_OFF_SHIFT 16 #define MSHV_PG_OFF_CPU_MASK (BIT_ULL(MSHV_REAL_OFF_SHIFT) - 1) #define MSHV_RUN_PAGE_OFFSET 0 #define MSHV_REG_PAGE_OFFSET 1 #define VTL2_VMBUS_SINT_INDEX 7 static struct device *mem_dev; static struct tasklet_struct msg_dpc; static wait_queue_head_t fd_wait_queue; static bool has_message; static struct eventfd_ctx *flag_eventfds[HV_EVENT_FLAGS_COUNT]; static DEFINE_MUTEX(flag_lock); static bool __read_mostly mshv_has_reg_page; /* hvcall code is of type u16, allocate a bitmap of size (1 << 16) to accommodate it */ #define MAX_BITMAP_SIZE ((U16_MAX + 1) / 8) struct mshv_vtl_hvcall_fd { u8 allow_bitmap[MAX_BITMAP_SIZE]; bool allow_map_initialized; /* * Used to protect hvcall setup in IOCTLs */ struct mutex init_mutex; struct miscdevice *dev; }; struct mshv_vtl_poll_file { struct file *file; wait_queue_entry_t wait; wait_queue_head_t *wqh; poll_table pt; int cpu; }; struct mshv_vtl { struct device *module_dev; u64 id; }; struct mshv_vtl_per_cpu { struct mshv_vtl_run *run; struct page *reg_page; }; /* SYNIC_OVERLAY_PAGE_MSR - internal, identical to hv_synic_simp */ union hv_synic_overlay_page_msr { u64 as_uint64; struct { u64 enabled: 1; u64 reserved: 11; u64 pfn: 52; } __packed; }; static struct mutex mshv_vtl_poll_file_lock; static union hv_register_vsm_page_offsets mshv_vsm_page_offsets; static union hv_register_vsm_capabilities mshv_vsm_capabilities; static DEFINE_PER_CPU(struct mshv_vtl_poll_file, mshv_vtl_poll_file); static DEFINE_PER_CPU(unsigned long long, num_vtl0_transitions); static DEFINE_PER_CPU(struct mshv_vtl_per_cpu, mshv_vtl_per_cpu); static const union hv_input_vtl input_vtl_zero; static const union hv_input_vtl input_vtl_normal = { .use_target_vtl = 1, }; static const struct file_operations mshv_vtl_fops; static long mshv_ioctl_create_vtl(void __user *user_arg, struct device *module_dev) { struct mshv_vtl *vtl; struct file *file; int fd; vtl = kzalloc(sizeof(*vtl), GFP_KERNEL); if (!vtl) return -ENOMEM; fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) { kfree(vtl); return fd; } file = anon_inode_getfile("mshv_vtl", &mshv_vtl_fops, vtl, O_RDWR); if (IS_ERR(file)) { kfree(vtl); return PTR_ERR(file); } vtl->module_dev = module_dev; fd_install(fd, file); return fd; } static long mshv_ioctl_check_extension(void __user *user_arg) { u32 arg; if (copy_from_user(&arg, user_arg, sizeof(arg))) return -EFAULT; switch (arg) { case MSHV_CAP_CORE_API_STABLE: return 0; case MSHV_CAP_REGISTER_PAGE: return mshv_has_reg_page; case MSHV_CAP_VTL_RETURN_ACTION: return mshv_vsm_capabilities.return_action_available; case MSHV_CAP_DR6_SHARED: return mshv_vsm_capabilities.dr6_shared; } return -EOPNOTSUPP; } static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct miscdevice *misc = filp->private_data; switch (ioctl) { case MSHV_CHECK_EXTENSION: return mshv_ioctl_check_extension((void __user *)arg); case MSHV_CREATE_VTL: return mshv_ioctl_create_vtl((void __user *)arg, misc->this_device); } return -ENOTTY; } static const struct file_operations mshv_dev_fops = { .owner = THIS_MODULE, .unlocked_ioctl = mshv_dev_ioctl, .llseek = noop_llseek, }; static struct miscdevice mshv_dev = { .minor = MISC_DYNAMIC_MINOR, .name = "mshv", .fops = &mshv_dev_fops, .mode = 0600, }; static struct mshv_vtl_run *mshv_vtl_this_run(void) { return *this_cpu_ptr(&mshv_vtl_per_cpu.run); } static struct mshv_vtl_run *mshv_vtl_cpu_run(int cpu) { return *per_cpu_ptr(&mshv_vtl_per_cpu.run, cpu); } static struct page *mshv_vtl_cpu_reg_page(int cpu) { return *per_cpu_ptr(&mshv_vtl_per_cpu.reg_page, cpu); } static void mshv_vtl_configure_reg_page(struct mshv_vtl_per_cpu *per_cpu) { struct hv_register_assoc reg_assoc = {}; union hv_synic_overlay_page_msr overlay = {}; struct page *reg_page; reg_page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL); if (!reg_page) { WARN(1, "failed to allocate register page\n"); return; } overlay.enabled = 1; overlay.pfn = page_to_hvpfn(reg_page); reg_assoc.name = HV_X64_REGISTER_REG_PAGE; reg_assoc.value.reg64 = overlay.as_uint64; if (hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, 1, input_vtl_zero, ®_assoc)) { WARN(1, "failed to setup register page\n"); __free_page(reg_page); return; } per_cpu->reg_page = reg_page; mshv_has_reg_page = true; } static void mshv_vtl_synic_enable_regs(unsigned int cpu) { union hv_synic_sint sint; sint.as_uint64 = 0; sint.vector = HYPERVISOR_CALLBACK_VECTOR; sint.masked = false; sint.auto_eoi = hv_recommend_using_aeoi(); /* Enable intercepts */ if (!mshv_vsm_capabilities.intercept_page_available) hv_set_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX, sint.as_uint64); /* VTL2 Host VSP SINT is (un)masked when the user mode requests that */ } static int mshv_vtl_get_vsm_regs(void) { struct hv_register_assoc registers[2]; int ret, count = 2; registers[0].name = HV_REGISTER_VSM_CODE_PAGE_OFFSETS; registers[1].name = HV_REGISTER_VSM_CAPABILITIES; ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, count, input_vtl_zero, registers); if (ret) return ret; mshv_vsm_page_offsets.as_uint64 = registers[0].value.reg64; mshv_vsm_capabilities.as_uint64 = registers[1].value.reg64; return ret; } static int mshv_vtl_configure_vsm_partition(struct device *dev) { union hv_register_vsm_partition_config config; struct hv_register_assoc reg_assoc; config.as_uint64 = 0; config.default_vtl_protection_mask = HV_MAP_GPA_PERMISSIONS_MASK; config.enable_vtl_protection = 1; config.zero_memory_on_reset = 1; config.intercept_vp_startup = 1; config.intercept_cpuid_unimplemented = 1; if (mshv_vsm_capabilities.intercept_page_available) { dev_dbg(dev, "using intercept page\n"); config.intercept_page = 1; } reg_assoc.name = HV_REGISTER_VSM_PARTITION_CONFIG; reg_assoc.value.reg64 = config.as_uint64; return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, 1, input_vtl_zero, ®_assoc); } static void mshv_vtl_vmbus_isr(void) { struct hv_per_cpu_context *per_cpu; struct hv_message *msg; u32 message_type; union hv_synic_event_flags *event_flags; struct eventfd_ctx *eventfd; u16 i; per_cpu = this_cpu_ptr(hv_context.cpu_context); if (smp_processor_id() == 0) { msg = (struct hv_message *)per_cpu->hyp_synic_message_page + VTL2_VMBUS_SINT_INDEX; message_type = READ_ONCE(msg->header.message_type); if (message_type != HVMSG_NONE) tasklet_schedule(&msg_dpc); } event_flags = (union hv_synic_event_flags *)per_cpu->hyp_synic_event_page + VTL2_VMBUS_SINT_INDEX; for_each_set_bit(i, event_flags->flags, HV_EVENT_FLAGS_COUNT) { if (!sync_test_and_clear_bit(i, event_flags->flags)) continue; rcu_read_lock(); eventfd = READ_ONCE(flag_eventfds[i]); if (eventfd) eventfd_signal(eventfd); rcu_read_unlock(); } vmbus_isr(); } static int mshv_vtl_alloc_context(unsigned int cpu) { struct mshv_vtl_per_cpu *per_cpu = this_cpu_ptr(&mshv_vtl_per_cpu); per_cpu->run = (struct mshv_vtl_run *)__get_free_page(GFP_KERNEL | __GFP_ZERO); if (!per_cpu->run) return -ENOMEM; if (mshv_vsm_capabilities.intercept_page_available) mshv_vtl_configure_reg_page(per_cpu); mshv_vtl_synic_enable_regs(cpu); return 0; } static int mshv_vtl_cpuhp_online; static int hv_vtl_setup_synic(void) { int ret; /* Use our isr to first filter out packets destined for userspace */ hv_setup_vmbus_handler(mshv_vtl_vmbus_isr); ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vtl:online", mshv_vtl_alloc_context, NULL); if (ret < 0) { hv_setup_vmbus_handler(vmbus_isr); return ret; } mshv_vtl_cpuhp_online = ret; return 0; } static void hv_vtl_remove_synic(void) { cpuhp_remove_state(mshv_vtl_cpuhp_online); hv_setup_vmbus_handler(vmbus_isr); } static int vtl_get_vp_register(struct hv_register_assoc *reg) { return hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, 1, input_vtl_normal, reg); } static int vtl_set_vp_register(struct hv_register_assoc *reg) { return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, 1, input_vtl_normal, reg); } static int mshv_vtl_ioctl_add_vtl0_mem(struct mshv_vtl *vtl, void __user *arg) { struct mshv_vtl_ram_disposition vtl0_mem; struct dev_pagemap *pgmap; void *addr; if (copy_from_user(&vtl0_mem, arg, sizeof(vtl0_mem))) return -EFAULT; /* vtl0_mem.last_pfn is excluded in the pagemap range for VTL0 as per design */ if (vtl0_mem.last_pfn <= vtl0_mem.start_pfn) { dev_err(vtl->module_dev, "range start pfn (%llx) > end pfn (%llx)\n", vtl0_mem.start_pfn, vtl0_mem.last_pfn); return -EFAULT; } pgmap = kzalloc(sizeof(*pgmap), GFP_KERNEL); if (!pgmap) return -ENOMEM; pgmap->ranges[0].start = PFN_PHYS(vtl0_mem.start_pfn); pgmap->ranges[0].end = PFN_PHYS(vtl0_mem.last_pfn) - 1; pgmap->nr_range = 1; pgmap->type = MEMORY_DEVICE_GENERIC; /* * Determine the highest page order that can be used for the given memory range. * This works best when the range is aligned; i.e. both the start and the length. */ pgmap->vmemmap_shift = count_trailing_zeros(vtl0_mem.start_pfn | vtl0_mem.last_pfn); dev_dbg(vtl->module_dev, "Add VTL0 memory: start: 0x%llx, end_pfn: 0x%llx, page order: %lu\n", vtl0_mem.start_pfn, vtl0_mem.last_pfn, pgmap->vmemmap_shift); addr = devm_memremap_pages(mem_dev, pgmap); if (IS_ERR(addr)) { dev_err(vtl->module_dev, "devm_memremap_pages error: %ld\n", PTR_ERR(addr)); kfree(pgmap); return -EFAULT; } /* Don't free pgmap, since it has to stick around until the memory * is unmapped, which will never happen as there is no scenario * where VTL0 can be released/shutdown without bringing down VTL2. */ return 0; } static void mshv_vtl_cancel(int cpu) { int here = get_cpu(); if (here != cpu) { if (!xchg_relaxed(&mshv_vtl_cpu_run(cpu)->cancel, 1)) smp_send_reschedule(cpu); } else { WRITE_ONCE(mshv_vtl_this_run()->cancel, 1); } put_cpu(); } static int mshv_vtl_poll_file_wake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { struct mshv_vtl_poll_file *poll_file = container_of(wait, struct mshv_vtl_poll_file, wait); mshv_vtl_cancel(poll_file->cpu); return 0; } static void mshv_vtl_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, poll_table *pt) { struct mshv_vtl_poll_file *poll_file = container_of(pt, struct mshv_vtl_poll_file, pt); WARN_ON(poll_file->wqh); poll_file->wqh = wqh; add_wait_queue(wqh, &poll_file->wait); } static int mshv_vtl_ioctl_set_poll_file(struct mshv_vtl_set_poll_file __user *user_input) { struct file *file, *old_file; struct mshv_vtl_poll_file *poll_file; struct mshv_vtl_set_poll_file input; if (copy_from_user(&input, user_input, sizeof(input))) return -EFAULT; if (input.cpu >= num_possible_cpus() || !cpu_online(input.cpu)) return -EINVAL; /* * CPU Hotplug is not supported in VTL2 in OpenHCL, where this kernel driver exists. * CPU is expected to remain online after above cpu_online() check. */ file = NULL; file = fget(input.fd); if (!file) return -EBADFD; poll_file = per_cpu_ptr(&mshv_vtl_poll_file, READ_ONCE(input.cpu)); if (!poll_file) return -EINVAL; mutex_lock(&mshv_vtl_poll_file_lock); if (poll_file->wqh) remove_wait_queue(poll_file->wqh, &poll_file->wait); poll_file->wqh = NULL; old_file = poll_file->file; poll_file->file = file; poll_file->cpu = input.cpu; if (file) { init_waitqueue_func_entry(&poll_file->wait, mshv_vtl_poll_file_wake); init_poll_funcptr(&poll_file->pt, mshv_vtl_ptable_queue_proc); vfs_poll(file, &poll_file->pt); } mutex_unlock(&mshv_vtl_poll_file_lock); if (old_file) fput(old_file); return 0; } /* Static table mapping register names to their corresponding actions */ static const struct { enum hv_register_name reg_name; int debug_reg_num; /* -1 if not a debug register */ u32 msr_addr; /* 0 if not an MSR */ } reg_table[] = { /* Debug registers */ {HV_X64_REGISTER_DR0, 0, 0}, {HV_X64_REGISTER_DR1, 1, 0}, {HV_X64_REGISTER_DR2, 2, 0}, {HV_X64_REGISTER_DR3, 3, 0}, {HV_X64_REGISTER_DR6, 6, 0}, /* MTRR MSRs */ {HV_X64_REGISTER_MSR_MTRR_CAP, -1, MSR_MTRRcap}, {HV_X64_REGISTER_MSR_MTRR_DEF_TYPE, -1, MSR_MTRRdefType}, {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0, -1, MTRRphysBase_MSR(0)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1, -1, MTRRphysBase_MSR(1)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2, -1, MTRRphysBase_MSR(2)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3, -1, MTRRphysBase_MSR(3)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4, -1, MTRRphysBase_MSR(4)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5, -1, MTRRphysBase_MSR(5)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6, -1, MTRRphysBase_MSR(6)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7, -1, MTRRphysBase_MSR(7)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8, -1, MTRRphysBase_MSR(8)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9, -1, MTRRphysBase_MSR(9)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA, -1, MTRRphysBase_MSR(0xa)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB, -1, MTRRphysBase_MSR(0xb)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC, -1, MTRRphysBase_MSR(0xc)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_BASED, -1, MTRRphysBase_MSR(0xd)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE, -1, MTRRphysBase_MSR(0xe)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF, -1, MTRRphysBase_MSR(0xf)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0, -1, MTRRphysMask_MSR(0)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1, -1, MTRRphysMask_MSR(1)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2, -1, MTRRphysMask_MSR(2)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3, -1, MTRRphysMask_MSR(3)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4, -1, MTRRphysMask_MSR(4)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5, -1, MTRRphysMask_MSR(5)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6, -1, MTRRphysMask_MSR(6)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7, -1, MTRRphysMask_MSR(7)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8, -1, MTRRphysMask_MSR(8)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9, -1, MTRRphysMask_MSR(9)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA, -1, MTRRphysMask_MSR(0xa)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB, -1, MTRRphysMask_MSR(0xb)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC, -1, MTRRphysMask_MSR(0xc)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD, -1, MTRRphysMask_MSR(0xd)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE, -1, MTRRphysMask_MSR(0xe)}, {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF, -1, MTRRphysMask_MSR(0xf)}, {HV_X64_REGISTER_MSR_MTRR_FIX64K00000, -1, MSR_MTRRfix64K_00000}, {HV_X64_REGISTER_MSR_MTRR_FIX16K80000, -1, MSR_MTRRfix16K_80000}, {HV_X64_REGISTER_MSR_MTRR_FIX16KA0000, -1, MSR_MTRRfix16K_A0000}, {HV_X64_REGISTER_MSR_MTRR_FIX4KC0000, -1, MSR_MTRRfix4K_C0000}, {HV_X64_REGISTER_MSR_MTRR_FIX4KC8000, -1, MSR_MTRRfix4K_C8000}, {HV_X64_REGISTER_MSR_MTRR_FIX4KD0000, -1, MSR_MTRRfix4K_D0000}, {HV_X64_REGISTER_MSR_MTRR_FIX4KD8000, -1, MSR_MTRRfix4K_D8000}, {HV_X64_REGISTER_MSR_MTRR_FIX4KE0000, -1, MSR_MTRRfix4K_E0000}, {HV_X64_REGISTER_MSR_MTRR_FIX4KE8000, -1, MSR_MTRRfix4K_E8000}, {HV_X64_REGISTER_MSR_MTRR_FIX4KF0000, -1, MSR_MTRRfix4K_F0000}, {HV_X64_REGISTER_MSR_MTRR_FIX4KF8000, -1, MSR_MTRRfix4K_F8000}, }; static int mshv_vtl_get_set_reg(struct hv_register_assoc *regs, bool set) { u64 *reg64; enum hv_register_name gpr_name; int i; gpr_name = regs->name; reg64 = ®s->value.reg64; /* Search for the register in the table */ for (i = 0; i < ARRAY_SIZE(reg_table); i++) { if (reg_table[i].reg_name != gpr_name) continue; if (reg_table[i].debug_reg_num != -1) { /* Handle debug registers */ if (gpr_name == HV_X64_REGISTER_DR6 && !mshv_vsm_capabilities.dr6_shared) goto hypercall; if (set) native_set_debugreg(reg_table[i].debug_reg_num, *reg64); else *reg64 = native_get_debugreg(reg_table[i].debug_reg_num); } else { /* Handle MSRs */ if (set) wrmsrl(reg_table[i].msr_addr, *reg64); else rdmsrl(reg_table[i].msr_addr, *reg64); } return 0; } hypercall: return 1; } static void mshv_vtl_return(struct mshv_vtl_cpu_context *vtl0) { struct hv_vp_assist_page *hvp; hvp = hv_vp_assist_page[smp_processor_id()]; /* * Process signal event direct set in the run page, if any. */ if (mshv_vsm_capabilities.return_action_available) { u32 offset = READ_ONCE(mshv_vtl_this_run()->vtl_ret_action_size); WRITE_ONCE(mshv_vtl_this_run()->vtl_ret_action_size, 0); /* * Hypervisor will take care of clearing out the actions * set in the assist page. */ memcpy(hvp->vtl_ret_actions, mshv_vtl_this_run()->vtl_ret_actions, min_t(u32, offset, sizeof(hvp->vtl_ret_actions))); } mshv_vtl_return_call(vtl0); } static bool mshv_vtl_process_intercept(void) { struct hv_per_cpu_context *mshv_cpu; void *synic_message_page; struct hv_message *msg; u32 message_type; mshv_cpu = this_cpu_ptr(hv_context.cpu_context); synic_message_page = mshv_cpu->hyp_synic_message_page; if (unlikely(!synic_message_page)) return true; msg = (struct hv_message *)synic_message_page + HV_SYNIC_INTERCEPTION_SINT_INDEX; message_type = READ_ONCE(msg->header.message_type); if (message_type == HVMSG_NONE) return true; memcpy(mshv_vtl_this_run()->exit_message, msg, sizeof(*msg)); vmbus_signal_eom(msg, message_type); return false; } static int mshv_vtl_ioctl_return_to_lower_vtl(void) { preempt_disable(); for (;;) { unsigned long irq_flags; struct hv_vp_assist_page *hvp; int ret; if (__xfer_to_guest_mode_work_pending()) { preempt_enable(); ret = xfer_to_guest_mode_handle_work(); if (ret) return ret; preempt_disable(); } local_irq_save(irq_flags); if (READ_ONCE(mshv_vtl_this_run()->cancel)) { local_irq_restore(irq_flags); preempt_enable(); return -EINTR; } mshv_vtl_return(&mshv_vtl_this_run()->cpu_context); local_irq_restore(irq_flags); hvp = hv_vp_assist_page[smp_processor_id()]; this_cpu_inc(num_vtl0_transitions); switch (hvp->vtl_entry_reason) { case MSHV_ENTRY_REASON_INTERRUPT: if (!mshv_vsm_capabilities.intercept_page_available && likely(!mshv_vtl_process_intercept())) goto done; break; case MSHV_ENTRY_REASON_INTERCEPT: WARN_ON(!mshv_vsm_capabilities.intercept_page_available); memcpy(mshv_vtl_this_run()->exit_message, hvp->intercept_message, sizeof(hvp->intercept_message)); goto done; default: panic("unknown entry reason: %d", hvp->vtl_entry_reason); } } done: preempt_enable(); return 0; } static long mshv_vtl_ioctl_get_regs(void __user *user_args) { struct mshv_vp_registers args; struct hv_register_assoc reg; long ret; if (copy_from_user(&args, user_args, sizeof(args))) return -EFAULT; /* This IOCTL supports processing only one register at a time. */ if (args.count != 1) return -EINVAL; if (copy_from_user(®, (void __user *)args.regs_ptr, sizeof(reg))) return -EFAULT; ret = mshv_vtl_get_set_reg(®, false); if (!ret) goto copy_args; /* No need of hypercall */ ret = vtl_get_vp_register(®); if (ret) return ret; copy_args: if (copy_to_user((void __user *)args.regs_ptr, ®, sizeof(reg))) ret = -EFAULT; return ret; } static long mshv_vtl_ioctl_set_regs(void __user *user_args) { struct mshv_vp_registers args; struct hv_register_assoc reg; long ret; if (copy_from_user(&args, user_args, sizeof(args))) return -EFAULT; /* This IOCTL supports processing only one register at a time. */ if (args.count != 1) return -EINVAL; if (copy_from_user(®, (void __user *)args.regs_ptr, sizeof(reg))) return -EFAULT; ret = mshv_vtl_get_set_reg(®, true); if (!ret) return ret; /* No need of hypercall */ ret = vtl_set_vp_register(®); return ret; } static long mshv_vtl_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { long ret; struct mshv_vtl *vtl = filp->private_data; switch (ioctl) { case MSHV_SET_POLL_FILE: ret = mshv_vtl_ioctl_set_poll_file((struct mshv_vtl_set_poll_file __user *)arg); break; case MSHV_GET_VP_REGISTERS: ret = mshv_vtl_ioctl_get_regs((void __user *)arg); break; case MSHV_SET_VP_REGISTERS: ret = mshv_vtl_ioctl_set_regs((void __user *)arg); break; case MSHV_RETURN_TO_LOWER_VTL: ret = mshv_vtl_ioctl_return_to_lower_vtl(); break; case MSHV_ADD_VTL0_MEMORY: ret = mshv_vtl_ioctl_add_vtl0_mem(vtl, (void __user *)arg); break; default: dev_err(vtl->module_dev, "invalid vtl ioctl: %#x\n", ioctl); ret = -ENOTTY; } return ret; } static vm_fault_t mshv_vtl_fault(struct vm_fault *vmf) { struct page *page; int cpu = vmf->pgoff & MSHV_PG_OFF_CPU_MASK; int real_off = vmf->pgoff >> MSHV_REAL_OFF_SHIFT; if (!cpu_online(cpu)) return VM_FAULT_SIGBUS; /* * CPU Hotplug is not supported in VTL2 in OpenHCL, where this kernel driver exists. * CPU is expected to remain online after above cpu_online() check. */ if (real_off == MSHV_RUN_PAGE_OFFSET) { page = virt_to_page(mshv_vtl_cpu_run(cpu)); } else if (real_off == MSHV_REG_PAGE_OFFSET) { if (!mshv_has_reg_page) return VM_FAULT_SIGBUS; page = mshv_vtl_cpu_reg_page(cpu); } else { return VM_FAULT_NOPAGE; } get_page(page); vmf->page = page; return 0; } static const struct vm_operations_struct mshv_vtl_vm_ops = { .fault = mshv_vtl_fault, }; static int mshv_vtl_mmap(struct file *filp, struct vm_area_struct *vma) { vma->vm_ops = &mshv_vtl_vm_ops; return 0; } static int mshv_vtl_release(struct inode *inode, struct file *filp) { struct mshv_vtl *vtl = filp->private_data; kfree(vtl); return 0; } static const struct file_operations mshv_vtl_fops = { .owner = THIS_MODULE, .unlocked_ioctl = mshv_vtl_ioctl, .release = mshv_vtl_release, .mmap = mshv_vtl_mmap, }; static void mshv_vtl_synic_mask_vmbus_sint(const u8 *mask) { union hv_synic_sint sint; sint.as_uint64 = 0; sint.vector = HYPERVISOR_CALLBACK_VECTOR; sint.masked = (*mask != 0); sint.auto_eoi = hv_recommend_using_aeoi(); hv_set_msr(HV_MSR_SINT0 + VTL2_VMBUS_SINT_INDEX, sint.as_uint64); if (!sint.masked) pr_debug("%s: Unmasking VTL2 VMBUS SINT on VP %d\n", __func__, smp_processor_id()); else pr_debug("%s: Masking VTL2 VMBUS SINT on VP %d\n", __func__, smp_processor_id()); } static void mshv_vtl_read_remote(void *buffer) { struct hv_per_cpu_context *mshv_cpu = this_cpu_ptr(hv_context.cpu_context); struct hv_message *msg = (struct hv_message *)mshv_cpu->hyp_synic_message_page + VTL2_VMBUS_SINT_INDEX; u32 message_type = READ_ONCE(msg->header.message_type); WRITE_ONCE(has_message, false); if (message_type == HVMSG_NONE) return; memcpy(buffer, msg, sizeof(*msg)); vmbus_signal_eom(msg, message_type); } static bool vtl_synic_mask_vmbus_sint_masked = true; static ssize_t mshv_vtl_sint_read(struct file *filp, char __user *arg, size_t size, loff_t *offset) { struct hv_message msg = {}; int ret; if (size < sizeof(msg)) return -EINVAL; for (;;) { smp_call_function_single(VMBUS_CONNECT_CPU, mshv_vtl_read_remote, &msg, true); if (msg.header.message_type != HVMSG_NONE) break; if (READ_ONCE(vtl_synic_mask_vmbus_sint_masked)) return 0; /* EOF */ if (filp->f_flags & O_NONBLOCK) return -EAGAIN; ret = wait_event_interruptible(fd_wait_queue, READ_ONCE(has_message) || READ_ONCE(vtl_synic_mask_vmbus_sint_masked)); if (ret) return ret; } if (copy_to_user(arg, &msg, sizeof(msg))) return -EFAULT; return sizeof(msg); } static __poll_t mshv_vtl_sint_poll(struct file *filp, poll_table *wait) { __poll_t mask = 0; poll_wait(filp, &fd_wait_queue, wait); if (READ_ONCE(has_message) || READ_ONCE(vtl_synic_mask_vmbus_sint_masked)) mask |= EPOLLIN | EPOLLRDNORM; return mask; } static void mshv_vtl_sint_on_msg_dpc(unsigned long data) { WRITE_ONCE(has_message, true); wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN); } static int mshv_vtl_sint_ioctl_post_msg(struct mshv_vtl_sint_post_msg __user *arg) { struct mshv_vtl_sint_post_msg message; u8 payload[HV_MESSAGE_PAYLOAD_BYTE_COUNT]; if (copy_from_user(&message, arg, sizeof(message))) return -EFAULT; if (message.payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) return -EINVAL; if (copy_from_user(payload, (void __user *)message.payload_ptr, message.payload_size)) return -EFAULT; return hv_post_message((union hv_connection_id)message.connection_id, message.message_type, (void *)payload, message.payload_size); } static int mshv_vtl_sint_ioctl_signal_event(struct mshv_vtl_signal_event __user *arg) { u64 input, status; struct mshv_vtl_signal_event signal_event; if (copy_from_user(&signal_event, arg, sizeof(signal_event))) return -EFAULT; input = signal_event.connection_id | ((u64)signal_event.flag << 32); status = hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, input); return hv_result_to_errno(status); } static int mshv_vtl_sint_ioctl_set_eventfd(struct mshv_vtl_set_eventfd __user *arg) { struct mshv_vtl_set_eventfd set_eventfd; struct eventfd_ctx *eventfd, *old_eventfd; if (copy_from_user(&set_eventfd, arg, sizeof(set_eventfd))) return -EFAULT; if (set_eventfd.flag >= HV_EVENT_FLAGS_COUNT) return -EINVAL; eventfd = NULL; if (set_eventfd.fd >= 0) { eventfd = eventfd_ctx_fdget(set_eventfd.fd); if (IS_ERR(eventfd)) return PTR_ERR(eventfd); } guard(mutex)(&flag_lock); old_eventfd = READ_ONCE(flag_eventfds[set_eventfd.flag]); WRITE_ONCE(flag_eventfds[set_eventfd.flag], eventfd); if (old_eventfd) { synchronize_rcu(); eventfd_ctx_put(old_eventfd); } return 0; } static int mshv_vtl_sint_ioctl_pause_msg_stream(struct mshv_sint_mask __user *arg) { static DEFINE_MUTEX(vtl2_vmbus_sint_mask_mutex); struct mshv_sint_mask mask; if (copy_from_user(&mask, arg, sizeof(mask))) return -EFAULT; guard(mutex)(&vtl2_vmbus_sint_mask_mutex); on_each_cpu((smp_call_func_t)mshv_vtl_synic_mask_vmbus_sint, &mask.mask, 1); WRITE_ONCE(vtl_synic_mask_vmbus_sint_masked, mask.mask != 0); if (mask.mask) wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN); return 0; } static long mshv_vtl_sint_ioctl(struct file *f, unsigned int cmd, unsigned long arg) { switch (cmd) { case MSHV_SINT_POST_MESSAGE: return mshv_vtl_sint_ioctl_post_msg((struct mshv_vtl_sint_post_msg __user *)arg); case MSHV_SINT_SIGNAL_EVENT: return mshv_vtl_sint_ioctl_signal_event((struct mshv_vtl_signal_event __user *)arg); case MSHV_SINT_SET_EVENTFD: return mshv_vtl_sint_ioctl_set_eventfd((struct mshv_vtl_set_eventfd __user *)arg); case MSHV_SINT_PAUSE_MESSAGE_STREAM: return mshv_vtl_sint_ioctl_pause_msg_stream((struct mshv_sint_mask __user *)arg); default: return -ENOIOCTLCMD; } } static const struct file_operations mshv_vtl_sint_ops = { .owner = THIS_MODULE, .read = mshv_vtl_sint_read, .poll = mshv_vtl_sint_poll, .unlocked_ioctl = mshv_vtl_sint_ioctl, }; static struct miscdevice mshv_vtl_sint_dev = { .name = "mshv_sint", .fops = &mshv_vtl_sint_ops, .mode = 0600, .minor = MISC_DYNAMIC_MINOR, }; static int mshv_vtl_hvcall_dev_open(struct inode *node, struct file *f) { struct miscdevice *dev = f->private_data; struct mshv_vtl_hvcall_fd *fd; if (!capable(CAP_SYS_ADMIN)) return -EPERM; fd = vzalloc(sizeof(*fd)); if (!fd) return -ENOMEM; fd->dev = dev; f->private_data = fd; mutex_init(&fd->init_mutex); return 0; } static int mshv_vtl_hvcall_dev_release(struct inode *node, struct file *f) { struct mshv_vtl_hvcall_fd *fd; fd = f->private_data; if (fd) { vfree(fd); f->private_data = NULL; } return 0; } static int mshv_vtl_hvcall_do_setup(struct mshv_vtl_hvcall_fd *fd, struct mshv_vtl_hvcall_setup __user *hvcall_setup_user) { struct mshv_vtl_hvcall_setup hvcall_setup; guard(mutex)(&fd->init_mutex); if (fd->allow_map_initialized) { dev_err(fd->dev->this_device, "Hypercall allow map has already been set, pid %d\n", current->pid); return -EINVAL; } if (copy_from_user(&hvcall_setup, hvcall_setup_user, sizeof(struct mshv_vtl_hvcall_setup))) { return -EFAULT; } if (hvcall_setup.bitmap_array_size > ARRAY_SIZE(fd->allow_bitmap)) return -EINVAL; if (copy_from_user(&fd->allow_bitmap, (void __user *)hvcall_setup.allow_bitmap_ptr, hvcall_setup.bitmap_array_size)) { return -EFAULT; } dev_info(fd->dev->this_device, "Hypercall allow map has been set, pid %d\n", current->pid); fd->allow_map_initialized = true; return 0; } static bool mshv_vtl_hvcall_is_allowed(struct mshv_vtl_hvcall_fd *fd, u16 call_code) { return test_bit(call_code, (unsigned long *)fd->allow_bitmap); } static int mshv_vtl_hvcall_call(struct mshv_vtl_hvcall_fd *fd, struct mshv_vtl_hvcall __user *hvcall_user) { struct mshv_vtl_hvcall hvcall; void *in, *out; int ret; if (copy_from_user(&hvcall, hvcall_user, sizeof(struct mshv_vtl_hvcall))) return -EFAULT; if (hvcall.input_size > HV_HYP_PAGE_SIZE) return -EINVAL; if (hvcall.output_size > HV_HYP_PAGE_SIZE) return -EINVAL; /* * By default, all hypercalls are not allowed. * The user mode code has to set up the allow bitmap once. */ if (!mshv_vtl_hvcall_is_allowed(fd, hvcall.control & 0xFFFF)) { dev_err(fd->dev->this_device, "Hypercall with control data %#llx isn't allowed\n", hvcall.control); return -EPERM; } /* * This may create a problem for Confidential VM (CVM) usecase where we need to use * Hyper-V driver allocated per-cpu input and output pages (hyperv_pcpu_input_arg and * hyperv_pcpu_output_arg) for making a hypervisor call. * * TODO: Take care of this when CVM support is added. */ in = (void *)__get_free_page(GFP_KERNEL); out = (void *)__get_free_page(GFP_KERNEL); if (copy_from_user(in, (void __user *)hvcall.input_ptr, hvcall.input_size)) { ret = -EFAULT; goto free_pages; } hvcall.status = hv_do_hypercall(hvcall.control, in, out); if (copy_to_user((void __user *)hvcall.output_ptr, out, hvcall.output_size)) { ret = -EFAULT; goto free_pages; } ret = put_user(hvcall.status, &hvcall_user->status); free_pages: free_page((unsigned long)in); free_page((unsigned long)out); return ret; } static long mshv_vtl_hvcall_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) { struct mshv_vtl_hvcall_fd *fd = f->private_data; switch (cmd) { case MSHV_HVCALL_SETUP: return mshv_vtl_hvcall_do_setup(fd, (struct mshv_vtl_hvcall_setup __user *)arg); case MSHV_HVCALL: return mshv_vtl_hvcall_call(fd, (struct mshv_vtl_hvcall __user *)arg); default: break; } return -ENOIOCTLCMD; } static const struct file_operations mshv_vtl_hvcall_dev_file_ops = { .owner = THIS_MODULE, .open = mshv_vtl_hvcall_dev_open, .release = mshv_vtl_hvcall_dev_release, .unlocked_ioctl = mshv_vtl_hvcall_dev_ioctl, }; static struct miscdevice mshv_vtl_hvcall_dev = { .name = "mshv_hvcall", .nodename = "mshv_hvcall", .fops = &mshv_vtl_hvcall_dev_file_ops, .mode = 0600, .minor = MISC_DYNAMIC_MINOR, }; static int mshv_vtl_low_open(struct inode *inodep, struct file *filp) { pid_t pid = task_pid_vnr(current); uid_t uid = current_uid().val; int ret = 0; pr_debug("%s: Opening VTL low, task group %d, uid %d\n", __func__, pid, uid); if (capable(CAP_SYS_ADMIN)) { filp->private_data = inodep; } else { pr_err("%s: VTL low open failed: CAP_SYS_ADMIN required. task group %d, uid %d", __func__, pid, uid); ret = -EPERM; } return ret; } static bool can_fault(struct vm_fault *vmf, unsigned long size, unsigned long *pfn) { unsigned long mask = size - 1; unsigned long start = vmf->address & ~mask; unsigned long end = start + size; bool is_valid; is_valid = (vmf->address & mask) == ((vmf->pgoff << PAGE_SHIFT) & mask) && start >= vmf->vma->vm_start && end <= vmf->vma->vm_end; if (is_valid) *pfn = vmf->pgoff & ~(mask >> PAGE_SHIFT); return is_valid; } static vm_fault_t mshv_vtl_low_huge_fault(struct vm_fault *vmf, unsigned int order) { unsigned long pfn = vmf->pgoff; vm_fault_t ret = VM_FAULT_FALLBACK; switch (order) { case 0: return vmf_insert_mixed(vmf->vma, vmf->address, pfn); case PMD_ORDER: if (can_fault(vmf, PMD_SIZE, &pfn)) ret = vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); return ret; case PUD_ORDER: if (can_fault(vmf, PUD_SIZE, &pfn)) ret = vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); return ret; default: return VM_FAULT_SIGBUS; } } static vm_fault_t mshv_vtl_low_fault(struct vm_fault *vmf) { return mshv_vtl_low_huge_fault(vmf, 0); } static const struct vm_operations_struct mshv_vtl_low_vm_ops = { .fault = mshv_vtl_low_fault, .huge_fault = mshv_vtl_low_huge_fault, }; static int mshv_vtl_low_mmap(struct file *filp, struct vm_area_struct *vma) { vma->vm_ops = &mshv_vtl_low_vm_ops; vm_flags_set(vma, VM_HUGEPAGE | VM_MIXEDMAP); return 0; } static const struct file_operations mshv_vtl_low_file_ops = { .owner = THIS_MODULE, .open = mshv_vtl_low_open, .mmap = mshv_vtl_low_mmap, }; static struct miscdevice mshv_vtl_low = { .name = "mshv_vtl_low", .nodename = "mshv_vtl_low", .fops = &mshv_vtl_low_file_ops, .mode = 0600, .minor = MISC_DYNAMIC_MINOR, }; static int __init mshv_vtl_init(void) { int ret; struct device *dev = mshv_dev.this_device; /* * This creates /dev/mshv which provides functionality to create VTLs and partitions. */ ret = misc_register(&mshv_dev); if (ret) { dev_err(dev, "mshv device register failed: %d\n", ret); goto free_dev; } tasklet_init(&msg_dpc, mshv_vtl_sint_on_msg_dpc, 0); init_waitqueue_head(&fd_wait_queue); if (mshv_vtl_get_vsm_regs()) { dev_emerg(dev, "Unable to get VSM capabilities !!\n"); ret = -ENODEV; goto free_dev; } if (mshv_vtl_configure_vsm_partition(dev)) { dev_emerg(dev, "VSM configuration failed !!\n"); ret = -ENODEV; goto free_dev; } mshv_vtl_return_call_init(mshv_vsm_page_offsets.vtl_return_offset); ret = hv_vtl_setup_synic(); if (ret) goto free_dev; /* * mshv_sint device adds VMBus relay ioctl support. * This provides a channel for VTL0 to communicate with VTL2. */ ret = misc_register(&mshv_vtl_sint_dev); if (ret) goto free_synic; /* * mshv_hvcall device adds interface to enable userspace for direct hypercalls support. */ ret = misc_register(&mshv_vtl_hvcall_dev); if (ret) goto free_sint; /* * mshv_vtl_low device is used to map VTL0 address space to a user-mode process in VTL2. * It implements mmap() to allow a user-mode process in VTL2 to map to the address of VTL0. */ ret = misc_register(&mshv_vtl_low); if (ret) goto free_hvcall; /* * "mshv vtl mem dev" device is later used to setup VTL0 memory. */ mem_dev = kzalloc(sizeof(*mem_dev), GFP_KERNEL); if (!mem_dev) { ret = -ENOMEM; goto free_low; } mutex_init(&mshv_vtl_poll_file_lock); device_initialize(mem_dev); dev_set_name(mem_dev, "mshv vtl mem dev"); ret = device_add(mem_dev); if (ret) { dev_err(dev, "mshv vtl mem dev add: %d\n", ret); goto free_mem; } return 0; free_mem: kfree(mem_dev); free_low: misc_deregister(&mshv_vtl_low); free_hvcall: misc_deregister(&mshv_vtl_hvcall_dev); free_sint: misc_deregister(&mshv_vtl_sint_dev); free_synic: hv_vtl_remove_synic(); free_dev: misc_deregister(&mshv_dev); return ret; } static void __exit mshv_vtl_exit(void) { device_del(mem_dev); kfree(mem_dev); misc_deregister(&mshv_vtl_low); misc_deregister(&mshv_vtl_hvcall_dev); misc_deregister(&mshv_vtl_sint_dev); hv_vtl_remove_synic(); misc_deregister(&mshv_dev); } module_init(mshv_vtl_init); module_exit(mshv_vtl_exit);