diff options
Diffstat (limited to 'drivers/hv/mshv_root_main.c')
| -rw-r--r-- | drivers/hv/mshv_root_main.c | 778 |
1 files changed, 416 insertions, 362 deletions
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index e3b2bd417c46..1134a82c7881 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -29,6 +29,7 @@ #include <linux/crash_dump.h> #include <linux/panic_notifier.h> #include <linux/vmalloc.h> +#include <linux/rseq.h> #include "mshv_eventfd.h" #include "mshv.h" @@ -41,7 +42,7 @@ MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv"); /* TODO move this to another file when debugfs code is added */ enum hv_stats_vp_counters { /* HV_THREAD_COUNTER */ #if defined(CONFIG_X86) - VpRootDispatchThreadBlocked = 201, + VpRootDispatchThreadBlocked = 202, #elif defined(CONFIG_ARM64) VpRootDispatchThreadBlocked = 94, #endif @@ -122,6 +123,7 @@ static struct miscdevice mshv_dev = { */ static u16 mshv_passthru_hvcalls[] = { HVCALL_GET_PARTITION_PROPERTY, + HVCALL_GET_PARTITION_PROPERTY_EX, HVCALL_SET_PARTITION_PROPERTY, HVCALL_INSTALL_INTERCEPT, HVCALL_GET_VP_REGISTERS, @@ -136,6 +138,16 @@ static u16 mshv_passthru_hvcalls[] = { HVCALL_GET_VP_CPUID_VALUES, }; +/* + * Only allow hypercalls that are safe to be called by the VMM with the host + * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a + * hypercall cannot be misused by the VMM before adding it to this list. + */ +static u16 mshv_self_passthru_hvcalls[] = { + HVCALL_GET_PARTITION_PROPERTY, + HVCALL_GET_PARTITION_PROPERTY_EX, +}; + static bool mshv_hvcall_is_async(u16 code) { switch (code) { @@ -147,18 +159,38 @@ static bool mshv_hvcall_is_async(u16 code) return false; } +static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id) +{ + int i; + int n = ARRAY_SIZE(mshv_passthru_hvcalls); + u16 *allowed_hvcalls = mshv_passthru_hvcalls; + + if (pt_id == HV_PARTITION_ID_SELF) { + n = ARRAY_SIZE(mshv_self_passthru_hvcalls); + allowed_hvcalls = mshv_self_passthru_hvcalls; + } + + for (i = 0; i < n; ++i) + if (allowed_hvcalls[i] == code) + return true; + + return false; +} + static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, bool partition_locked, void __user *user_args) { u64 status; - int ret = 0, i; + int ret = 0; bool is_async; struct mshv_root_hvcall args; struct page *page; unsigned int pages_order; void *input_pg = NULL; void *output_pg = NULL; + u16 reps_completed; + u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF; if (copy_from_user(&args, user_args, sizeof(args))) return -EFAULT; @@ -170,17 +202,13 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE)) return -EINVAL; - for (i = 0; i < ARRAY_SIZE(mshv_passthru_hvcalls); ++i) - if (args.code == mshv_passthru_hvcalls[i]) - break; - - if (i >= ARRAY_SIZE(mshv_passthru_hvcalls)) + if (!mshv_passthru_hvcall_allowed(args.code, pt_id)) return -EINVAL; is_async = mshv_hvcall_is_async(args.code); if (is_async) { /* async hypercalls can only be called from partition fd */ - if (!partition_locked) + if (!partition || !partition_locked) return -EINVAL; ret = mshv_init_async_handler(partition); if (ret) @@ -208,43 +236,44 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, * NOTE: This only works because all the allowed hypercalls' input * structs begin with a u64 partition_id field. */ - *(u64 *)input_pg = partition->pt_id; + *(u64 *)input_pg = pt_id; - if (args.reps) - status = hv_do_rep_hypercall(args.code, args.reps, 0, - input_pg, output_pg); - else - status = hv_do_hypercall(args.code, input_pg, output_pg); - - if (hv_result(status) == HV_STATUS_CALL_PENDING) { - if (is_async) { - mshv_async_hvcall_handler(partition, &status); - } else { /* Paranoia check. This shouldn't happen! */ - ret = -EBADFD; - goto free_pages_out; + reps_completed = 0; + do { + if (args.reps) { + status = hv_do_rep_hypercall_ex(args.code, args.reps, + 0, reps_completed, + input_pg, output_pg); + reps_completed = hv_repcomp(status); + } else { + status = hv_do_hypercall(args.code, input_pg, output_pg); } - } - if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) { - ret = hv_call_deposit_pages(NUMA_NO_NODE, partition->pt_id, 1); - if (!ret) - ret = -EAGAIN; - } else if (!hv_result_success(status)) { - ret = hv_result_to_errno(status); - } + if (hv_result(status) == HV_STATUS_CALL_PENDING) { + if (is_async) { + mshv_async_hvcall_handler(partition, &status); + } else { /* Paranoia check. This shouldn't happen! */ + ret = -EBADFD; + goto free_pages_out; + } + } + + if (hv_result_success(status)) + break; + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) + ret = hv_result_to_errno(status); + else + ret = hv_call_deposit_pages(NUMA_NO_NODE, + pt_id, 1); + } while (!ret); - /* - * Always return the status and output data regardless of result. - * The VMM may need it to determine how to proceed. E.g. the status may - * contain the number of reps completed if a rep hypercall partially - * succeeded. - */ args.status = hv_result(status); - args.reps = args.reps ? hv_repcomp(status) : 0; + args.reps = reps_completed; if (copy_to_user(user_args, &args, sizeof(args))) ret = -EFAULT; - if (output_pg && + if (!ret && output_pg && copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz)) ret = -EFAULT; @@ -560,20 +589,106 @@ static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp) } } while (!vp->run.flags.intercept_suspend); + rseq_virt_userspace_exit(); + return ret; } static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ, "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ"); +static struct mshv_mem_region * +mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) +{ + struct mshv_mem_region *region; + + hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { + if (gfn >= region->start_gfn && + gfn < region->start_gfn + region->nr_pages) + return region; + } + + return NULL; +} + +#ifdef CONFIG_X86_64 +static struct mshv_mem_region * +mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn) +{ + struct mshv_mem_region *region; + + spin_lock(&p->pt_mem_regions_lock); + region = mshv_partition_region_by_gfn(p, gfn); + if (!region || !mshv_region_get(region)) { + spin_unlock(&p->pt_mem_regions_lock); + return NULL; + } + spin_unlock(&p->pt_mem_regions_lock); + + return region; +} + +/** + * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts. + * @vp: Pointer to the virtual processor structure. + * + * This function processes GPA intercepts by identifying the memory region + * corresponding to the intercepted GPA, aligning the page offset, and + * mapping the required pages. It ensures that the region is valid and + * handles faults efficiently by mapping multiple pages at once. + * + * Return: true if the intercept was handled successfully, false otherwise. + */ +static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) +{ + struct mshv_partition *p = vp->vp_partition; + struct mshv_mem_region *region; + struct hv_x64_memory_intercept_message *msg; + bool ret; + u64 gfn; + + msg = (struct hv_x64_memory_intercept_message *) + vp->vp_intercept_msg_page->u.payload; + + gfn = HVPFN_DOWN(msg->guest_physical_address); + + region = mshv_partition_region_by_gfn_get(p, gfn); + if (!region) + return false; + + /* Only movable memory ranges are supported for GPA intercepts */ + if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE) + ret = mshv_region_handle_gfn_fault(region, gfn); + else + ret = false; + + mshv_region_put(region); + + return ret; +} +#else /* CONFIG_X86_64 */ +static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; } +#endif /* CONFIG_X86_64 */ + +static bool mshv_vp_handle_intercept(struct mshv_vp *vp) +{ + switch (vp->vp_intercept_msg_page->header.message_type) { + case HVMSG_GPA_INTERCEPT: + return mshv_handle_gpa_intercept(vp); + } + return false; +} + static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg) { long rc; - if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) - rc = mshv_run_vp_with_root_scheduler(vp); - else - rc = mshv_run_vp_with_hyp_scheduler(vp); + do { + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) + rc = mshv_run_vp_with_root_scheduler(vp); + else + rc = mshv_run_vp_with_hyp_scheduler(vp); + } while (rc == 0 && mshv_vp_handle_intercept(vp)); if (rc) return rc; @@ -841,7 +956,8 @@ mshv_vp_release(struct inode *inode, struct file *filp) return 0; } -static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index) +static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index, + void *stats_pages[]) { union hv_stats_object_identity identity = { .vp.partition_id = partition_id, @@ -849,10 +965,10 @@ static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index) }; identity.vp.stats_area_type = HV_STATS_AREA_SELF; - hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); + hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity); identity.vp.stats_area_type = HV_STATS_AREA_PARENT; - hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); + hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity); } static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, @@ -865,14 +981,14 @@ static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, int err; identity.vp.stats_area_type = HV_STATS_AREA_SELF; - err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity, - &stats_pages[HV_STATS_AREA_SELF]); + err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, + &stats_pages[HV_STATS_AREA_SELF]); if (err) return err; identity.vp.stats_area_type = HV_STATS_AREA_PARENT; - err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity, - &stats_pages[HV_STATS_AREA_PARENT]); + err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, + &stats_pages[HV_STATS_AREA_PARENT]); if (err) goto unmap_self; @@ -880,7 +996,7 @@ static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, unmap_self: identity.vp.stats_area_type = HV_STATS_AREA_SELF; - hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); + hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity); return err; } @@ -890,7 +1006,7 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition, { struct mshv_create_vp args; struct mshv_vp *vp; - struct page *intercept_message_page, *register_page, *ghcb_page; + struct page *intercept_msg_page, *register_page, *ghcb_page; void *stats_pages[2]; long ret; @@ -908,33 +1024,34 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition, if (ret) return ret; - ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, - HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, - input_vtl_zero, - &intercept_message_page); + ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, + input_vtl_zero, &intercept_msg_page); if (ret) goto destroy_vp; if (!mshv_partition_encrypted(partition)) { - ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, - HV_VP_STATE_PAGE_REGISTERS, - input_vtl_zero, - ®ister_page); + ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_REGISTERS, + input_vtl_zero, ®ister_page); if (ret) goto unmap_intercept_message_page; } if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) { - ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, - HV_VP_STATE_PAGE_GHCB, - input_vtl_normal, - &ghcb_page); + ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_GHCB, + input_vtl_normal, &ghcb_page); if (ret) goto unmap_register_page; } - if (hv_parent_partition()) { + /* + * This mapping of the stats page is for detecting if dispatch thread + * is blocked - only relevant for root scheduler + */ + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) { ret = mshv_vp_stats_map(partition->pt_id, args.vp_index, stats_pages); if (ret) @@ -956,14 +1073,14 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition, atomic64_set(&vp->run.vp_signaled_count, 0); vp->vp_index = args.vp_index; - vp->vp_intercept_msg_page = page_to_virt(intercept_message_page); + vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page); if (!mshv_partition_encrypted(partition)) vp->vp_register_page = page_to_virt(register_page); if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) vp->vp_ghcb_page = page_to_virt(ghcb_page); - if (hv_parent_partition()) + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages)); /* @@ -986,24 +1103,22 @@ put_partition: free_vp: kfree(vp); unmap_stats_pages: - if (hv_parent_partition()) - mshv_vp_stats_unmap(partition->pt_id, args.vp_index); + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) + mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages); unmap_ghcb_page: - if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) { - hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, - HV_VP_STATE_PAGE_GHCB, - input_vtl_normal); - } + if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) + hv_unmap_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_GHCB, ghcb_page, + input_vtl_normal); unmap_register_page: - if (!mshv_partition_encrypted(partition)) { - hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, - HV_VP_STATE_PAGE_REGISTERS, - input_vtl_zero); - } + if (!mshv_partition_encrypted(partition)) + hv_unmap_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_REGISTERS, + register_page, input_vtl_zero); unmap_intercept_message_page: - hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, - HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, - input_vtl_zero); + hv_unmap_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, + intercept_msg_page, input_vtl_zero); destroy_vp: hv_call_delete_vp(partition->pt_id, args.vp_index); return ret; @@ -1031,162 +1146,6 @@ static void mshv_async_hvcall_handler(void *data, u64 *status) *status = partition->async_hypercall_status; } -static int -mshv_partition_region_share(struct mshv_mem_region *region) -{ - u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED; - - if (region->flags.large_pages) - flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; - - return hv_call_modify_spa_host_access(region->partition->pt_id, - region->pages, region->nr_pages, - HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE, - flags, true); -} - -static int -mshv_partition_region_unshare(struct mshv_mem_region *region) -{ - u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE; - - if (region->flags.large_pages) - flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; - - return hv_call_modify_spa_host_access(region->partition->pt_id, - region->pages, region->nr_pages, - 0, - flags, false); -} - -static int -mshv_region_remap_pages(struct mshv_mem_region *region, u32 map_flags, - u64 page_offset, u64 page_count) -{ - if (page_offset + page_count > region->nr_pages) - return -EINVAL; - - if (region->flags.large_pages) - map_flags |= HV_MAP_GPA_LARGE_PAGE; - - /* ask the hypervisor to map guest ram */ - return hv_call_map_gpa_pages(region->partition->pt_id, - region->start_gfn + page_offset, - page_count, map_flags, - region->pages + page_offset); -} - -static int -mshv_region_map(struct mshv_mem_region *region) -{ - u32 map_flags = region->hv_map_flags; - - return mshv_region_remap_pages(region, map_flags, - 0, region->nr_pages); -} - -static void -mshv_region_evict_pages(struct mshv_mem_region *region, - u64 page_offset, u64 page_count) -{ - if (region->flags.range_pinned) - unpin_user_pages(region->pages + page_offset, page_count); - - memset(region->pages + page_offset, 0, - page_count * sizeof(struct page *)); -} - -static void -mshv_region_evict(struct mshv_mem_region *region) -{ - mshv_region_evict_pages(region, 0, region->nr_pages); -} - -static int -mshv_region_populate_pages(struct mshv_mem_region *region, - u64 page_offset, u64 page_count) -{ - u64 done_count, nr_pages; - struct page **pages; - __u64 userspace_addr; - int ret; - - if (page_offset + page_count > region->nr_pages) - return -EINVAL; - - for (done_count = 0; done_count < page_count; done_count += ret) { - pages = region->pages + page_offset + done_count; - userspace_addr = region->start_uaddr + - (page_offset + done_count) * - HV_HYP_PAGE_SIZE; - nr_pages = min(page_count - done_count, - MSHV_PIN_PAGES_BATCH_SIZE); - - /* - * Pinning assuming 4k pages works for large pages too. - * All page structs within the large page are returned. - * - * Pin requests are batched because pin_user_pages_fast - * with the FOLL_LONGTERM flag does a large temporary - * allocation of contiguous memory. - */ - if (region->flags.range_pinned) - ret = pin_user_pages_fast(userspace_addr, - nr_pages, - FOLL_WRITE | FOLL_LONGTERM, - pages); - else - ret = -EOPNOTSUPP; - - if (ret < 0) - goto release_pages; - } - - if (PageHuge(region->pages[page_offset])) - region->flags.large_pages = true; - - return 0; - -release_pages: - mshv_region_evict_pages(region, page_offset, done_count); - return ret; -} - -static int -mshv_region_populate(struct mshv_mem_region *region) -{ - return mshv_region_populate_pages(region, 0, region->nr_pages); -} - -static struct mshv_mem_region * -mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) -{ - struct mshv_mem_region *region; - - hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { - if (gfn >= region->start_gfn && - gfn < region->start_gfn + region->nr_pages) - return region; - } - - return NULL; -} - -static struct mshv_mem_region * -mshv_partition_region_by_uaddr(struct mshv_partition *partition, u64 uaddr) -{ - struct mshv_mem_region *region; - - hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { - if (uaddr >= region->start_uaddr && - uaddr < region->start_uaddr + - (region->nr_pages << HV_HYP_PAGE_SHIFT)) - return region; - } - - return NULL; -} - /* * NB: caller checks and makes sure mem->size is page aligned * Returns: 0 with regionpp updated on success, or -errno @@ -1196,53 +1155,61 @@ static int mshv_partition_create_region(struct mshv_partition *partition, struct mshv_mem_region **regionpp, bool is_mmio) { - struct mshv_mem_region *region; + struct mshv_mem_region *rg; u64 nr_pages = HVPFN_DOWN(mem->size); /* Reject overlapping regions */ - if (mshv_partition_region_by_gfn(partition, mem->guest_pfn) || - mshv_partition_region_by_gfn(partition, mem->guest_pfn + nr_pages - 1) || - mshv_partition_region_by_uaddr(partition, mem->userspace_addr) || - mshv_partition_region_by_uaddr(partition, mem->userspace_addr + mem->size - 1)) + spin_lock(&partition->pt_mem_regions_lock); + hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) { + if (mem->guest_pfn + nr_pages <= rg->start_gfn || + rg->start_gfn + rg->nr_pages <= mem->guest_pfn) + continue; + spin_unlock(&partition->pt_mem_regions_lock); return -EEXIST; + } + spin_unlock(&partition->pt_mem_regions_lock); - region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages); - if (!region) - return -ENOMEM; - - region->nr_pages = nr_pages; - region->start_gfn = mem->guest_pfn; - region->start_uaddr = mem->userspace_addr; - region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE; - if (mem->flags & BIT(MSHV_SET_MEM_BIT_WRITABLE)) - region->hv_map_flags |= HV_MAP_GPA_WRITABLE; - if (mem->flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE)) - region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE; + rg = mshv_region_create(mem->guest_pfn, nr_pages, + mem->userspace_addr, mem->flags); + if (IS_ERR(rg)) + return PTR_ERR(rg); - /* Note: large_pages flag populated when we pin the pages */ - if (!is_mmio) - region->flags.range_pinned = true; + if (is_mmio) + rg->type = MSHV_REGION_TYPE_MMIO; + else if (mshv_partition_encrypted(partition) || + !mshv_region_movable_init(rg)) + rg->type = MSHV_REGION_TYPE_MEM_PINNED; + else + rg->type = MSHV_REGION_TYPE_MEM_MOVABLE; - region->partition = partition; + rg->partition = partition; - *regionpp = region; + *regionpp = rg; return 0; } -/* - * Map guest ram. if snp, make sure to release that from the host first - * Side Effects: In case of failure, pages are unpinned when feasible. +/** + * mshv_prepare_pinned_region - Pin and map memory regions + * @region: Pointer to the memory region structure + * + * This function processes memory regions that are explicitly marked as pinned. + * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based + * population. The function ensures the region is properly populated, handles + * encryption requirements for SNP partitions if applicable, maps the region, + * and performs necessary sharing or eviction operations based on the mapping + * result. + * + * Return: 0 on success, negative error code on failure. */ -static int -mshv_partition_mem_region_map(struct mshv_mem_region *region) +static int mshv_prepare_pinned_region(struct mshv_mem_region *region) { struct mshv_partition *partition = region->partition; int ret; - ret = mshv_region_populate(region); + ret = mshv_region_pin(region); if (ret) { - pt_err(partition, "Failed to populate memory region: %d\n", + pt_err(partition, "Failed to pin memory region: %d\n", ret); goto err_out; } @@ -1255,12 +1222,12 @@ mshv_partition_mem_region_map(struct mshv_mem_region *region) * access to guest memory regions. */ if (mshv_partition_encrypted(partition)) { - ret = mshv_partition_region_unshare(region); + ret = mshv_region_unshare(region); if (ret) { pt_err(partition, "Failed to unshare memory region (guest_pfn: %llu): %d\n", region->start_gfn, ret); - goto evict_region; + goto invalidate_region; } } @@ -1268,9 +1235,9 @@ mshv_partition_mem_region_map(struct mshv_mem_region *region) if (ret && mshv_partition_encrypted(partition)) { int shrc; - shrc = mshv_partition_region_share(region); + shrc = mshv_region_share(region); if (!shrc) - goto evict_region; + goto invalidate_region; pt_err(partition, "Failed to share memory region (guest_pfn: %llu): %d\n", @@ -1284,8 +1251,8 @@ mshv_partition_mem_region_map(struct mshv_mem_region *region) return 0; -evict_region: - mshv_region_evict(region); +invalidate_region: + mshv_region_invalidate(region); err_out: return ret; } @@ -1330,17 +1297,35 @@ mshv_map_user_memory(struct mshv_partition *partition, if (ret) return ret; - if (is_mmio) - ret = hv_call_map_mmio_pages(partition->pt_id, mem.guest_pfn, - mmio_pfn, HVPFN_DOWN(mem.size)); - else - ret = mshv_partition_mem_region_map(region); + switch (region->type) { + case MSHV_REGION_TYPE_MEM_PINNED: + ret = mshv_prepare_pinned_region(region); + break; + case MSHV_REGION_TYPE_MEM_MOVABLE: + /* + * For movable memory regions, remap with no access to let + * the hypervisor track dirty pages, enabling pre-copy live + * migration. + */ + ret = hv_call_map_gpa_pages(partition->pt_id, + region->start_gfn, + region->nr_pages, + HV_MAP_GPA_NO_ACCESS, NULL); + break; + case MSHV_REGION_TYPE_MMIO: + ret = hv_call_map_mmio_pages(partition->pt_id, + region->start_gfn, + mmio_pfn, + region->nr_pages); + break; + } if (ret) goto errout; - /* Install the new region */ + spin_lock(&partition->pt_mem_regions_lock); hlist_add_head(®ion->hnode, &partition->pt_mem_regions); + spin_unlock(&partition->pt_mem_regions_lock); return 0; @@ -1355,33 +1340,32 @@ mshv_unmap_user_memory(struct mshv_partition *partition, struct mshv_user_mem_region mem) { struct mshv_mem_region *region; - u32 unmap_flags = 0; if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))) return -EINVAL; + spin_lock(&partition->pt_mem_regions_lock); + region = mshv_partition_region_by_gfn(partition, mem.guest_pfn); - if (!region) - return -EINVAL; + if (!region) { + spin_unlock(&partition->pt_mem_regions_lock); + return -ENOENT; + } /* Paranoia check */ if (region->start_uaddr != mem.userspace_addr || region->start_gfn != mem.guest_pfn || - region->nr_pages != HVPFN_DOWN(mem.size)) + region->nr_pages != HVPFN_DOWN(mem.size)) { + spin_unlock(&partition->pt_mem_regions_lock); return -EINVAL; + } hlist_del(®ion->hnode); - if (region->flags.large_pages) - unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE; + spin_unlock(&partition->pt_mem_regions_lock); - /* ignore unmap failures and continue as process may be exiting */ - hv_call_unmap_gpa_pages(partition->pt_id, region->start_gfn, - region->nr_pages, unmap_flags); + mshv_region_put(region); - mshv_region_evict(region); - - vfree(region); return 0; } @@ -1717,8 +1701,8 @@ static void destroy_partition(struct mshv_partition *partition) { struct mshv_vp *vp; struct mshv_mem_region *region; - int i, ret; struct hlist_node *n; + int i; if (refcount_read(&partition->pt_ref_count)) { pt_err(partition, @@ -1740,28 +1724,32 @@ static void destroy_partition(struct mshv_partition *partition) if (!vp) continue; - if (hv_parent_partition()) - mshv_vp_stats_unmap(partition->pt_id, vp->vp_index); + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) + mshv_vp_stats_unmap(partition->pt_id, vp->vp_index, + (void **)vp->vp_stats_pages); if (vp->vp_register_page) { - (void)hv_call_unmap_vp_state_page(partition->pt_id, - vp->vp_index, - HV_VP_STATE_PAGE_REGISTERS, - input_vtl_zero); + (void)hv_unmap_vp_state_page(partition->pt_id, + vp->vp_index, + HV_VP_STATE_PAGE_REGISTERS, + virt_to_page(vp->vp_register_page), + input_vtl_zero); vp->vp_register_page = NULL; } - (void)hv_call_unmap_vp_state_page(partition->pt_id, - vp->vp_index, - HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, - input_vtl_zero); + (void)hv_unmap_vp_state_page(partition->pt_id, + vp->vp_index, + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, + virt_to_page(vp->vp_intercept_msg_page), + input_vtl_zero); vp->vp_intercept_msg_page = NULL; if (vp->vp_ghcb_page) { - (void)hv_call_unmap_vp_state_page(partition->pt_id, - vp->vp_index, - HV_VP_STATE_PAGE_GHCB, - input_vtl_normal); + (void)hv_unmap_vp_state_page(partition->pt_id, + vp->vp_index, + HV_VP_STATE_PAGE_GHCB, + virt_to_page(vp->vp_ghcb_page), + input_vtl_normal); vp->vp_ghcb_page = NULL; } @@ -1778,24 +1766,10 @@ static void destroy_partition(struct mshv_partition *partition) remove_partition(partition); - /* Remove regions, regain access to the memory and unpin the pages */ hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions, hnode) { hlist_del(®ion->hnode); - - if (mshv_partition_encrypted(partition)) { - ret = mshv_partition_region_share(region); - if (ret) { - pt_err(partition, - "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n", - ret); - return; - } - } - - mshv_region_evict(region); - - vfree(region); + mshv_region_put(region); } /* Withdraw and free all pages we deposited */ @@ -1862,43 +1836,117 @@ add_partition(struct mshv_partition *partition) return 0; } -static long -mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) +static_assert(MSHV_NUM_CPU_FEATURES_BANKS == + HV_PARTITION_PROCESSOR_FEATURES_BANKS); + +static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags, + struct hv_partition_creation_properties *cr_props, + union hv_partition_isolation_properties *isol_props) { - struct mshv_create_partition args; - u64 creation_flags; - struct hv_partition_creation_properties creation_properties = {}; - union hv_partition_isolation_properties isolation_properties = {}; - struct mshv_partition *partition; - struct file *file; - int fd; - long ret; + int i; + struct mshv_create_partition_v2 args; + union hv_partition_processor_features *disabled_procs; + union hv_partition_processor_xsave_features *disabled_xsave; - if (copy_from_user(&args, user_arg, sizeof(args))) + /* First, copy v1 struct in case user is on previous versions */ + if (copy_from_user(&args, user_arg, + sizeof(struct mshv_create_partition))) return -EFAULT; if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) return -EINVAL; + disabled_procs = &cr_props->disabled_processor_features; + disabled_xsave = &cr_props->disabled_processor_xsave_features; + + /* Check if user provided newer struct with feature fields */ + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) { + if (copy_from_user(&args, user_arg, sizeof(args))) + return -EFAULT; + + /* Re-validate v1 fields after second copy_from_user() */ + if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || + args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) + return -EINVAL; + + if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS || + mshv_field_nonzero(args, pt_rsvd) || + mshv_field_nonzero(args, pt_rsvd1)) + return -EINVAL; + + /* + * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never + * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS + * (i.e. 2). + * + * Further banks (index >= 2) will be modifiable as 'early' + * properties via the set partition property hypercall. + */ + for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++) + disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i]; + +#if IS_ENABLED(CONFIG_X86_64) + disabled_xsave->as_uint64 = args.pt_disabled_xsave; +#else + /* + * In practice this field is ignored on arm64, but safer to + * zero it in case it is ever used. + */ + disabled_xsave->as_uint64 = 0; + + if (mshv_field_nonzero(args, pt_rsvd2)) + return -EINVAL; +#endif + } else { + /* + * v1 behavior: try to enable everything. The hypervisor will + * disable features that are not supported. The banks can be + * queried via the get partition property hypercall. + */ + for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++) + disabled_procs->as_uint64[i] = 0; + + disabled_xsave->as_uint64 = 0; + } + /* Only support EXO partitions */ - creation_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION | - HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED; + *pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION | + HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED; - if (args.pt_flags & BIT(MSHV_PT_BIT_LAPIC)) - creation_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED; - if (args.pt_flags & BIT(MSHV_PT_BIT_X2APIC)) - creation_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; - if (args.pt_flags & BIT(MSHV_PT_BIT_GPA_SUPER_PAGES)) - creation_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED; + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; + + isol_props->as_uint64 = 0; switch (args.pt_isolation) { case MSHV_PT_ISOLATION_NONE: - isolation_properties.isolation_type = - HV_PARTITION_ISOLATION_TYPE_NONE; + isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE; break; } + return 0; +} + +static long +mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) +{ + u64 creation_flags; + struct hv_partition_creation_properties creation_properties; + union hv_partition_isolation_properties isolation_properties; + struct mshv_partition *partition; + long ret; + + ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags, + &creation_properties, + &isolation_properties); + if (ret) + return ret; + partition = kzalloc(sizeof(*partition), GFP_KERNEL); if (!partition) return -ENOMEM; @@ -1918,6 +1966,7 @@ mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) INIT_HLIST_HEAD(&partition->pt_devices); + spin_lock_init(&partition->pt_mem_regions_lock); INIT_HLIST_HEAD(&partition->pt_mem_regions); mshv_eventfd_init(partition); @@ -1938,29 +1987,13 @@ mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) goto delete_partition; ret = mshv_init_async_handler(partition); - if (ret) - goto remove_partition; - - fd = get_unused_fd_flags(O_CLOEXEC); - if (fd < 0) { - ret = fd; - goto remove_partition; - } - - file = anon_inode_getfile("mshv_partition", &mshv_partition_fops, - partition, O_RDWR); - if (IS_ERR(file)) { - ret = PTR_ERR(file); - goto put_fd; + if (!ret) { + ret = FD_ADD(O_CLOEXEC, anon_inode_getfile("mshv_partition", + &mshv_partition_fops, + partition, O_RDWR)); + if (ret >= 0) + return ret; } - - fd_install(fd, file); - - return fd; - -put_fd: - put_unused_fd(fd); -remove_partition: remove_partition(partition); delete_partition: hv_call_delete_partition(partition->pt_id); @@ -1981,6 +2014,9 @@ static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, case MSHV_CREATE_PARTITION: return mshv_ioctl_create_partition((void __user *)arg, misc->this_device); + case MSHV_ROOT_HVCALL: + return mshv_ioctl_passthru_hvcall(NULL, false, + (void __user *)arg); } return -ENOTTY; @@ -2197,6 +2233,22 @@ root_sched_deinit: return err; } +static void mshv_init_vmm_caps(struct device *dev) +{ + /* + * This can only fail here if HVCALL_GET_PARTITION_PROPERTY_EX or + * HV_PARTITION_PROPERTY_VMM_CAPABILITIES are not supported. In that + * case it's valid to proceed as if all vmm_caps are disabled (zero). + */ + if (hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF, + HV_PARTITION_PROPERTY_VMM_CAPABILITIES, + 0, &mshv_root.vmm_caps, + sizeof(mshv_root.vmm_caps))) + dev_warn(dev, "Unable to get VMM capabilities\n"); + + dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]); +} + static int __init mshv_parent_partition_init(void) { int ret; @@ -2249,6 +2301,8 @@ static int __init mshv_parent_partition_init(void) if (ret) goto remove_cpu_state; + mshv_init_vmm_caps(dev); + ret = mshv_irqfd_wq_init(); if (ret) goto exit_partition; |
