diff options
| -rw-r--r-- | drivers/hv/Kconfig | 2 | ||||
| -rw-r--r-- | drivers/hv/mshv_regions.c | 218 | ||||
| -rw-r--r-- | drivers/hv/mshv_root.h | 20 | ||||
| -rw-r--r-- | drivers/hv/mshv_root_main.c | 142 |
4 files changed, 346 insertions, 36 deletions
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index d4a8d349200c..7937ac0cbd0f 100644 --- a/drivers/hv/Kconfig +++ b/drivers/hv/Kconfig @@ -76,6 +76,8 @@ config MSHV_ROOT depends on PAGE_SIZE_4KB select EVENTFD select VIRT_XFER_TO_GUEST_WORK + select HMM_MIRROR + select MMU_NOTIFIER default n help Select this option to enable support for booting and running as root diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c index 4ec78fdaf56d..202b9d551e39 100644 --- a/drivers/hv/mshv_regions.c +++ b/drivers/hv/mshv_regions.c @@ -7,6 +7,8 @@ * Authors: Microsoft Linux virtualization team */ +#include <linux/hmm.h> +#include <linux/hyperv.h> #include <linux/kref.h> #include <linux/mm.h> #include <linux/vmalloc.h> @@ -15,6 +17,8 @@ #include "mshv_root.h" +#define MSHV_MAP_FAULT_IN_PAGES PTRS_PER_PMD + /** * mshv_region_process_chunk - Processes a contiguous chunk of memory pages * in a region. @@ -134,8 +138,7 @@ static int mshv_region_process_range(struct mshv_mem_region *region, } struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages, - u64 uaddr, u32 flags, - bool is_mmio) + u64 uaddr, u32 flags) { struct mshv_mem_region *region; @@ -152,9 +155,6 @@ struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages, if (flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE)) region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE; - if (!is_mmio) - region->flags.range_pinned = true; - kref_init(®ion->refcount); return region; @@ -245,7 +245,7 @@ int mshv_region_map(struct mshv_mem_region *region) static void mshv_region_invalidate_pages(struct mshv_mem_region *region, u64 page_offset, u64 page_count) { - if (region->flags.range_pinned) + if (region->type == MSHV_REGION_TYPE_MEM_PINNED) unpin_user_pages(region->pages + page_offset, page_count); memset(region->pages + page_offset, 0, @@ -321,6 +321,9 @@ static void mshv_region_destroy(struct kref *ref) struct mshv_partition *partition = region->partition; int ret; + if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE) + mshv_region_movable_fini(region); + if (mshv_partition_encrypted(partition)) { ret = mshv_region_share(region); if (ret) { @@ -347,3 +350,206 @@ int mshv_region_get(struct mshv_mem_region *region) { return kref_get_unless_zero(®ion->refcount); } + +/** + * mshv_region_hmm_fault_and_lock - Handle HMM faults and lock the memory region + * @region: Pointer to the memory region structure + * @range: Pointer to the HMM range structure + * + * This function performs the following steps: + * 1. Reads the notifier sequence for the HMM range. + * 2. Acquires a read lock on the memory map. + * 3. Handles HMM faults for the specified range. + * 4. Releases the read lock on the memory map. + * 5. If successful, locks the memory region mutex. + * 6. Verifies if the notifier sequence has changed during the operation. + * If it has, releases the mutex and returns -EBUSY to match with + * hmm_range_fault() return code for repeating. + * + * Return: 0 on success, a negative error code otherwise. + */ +static int mshv_region_hmm_fault_and_lock(struct mshv_mem_region *region, + struct hmm_range *range) +{ + int ret; + + range->notifier_seq = mmu_interval_read_begin(range->notifier); + mmap_read_lock(region->mni.mm); + ret = hmm_range_fault(range); + mmap_read_unlock(region->mni.mm); + if (ret) + return ret; + + mutex_lock(®ion->mutex); + + if (mmu_interval_read_retry(range->notifier, range->notifier_seq)) { + mutex_unlock(®ion->mutex); + cond_resched(); + return -EBUSY; + } + + return 0; +} + +/** + * mshv_region_range_fault - Handle memory range faults for a given region. + * @region: Pointer to the memory region structure. + * @page_offset: Offset of the page within the region. + * @page_count: Number of pages to handle. + * + * This function resolves memory faults for a specified range of pages + * within a memory region. It uses HMM (Heterogeneous Memory Management) + * to fault in the required pages and updates the region's page array. + * + * Return: 0 on success, negative error code on failure. + */ +static int mshv_region_range_fault(struct mshv_mem_region *region, + u64 page_offset, u64 page_count) +{ + struct hmm_range range = { + .notifier = ®ion->mni, + .default_flags = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE, + }; + unsigned long *pfns; + int ret; + u64 i; + + pfns = kmalloc_array(page_count, sizeof(*pfns), GFP_KERNEL); + if (!pfns) + return -ENOMEM; + + range.hmm_pfns = pfns; + range.start = region->start_uaddr + page_offset * HV_HYP_PAGE_SIZE; + range.end = range.start + page_count * HV_HYP_PAGE_SIZE; + + do { + ret = mshv_region_hmm_fault_and_lock(region, &range); + } while (ret == -EBUSY); + + if (ret) + goto out; + + for (i = 0; i < page_count; i++) + region->pages[page_offset + i] = hmm_pfn_to_page(pfns[i]); + + ret = mshv_region_remap_pages(region, region->hv_map_flags, + page_offset, page_count); + + mutex_unlock(®ion->mutex); +out: + kfree(pfns); + return ret; +} + +bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn) +{ + u64 page_offset, page_count; + int ret; + + /* Align the page offset to the nearest MSHV_MAP_FAULT_IN_PAGES. */ + page_offset = ALIGN_DOWN(gfn - region->start_gfn, + MSHV_MAP_FAULT_IN_PAGES); + + /* Map more pages than requested to reduce the number of faults. */ + page_count = min(region->nr_pages - page_offset, + MSHV_MAP_FAULT_IN_PAGES); + + ret = mshv_region_range_fault(region, page_offset, page_count); + + WARN_ONCE(ret, + "p%llu: GPA intercept failed: region %#llx-%#llx, gfn %#llx, page_offset %llu, page_count %llu\n", + region->partition->pt_id, region->start_uaddr, + region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT), + gfn, page_offset, page_count); + + return !ret; +} + +/** + * mshv_region_interval_invalidate - Invalidate a range of memory region + * @mni: Pointer to the mmu_interval_notifier structure + * @range: Pointer to the mmu_notifier_range structure + * @cur_seq: Current sequence number for the interval notifier + * + * This function invalidates a memory region by remapping its pages with + * no access permissions. It locks the region's mutex to ensure thread safety + * and updates the sequence number for the interval notifier. If the range + * is blockable, it uses a blocking lock; otherwise, it attempts a non-blocking + * lock and returns false if unsuccessful. + * + * NOTE: Failure to invalidate a region is a serious error, as the pages will + * be considered freed while they are still mapped by the hypervisor. + * Any attempt to access such pages will likely crash the system. + * + * Return: true if the region was successfully invalidated, false otherwise. + */ +static bool mshv_region_interval_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) +{ + struct mshv_mem_region *region = container_of(mni, + struct mshv_mem_region, + mni); + u64 page_offset, page_count; + unsigned long mstart, mend; + int ret = -EPERM; + + if (mmu_notifier_range_blockable(range)) + mutex_lock(®ion->mutex); + else if (!mutex_trylock(®ion->mutex)) + goto out_fail; + + mmu_interval_set_seq(mni, cur_seq); + + mstart = max(range->start, region->start_uaddr); + mend = min(range->end, region->start_uaddr + + (region->nr_pages << HV_HYP_PAGE_SHIFT)); + + page_offset = HVPFN_DOWN(mstart - region->start_uaddr); + page_count = HVPFN_DOWN(mend - mstart); + + ret = mshv_region_remap_pages(region, HV_MAP_GPA_NO_ACCESS, + page_offset, page_count); + if (ret) + goto out_fail; + + mshv_region_invalidate_pages(region, page_offset, page_count); + + mutex_unlock(®ion->mutex); + + return true; + +out_fail: + WARN_ONCE(ret, + "Failed to invalidate region %#llx-%#llx (range %#lx-%#lx, event: %u, pages %#llx-%#llx, mm: %#llx): %d\n", + region->start_uaddr, + region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT), + range->start, range->end, range->event, + page_offset, page_offset + page_count - 1, (u64)range->mm, ret); + return false; +} + +static const struct mmu_interval_notifier_ops mshv_region_mni_ops = { + .invalidate = mshv_region_interval_invalidate, +}; + +void mshv_region_movable_fini(struct mshv_mem_region *region) +{ + mmu_interval_notifier_remove(®ion->mni); +} + +bool mshv_region_movable_init(struct mshv_mem_region *region) +{ + int ret; + + ret = mmu_interval_notifier_insert(®ion->mni, current->mm, + region->start_uaddr, + region->nr_pages << HV_HYP_PAGE_SHIFT, + &mshv_region_mni_ops); + if (ret) + return false; + + mutex_init(®ion->mutex); + + return true; +} diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h index 4249534ba900..3c1d88b36741 100644 --- a/drivers/hv/mshv_root.h +++ b/drivers/hv/mshv_root.h @@ -15,6 +15,7 @@ #include <linux/hashtable.h> #include <linux/dev_printk.h> #include <linux/build_bug.h> +#include <linux/mmu_notifier.h> #include <uapi/linux/mshv.h> /* @@ -70,6 +71,12 @@ do { \ #define vp_info(v, fmt, ...) vp_devprintk(info, v, fmt, ##__VA_ARGS__) #define vp_dbg(v, fmt, ...) vp_devprintk(dbg, v, fmt, ##__VA_ARGS__) +enum mshv_region_type { + MSHV_REGION_TYPE_MEM_PINNED, + MSHV_REGION_TYPE_MEM_MOVABLE, + MSHV_REGION_TYPE_MMIO +}; + struct mshv_mem_region { struct hlist_node hnode; struct kref refcount; @@ -77,11 +84,10 @@ struct mshv_mem_region { u64 start_gfn; u64 start_uaddr; u32 hv_map_flags; - struct { - u64 range_pinned: 1; - u64 reserved: 63; - } flags; struct mshv_partition *partition; + enum mshv_region_type type; + struct mmu_interval_notifier mni; + struct mutex mutex; /* protects region pages remapping */ struct page *pages[]; }; @@ -315,8 +321,7 @@ extern enum hv_scheduler_type hv_scheduler_type; extern u8 * __percpu *hv_synic_eventring_tail; struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages, - u64 uaddr, u32 flags, - bool is_mmio); + u64 uaddr, u32 flags); int mshv_region_share(struct mshv_mem_region *region); int mshv_region_unshare(struct mshv_mem_region *region); int mshv_region_map(struct mshv_mem_region *region); @@ -324,5 +329,8 @@ void mshv_region_invalidate(struct mshv_mem_region *region); int mshv_region_pin(struct mshv_mem_region *region); void mshv_region_put(struct mshv_mem_region *region); int mshv_region_get(struct mshv_mem_region *region); +bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn); +void mshv_region_movable_fini(struct mshv_mem_region *region); +bool mshv_region_movable_init(struct mshv_mem_region *region); #endif /* _MSHV_ROOT_H_ */ diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index aa1a11f4dc3e..9cf28a3f12fe 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -594,14 +594,98 @@ static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp) static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ, "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ"); +static struct mshv_mem_region * +mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) +{ + struct mshv_mem_region *region; + + hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { + if (gfn >= region->start_gfn && + gfn < region->start_gfn + region->nr_pages) + return region; + } + + return NULL; +} + +#ifdef CONFIG_X86_64 +static struct mshv_mem_region * +mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn) +{ + struct mshv_mem_region *region; + + spin_lock(&p->pt_mem_regions_lock); + region = mshv_partition_region_by_gfn(p, gfn); + if (!region || !mshv_region_get(region)) { + spin_unlock(&p->pt_mem_regions_lock); + return NULL; + } + spin_unlock(&p->pt_mem_regions_lock); + + return region; +} + +/** + * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts. + * @vp: Pointer to the virtual processor structure. + * + * This function processes GPA intercepts by identifying the memory region + * corresponding to the intercepted GPA, aligning the page offset, and + * mapping the required pages. It ensures that the region is valid and + * handles faults efficiently by mapping multiple pages at once. + * + * Return: true if the intercept was handled successfully, false otherwise. + */ +static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) +{ + struct mshv_partition *p = vp->vp_partition; + struct mshv_mem_region *region; + struct hv_x64_memory_intercept_message *msg; + bool ret; + u64 gfn; + + msg = (struct hv_x64_memory_intercept_message *) + vp->vp_intercept_msg_page->u.payload; + + gfn = HVPFN_DOWN(msg->guest_physical_address); + + region = mshv_partition_region_by_gfn_get(p, gfn); + if (!region) + return false; + + /* Only movable memory ranges are supported for GPA intercepts */ + if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE) + ret = mshv_region_handle_gfn_fault(region, gfn); + else + ret = false; + + mshv_region_put(region); + + return ret; +} +#else /* CONFIG_X86_64 */ +static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; } +#endif /* CONFIG_X86_64 */ + +static bool mshv_vp_handle_intercept(struct mshv_vp *vp) +{ + switch (vp->vp_intercept_msg_page->header.message_type) { + case HVMSG_GPA_INTERCEPT: + return mshv_handle_gpa_intercept(vp); + } + return false; +} + static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg) { long rc; - if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) - rc = mshv_run_vp_with_root_scheduler(vp); - else - rc = mshv_run_vp_with_hyp_scheduler(vp); + do { + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) + rc = mshv_run_vp_with_root_scheduler(vp); + else + rc = mshv_run_vp_with_hyp_scheduler(vp); + } while (rc == 0 && mshv_vp_handle_intercept(vp)); if (rc) return rc; @@ -1059,20 +1143,6 @@ static void mshv_async_hvcall_handler(void *data, u64 *status) *status = partition->async_hypercall_status; } -static struct mshv_mem_region * -mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) -{ - struct mshv_mem_region *region; - - hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { - if (gfn >= region->start_gfn && - gfn < region->start_gfn + region->nr_pages) - return region; - } - - return NULL; -} - /* * NB: caller checks and makes sure mem->size is page aligned * Returns: 0 with regionpp updated on success, or -errno @@ -1097,11 +1167,18 @@ static int mshv_partition_create_region(struct mshv_partition *partition, spin_unlock(&partition->pt_mem_regions_lock); rg = mshv_region_create(mem->guest_pfn, nr_pages, - mem->userspace_addr, mem->flags, - is_mmio); + mem->userspace_addr, mem->flags); if (IS_ERR(rg)) return PTR_ERR(rg); + if (is_mmio) + rg->type = MSHV_REGION_TYPE_MMIO; + else if (mshv_partition_encrypted(partition) || + !mshv_region_movable_init(rg)) + rg->type = MSHV_REGION_TYPE_MEM_PINNED; + else + rg->type = MSHV_REGION_TYPE_MEM_MOVABLE; + rg->partition = partition; *regionpp = rg; @@ -1217,11 +1294,28 @@ mshv_map_user_memory(struct mshv_partition *partition, if (ret) return ret; - if (is_mmio) - ret = hv_call_map_mmio_pages(partition->pt_id, mem.guest_pfn, - mmio_pfn, HVPFN_DOWN(mem.size)); - else + switch (region->type) { + case MSHV_REGION_TYPE_MEM_PINNED: ret = mshv_prepare_pinned_region(region); + break; + case MSHV_REGION_TYPE_MEM_MOVABLE: + /* + * For movable memory regions, remap with no access to let + * the hypervisor track dirty pages, enabling pre-copy live + * migration. + */ + ret = hv_call_map_gpa_pages(partition->pt_id, + region->start_gfn, + region->nr_pages, + HV_MAP_GPA_NO_ACCESS, NULL); + break; + case MSHV_REGION_TYPE_MMIO: + ret = hv_call_map_mmio_pages(partition->pt_id, + region->start_gfn, + mmio_pfn, + region->nr_pages); + break; + } if (ret) goto errout; |
