diff options
Diffstat (limited to 'drivers/hv')
| -rw-r--r-- | drivers/hv/Kconfig | 29 | ||||
| -rw-r--r-- | drivers/hv/Makefile | 9 | ||||
| -rw-r--r-- | drivers/hv/channel.c | 75 | ||||
| -rw-r--r-- | drivers/hv/channel_mgmt.c | 27 | ||||
| -rw-r--r-- | drivers/hv/connection.c | 6 | ||||
| -rw-r--r-- | drivers/hv/hv.c | 377 | ||||
| -rw-r--r-- | drivers/hv/hv_common.c | 27 | ||||
| -rw-r--r-- | drivers/hv/hv_util.c | 2 | ||||
| -rw-r--r-- | drivers/hv/hyperv_vmbus.h | 76 | ||||
| -rw-r--r-- | drivers/hv/mshv_common.c | 99 | ||||
| -rw-r--r-- | drivers/hv/mshv_eventfd.c | 8 | ||||
| -rw-r--r-- | drivers/hv/mshv_irq.c | 4 | ||||
| -rw-r--r-- | drivers/hv/mshv_regions.c | 555 | ||||
| -rw-r--r-- | drivers/hv/mshv_root.h | 57 | ||||
| -rw-r--r-- | drivers/hv/mshv_root_hv_call.c | 196 | ||||
| -rw-r--r-- | drivers/hv/mshv_root_main.c | 745 | ||||
| -rw-r--r-- | drivers/hv/mshv_synic.c | 6 | ||||
| -rw-r--r-- | drivers/hv/mshv_vtl.h | 25 | ||||
| -rw-r--r-- | drivers/hv/mshv_vtl_main.c | 1392 | ||||
| -rw-r--r-- | drivers/hv/ring_buffer.c | 5 | ||||
| -rw-r--r-- | drivers/hv/vmbus_drv.c | 202 |
21 files changed, 3315 insertions, 607 deletions
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index 0b8c391a0342..7937ac0cbd0f 100644 --- a/drivers/hv/Kconfig +++ b/drivers/hv/Kconfig @@ -17,7 +17,8 @@ config HYPERV config HYPERV_VTL_MODE bool "Enable Linux to boot in VTL context" - depends on (X86_64 || ARM64) && HYPERV + depends on (X86_64 && HAVE_STATIC_CALL) || ARM64 + depends on HYPERV depends on SMP default n help @@ -75,6 +76,8 @@ config MSHV_ROOT depends on PAGE_SIZE_4KB select EVENTFD select VIRT_XFER_TO_GUEST_WORK + select HMM_MIRROR + select MMU_NOTIFIER default n help Select this option to enable support for booting and running as root @@ -82,4 +85,28 @@ config MSHV_ROOT If unsure, say N. +config MSHV_VTL + tristate "Microsoft Hyper-V VTL driver" + depends on X86_64 && HYPERV_VTL_MODE + depends on HYPERV_VMBUS + # Mapping VTL0 memory to a userspace process in VTL2 is supported in OpenHCL. + # VTL2 for OpenHCL makes use of Huge Pages to improve performance on VMs, + # specially with large memory requirements. + depends on TRANSPARENT_HUGEPAGE + # MTRRs are controlled by VTL0, and are not specific to individual VTLs. + # Therefore, do not attempt to access or modify MTRRs here. + depends on !MTRR + select CPUMASK_OFFSTACK + select VIRT_XFER_TO_GUEST_WORK + default n + help + Select this option to enable Hyper-V VTL driver support. + This driver provides interfaces for Virtual Machine Manager (VMM) running in VTL2 + userspace to create VTLs and partitions, setup and manage VTL0 memory and + allow userspace to make direct hypercalls. This also allows to map VTL0's address + space to a usermode process in VTL2 and supports getting new VMBus messages and channel + events in VTL2. + + If unsure, say N. + endmenu diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile index 1a1677bf4dac..a49f93c2d245 100644 --- a/drivers/hv/Makefile +++ b/drivers/hv/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_HYPERV_VMBUS) += hv_vmbus.o obj-$(CONFIG_HYPERV_UTILS) += hv_utils.o obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o obj-$(CONFIG_MSHV_ROOT) += mshv_root.o +obj-$(CONFIG_MSHV_VTL) += mshv_vtl.o CFLAGS_hv_trace.o = -I$(src) CFLAGS_hv_balloon.o = -I$(src) @@ -13,8 +14,12 @@ hv_vmbus-y := vmbus_drv.o \ hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_utils_transport.o mshv_root-y := mshv_root_main.o mshv_synic.o mshv_eventfd.o mshv_irq.o \ - mshv_root_hv_call.o mshv_portid_table.o + mshv_root_hv_call.o mshv_portid_table.o mshv_regions.o +mshv_vtl-y := mshv_vtl_main.o # Code that must be built-in obj-$(CONFIG_HYPERV) += hv_common.o -obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o mshv_common.o +obj-$(subst m,y,$(CONFIG_MSHV_ROOT)) += hv_proc.o +ifneq ($(CONFIG_MSHV_ROOT)$(CONFIG_MSHV_VTL),) + obj-y += mshv_common.o +endif diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 162d6aeece7b..6821f225248b 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -410,6 +410,21 @@ static int create_gpadl_header(enum hv_gpadl_type type, void *kbuffer, return 0; } +static void vmbus_free_channel_msginfo(struct vmbus_channel_msginfo *msginfo) +{ + struct vmbus_channel_msginfo *submsginfo, *tmp; + + if (!msginfo) + return; + + list_for_each_entry_safe(submsginfo, tmp, &msginfo->submsglist, + msglistentry) { + kfree(submsginfo); + } + + kfree(msginfo); +} + /* * __vmbus_establish_gpadl - Establish a GPADL for a buffer or ringbuffer * @@ -429,7 +444,7 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel, struct vmbus_channel_gpadl_header *gpadlmsg; struct vmbus_channel_gpadl_body *gpadl_body; struct vmbus_channel_msginfo *msginfo = NULL; - struct vmbus_channel_msginfo *submsginfo, *tmp; + struct vmbus_channel_msginfo *submsginfo; struct list_head *curr; u32 next_gpadl_handle; unsigned long flags; @@ -444,20 +459,24 @@ static int __vmbus_establish_gpadl(struct vmbus_channel *channel, return ret; } - /* - * Set the "decrypted" flag to true for the set_memory_decrypted() - * success case. In the failure case, the encryption state of the - * memory is unknown. Leave "decrypted" as true to ensure the - * memory will be leaked instead of going back on the free list. - */ - gpadl->decrypted = true; - ret = set_memory_decrypted((unsigned long)kbuffer, - PFN_UP(size)); - if (ret) { - dev_warn(&channel->device_obj->device, - "Failed to set host visibility for new GPADL %d.\n", - ret); - return ret; + gpadl->decrypted = !((channel->co_external_memory && type == HV_GPADL_BUFFER) || + (channel->co_ring_buffer && type == HV_GPADL_RING)); + if (gpadl->decrypted) { + /* + * The "decrypted" flag being true assumes that set_memory_decrypted() succeeds. + * But if it fails, the encryption state of the memory is unknown. In that case, + * leave "decrypted" as true to ensure the memory is leaked instead of going back + * on the free list. + */ + ret = set_memory_decrypted((unsigned long)kbuffer, + PFN_UP(size)); + if (ret) { + dev_warn(&channel->device_obj->device, + "Failed to set host visibility for new GPADL %d.\n", + ret); + vmbus_free_channel_msginfo(msginfo); + return ret; + } } init_completion(&msginfo->waitevent); @@ -532,12 +551,8 @@ cleanup: spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); list_del(&msginfo->msglistentry); spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); - list_for_each_entry_safe(submsginfo, tmp, &msginfo->submsglist, - msglistentry) { - kfree(submsginfo); - } - kfree(msginfo); + vmbus_free_channel_msginfo(msginfo); if (ret) { /* @@ -545,8 +560,10 @@ cleanup: * left as true so the memory is leaked instead of being * put back on the free list. */ - if (!set_memory_encrypted((unsigned long)kbuffer, PFN_UP(size))) - gpadl->decrypted = false; + if (gpadl->decrypted) { + if (!set_memory_encrypted((unsigned long)kbuffer, PFN_UP(size))) + gpadl->decrypted = false; + } } return ret; @@ -573,7 +590,7 @@ EXPORT_SYMBOL_GPL(vmbus_establish_gpadl); * keeps track of the next available slot in the array. Initially, each * slot points to the next one (as in a Linked List). The last slot * does not point to anything, so its value is U64_MAX by default. - * @size The size of the array + * @size: The size of the array */ static u64 *request_arr_init(u32 size) { @@ -677,12 +694,13 @@ static int __vmbus_open(struct vmbus_channel *newchannel, goto error_clean_ring; err = hv_ringbuffer_init(&newchannel->outbound, - page, send_pages, 0); + page, send_pages, 0, newchannel->co_ring_buffer); if (err) goto error_free_gpadl; err = hv_ringbuffer_init(&newchannel->inbound, &page[send_pages], - recv_pages, newchannel->max_pkt_size); + recv_pages, newchannel->max_pkt_size, + newchannel->co_ring_buffer); if (err) goto error_free_gpadl; @@ -863,8 +881,11 @@ post_msg_err: kfree(info); - ret = set_memory_encrypted((unsigned long)gpadl->buffer, - PFN_UP(gpadl->size)); + if (gpadl->decrypted) + ret = set_memory_encrypted((unsigned long)gpadl->buffer, + PFN_UP(gpadl->size)); + else + ret = 0; if (ret) pr_warn("Fail to set mem host visibility in GPADL teardown %d.\n", ret); diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index 65dd299e2944..74fed2c073d4 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -844,14 +844,14 @@ static void vmbus_wait_for_unload(void) = per_cpu_ptr(hv_context.cpu_context, cpu); /* - * In a CoCo VM the synic_message_page is not allocated + * In a CoCo VM the hyp_synic_message_page is not allocated * in hv_synic_alloc(). Instead it is set/cleared in - * hv_synic_enable_regs() and hv_synic_disable_regs() + * hv_hyp_synic_enable_regs() and hv_hyp_synic_disable_regs() * such that it is set only when the CPU is online. If * not all present CPUs are online, the message page * might be NULL, so skip such CPUs. */ - page_addr = hv_cpu->synic_message_page; + page_addr = hv_cpu->hyp_synic_message_page; if (!page_addr) continue; @@ -892,7 +892,7 @@ completed: struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); - page_addr = hv_cpu->synic_message_page; + page_addr = hv_cpu->hyp_synic_message_page; if (!page_addr) continue; @@ -1022,6 +1022,7 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) struct vmbus_channel_offer_channel *offer; struct vmbus_channel *oldchannel, *newchannel; size_t offer_sz; + bool co_ring_buffer, co_external_memory; offer = (struct vmbus_channel_offer_channel *)hdr; @@ -1034,6 +1035,22 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) return; } + co_ring_buffer = is_co_ring_buffer(offer); + co_external_memory = is_co_external_memory(offer); + if (!co_ring_buffer && co_external_memory) { + pr_err("Invalid offer relid=%d: the ring buffer isn't encrypted\n", + offer->child_relid); + return; + } + if (co_ring_buffer || co_external_memory) { + if (vmbus_proto_version < VERSION_WIN10_V6_0 || !vmbus_is_confidential()) { + pr_err("Invalid offer relid=%d: no support for confidential VMBus\n", + offer->child_relid); + atomic_dec(&vmbus_connection.offer_in_progress); + return; + } + } + oldchannel = find_primary_channel_by_offer(offer); if (oldchannel != NULL) { @@ -1112,6 +1129,8 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) pr_err("Unable to allocate channel object\n"); return; } + newchannel->co_ring_buffer = co_ring_buffer; + newchannel->co_external_memory = co_external_memory; vmbus_setup_channel_state(newchannel, offer); diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 1fe3573ae52a..5d9cb5bf2d62 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -51,6 +51,7 @@ EXPORT_SYMBOL_GPL(vmbus_proto_version); * Linux guests and are not listed. */ static __u32 vmbus_versions[] = { + VERSION_WIN10_V6_0, VERSION_WIN10_V5_3, VERSION_WIN10_V5_2, VERSION_WIN10_V5_1, @@ -65,7 +66,7 @@ static __u32 vmbus_versions[] = { * Maximal VMBus protocol version guests can negotiate. Useful to cap the * VMBus version for testing and debugging purpose. */ -static uint max_version = VERSION_WIN10_V5_3; +static uint max_version = VERSION_WIN10_V6_0; module_param(max_version, uint, S_IRUGO); MODULE_PARM_DESC(max_version, @@ -105,6 +106,9 @@ int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version) vmbus_connection.msg_conn_id = VMBUS_MESSAGE_CONNECTION_ID; } + if (vmbus_is_confidential() && version >= VERSION_WIN10_V6_0) + msg->feature_flags = VMBUS_FEATURE_FLAG_CONFIDENTIAL_CHANNELS; + /* * shared_gpa_boundary is zero in non-SNP VMs, so it's safe to always * bitwise OR it diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index b14c5f9e0ef2..c100f04b3581 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -18,6 +18,7 @@ #include <linux/clockchips.h> #include <linux/delay.h> #include <linux/interrupt.h> +#include <linux/export.h> #include <clocksource/hyperv_timer.h> #include <asm/mshyperv.h> #include <linux/set_memory.h> @@ -25,6 +26,7 @@ /* The one and only */ struct hv_context hv_context; +EXPORT_SYMBOL_FOR_MODULES(hv_context, "mshv_vtl"); /* * hv_init - Main initialization routine. @@ -74,7 +76,11 @@ int hv_post_message(union hv_connection_id connection_id, aligned_msg->payload_size = payload_size; memcpy((void *)aligned_msg->payload, payload, payload_size); - if (ms_hyperv.paravisor_present) { + if (ms_hyperv.paravisor_present && !vmbus_is_confidential()) { + /* + * If the VMBus isn't confidential, use the CoCo-specific + * mechanism to communicate with the hypervisor. + */ if (hv_isolation_type_tdx()) status = hv_tdx_hypercall(HVCALL_POST_MESSAGE, virt_to_phys(aligned_msg), 0); @@ -88,6 +94,11 @@ int hv_post_message(union hv_connection_id connection_id, u64 control = HVCALL_POST_MESSAGE; control |= hv_nested ? HV_HYPERCALL_NESTED : 0; + /* + * If there is no paravisor, this will go to the hypervisor. + * In the Confidential VMBus case, there is the paravisor + * to which this will trap. + */ status = hv_do_hypercall(control, aligned_msg, NULL); } @@ -95,11 +106,72 @@ int hv_post_message(union hv_connection_id connection_id, return hv_result(status); } +EXPORT_SYMBOL_FOR_MODULES(hv_post_message, "mshv_vtl"); + +static int hv_alloc_page(void **page, bool decrypt, const char *note) +{ + int ret = 0; + + /* + * After the page changes its encryption status, its contents might + * appear scrambled on some hardware. Thus `get_zeroed_page` would + * zero the page out in vain, so do that explicitly exactly once. + * + * By default, the page is allocated encrypted in a CoCo VM. + */ + *page = (void *)__get_free_page(GFP_KERNEL); + if (!*page) + return -ENOMEM; + + if (decrypt) + ret = set_memory_decrypted((unsigned long)*page, 1); + if (ret) + goto failed; + + memset(*page, 0, PAGE_SIZE); + return 0; + +failed: + /* + * Report the failure but don't put the page back on the free list as + * its encryption status is unknown. + */ + pr_err("allocation failed for %s page, error %d, decrypted %d\n", + note, ret, decrypt); + *page = NULL; + return ret; +} + +static int hv_free_page(void **page, bool encrypt, const char *note) +{ + int ret = 0; + + if (!*page) + return 0; + + if (encrypt) + ret = set_memory_encrypted((unsigned long)*page, 1); + + /* + * In the case of the failure, the page is leaked. Something is wrong, + * prefer to lose the page with the unknown encryption status and stay afloat. + */ + if (ret) + pr_err("deallocation failed for %s page, error %d, encrypt %d\n", + note, ret, encrypt); + else + free_page((unsigned long)*page); + + *page = NULL; + + return ret; +} int hv_synic_alloc(void) { int cpu, ret = -ENOMEM; struct hv_per_cpu_context *hv_cpu; + const bool decrypt = !vmbus_is_confidential(); /* * First, zero all per-cpu memory areas so hv_synic_free() can @@ -125,73 +197,37 @@ int hv_synic_alloc(void) vmbus_on_msg_dpc, (unsigned long)hv_cpu); if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { - hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC); - if (!hv_cpu->post_msg_page) { - pr_err("Unable to allocate post msg page\n"); + ret = hv_alloc_page(&hv_cpu->post_msg_page, + decrypt, "post msg"); + if (ret) goto err; - } - - ret = set_memory_decrypted((unsigned long)hv_cpu->post_msg_page, 1); - if (ret) { - pr_err("Failed to decrypt post msg page: %d\n", ret); - /* Just leak the page, as it's unsafe to free the page. */ - hv_cpu->post_msg_page = NULL; - goto err; - } - - memset(hv_cpu->post_msg_page, 0, PAGE_SIZE); } /* - * Synic message and event pages are allocated by paravisor. - * Skip these pages allocation here. + * If these SynIC pages are not allocated, SIEF and SIM pages + * are configured using what the root partition or the paravisor + * provides upon reading the SIEFP and SIMP registers. */ if (!ms_hyperv.paravisor_present && !hv_root_partition()) { - hv_cpu->synic_message_page = - (void *)get_zeroed_page(GFP_ATOMIC); - if (!hv_cpu->synic_message_page) { - pr_err("Unable to allocate SYNIC message page\n"); + ret = hv_alloc_page(&hv_cpu->hyp_synic_message_page, + decrypt, "hypervisor SynIC msg"); + if (ret) goto err; - } - - hv_cpu->synic_event_page = - (void *)get_zeroed_page(GFP_ATOMIC); - if (!hv_cpu->synic_event_page) { - pr_err("Unable to allocate SYNIC event page\n"); - - free_page((unsigned long)hv_cpu->synic_message_page); - hv_cpu->synic_message_page = NULL; + ret = hv_alloc_page(&hv_cpu->hyp_synic_event_page, + decrypt, "hypervisor SynIC event"); + if (ret) goto err; - } } - if (!ms_hyperv.paravisor_present && - (hv_isolation_type_snp() || hv_isolation_type_tdx())) { - ret = set_memory_decrypted((unsigned long) - hv_cpu->synic_message_page, 1); - if (ret) { - pr_err("Failed to decrypt SYNIC msg page: %d\n", ret); - hv_cpu->synic_message_page = NULL; - - /* - * Free the event page here so that hv_synic_free() - * won't later try to re-encrypt it. - */ - free_page((unsigned long)hv_cpu->synic_event_page); - hv_cpu->synic_event_page = NULL; + if (vmbus_is_confidential()) { + ret = hv_alloc_page(&hv_cpu->para_synic_message_page, + false, "paravisor SynIC msg"); + if (ret) goto err; - } - - ret = set_memory_decrypted((unsigned long) - hv_cpu->synic_event_page, 1); - if (ret) { - pr_err("Failed to decrypt SYNIC event page: %d\n", ret); - hv_cpu->synic_event_page = NULL; + ret = hv_alloc_page(&hv_cpu->para_synic_event_page, + false, "paravisor SynIC event"); + if (ret) goto err; - } - - memset(hv_cpu->synic_message_page, 0, PAGE_SIZE); - memset(hv_cpu->synic_event_page, 0, PAGE_SIZE); } } @@ -207,70 +243,46 @@ err: void hv_synic_free(void) { - int cpu, ret; + int cpu; + const bool encrypt = !vmbus_is_confidential(); for_each_present_cpu(cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); - /* It's better to leak the page if the encryption fails. */ - if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { - if (hv_cpu->post_msg_page) { - ret = set_memory_encrypted((unsigned long) - hv_cpu->post_msg_page, 1); - if (ret) { - pr_err("Failed to encrypt post msg page: %d\n", ret); - hv_cpu->post_msg_page = NULL; - } - } + if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) + hv_free_page(&hv_cpu->post_msg_page, + encrypt, "post msg"); + if (!ms_hyperv.paravisor_present && !hv_root_partition()) { + hv_free_page(&hv_cpu->hyp_synic_event_page, + encrypt, "hypervisor SynIC event"); + hv_free_page(&hv_cpu->hyp_synic_message_page, + encrypt, "hypervisor SynIC msg"); } - - if (!ms_hyperv.paravisor_present && - (hv_isolation_type_snp() || hv_isolation_type_tdx())) { - if (hv_cpu->synic_message_page) { - ret = set_memory_encrypted((unsigned long) - hv_cpu->synic_message_page, 1); - if (ret) { - pr_err("Failed to encrypt SYNIC msg page: %d\n", ret); - hv_cpu->synic_message_page = NULL; - } - } - - if (hv_cpu->synic_event_page) { - ret = set_memory_encrypted((unsigned long) - hv_cpu->synic_event_page, 1); - if (ret) { - pr_err("Failed to encrypt SYNIC event page: %d\n", ret); - hv_cpu->synic_event_page = NULL; - } - } + if (vmbus_is_confidential()) { + hv_free_page(&hv_cpu->para_synic_event_page, + false, "paravisor SynIC event"); + hv_free_page(&hv_cpu->para_synic_message_page, + false, "paravisor SynIC msg"); } - - free_page((unsigned long)hv_cpu->post_msg_page); - free_page((unsigned long)hv_cpu->synic_event_page); - free_page((unsigned long)hv_cpu->synic_message_page); } kfree(hv_context.hv_numa_map); } /* - * hv_synic_init - Initialize the Synthetic Interrupt Controller. - * - * If it is already initialized by another entity (ie x2v shim), we need to - * retrieve the initialized message and event pages. Otherwise, we create and - * initialize the message and event pages. + * hv_hyp_synic_enable_regs - Initialize the Synthetic Interrupt Controller + * with the hypervisor. */ -void hv_synic_enable_regs(unsigned int cpu) +void hv_hyp_synic_enable_regs(unsigned int cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); union hv_synic_simp simp; union hv_synic_siefp siefp; union hv_synic_sint shared_sint; - union hv_synic_scontrol sctrl; - /* Setup the Synic's message page */ + /* Setup the Synic's message page with the hypervisor. */ simp.as_uint64 = hv_get_msr(HV_MSR_SIMP); simp.simp_enabled = 1; @@ -278,18 +290,18 @@ void hv_synic_enable_regs(unsigned int cpu) /* Mask out vTOM bit. ioremap_cache() maps decrypted */ u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) & ~ms_hyperv.shared_gpa_boundary; - hv_cpu->synic_message_page = + hv_cpu->hyp_synic_message_page = (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); - if (!hv_cpu->synic_message_page) + if (!hv_cpu->hyp_synic_message_page) pr_err("Fail to map synic message page.\n"); } else { - simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page) + simp.base_simp_gpa = virt_to_phys(hv_cpu->hyp_synic_message_page) >> HV_HYP_PAGE_SHIFT; } hv_set_msr(HV_MSR_SIMP, simp.as_uint64); - /* Setup the Synic's event page */ + /* Setup the Synic's event page with the hypervisor. */ siefp.as_uint64 = hv_get_msr(HV_MSR_SIEFP); siefp.siefp_enabled = 1; @@ -297,16 +309,17 @@ void hv_synic_enable_regs(unsigned int cpu) /* Mask out vTOM bit. ioremap_cache() maps decrypted */ u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) & ~ms_hyperv.shared_gpa_boundary; - hv_cpu->synic_event_page = + hv_cpu->hyp_synic_event_page = (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); - if (!hv_cpu->synic_event_page) + if (!hv_cpu->hyp_synic_event_page) pr_err("Fail to map synic event page.\n"); } else { - siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page) + siefp.base_siefp_gpa = virt_to_phys(hv_cpu->hyp_synic_event_page) >> HV_HYP_PAGE_SHIFT; } hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64); + hv_enable_coco_interrupt(cpu, vmbus_interrupt, true); /* Setup the shared SINT. */ if (vmbus_irq != -1) @@ -317,6 +330,11 @@ void hv_synic_enable_regs(unsigned int cpu) shared_sint.masked = false; shared_sint.auto_eoi = hv_recommend_using_aeoi(); hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64); +} + +static void hv_hyp_synic_enable_interrupts(void) +{ + union hv_synic_scontrol sctrl; /* Enable the global synic bit */ sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL); @@ -325,23 +343,72 @@ void hv_synic_enable_regs(unsigned int cpu) hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64); } +static void hv_para_synic_enable_regs(unsigned int cpu) +{ + union hv_synic_simp simp; + union hv_synic_siefp siefp; + struct hv_per_cpu_context *hv_cpu + = per_cpu_ptr(hv_context.cpu_context, cpu); + + /* Setup the Synic's message page with the paravisor. */ + simp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIMP); + simp.simp_enabled = 1; + simp.base_simp_gpa = virt_to_phys(hv_cpu->para_synic_message_page) + >> HV_HYP_PAGE_SHIFT; + hv_para_set_synic_register(HV_MSR_SIMP, simp.as_uint64); + + /* Setup the Synic's event page with the paravisor. */ + siefp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIEFP); + siefp.siefp_enabled = 1; + siefp.base_siefp_gpa = virt_to_phys(hv_cpu->para_synic_event_page) + >> HV_HYP_PAGE_SHIFT; + hv_para_set_synic_register(HV_MSR_SIEFP, siefp.as_uint64); +} + +static void hv_para_synic_enable_interrupts(void) +{ + union hv_synic_scontrol sctrl; + + /* Enable the global synic bit */ + sctrl.as_uint64 = hv_para_get_synic_register(HV_MSR_SCONTROL); + sctrl.enable = 1; + hv_para_set_synic_register(HV_MSR_SCONTROL, sctrl.as_uint64); +} + int hv_synic_init(unsigned int cpu) { - hv_synic_enable_regs(cpu); + if (vmbus_is_confidential()) + hv_para_synic_enable_regs(cpu); + + /* + * The SINT is set in hv_hyp_synic_enable_regs() by calling + * hv_set_msr(). hv_set_msr() in turn has special case code for the + * SINT MSRs that write to the hypervisor version of the MSR *and* + * the paravisor version of the MSR (but *without* the proxy bit when + * VMBus is confidential). + * + * Then enable interrupts via the paravisor if VMBus is confidential, + * and otherwise via the hypervisor. + */ + + hv_hyp_synic_enable_regs(cpu); + if (vmbus_is_confidential()) + hv_para_synic_enable_interrupts(); + else + hv_hyp_synic_enable_interrupts(); hv_stimer_legacy_init(cpu, VMBUS_MESSAGE_SINT); return 0; } -void hv_synic_disable_regs(unsigned int cpu) +void hv_hyp_synic_disable_regs(unsigned int cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); union hv_synic_sint shared_sint; union hv_synic_simp simp; union hv_synic_siefp siefp; - union hv_synic_scontrol sctrl; shared_sint.as_uint64 = hv_get_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT); @@ -350,18 +417,21 @@ void hv_synic_disable_regs(unsigned int cpu) /* Need to correctly cleanup in the case of SMP!!! */ /* Disable the interrupt */ hv_set_msr(HV_MSR_SINT0 + VMBUS_MESSAGE_SINT, shared_sint.as_uint64); + hv_enable_coco_interrupt(cpu, vmbus_interrupt, false); simp.as_uint64 = hv_get_msr(HV_MSR_SIMP); /* - * In Isolation VM, sim and sief pages are allocated by + * In Isolation VM, simp and sief pages are allocated by * paravisor. These pages also will be used by kdump * kernel. So just reset enable bit here and keep page * addresses. */ simp.simp_enabled = 0; if (ms_hyperv.paravisor_present || hv_root_partition()) { - iounmap(hv_cpu->synic_message_page); - hv_cpu->synic_message_page = NULL; + if (hv_cpu->hyp_synic_message_page) { + iounmap(hv_cpu->hyp_synic_message_page); + hv_cpu->hyp_synic_message_page = NULL; + } } else { simp.base_simp_gpa = 0; } @@ -372,21 +442,51 @@ void hv_synic_disable_regs(unsigned int cpu) siefp.siefp_enabled = 0; if (ms_hyperv.paravisor_present || hv_root_partition()) { - iounmap(hv_cpu->synic_event_page); - hv_cpu->synic_event_page = NULL; + if (hv_cpu->hyp_synic_event_page) { + iounmap(hv_cpu->hyp_synic_event_page); + hv_cpu->hyp_synic_event_page = NULL; + } } else { siefp.base_siefp_gpa = 0; } hv_set_msr(HV_MSR_SIEFP, siefp.as_uint64); +} + +static void hv_hyp_synic_disable_interrupts(void) +{ + union hv_synic_scontrol sctrl; /* Disable the global synic bit */ sctrl.as_uint64 = hv_get_msr(HV_MSR_SCONTROL); sctrl.enable = 0; hv_set_msr(HV_MSR_SCONTROL, sctrl.as_uint64); +} - if (vmbus_irq != -1) - disable_percpu_irq(vmbus_irq); +static void hv_para_synic_disable_regs(unsigned int cpu) +{ + union hv_synic_simp simp; + union hv_synic_siefp siefp; + + /* Disable SynIC's message page in the paravisor. */ + simp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIMP); + simp.simp_enabled = 0; + hv_para_set_synic_register(HV_MSR_SIMP, simp.as_uint64); + + /* Disable SynIC's event page in the paravisor. */ + siefp.as_uint64 = hv_para_get_synic_register(HV_MSR_SIEFP); + siefp.siefp_enabled = 0; + hv_para_set_synic_register(HV_MSR_SIEFP, siefp.as_uint64); +} + +static void hv_para_synic_disable_interrupts(void) +{ + union hv_synic_scontrol sctrl; + + /* Disable the global synic bit */ + sctrl.as_uint64 = hv_para_get_synic_register(HV_MSR_SCONTROL); + sctrl.enable = 0; + hv_para_set_synic_register(HV_MSR_SCONTROL, sctrl.as_uint64); } #define HV_MAX_TRIES 3 @@ -399,16 +499,18 @@ void hv_synic_disable_regs(unsigned int cpu) * that the normal interrupt handling mechanism will find and process the channel interrupt * "very soon", and in the process clear the bit. */ -static bool hv_synic_event_pending(void) +static bool __hv_synic_event_pending(union hv_synic_event_flags *event, int sint) { - struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context); - union hv_synic_event_flags *event = - (union hv_synic_event_flags *)hv_cpu->synic_event_page + VMBUS_MESSAGE_SINT; - unsigned long *recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */ + unsigned long *recv_int_page; bool pending; u32 relid; int tries = 0; + if (!event) + return false; + + event += sint; + recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */ retry: pending = false; for_each_set_bit(relid, recv_int_page, HV_EVENT_FLAGS_COUNT) { @@ -425,6 +527,17 @@ retry: return pending; } +static bool hv_synic_event_pending(void) +{ + struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context); + union hv_synic_event_flags *hyp_synic_event_page = hv_cpu->hyp_synic_event_page; + union hv_synic_event_flags *para_synic_event_page = hv_cpu->para_synic_event_page; + + return + __hv_synic_event_pending(hyp_synic_event_page, VMBUS_MESSAGE_SINT) || + __hv_synic_event_pending(para_synic_event_page, VMBUS_MESSAGE_SINT); +} + static int hv_pick_new_cpu(struct vmbus_channel *channel) { int ret = -EBUSY; @@ -517,7 +630,27 @@ int hv_synic_cleanup(unsigned int cpu) always_cleanup: hv_stimer_legacy_cleanup(cpu); - hv_synic_disable_regs(cpu); + /* + * First, disable the event and message pages + * used for communicating with the host, and then + * disable the host interrupts if VMBus is not + * confidential. + */ + hv_hyp_synic_disable_regs(cpu); + if (!vmbus_is_confidential()) + hv_hyp_synic_disable_interrupts(); + + /* + * Perform the same steps for the Confidential VMBus. + * The sequencing provides the guarantee that no data + * may be posted for processing before disabling interrupts. + */ + if (vmbus_is_confidential()) { + hv_para_synic_disable_regs(cpu); + hv_para_synic_disable_interrupts(); + } + if (vmbus_irq != -1) + disable_percpu_irq(vmbus_irq); return ret; } diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index e109a620c83f..0a3ab7efed46 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -315,9 +315,9 @@ int __init hv_common_init(void) int i; union hv_hypervisor_version_info version; - /* Get information about the Hyper-V host version */ + /* Get information about the Microsoft Hypervisor version */ if (!hv_get_hypervisor_version(&version)) - pr_info("Hyper-V: Host Build %d.%d.%d.%d-%d-%d\n", + pr_info("Hyper-V: Hypervisor Build %d.%d.%d.%d-%d-%d\n", version.major_version, version.minor_version, version.build_number, version.service_number, version.service_pack, version.service_branch); @@ -487,7 +487,7 @@ int hv_common_cpu_init(unsigned int cpu) * online and then taken offline */ if (!*inputarg) { - mem = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags); + mem = kmalloc_array(pgcount, HV_HYP_PAGE_SIZE, flags); if (!mem) return -ENOMEM; @@ -716,6 +716,27 @@ u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2) } EXPORT_SYMBOL_GPL(hv_tdx_hypercall); +void __weak hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set) +{ +} +EXPORT_SYMBOL_GPL(hv_enable_coco_interrupt); + +void __weak hv_para_set_sint_proxy(bool enable) +{ +} +EXPORT_SYMBOL_GPL(hv_para_set_sint_proxy); + +u64 __weak hv_para_get_synic_register(unsigned int reg) +{ + return ~0ULL; +} +EXPORT_SYMBOL_GPL(hv_para_get_synic_register); + +void __weak hv_para_set_synic_register(unsigned int reg, u64 val) +{ +} +EXPORT_SYMBOL_GPL(hv_para_set_synic_register); + void hv_identify_partition_type(void) { /* Assume guest role */ diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index 36ee89c0358b..7e9c8e169c66 100644 --- a/drivers/hv/hv_util.c +++ b/drivers/hv/hv_util.c @@ -586,7 +586,7 @@ static int util_probe(struct hv_device *dev, (struct hv_util_service *)dev_id->driver_data; int ret; - srv->recv_buffer = kmalloc(HV_HYP_PAGE_SIZE * 4, GFP_KERNEL); + srv->recv_buffer = kmalloc_array(4, HV_HYP_PAGE_SIZE, GFP_KERNEL); if (!srv->recv_buffer) return -ENOMEM; srv->channel = dev->channel; diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 0b450e53161e..b2862e0a317a 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -15,6 +15,7 @@ #include <linux/list.h> #include <linux/bitops.h> #include <asm/sync_bitops.h> +#include <asm/mshyperv.h> #include <linux/atomic.h> #include <linux/hyperv.h> #include <linux/interrupt.h> @@ -32,6 +33,7 @@ */ #define HV_UTIL_NEGO_TIMEOUT 55 +void vmbus_isr(void); /* Definitions for the monitored notification facility */ union hv_monitor_trigger_group { @@ -120,8 +122,26 @@ enum { * Per cpu state for channel handling */ struct hv_per_cpu_context { - void *synic_message_page; - void *synic_event_page; + /* + * SynIC pages for communicating with the host. + * + * These pages are accessible to the host partition and the hypervisor. + * They may be used for exchanging data with the host partition and the + * hypervisor even when they aren't trusted yet the guest partition + * must be prepared to handle the malicious behavior. + */ + void *hyp_synic_message_page; + void *hyp_synic_event_page; + /* + * SynIC pages for communicating with the paravisor. + * + * These pages may be accessed from within the guest partition only in + * CoCo VMs. Neither the host partition nor the hypervisor can access + * these pages in that case; they are used for exchanging data with the + * paravisor. + */ + void *para_synic_message_page; + void *para_synic_event_page; /* * The page is only used in hv_post_message() for a TDX VM (with the @@ -171,10 +191,10 @@ extern int hv_synic_alloc(void); extern void hv_synic_free(void); -extern void hv_synic_enable_regs(unsigned int cpu); +extern void hv_hyp_synic_enable_regs(unsigned int cpu); extern int hv_synic_init(unsigned int cpu); -extern void hv_synic_disable_regs(unsigned int cpu); +extern void hv_hyp_synic_disable_regs(unsigned int cpu); extern int hv_synic_cleanup(unsigned int cpu); /* Interface */ @@ -182,7 +202,8 @@ extern int hv_synic_cleanup(unsigned int cpu); void hv_ringbuffer_pre_init(struct vmbus_channel *channel); int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, - struct page *pages, u32 pagecnt, u32 max_pkt_size); + struct page *pages, u32 pagecnt, u32 max_pkt_size, + bool confidential); void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info); @@ -333,6 +354,51 @@ extern const struct vmbus_channel_message_table_entry /* General vmbus interface */ +bool vmbus_is_confidential(void); + +#if IS_ENABLED(CONFIG_HYPERV_VMBUS) +/* Free the message slot and signal end-of-message if required */ +static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) +{ + /* + * On crash we're reading some other CPU's message page and we need + * to be careful: this other CPU may already had cleared the header + * and the host may already had delivered some other message there. + * In case we blindly write msg->header.message_type we're going + * to lose it. We can still lose a message of the same type but + * we count on the fact that there can only be one + * CHANNELMSG_UNLOAD_RESPONSE and we don't care about other messages + * on crash. + */ + if (cmpxchg(&msg->header.message_type, old_msg_type, + HVMSG_NONE) != old_msg_type) + return; + + /* + * The cmxchg() above does an implicit memory barrier to + * ensure the write to MessageType (ie set to + * HVMSG_NONE) happens before we read the + * MessagePending and EOMing. Otherwise, the EOMing + * will not deliver any more messages since there is + * no empty slot + */ + if (msg->header.message_flags.msg_pending) { + /* + * This will cause message queue rescan to + * possibly deliver another msg from the + * hypervisor + */ + if (vmbus_is_confidential()) + hv_para_set_synic_register(HV_MSR_EOM, 0); + else + hv_set_msr(HV_MSR_EOM, 0); + } +} + +extern int vmbus_interrupt; +extern int vmbus_irq; +#endif /* CONFIG_HYPERV_VMBUS */ + struct hv_device *vmbus_device_create(const guid_t *type, const guid_t *instance, struct vmbus_channel *channel); diff --git a/drivers/hv/mshv_common.c b/drivers/hv/mshv_common.c index aa2be51979fd..58027b23c206 100644 --- a/drivers/hv/mshv_common.c +++ b/drivers/hv/mshv_common.c @@ -14,6 +14,9 @@ #include <asm/mshyperv.h> #include <linux/resume_user_mode.h> #include <linux/export.h> +#include <linux/acpi.h> +#include <linux/notifier.h> +#include <linux/reboot.h> #include "mshv.h" @@ -138,3 +141,99 @@ int hv_call_get_partition_property(u64 partition_id, return 0; } EXPORT_SYMBOL_GPL(hv_call_get_partition_property); + +/* + * Corresponding sleep states have to be initialized in order for a subsequent + * HVCALL_ENTER_SLEEP_STATE call to succeed. Currently only S5 state as per + * ACPI 6.4 chapter 7.4.2 is relevant, while S1, S2 and S3 can be supported. + * + * In order to pass proper PM values to mshv, ACPI should be initialized and + * should support S5 sleep state when this method is invoked. + */ +static int hv_initialize_sleep_states(void) +{ + u64 status; + unsigned long flags; + struct hv_input_set_system_property *in; + acpi_status acpi_status; + u8 sleep_type_a, sleep_type_b; + + if (!acpi_sleep_state_supported(ACPI_STATE_S5)) { + pr_err("%s: S5 sleep state not supported.\n", __func__); + return -ENODEV; + } + + acpi_status = acpi_get_sleep_type_data(ACPI_STATE_S5, &sleep_type_a, + &sleep_type_b); + if (ACPI_FAILURE(acpi_status)) + return -ENODEV; + + local_irq_save(flags); + in = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(in, 0, sizeof(*in)); + + in->property_id = HV_SYSTEM_PROPERTY_SLEEP_STATE; + in->set_sleep_state_info.sleep_state = HV_SLEEP_STATE_S5; + in->set_sleep_state_info.pm1a_slp_typ = sleep_type_a; + in->set_sleep_state_info.pm1b_slp_typ = sleep_type_b; + + status = hv_do_hypercall(HVCALL_SET_SYSTEM_PROPERTY, in, NULL); + local_irq_restore(flags); + + if (!hv_result_success(status)) { + hv_status_err(status, "\n"); + return hv_result_to_errno(status); + } + + return 0; +} + +/* + * This notifier initializes sleep states in mshv hypervisor which will be + * used during power off. + */ +static int hv_reboot_notifier_handler(struct notifier_block *this, + unsigned long code, void *another) +{ + int ret = 0; + + if (code == SYS_HALT || code == SYS_POWER_OFF) + ret = hv_initialize_sleep_states(); + + return ret ? NOTIFY_DONE : NOTIFY_OK; +} + +static struct notifier_block hv_reboot_notifier = { + .notifier_call = hv_reboot_notifier_handler, +}; + +void hv_sleep_notifiers_register(void) +{ + int ret; + + ret = register_reboot_notifier(&hv_reboot_notifier); + if (ret) + pr_err("%s: cannot register reboot notifier %d\n", __func__, + ret); +} + +/* + * Power off the machine by entering S5 sleep state via Hyper-V hypercall. + * This call does not return if successful. + */ +void hv_machine_power_off(void) +{ + unsigned long flags; + struct hv_input_enter_sleep_state *in; + + local_irq_save(flags); + in = *this_cpu_ptr(hyperv_pcpu_input_arg); + in->sleep_state = HV_SLEEP_STATE_S5; + + (void)hv_do_hypercall(HVCALL_ENTER_SLEEP_STATE, in, NULL); + local_irq_restore(flags); + + /* should never reach here */ + BUG(); + +} diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c index 806674722868..d93a18f09c76 100644 --- a/drivers/hv/mshv_eventfd.c +++ b/drivers/hv/mshv_eventfd.c @@ -163,8 +163,10 @@ static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd) if (hv_scheduler_type != HV_SCHEDULER_TYPE_ROOT) return -EOPNOTSUPP; +#if IS_ENABLED(CONFIG_X86) if (irq->lapic_control.logical_dest_mode) return -EOPNOTSUPP; +#endif vp = partition->pt_vp_array[irq->lapic_apic_id]; @@ -196,8 +198,10 @@ static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd) unsigned int seq; int idx; +#if IS_ENABLED(CONFIG_X86) WARN_ON(irqfd->irqfd_resampler && !irq->lapic_control.level_triggered); +#endif idx = srcu_read_lock(&partition->pt_irq_srcu); if (irqfd->irqfd_girq_ent.guest_irq_num) { @@ -469,6 +473,7 @@ static int mshv_irqfd_assign(struct mshv_partition *pt, init_poll_funcptr(&irqfd->irqfd_polltbl, mshv_irqfd_queue_proc); spin_lock_irq(&pt->pt_irqfds_lock); +#if IS_ENABLED(CONFIG_X86) if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE) && !irqfd->irqfd_lapic_irq.lapic_control.level_triggered) { /* @@ -479,6 +484,7 @@ static int mshv_irqfd_assign(struct mshv_partition *pt, ret = -EINVAL; goto fail; } +#endif ret = 0; hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) { if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx) @@ -592,7 +598,7 @@ static void mshv_irqfd_release(struct mshv_partition *pt) int mshv_irqfd_wq_init(void) { - irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", 0, 0); + irqfd_cleanup_wq = alloc_workqueue("mshv-irqfd-cleanup", WQ_PERCPU, 0); if (!irqfd_cleanup_wq) return -ENOMEM; diff --git a/drivers/hv/mshv_irq.c b/drivers/hv/mshv_irq.c index d0fb9ef734f4..798e7e1ab06e 100644 --- a/drivers/hv/mshv_irq.c +++ b/drivers/hv/mshv_irq.c @@ -119,6 +119,10 @@ void mshv_copy_girq_info(struct mshv_guest_irq_ent *ent, lirq->lapic_vector = ent->girq_irq_data & 0xFF; lirq->lapic_apic_id = (ent->girq_addr_lo >> 12) & 0xFF; lirq->lapic_control.interrupt_type = (ent->girq_irq_data & 0x700) >> 8; +#if IS_ENABLED(CONFIG_X86) lirq->lapic_control.level_triggered = (ent->girq_irq_data >> 15) & 0x1; lirq->lapic_control.logical_dest_mode = (ent->girq_addr_lo >> 2) & 0x1; +#elif IS_ENABLED(CONFIG_ARM64) + lirq->lapic_control.asserted = 1; +#endif } diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c new file mode 100644 index 000000000000..202b9d551e39 --- /dev/null +++ b/drivers/hv/mshv_regions.c @@ -0,0 +1,555 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2025, Microsoft Corporation. + * + * Memory region management for mshv_root module. + * + * Authors: Microsoft Linux virtualization team + */ + +#include <linux/hmm.h> +#include <linux/hyperv.h> +#include <linux/kref.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> + +#include <asm/mshyperv.h> + +#include "mshv_root.h" + +#define MSHV_MAP_FAULT_IN_PAGES PTRS_PER_PMD + +/** + * mshv_region_process_chunk - Processes a contiguous chunk of memory pages + * in a region. + * @region : Pointer to the memory region structure. + * @flags : Flags to pass to the handler. + * @page_offset: Offset into the region's pages array to start processing. + * @page_count : Number of pages to process. + * @handler : Callback function to handle the chunk. + * + * This function scans the region's pages starting from @page_offset, + * checking for contiguous present pages of the same size (normal or huge). + * It invokes @handler for the chunk of contiguous pages found. Returns the + * number of pages handled, or a negative error code if the first page is + * not present or the handler fails. + * + * Note: The @handler callback must be able to handle both normal and huge + * pages. + * + * Return: Number of pages handled, or negative error code. + */ +static long mshv_region_process_chunk(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count, + int (*handler)(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, + u64 page_count)) +{ + u64 count, stride; + unsigned int page_order; + struct page *page; + int ret; + + page = region->pages[page_offset]; + if (!page) + return -EINVAL; + + page_order = folio_order(page_folio(page)); + /* The hypervisor only supports 4K and 2M page sizes */ + if (page_order && page_order != HPAGE_PMD_ORDER) + return -EINVAL; + + stride = 1 << page_order; + + /* Start at stride since the first page is validated */ + for (count = stride; count < page_count; count += stride) { + page = region->pages[page_offset + count]; + + /* Break if current page is not present */ + if (!page) + break; + + /* Break if page size changes */ + if (page_order != folio_order(page_folio(page))) + break; + } + + ret = handler(region, flags, page_offset, count); + if (ret) + return ret; + + return count; +} + +/** + * mshv_region_process_range - Processes a range of memory pages in a + * region. + * @region : Pointer to the memory region structure. + * @flags : Flags to pass to the handler. + * @page_offset: Offset into the region's pages array to start processing. + * @page_count : Number of pages to process. + * @handler : Callback function to handle each chunk of contiguous + * pages. + * + * Iterates over the specified range of pages in @region, skipping + * non-present pages. For each contiguous chunk of present pages, invokes + * @handler via mshv_region_process_chunk. + * + * Note: The @handler callback must be able to handle both normal and huge + * pages. + * + * Returns 0 on success, or a negative error code on failure. + */ +static int mshv_region_process_range(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count, + int (*handler)(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, + u64 page_count)) +{ + long ret; + + if (page_offset + page_count > region->nr_pages) + return -EINVAL; + + while (page_count) { + /* Skip non-present pages */ + if (!region->pages[page_offset]) { + page_offset++; + page_count--; + continue; + } + + ret = mshv_region_process_chunk(region, flags, + page_offset, + page_count, + handler); + if (ret < 0) + return ret; + + page_offset += ret; + page_count -= ret; + } + + return 0; +} + +struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages, + u64 uaddr, u32 flags) +{ + struct mshv_mem_region *region; + + region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages); + if (!region) + return ERR_PTR(-ENOMEM); + + region->nr_pages = nr_pages; + region->start_gfn = guest_pfn; + region->start_uaddr = uaddr; + region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE; + if (flags & BIT(MSHV_SET_MEM_BIT_WRITABLE)) + region->hv_map_flags |= HV_MAP_GPA_WRITABLE; + if (flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE)) + region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE; + + kref_init(®ion->refcount); + + return region; +} + +static int mshv_region_chunk_share(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count) +{ + struct page *page = region->pages[page_offset]; + + if (PageHuge(page) || PageTransCompound(page)) + flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; + + return hv_call_modify_spa_host_access(region->partition->pt_id, + region->pages + page_offset, + page_count, + HV_MAP_GPA_READABLE | + HV_MAP_GPA_WRITABLE, + flags, true); +} + +int mshv_region_share(struct mshv_mem_region *region) +{ + u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED; + + return mshv_region_process_range(region, flags, + 0, region->nr_pages, + mshv_region_chunk_share); +} + +static int mshv_region_chunk_unshare(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count) +{ + struct page *page = region->pages[page_offset]; + + if (PageHuge(page) || PageTransCompound(page)) + flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; + + return hv_call_modify_spa_host_access(region->partition->pt_id, + region->pages + page_offset, + page_count, 0, + flags, false); +} + +int mshv_region_unshare(struct mshv_mem_region *region) +{ + u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE; + + return mshv_region_process_range(region, flags, + 0, region->nr_pages, + mshv_region_chunk_unshare); +} + +static int mshv_region_chunk_remap(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count) +{ + struct page *page = region->pages[page_offset]; + + if (PageHuge(page) || PageTransCompound(page)) + flags |= HV_MAP_GPA_LARGE_PAGE; + + return hv_call_map_gpa_pages(region->partition->pt_id, + region->start_gfn + page_offset, + page_count, flags, + region->pages + page_offset); +} + +static int mshv_region_remap_pages(struct mshv_mem_region *region, + u32 map_flags, + u64 page_offset, u64 page_count) +{ + return mshv_region_process_range(region, map_flags, + page_offset, page_count, + mshv_region_chunk_remap); +} + +int mshv_region_map(struct mshv_mem_region *region) +{ + u32 map_flags = region->hv_map_flags; + + return mshv_region_remap_pages(region, map_flags, + 0, region->nr_pages); +} + +static void mshv_region_invalidate_pages(struct mshv_mem_region *region, + u64 page_offset, u64 page_count) +{ + if (region->type == MSHV_REGION_TYPE_MEM_PINNED) + unpin_user_pages(region->pages + page_offset, page_count); + + memset(region->pages + page_offset, 0, + page_count * sizeof(struct page *)); +} + +void mshv_region_invalidate(struct mshv_mem_region *region) +{ + mshv_region_invalidate_pages(region, 0, region->nr_pages); +} + +int mshv_region_pin(struct mshv_mem_region *region) +{ + u64 done_count, nr_pages; + struct page **pages; + __u64 userspace_addr; + int ret; + + for (done_count = 0; done_count < region->nr_pages; done_count += ret) { + pages = region->pages + done_count; + userspace_addr = region->start_uaddr + + done_count * HV_HYP_PAGE_SIZE; + nr_pages = min(region->nr_pages - done_count, + MSHV_PIN_PAGES_BATCH_SIZE); + + /* + * Pinning assuming 4k pages works for large pages too. + * All page structs within the large page are returned. + * + * Pin requests are batched because pin_user_pages_fast + * with the FOLL_LONGTERM flag does a large temporary + * allocation of contiguous memory. + */ + ret = pin_user_pages_fast(userspace_addr, nr_pages, + FOLL_WRITE | FOLL_LONGTERM, + pages); + if (ret < 0) + goto release_pages; + } + + return 0; + +release_pages: + mshv_region_invalidate_pages(region, 0, done_count); + return ret; +} + +static int mshv_region_chunk_unmap(struct mshv_mem_region *region, + u32 flags, + u64 page_offset, u64 page_count) +{ + struct page *page = region->pages[page_offset]; + + if (PageHuge(page) || PageTransCompound(page)) + flags |= HV_UNMAP_GPA_LARGE_PAGE; + + return hv_call_unmap_gpa_pages(region->partition->pt_id, + region->start_gfn + page_offset, + page_count, flags); +} + +static int mshv_region_unmap(struct mshv_mem_region *region) +{ + return mshv_region_process_range(region, 0, + 0, region->nr_pages, + mshv_region_chunk_unmap); +} + +static void mshv_region_destroy(struct kref *ref) +{ + struct mshv_mem_region *region = + container_of(ref, struct mshv_mem_region, refcount); + struct mshv_partition *partition = region->partition; + int ret; + + if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE) + mshv_region_movable_fini(region); + + if (mshv_partition_encrypted(partition)) { + ret = mshv_region_share(region); + if (ret) { + pt_err(partition, + "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n", + ret); + return; + } + } + + mshv_region_unmap(region); + + mshv_region_invalidate(region); + + vfree(region); +} + +void mshv_region_put(struct mshv_mem_region *region) +{ + kref_put(®ion->refcount, mshv_region_destroy); +} + +int mshv_region_get(struct mshv_mem_region *region) +{ + return kref_get_unless_zero(®ion->refcount); +} + +/** + * mshv_region_hmm_fault_and_lock - Handle HMM faults and lock the memory region + * @region: Pointer to the memory region structure + * @range: Pointer to the HMM range structure + * + * This function performs the following steps: + * 1. Reads the notifier sequence for the HMM range. + * 2. Acquires a read lock on the memory map. + * 3. Handles HMM faults for the specified range. + * 4. Releases the read lock on the memory map. + * 5. If successful, locks the memory region mutex. + * 6. Verifies if the notifier sequence has changed during the operation. + * If it has, releases the mutex and returns -EBUSY to match with + * hmm_range_fault() return code for repeating. + * + * Return: 0 on success, a negative error code otherwise. + */ +static int mshv_region_hmm_fault_and_lock(struct mshv_mem_region *region, + struct hmm_range *range) +{ + int ret; + + range->notifier_seq = mmu_interval_read_begin(range->notifier); + mmap_read_lock(region->mni.mm); + ret = hmm_range_fault(range); + mmap_read_unlock(region->mni.mm); + if (ret) + return ret; + + mutex_lock(®ion->mutex); + + if (mmu_interval_read_retry(range->notifier, range->notifier_seq)) { + mutex_unlock(®ion->mutex); + cond_resched(); + return -EBUSY; + } + + return 0; +} + +/** + * mshv_region_range_fault - Handle memory range faults for a given region. + * @region: Pointer to the memory region structure. + * @page_offset: Offset of the page within the region. + * @page_count: Number of pages to handle. + * + * This function resolves memory faults for a specified range of pages + * within a memory region. It uses HMM (Heterogeneous Memory Management) + * to fault in the required pages and updates the region's page array. + * + * Return: 0 on success, negative error code on failure. + */ +static int mshv_region_range_fault(struct mshv_mem_region *region, + u64 page_offset, u64 page_count) +{ + struct hmm_range range = { + .notifier = ®ion->mni, + .default_flags = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE, + }; + unsigned long *pfns; + int ret; + u64 i; + + pfns = kmalloc_array(page_count, sizeof(*pfns), GFP_KERNEL); + if (!pfns) + return -ENOMEM; + + range.hmm_pfns = pfns; + range.start = region->start_uaddr + page_offset * HV_HYP_PAGE_SIZE; + range.end = range.start + page_count * HV_HYP_PAGE_SIZE; + + do { + ret = mshv_region_hmm_fault_and_lock(region, &range); + } while (ret == -EBUSY); + + if (ret) + goto out; + + for (i = 0; i < page_count; i++) + region->pages[page_offset + i] = hmm_pfn_to_page(pfns[i]); + + ret = mshv_region_remap_pages(region, region->hv_map_flags, + page_offset, page_count); + + mutex_unlock(®ion->mutex); +out: + kfree(pfns); + return ret; +} + +bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn) +{ + u64 page_offset, page_count; + int ret; + + /* Align the page offset to the nearest MSHV_MAP_FAULT_IN_PAGES. */ + page_offset = ALIGN_DOWN(gfn - region->start_gfn, + MSHV_MAP_FAULT_IN_PAGES); + + /* Map more pages than requested to reduce the number of faults. */ + page_count = min(region->nr_pages - page_offset, + MSHV_MAP_FAULT_IN_PAGES); + + ret = mshv_region_range_fault(region, page_offset, page_count); + + WARN_ONCE(ret, + "p%llu: GPA intercept failed: region %#llx-%#llx, gfn %#llx, page_offset %llu, page_count %llu\n", + region->partition->pt_id, region->start_uaddr, + region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT), + gfn, page_offset, page_count); + + return !ret; +} + +/** + * mshv_region_interval_invalidate - Invalidate a range of memory region + * @mni: Pointer to the mmu_interval_notifier structure + * @range: Pointer to the mmu_notifier_range structure + * @cur_seq: Current sequence number for the interval notifier + * + * This function invalidates a memory region by remapping its pages with + * no access permissions. It locks the region's mutex to ensure thread safety + * and updates the sequence number for the interval notifier. If the range + * is blockable, it uses a blocking lock; otherwise, it attempts a non-blocking + * lock and returns false if unsuccessful. + * + * NOTE: Failure to invalidate a region is a serious error, as the pages will + * be considered freed while they are still mapped by the hypervisor. + * Any attempt to access such pages will likely crash the system. + * + * Return: true if the region was successfully invalidated, false otherwise. + */ +static bool mshv_region_interval_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) +{ + struct mshv_mem_region *region = container_of(mni, + struct mshv_mem_region, + mni); + u64 page_offset, page_count; + unsigned long mstart, mend; + int ret = -EPERM; + + if (mmu_notifier_range_blockable(range)) + mutex_lock(®ion->mutex); + else if (!mutex_trylock(®ion->mutex)) + goto out_fail; + + mmu_interval_set_seq(mni, cur_seq); + + mstart = max(range->start, region->start_uaddr); + mend = min(range->end, region->start_uaddr + + (region->nr_pages << HV_HYP_PAGE_SHIFT)); + + page_offset = HVPFN_DOWN(mstart - region->start_uaddr); + page_count = HVPFN_DOWN(mend - mstart); + + ret = mshv_region_remap_pages(region, HV_MAP_GPA_NO_ACCESS, + page_offset, page_count); + if (ret) + goto out_fail; + + mshv_region_invalidate_pages(region, page_offset, page_count); + + mutex_unlock(®ion->mutex); + + return true; + +out_fail: + WARN_ONCE(ret, + "Failed to invalidate region %#llx-%#llx (range %#lx-%#lx, event: %u, pages %#llx-%#llx, mm: %#llx): %d\n", + region->start_uaddr, + region->start_uaddr + (region->nr_pages << HV_HYP_PAGE_SHIFT), + range->start, range->end, range->event, + page_offset, page_offset + page_count - 1, (u64)range->mm, ret); + return false; +} + +static const struct mmu_interval_notifier_ops mshv_region_mni_ops = { + .invalidate = mshv_region_interval_invalidate, +}; + +void mshv_region_movable_fini(struct mshv_mem_region *region) +{ + mmu_interval_notifier_remove(®ion->mni); +} + +bool mshv_region_movable_init(struct mshv_mem_region *region) +{ + int ret; + + ret = mmu_interval_notifier_insert(®ion->mni, current->mm, + region->start_uaddr, + region->nr_pages << HV_HYP_PAGE_SHIFT, + &mshv_region_mni_ops); + if (ret) + return false; + + mutex_init(®ion->mutex); + + return true; +} diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h index e3931b0f1269..3c1d88b36741 100644 --- a/drivers/hv/mshv_root.h +++ b/drivers/hv/mshv_root.h @@ -15,6 +15,7 @@ #include <linux/hashtable.h> #include <linux/dev_printk.h> #include <linux/build_bug.h> +#include <linux/mmu_notifier.h> #include <uapi/linux/mshv.h> /* @@ -70,18 +71,23 @@ do { \ #define vp_info(v, fmt, ...) vp_devprintk(info, v, fmt, ##__VA_ARGS__) #define vp_dbg(v, fmt, ...) vp_devprintk(dbg, v, fmt, ##__VA_ARGS__) +enum mshv_region_type { + MSHV_REGION_TYPE_MEM_PINNED, + MSHV_REGION_TYPE_MEM_MOVABLE, + MSHV_REGION_TYPE_MMIO +}; + struct mshv_mem_region { struct hlist_node hnode; + struct kref refcount; u64 nr_pages; u64 start_gfn; u64 start_uaddr; u32 hv_map_flags; - struct { - u64 large_pages: 1; /* 2MiB */ - u64 range_pinned: 1; - u64 reserved: 62; - } flags; struct mshv_partition *partition; + enum mshv_region_type type; + struct mmu_interval_notifier mni; + struct mutex mutex; /* protects region pages remapping */ struct page *pages[]; }; @@ -98,6 +104,8 @@ struct mshv_partition { u64 pt_id; refcount_t pt_ref_count; struct mutex pt_mutex; + + spinlock_t pt_mem_regions_lock; struct hlist_head pt_mem_regions; // not ordered u32 pt_vp_count; @@ -169,7 +177,7 @@ struct mshv_girq_routing_table { }; struct hv_synic_pages { - struct hv_message_page *synic_message_page; + struct hv_message_page *hyp_synic_message_page; struct hv_synic_event_flags_page *synic_event_flags_page; struct hv_synic_event_ring_page *synic_event_ring_page; }; @@ -178,6 +186,7 @@ struct mshv_root { struct hv_synic_pages __percpu *synic_pages; spinlock_t pt_ht_lock; DECLARE_HASHTABLE(pt_htable, MSHV_PARTITIONS_HASH_BITS); + struct hv_partition_property_vmm_capabilities vmm_caps; }; /* @@ -278,11 +287,12 @@ int hv_call_set_vp_state(u32 vp_index, u64 partition_id, /* Choose between pages and bytes */ struct hv_vp_state_data state_data, u64 page_count, struct page **pages, u32 num_bytes, u8 *bytes); -int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, - union hv_input_vtl input_vtl, - struct page **state_page); -int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, - union hv_input_vtl input_vtl); +int hv_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + union hv_input_vtl input_vtl, + struct page **state_page); +int hv_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + struct page *state_page, + union hv_input_vtl input_vtl); int hv_call_create_port(u64 port_partition_id, union hv_port_id port_id, u64 connection_partition_id, struct hv_port_info *port_info, u8 port_vtl, u8 min_connection_vtl, int node); @@ -295,17 +305,32 @@ int hv_call_connect_port(u64 port_partition_id, union hv_port_id port_id, int hv_call_disconnect_port(u64 connection_partition_id, union hv_connection_id connection_id); int hv_call_notify_port_ring_empty(u32 sint_index); -int hv_call_map_stat_page(enum hv_stats_object_type type, - const union hv_stats_object_identity *identity, - void **addr); -int hv_call_unmap_stat_page(enum hv_stats_object_type type, - const union hv_stats_object_identity *identity); +int hv_map_stats_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + void **addr); +int hv_unmap_stats_page(enum hv_stats_object_type type, void *page_addr, + const union hv_stats_object_identity *identity); int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages, u64 page_struct_count, u32 host_access, u32 flags, u8 acquire); +int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code, u64 arg, + void *property_value, size_t property_value_sz); extern struct mshv_root mshv_root; extern enum hv_scheduler_type hv_scheduler_type; extern u8 * __percpu *hv_synic_eventring_tail; +struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages, + u64 uaddr, u32 flags); +int mshv_region_share(struct mshv_mem_region *region); +int mshv_region_unshare(struct mshv_mem_region *region); +int mshv_region_map(struct mshv_mem_region *region); +void mshv_region_invalidate(struct mshv_mem_region *region); +int mshv_region_pin(struct mshv_mem_region *region); +void mshv_region_put(struct mshv_mem_region *region); +int mshv_region_get(struct mshv_mem_region *region); +bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn); +void mshv_region_movable_fini(struct mshv_mem_region *region); +bool mshv_region_movable_init(struct mshv_mem_region *region); + #endif /* _MSHV_ROOT_H_ */ diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c index c9c274f29c3c..598eaff4ff29 100644 --- a/drivers/hv/mshv_root_hv_call.c +++ b/drivers/hv/mshv_root_hv_call.c @@ -388,7 +388,13 @@ int hv_call_assert_virtual_interrupt(u64 partition_id, u32 vector, memset(input, 0, sizeof(*input)); input->partition_id = partition_id; input->vector = vector; + /* + * NOTE: dest_addr only needs to be provided while asserting an + * interrupt on x86 platform + */ +#if IS_ENABLED(CONFIG_X86) input->dest_addr = dest_addr; +#endif input->control = control; status = hv_do_hypercall(HVCALL_ASSERT_VIRTUAL_INTERRUPT, input, NULL); local_irq_restore(flags); @@ -526,9 +532,9 @@ int hv_call_set_vp_state(u32 vp_index, u64 partition_id, return ret; } -int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, - union hv_input_vtl input_vtl, - struct page **state_page) +static int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + union hv_input_vtl input_vtl, + struct page **state_page) { struct hv_input_map_vp_state_page *input; struct hv_output_map_vp_state_page *output; @@ -542,12 +548,20 @@ int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, input = *this_cpu_ptr(hyperv_pcpu_input_arg); output = *this_cpu_ptr(hyperv_pcpu_output_arg); + memset(input, 0, sizeof(*input)); input->partition_id = partition_id; input->vp_index = vp_index; input->type = type; input->input_vtl = input_vtl; - status = hv_do_hypercall(HVCALL_MAP_VP_STATE_PAGE, input, output); + if (*state_page) { + input->flags.map_location_provided = 1; + input->requested_map_location = + page_to_pfn(*state_page); + } + + status = hv_do_hypercall(HVCALL_MAP_VP_STATE_PAGE, input, + output); if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { if (hv_result_success(status)) @@ -565,8 +579,41 @@ int hv_call_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, return ret; } -int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, - union hv_input_vtl input_vtl) +static bool mshv_use_overlay_gpfn(void) +{ + return hv_l1vh_partition() && + mshv_root.vmm_caps.vmm_can_provide_overlay_gpfn; +} + +int hv_map_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + union hv_input_vtl input_vtl, + struct page **state_page) +{ + int ret = 0; + struct page *allocated_page = NULL; + + if (mshv_use_overlay_gpfn()) { + allocated_page = alloc_page(GFP_KERNEL); + if (!allocated_page) + return -ENOMEM; + *state_page = allocated_page; + } else { + *state_page = NULL; + } + + ret = hv_call_map_vp_state_page(partition_id, vp_index, type, input_vtl, + state_page); + + if (ret && allocated_page) { + __free_page(allocated_page); + *state_page = NULL; + } + + return ret; +} + +static int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + union hv_input_vtl input_vtl) { unsigned long flags; u64 status; @@ -590,6 +637,48 @@ int hv_call_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, return hv_result_to_errno(status); } +int hv_unmap_vp_state_page(u64 partition_id, u32 vp_index, u32 type, + struct page *state_page, union hv_input_vtl input_vtl) +{ + int ret = hv_call_unmap_vp_state_page(partition_id, vp_index, type, input_vtl); + + if (mshv_use_overlay_gpfn() && state_page) + __free_page(state_page); + + return ret; +} + +int hv_call_get_partition_property_ex(u64 partition_id, u64 property_code, + u64 arg, void *property_value, + size_t property_value_sz) +{ + u64 status; + unsigned long flags; + struct hv_input_get_partition_property_ex *input; + struct hv_output_get_partition_property_ex *output; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + memset(input, 0, sizeof(*input)); + input->partition_id = partition_id; + input->property_code = property_code; + input->arg = arg; + status = hv_do_hypercall(HVCALL_GET_PARTITION_PROPERTY_EX, input, output); + + if (!hv_result_success(status)) { + local_irq_restore(flags); + hv_status_debug(status, "\n"); + return hv_result_to_errno(status); + } + memcpy(property_value, &output->property_value, property_value_sz); + + local_irq_restore(flags); + + return 0; +} + int hv_call_clear_virtual_interrupt(u64 partition_id) { @@ -724,9 +813,51 @@ hv_call_notify_port_ring_empty(u32 sint_index) return hv_result_to_errno(status); } -int hv_call_map_stat_page(enum hv_stats_object_type type, - const union hv_stats_object_identity *identity, - void **addr) +static int hv_call_map_stats_page2(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + u64 map_location) +{ + unsigned long flags; + struct hv_input_map_stats_page2 *input; + u64 status; + int ret; + + if (!map_location || !mshv_use_overlay_gpfn()) + return -EINVAL; + + do { + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + memset(input, 0, sizeof(*input)); + input->type = type; + input->identity = *identity; + input->map_location = map_location; + + status = hv_do_hypercall(HVCALL_MAP_STATS_PAGE2, input, NULL); + + local_irq_restore(flags); + + ret = hv_result_to_errno(status); + + if (!ret) + break; + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { + hv_status_debug(status, "\n"); + break; + } + + ret = hv_call_deposit_pages(NUMA_NO_NODE, + hv_current_partition_id, 1); + } while (!ret); + + return ret; +} + +static int hv_call_map_stats_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + void **addr) { unsigned long flags; struct hv_input_map_stats_page *input; @@ -765,8 +896,38 @@ int hv_call_map_stat_page(enum hv_stats_object_type type, return ret; } -int hv_call_unmap_stat_page(enum hv_stats_object_type type, - const union hv_stats_object_identity *identity) +int hv_map_stats_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity, + void **addr) +{ + int ret; + struct page *allocated_page = NULL; + + if (!addr) + return -EINVAL; + + if (mshv_use_overlay_gpfn()) { + allocated_page = alloc_page(GFP_KERNEL); + if (!allocated_page) + return -ENOMEM; + + ret = hv_call_map_stats_page2(type, identity, + page_to_pfn(allocated_page)); + *addr = page_address(allocated_page); + } else { + ret = hv_call_map_stats_page(type, identity, addr); + } + + if (ret && allocated_page) { + __free_page(allocated_page); + *addr = NULL; + } + + return ret; +} + +static int hv_call_unmap_stats_page(enum hv_stats_object_type type, + const union hv_stats_object_identity *identity) { unsigned long flags; struct hv_input_unmap_stats_page *input; @@ -785,6 +946,19 @@ int hv_call_unmap_stat_page(enum hv_stats_object_type type, return hv_result_to_errno(status); } +int hv_unmap_stats_page(enum hv_stats_object_type type, void *page_addr, + const union hv_stats_object_identity *identity) +{ + int ret; + + ret = hv_call_unmap_stats_page(type, identity); + + if (mshv_use_overlay_gpfn() && page_addr) + __free_page(virt_to_page(page_addr)); + + return ret; +} + int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages, u64 page_struct_count, u32 host_access, u32 flags, u8 acquire) diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c index 1d8d8d00e4e0..1134a82c7881 100644 --- a/drivers/hv/mshv_root_main.c +++ b/drivers/hv/mshv_root_main.c @@ -42,7 +42,7 @@ MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv"); /* TODO move this to another file when debugfs code is added */ enum hv_stats_vp_counters { /* HV_THREAD_COUNTER */ #if defined(CONFIG_X86) - VpRootDispatchThreadBlocked = 201, + VpRootDispatchThreadBlocked = 202, #elif defined(CONFIG_ARM64) VpRootDispatchThreadBlocked = 94, #endif @@ -123,6 +123,7 @@ static struct miscdevice mshv_dev = { */ static u16 mshv_passthru_hvcalls[] = { HVCALL_GET_PARTITION_PROPERTY, + HVCALL_GET_PARTITION_PROPERTY_EX, HVCALL_SET_PARTITION_PROPERTY, HVCALL_INSTALL_INTERCEPT, HVCALL_GET_VP_REGISTERS, @@ -137,6 +138,16 @@ static u16 mshv_passthru_hvcalls[] = { HVCALL_GET_VP_CPUID_VALUES, }; +/* + * Only allow hypercalls that are safe to be called by the VMM with the host + * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a + * hypercall cannot be misused by the VMM before adding it to this list. + */ +static u16 mshv_self_passthru_hvcalls[] = { + HVCALL_GET_PARTITION_PROPERTY, + HVCALL_GET_PARTITION_PROPERTY_EX, +}; + static bool mshv_hvcall_is_async(u16 code) { switch (code) { @@ -148,18 +159,38 @@ static bool mshv_hvcall_is_async(u16 code) return false; } +static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id) +{ + int i; + int n = ARRAY_SIZE(mshv_passthru_hvcalls); + u16 *allowed_hvcalls = mshv_passthru_hvcalls; + + if (pt_id == HV_PARTITION_ID_SELF) { + n = ARRAY_SIZE(mshv_self_passthru_hvcalls); + allowed_hvcalls = mshv_self_passthru_hvcalls; + } + + for (i = 0; i < n; ++i) + if (allowed_hvcalls[i] == code) + return true; + + return false; +} + static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, bool partition_locked, void __user *user_args) { u64 status; - int ret = 0, i; + int ret = 0; bool is_async; struct mshv_root_hvcall args; struct page *page; unsigned int pages_order; void *input_pg = NULL; void *output_pg = NULL; + u16 reps_completed; + u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF; if (copy_from_user(&args, user_args, sizeof(args))) return -EFAULT; @@ -171,17 +202,13 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE)) return -EINVAL; - for (i = 0; i < ARRAY_SIZE(mshv_passthru_hvcalls); ++i) - if (args.code == mshv_passthru_hvcalls[i]) - break; - - if (i >= ARRAY_SIZE(mshv_passthru_hvcalls)) + if (!mshv_passthru_hvcall_allowed(args.code, pt_id)) return -EINVAL; is_async = mshv_hvcall_is_async(args.code); if (is_async) { /* async hypercalls can only be called from partition fd */ - if (!partition_locked) + if (!partition || !partition_locked) return -EINVAL; ret = mshv_init_async_handler(partition); if (ret) @@ -209,43 +236,44 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition, * NOTE: This only works because all the allowed hypercalls' input * structs begin with a u64 partition_id field. */ - *(u64 *)input_pg = partition->pt_id; + *(u64 *)input_pg = pt_id; - if (args.reps) - status = hv_do_rep_hypercall(args.code, args.reps, 0, - input_pg, output_pg); - else - status = hv_do_hypercall(args.code, input_pg, output_pg); - - if (hv_result(status) == HV_STATUS_CALL_PENDING) { - if (is_async) { - mshv_async_hvcall_handler(partition, &status); - } else { /* Paranoia check. This shouldn't happen! */ - ret = -EBADFD; - goto free_pages_out; + reps_completed = 0; + do { + if (args.reps) { + status = hv_do_rep_hypercall_ex(args.code, args.reps, + 0, reps_completed, + input_pg, output_pg); + reps_completed = hv_repcomp(status); + } else { + status = hv_do_hypercall(args.code, input_pg, output_pg); } - } - if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) { - ret = hv_call_deposit_pages(NUMA_NO_NODE, partition->pt_id, 1); - if (!ret) - ret = -EAGAIN; - } else if (!hv_result_success(status)) { - ret = hv_result_to_errno(status); - } + if (hv_result(status) == HV_STATUS_CALL_PENDING) { + if (is_async) { + mshv_async_hvcall_handler(partition, &status); + } else { /* Paranoia check. This shouldn't happen! */ + ret = -EBADFD; + goto free_pages_out; + } + } + + if (hv_result_success(status)) + break; + + if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) + ret = hv_result_to_errno(status); + else + ret = hv_call_deposit_pages(NUMA_NO_NODE, + pt_id, 1); + } while (!ret); - /* - * Always return the status and output data regardless of result. - * The VMM may need it to determine how to proceed. E.g. the status may - * contain the number of reps completed if a rep hypercall partially - * succeeded. - */ args.status = hv_result(status); - args.reps = args.reps ? hv_repcomp(status) : 0; + args.reps = reps_completed; if (copy_to_user(user_args, &args, sizeof(args))) ret = -EFAULT; - if (output_pg && + if (!ret && output_pg && copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz)) ret = -EFAULT; @@ -569,14 +597,98 @@ static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp) static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ, "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ"); +static struct mshv_mem_region * +mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) +{ + struct mshv_mem_region *region; + + hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { + if (gfn >= region->start_gfn && + gfn < region->start_gfn + region->nr_pages) + return region; + } + + return NULL; +} + +#ifdef CONFIG_X86_64 +static struct mshv_mem_region * +mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn) +{ + struct mshv_mem_region *region; + + spin_lock(&p->pt_mem_regions_lock); + region = mshv_partition_region_by_gfn(p, gfn); + if (!region || !mshv_region_get(region)) { + spin_unlock(&p->pt_mem_regions_lock); + return NULL; + } + spin_unlock(&p->pt_mem_regions_lock); + + return region; +} + +/** + * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts. + * @vp: Pointer to the virtual processor structure. + * + * This function processes GPA intercepts by identifying the memory region + * corresponding to the intercepted GPA, aligning the page offset, and + * mapping the required pages. It ensures that the region is valid and + * handles faults efficiently by mapping multiple pages at once. + * + * Return: true if the intercept was handled successfully, false otherwise. + */ +static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) +{ + struct mshv_partition *p = vp->vp_partition; + struct mshv_mem_region *region; + struct hv_x64_memory_intercept_message *msg; + bool ret; + u64 gfn; + + msg = (struct hv_x64_memory_intercept_message *) + vp->vp_intercept_msg_page->u.payload; + + gfn = HVPFN_DOWN(msg->guest_physical_address); + + region = mshv_partition_region_by_gfn_get(p, gfn); + if (!region) + return false; + + /* Only movable memory ranges are supported for GPA intercepts */ + if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE) + ret = mshv_region_handle_gfn_fault(region, gfn); + else + ret = false; + + mshv_region_put(region); + + return ret; +} +#else /* CONFIG_X86_64 */ +static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; } +#endif /* CONFIG_X86_64 */ + +static bool mshv_vp_handle_intercept(struct mshv_vp *vp) +{ + switch (vp->vp_intercept_msg_page->header.message_type) { + case HVMSG_GPA_INTERCEPT: + return mshv_handle_gpa_intercept(vp); + } + return false; +} + static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg) { long rc; - if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) - rc = mshv_run_vp_with_root_scheduler(vp); - else - rc = mshv_run_vp_with_hyp_scheduler(vp); + do { + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) + rc = mshv_run_vp_with_root_scheduler(vp); + else + rc = mshv_run_vp_with_hyp_scheduler(vp); + } while (rc == 0 && mshv_vp_handle_intercept(vp)); if (rc) return rc; @@ -844,7 +956,8 @@ mshv_vp_release(struct inode *inode, struct file *filp) return 0; } -static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index) +static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index, + void *stats_pages[]) { union hv_stats_object_identity identity = { .vp.partition_id = partition_id, @@ -852,10 +965,10 @@ static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index) }; identity.vp.stats_area_type = HV_STATS_AREA_SELF; - hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); + hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity); identity.vp.stats_area_type = HV_STATS_AREA_PARENT; - hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); + hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity); } static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, @@ -868,14 +981,14 @@ static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, int err; identity.vp.stats_area_type = HV_STATS_AREA_SELF; - err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity, - &stats_pages[HV_STATS_AREA_SELF]); + err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, + &stats_pages[HV_STATS_AREA_SELF]); if (err) return err; identity.vp.stats_area_type = HV_STATS_AREA_PARENT; - err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity, - &stats_pages[HV_STATS_AREA_PARENT]); + err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity, + &stats_pages[HV_STATS_AREA_PARENT]); if (err) goto unmap_self; @@ -883,7 +996,7 @@ static int mshv_vp_stats_map(u64 partition_id, u32 vp_index, unmap_self: identity.vp.stats_area_type = HV_STATS_AREA_SELF; - hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity); + hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity); return err; } @@ -893,7 +1006,7 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition, { struct mshv_create_vp args; struct mshv_vp *vp; - struct page *intercept_message_page, *register_page, *ghcb_page; + struct page *intercept_msg_page, *register_page, *ghcb_page; void *stats_pages[2]; long ret; @@ -911,33 +1024,34 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition, if (ret) return ret; - ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, - HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, - input_vtl_zero, - &intercept_message_page); + ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, + input_vtl_zero, &intercept_msg_page); if (ret) goto destroy_vp; if (!mshv_partition_encrypted(partition)) { - ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, - HV_VP_STATE_PAGE_REGISTERS, - input_vtl_zero, - ®ister_page); + ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_REGISTERS, + input_vtl_zero, ®ister_page); if (ret) goto unmap_intercept_message_page; } if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) { - ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index, - HV_VP_STATE_PAGE_GHCB, - input_vtl_normal, - &ghcb_page); + ret = hv_map_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_GHCB, + input_vtl_normal, &ghcb_page); if (ret) goto unmap_register_page; } - if (hv_parent_partition()) { + /* + * This mapping of the stats page is for detecting if dispatch thread + * is blocked - only relevant for root scheduler + */ + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) { ret = mshv_vp_stats_map(partition->pt_id, args.vp_index, stats_pages); if (ret) @@ -959,14 +1073,14 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition, atomic64_set(&vp->run.vp_signaled_count, 0); vp->vp_index = args.vp_index; - vp->vp_intercept_msg_page = page_to_virt(intercept_message_page); + vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page); if (!mshv_partition_encrypted(partition)) vp->vp_register_page = page_to_virt(register_page); if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) vp->vp_ghcb_page = page_to_virt(ghcb_page); - if (hv_parent_partition()) + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages)); /* @@ -989,24 +1103,22 @@ put_partition: free_vp: kfree(vp); unmap_stats_pages: - if (hv_parent_partition()) - mshv_vp_stats_unmap(partition->pt_id, args.vp_index); + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) + mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages); unmap_ghcb_page: - if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) { - hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, - HV_VP_STATE_PAGE_GHCB, - input_vtl_normal); - } + if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) + hv_unmap_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_GHCB, ghcb_page, + input_vtl_normal); unmap_register_page: - if (!mshv_partition_encrypted(partition)) { - hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, - HV_VP_STATE_PAGE_REGISTERS, - input_vtl_zero); - } + if (!mshv_partition_encrypted(partition)) + hv_unmap_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_REGISTERS, + register_page, input_vtl_zero); unmap_intercept_message_page: - hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index, - HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, - input_vtl_zero); + hv_unmap_vp_state_page(partition->pt_id, args.vp_index, + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, + intercept_msg_page, input_vtl_zero); destroy_vp: hv_call_delete_vp(partition->pt_id, args.vp_index); return ret; @@ -1034,162 +1146,6 @@ static void mshv_async_hvcall_handler(void *data, u64 *status) *status = partition->async_hypercall_status; } -static int -mshv_partition_region_share(struct mshv_mem_region *region) -{ - u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED; - - if (region->flags.large_pages) - flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; - - return hv_call_modify_spa_host_access(region->partition->pt_id, - region->pages, region->nr_pages, - HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE, - flags, true); -} - -static int -mshv_partition_region_unshare(struct mshv_mem_region *region) -{ - u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE; - - if (region->flags.large_pages) - flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE; - - return hv_call_modify_spa_host_access(region->partition->pt_id, - region->pages, region->nr_pages, - 0, - flags, false); -} - -static int -mshv_region_remap_pages(struct mshv_mem_region *region, u32 map_flags, - u64 page_offset, u64 page_count) -{ - if (page_offset + page_count > region->nr_pages) - return -EINVAL; - - if (region->flags.large_pages) - map_flags |= HV_MAP_GPA_LARGE_PAGE; - - /* ask the hypervisor to map guest ram */ - return hv_call_map_gpa_pages(region->partition->pt_id, - region->start_gfn + page_offset, - page_count, map_flags, - region->pages + page_offset); -} - -static int -mshv_region_map(struct mshv_mem_region *region) -{ - u32 map_flags = region->hv_map_flags; - - return mshv_region_remap_pages(region, map_flags, - 0, region->nr_pages); -} - -static void -mshv_region_evict_pages(struct mshv_mem_region *region, - u64 page_offset, u64 page_count) -{ - if (region->flags.range_pinned) - unpin_user_pages(region->pages + page_offset, page_count); - - memset(region->pages + page_offset, 0, - page_count * sizeof(struct page *)); -} - -static void -mshv_region_evict(struct mshv_mem_region *region) -{ - mshv_region_evict_pages(region, 0, region->nr_pages); -} - -static int -mshv_region_populate_pages(struct mshv_mem_region *region, - u64 page_offset, u64 page_count) -{ - u64 done_count, nr_pages; - struct page **pages; - __u64 userspace_addr; - int ret; - - if (page_offset + page_count > region->nr_pages) - return -EINVAL; - - for (done_count = 0; done_count < page_count; done_count += ret) { - pages = region->pages + page_offset + done_count; - userspace_addr = region->start_uaddr + - (page_offset + done_count) * - HV_HYP_PAGE_SIZE; - nr_pages = min(page_count - done_count, - MSHV_PIN_PAGES_BATCH_SIZE); - - /* - * Pinning assuming 4k pages works for large pages too. - * All page structs within the large page are returned. - * - * Pin requests are batched because pin_user_pages_fast - * with the FOLL_LONGTERM flag does a large temporary - * allocation of contiguous memory. - */ - if (region->flags.range_pinned) - ret = pin_user_pages_fast(userspace_addr, - nr_pages, - FOLL_WRITE | FOLL_LONGTERM, - pages); - else - ret = -EOPNOTSUPP; - - if (ret < 0) - goto release_pages; - } - - if (PageHuge(region->pages[page_offset])) - region->flags.large_pages = true; - - return 0; - -release_pages: - mshv_region_evict_pages(region, page_offset, done_count); - return ret; -} - -static int -mshv_region_populate(struct mshv_mem_region *region) -{ - return mshv_region_populate_pages(region, 0, region->nr_pages); -} - -static struct mshv_mem_region * -mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn) -{ - struct mshv_mem_region *region; - - hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { - if (gfn >= region->start_gfn && - gfn < region->start_gfn + region->nr_pages) - return region; - } - - return NULL; -} - -static struct mshv_mem_region * -mshv_partition_region_by_uaddr(struct mshv_partition *partition, u64 uaddr) -{ - struct mshv_mem_region *region; - - hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) { - if (uaddr >= region->start_uaddr && - uaddr < region->start_uaddr + - (region->nr_pages << HV_HYP_PAGE_SHIFT)) - return region; - } - - return NULL; -} - /* * NB: caller checks and makes sure mem->size is page aligned * Returns: 0 with regionpp updated on success, or -errno @@ -1199,53 +1155,61 @@ static int mshv_partition_create_region(struct mshv_partition *partition, struct mshv_mem_region **regionpp, bool is_mmio) { - struct mshv_mem_region *region; + struct mshv_mem_region *rg; u64 nr_pages = HVPFN_DOWN(mem->size); /* Reject overlapping regions */ - if (mshv_partition_region_by_gfn(partition, mem->guest_pfn) || - mshv_partition_region_by_gfn(partition, mem->guest_pfn + nr_pages - 1) || - mshv_partition_region_by_uaddr(partition, mem->userspace_addr) || - mshv_partition_region_by_uaddr(partition, mem->userspace_addr + mem->size - 1)) + spin_lock(&partition->pt_mem_regions_lock); + hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) { + if (mem->guest_pfn + nr_pages <= rg->start_gfn || + rg->start_gfn + rg->nr_pages <= mem->guest_pfn) + continue; + spin_unlock(&partition->pt_mem_regions_lock); return -EEXIST; + } + spin_unlock(&partition->pt_mem_regions_lock); - region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages); - if (!region) - return -ENOMEM; - - region->nr_pages = nr_pages; - region->start_gfn = mem->guest_pfn; - region->start_uaddr = mem->userspace_addr; - region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE; - if (mem->flags & BIT(MSHV_SET_MEM_BIT_WRITABLE)) - region->hv_map_flags |= HV_MAP_GPA_WRITABLE; - if (mem->flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE)) - region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE; + rg = mshv_region_create(mem->guest_pfn, nr_pages, + mem->userspace_addr, mem->flags); + if (IS_ERR(rg)) + return PTR_ERR(rg); - /* Note: large_pages flag populated when we pin the pages */ - if (!is_mmio) - region->flags.range_pinned = true; + if (is_mmio) + rg->type = MSHV_REGION_TYPE_MMIO; + else if (mshv_partition_encrypted(partition) || + !mshv_region_movable_init(rg)) + rg->type = MSHV_REGION_TYPE_MEM_PINNED; + else + rg->type = MSHV_REGION_TYPE_MEM_MOVABLE; - region->partition = partition; + rg->partition = partition; - *regionpp = region; + *regionpp = rg; return 0; } -/* - * Map guest ram. if snp, make sure to release that from the host first - * Side Effects: In case of failure, pages are unpinned when feasible. +/** + * mshv_prepare_pinned_region - Pin and map memory regions + * @region: Pointer to the memory region structure + * + * This function processes memory regions that are explicitly marked as pinned. + * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based + * population. The function ensures the region is properly populated, handles + * encryption requirements for SNP partitions if applicable, maps the region, + * and performs necessary sharing or eviction operations based on the mapping + * result. + * + * Return: 0 on success, negative error code on failure. */ -static int -mshv_partition_mem_region_map(struct mshv_mem_region *region) +static int mshv_prepare_pinned_region(struct mshv_mem_region *region) { struct mshv_partition *partition = region->partition; int ret; - ret = mshv_region_populate(region); + ret = mshv_region_pin(region); if (ret) { - pt_err(partition, "Failed to populate memory region: %d\n", + pt_err(partition, "Failed to pin memory region: %d\n", ret); goto err_out; } @@ -1258,12 +1222,12 @@ mshv_partition_mem_region_map(struct mshv_mem_region *region) * access to guest memory regions. */ if (mshv_partition_encrypted(partition)) { - ret = mshv_partition_region_unshare(region); + ret = mshv_region_unshare(region); if (ret) { pt_err(partition, "Failed to unshare memory region (guest_pfn: %llu): %d\n", region->start_gfn, ret); - goto evict_region; + goto invalidate_region; } } @@ -1271,9 +1235,9 @@ mshv_partition_mem_region_map(struct mshv_mem_region *region) if (ret && mshv_partition_encrypted(partition)) { int shrc; - shrc = mshv_partition_region_share(region); + shrc = mshv_region_share(region); if (!shrc) - goto evict_region; + goto invalidate_region; pt_err(partition, "Failed to share memory region (guest_pfn: %llu): %d\n", @@ -1287,8 +1251,8 @@ mshv_partition_mem_region_map(struct mshv_mem_region *region) return 0; -evict_region: - mshv_region_evict(region); +invalidate_region: + mshv_region_invalidate(region); err_out: return ret; } @@ -1333,17 +1297,35 @@ mshv_map_user_memory(struct mshv_partition *partition, if (ret) return ret; - if (is_mmio) - ret = hv_call_map_mmio_pages(partition->pt_id, mem.guest_pfn, - mmio_pfn, HVPFN_DOWN(mem.size)); - else - ret = mshv_partition_mem_region_map(region); + switch (region->type) { + case MSHV_REGION_TYPE_MEM_PINNED: + ret = mshv_prepare_pinned_region(region); + break; + case MSHV_REGION_TYPE_MEM_MOVABLE: + /* + * For movable memory regions, remap with no access to let + * the hypervisor track dirty pages, enabling pre-copy live + * migration. + */ + ret = hv_call_map_gpa_pages(partition->pt_id, + region->start_gfn, + region->nr_pages, + HV_MAP_GPA_NO_ACCESS, NULL); + break; + case MSHV_REGION_TYPE_MMIO: + ret = hv_call_map_mmio_pages(partition->pt_id, + region->start_gfn, + mmio_pfn, + region->nr_pages); + break; + } if (ret) goto errout; - /* Install the new region */ + spin_lock(&partition->pt_mem_regions_lock); hlist_add_head(®ion->hnode, &partition->pt_mem_regions); + spin_unlock(&partition->pt_mem_regions_lock); return 0; @@ -1358,33 +1340,32 @@ mshv_unmap_user_memory(struct mshv_partition *partition, struct mshv_user_mem_region mem) { struct mshv_mem_region *region; - u32 unmap_flags = 0; if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP))) return -EINVAL; + spin_lock(&partition->pt_mem_regions_lock); + region = mshv_partition_region_by_gfn(partition, mem.guest_pfn); - if (!region) - return -EINVAL; + if (!region) { + spin_unlock(&partition->pt_mem_regions_lock); + return -ENOENT; + } /* Paranoia check */ if (region->start_uaddr != mem.userspace_addr || region->start_gfn != mem.guest_pfn || - region->nr_pages != HVPFN_DOWN(mem.size)) + region->nr_pages != HVPFN_DOWN(mem.size)) { + spin_unlock(&partition->pt_mem_regions_lock); return -EINVAL; + } hlist_del(®ion->hnode); - if (region->flags.large_pages) - unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE; - - /* ignore unmap failures and continue as process may be exiting */ - hv_call_unmap_gpa_pages(partition->pt_id, region->start_gfn, - region->nr_pages, unmap_flags); + spin_unlock(&partition->pt_mem_regions_lock); - mshv_region_evict(region); + mshv_region_put(region); - vfree(region); return 0; } @@ -1720,8 +1701,8 @@ static void destroy_partition(struct mshv_partition *partition) { struct mshv_vp *vp; struct mshv_mem_region *region; - int i, ret; struct hlist_node *n; + int i; if (refcount_read(&partition->pt_ref_count)) { pt_err(partition, @@ -1743,28 +1724,32 @@ static void destroy_partition(struct mshv_partition *partition) if (!vp) continue; - if (hv_parent_partition()) - mshv_vp_stats_unmap(partition->pt_id, vp->vp_index); + if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) + mshv_vp_stats_unmap(partition->pt_id, vp->vp_index, + (void **)vp->vp_stats_pages); if (vp->vp_register_page) { - (void)hv_call_unmap_vp_state_page(partition->pt_id, - vp->vp_index, - HV_VP_STATE_PAGE_REGISTERS, - input_vtl_zero); + (void)hv_unmap_vp_state_page(partition->pt_id, + vp->vp_index, + HV_VP_STATE_PAGE_REGISTERS, + virt_to_page(vp->vp_register_page), + input_vtl_zero); vp->vp_register_page = NULL; } - (void)hv_call_unmap_vp_state_page(partition->pt_id, - vp->vp_index, - HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, - input_vtl_zero); + (void)hv_unmap_vp_state_page(partition->pt_id, + vp->vp_index, + HV_VP_STATE_PAGE_INTERCEPT_MESSAGE, + virt_to_page(vp->vp_intercept_msg_page), + input_vtl_zero); vp->vp_intercept_msg_page = NULL; if (vp->vp_ghcb_page) { - (void)hv_call_unmap_vp_state_page(partition->pt_id, - vp->vp_index, - HV_VP_STATE_PAGE_GHCB, - input_vtl_normal); + (void)hv_unmap_vp_state_page(partition->pt_id, + vp->vp_index, + HV_VP_STATE_PAGE_GHCB, + virt_to_page(vp->vp_ghcb_page), + input_vtl_normal); vp->vp_ghcb_page = NULL; } @@ -1781,24 +1766,10 @@ static void destroy_partition(struct mshv_partition *partition) remove_partition(partition); - /* Remove regions, regain access to the memory and unpin the pages */ hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions, hnode) { hlist_del(®ion->hnode); - - if (mshv_partition_encrypted(partition)) { - ret = mshv_partition_region_share(region); - if (ret) { - pt_err(partition, - "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n", - ret); - return; - } - } - - mshv_region_evict(region); - - vfree(region); + mshv_region_put(region); } /* Withdraw and free all pages we deposited */ @@ -1865,41 +1836,117 @@ add_partition(struct mshv_partition *partition) return 0; } -static long -mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) +static_assert(MSHV_NUM_CPU_FEATURES_BANKS == + HV_PARTITION_PROCESSOR_FEATURES_BANKS); + +static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags, + struct hv_partition_creation_properties *cr_props, + union hv_partition_isolation_properties *isol_props) { - struct mshv_create_partition args; - u64 creation_flags; - struct hv_partition_creation_properties creation_properties = {}; - union hv_partition_isolation_properties isolation_properties = {}; - struct mshv_partition *partition; - long ret; + int i; + struct mshv_create_partition_v2 args; + union hv_partition_processor_features *disabled_procs; + union hv_partition_processor_xsave_features *disabled_xsave; - if (copy_from_user(&args, user_arg, sizeof(args))) + /* First, copy v1 struct in case user is on previous versions */ + if (copy_from_user(&args, user_arg, + sizeof(struct mshv_create_partition))) return -EFAULT; if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) return -EINVAL; + disabled_procs = &cr_props->disabled_processor_features; + disabled_xsave = &cr_props->disabled_processor_xsave_features; + + /* Check if user provided newer struct with feature fields */ + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) { + if (copy_from_user(&args, user_arg, sizeof(args))) + return -EFAULT; + + /* Re-validate v1 fields after second copy_from_user() */ + if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) || + args.pt_isolation >= MSHV_PT_ISOLATION_COUNT) + return -EINVAL; + + if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS || + mshv_field_nonzero(args, pt_rsvd) || + mshv_field_nonzero(args, pt_rsvd1)) + return -EINVAL; + + /* + * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never + * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS + * (i.e. 2). + * + * Further banks (index >= 2) will be modifiable as 'early' + * properties via the set partition property hypercall. + */ + for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++) + disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i]; + +#if IS_ENABLED(CONFIG_X86_64) + disabled_xsave->as_uint64 = args.pt_disabled_xsave; +#else + /* + * In practice this field is ignored on arm64, but safer to + * zero it in case it is ever used. + */ + disabled_xsave->as_uint64 = 0; + + if (mshv_field_nonzero(args, pt_rsvd2)) + return -EINVAL; +#endif + } else { + /* + * v1 behavior: try to enable everything. The hypervisor will + * disable features that are not supported. The banks can be + * queried via the get partition property hypercall. + */ + for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++) + disabled_procs->as_uint64[i] = 0; + + disabled_xsave->as_uint64 = 0; + } + /* Only support EXO partitions */ - creation_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION | - HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED; + *pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION | + HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED; + + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED; + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; + if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES)) + *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; - if (args.pt_flags & BIT(MSHV_PT_BIT_LAPIC)) - creation_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED; - if (args.pt_flags & BIT(MSHV_PT_BIT_X2APIC)) - creation_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE; - if (args.pt_flags & BIT(MSHV_PT_BIT_GPA_SUPER_PAGES)) - creation_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED; + isol_props->as_uint64 = 0; switch (args.pt_isolation) { case MSHV_PT_ISOLATION_NONE: - isolation_properties.isolation_type = - HV_PARTITION_ISOLATION_TYPE_NONE; + isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE; break; } + return 0; +} + +static long +mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) +{ + u64 creation_flags; + struct hv_partition_creation_properties creation_properties; + union hv_partition_isolation_properties isolation_properties; + struct mshv_partition *partition; + long ret; + + ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags, + &creation_properties, + &isolation_properties); + if (ret) + return ret; + partition = kzalloc(sizeof(*partition), GFP_KERNEL); if (!partition) return -ENOMEM; @@ -1919,6 +1966,7 @@ mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev) INIT_HLIST_HEAD(&partition->pt_devices); + spin_lock_init(&partition->pt_mem_regions_lock); INIT_HLIST_HEAD(&partition->pt_mem_regions); mshv_eventfd_init(partition); @@ -1966,6 +2014,9 @@ static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl, case MSHV_CREATE_PARTITION: return mshv_ioctl_create_partition((void __user *)arg, misc->this_device); + case MSHV_ROOT_HVCALL: + return mshv_ioctl_passthru_hvcall(NULL, false, + (void __user *)arg); } return -ENOTTY; @@ -2182,6 +2233,22 @@ root_sched_deinit: return err; } +static void mshv_init_vmm_caps(struct device *dev) +{ + /* + * This can only fail here if HVCALL_GET_PARTITION_PROPERTY_EX or + * HV_PARTITION_PROPERTY_VMM_CAPABILITIES are not supported. In that + * case it's valid to proceed as if all vmm_caps are disabled (zero). + */ + if (hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF, + HV_PARTITION_PROPERTY_VMM_CAPABILITIES, + 0, &mshv_root.vmm_caps, + sizeof(mshv_root.vmm_caps))) + dev_warn(dev, "Unable to get VMM capabilities\n"); + + dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]); +} + static int __init mshv_parent_partition_init(void) { int ret; @@ -2234,6 +2301,8 @@ static int __init mshv_parent_partition_init(void) if (ret) goto remove_cpu_state; + mshv_init_vmm_caps(dev); + ret = mshv_irqfd_wq_init(); if (ret) goto exit_partition; diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c index e6b6381b7c36..f8b0337cdc82 100644 --- a/drivers/hv/mshv_synic.c +++ b/drivers/hv/mshv_synic.c @@ -394,7 +394,7 @@ unlock_out: void mshv_isr(void) { struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages); - struct hv_message_page **msg_page = &spages->synic_message_page; + struct hv_message_page **msg_page = &spages->hyp_synic_message_page; struct hv_message *msg; bool handled; @@ -456,7 +456,7 @@ int mshv_synic_init(unsigned int cpu) #endif union hv_synic_scontrol sctrl; struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages); - struct hv_message_page **msg_page = &spages->synic_message_page; + struct hv_message_page **msg_page = &spages->hyp_synic_message_page; struct hv_synic_event_flags_page **event_flags_page = &spages->synic_event_flags_page; struct hv_synic_event_ring_page **event_ring_page = @@ -550,7 +550,7 @@ int mshv_synic_cleanup(unsigned int cpu) union hv_synic_sirbp sirbp; union hv_synic_scontrol sctrl; struct hv_synic_pages *spages = this_cpu_ptr(mshv_root.synic_pages); - struct hv_message_page **msg_page = &spages->synic_message_page; + struct hv_message_page **msg_page = &spages->hyp_synic_message_page; struct hv_synic_event_flags_page **event_flags_page = &spages->synic_event_flags_page; struct hv_synic_event_ring_page **event_ring_page = diff --git a/drivers/hv/mshv_vtl.h b/drivers/hv/mshv_vtl.h new file mode 100644 index 000000000000..a6eea52f7aa2 --- /dev/null +++ b/drivers/hv/mshv_vtl.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _MSHV_VTL_H +#define _MSHV_VTL_H + +#include <linux/mshv.h> +#include <linux/types.h> + +struct mshv_vtl_run { + u32 cancel; + u32 vtl_ret_action_size; + u32 pad[2]; + char exit_message[MSHV_MAX_RUN_MSG_SIZE]; + union { + struct mshv_vtl_cpu_context cpu_context; + + /* + * Reserving room for the cpu context to grow and to maintain compatibility + * with user mode. + */ + char reserved[1024]; + }; + char vtl_ret_actions[MSHV_MAX_RUN_MSG_SIZE]; +}; + +#endif /* _MSHV_VTL_H */ diff --git a/drivers/hv/mshv_vtl_main.c b/drivers/hv/mshv_vtl_main.c new file mode 100644 index 000000000000..2cebe9de5a5a --- /dev/null +++ b/drivers/hv/mshv_vtl_main.c @@ -0,0 +1,1392 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023, Microsoft Corporation. + * + * Author: + * Roman Kisel <romank@linux.microsoft.com> + * Saurabh Sengar <ssengar@linux.microsoft.com> + * Naman Jain <namjain@linux.microsoft.com> + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/miscdevice.h> +#include <linux/anon_inodes.h> +#include <linux/cpuhotplug.h> +#include <linux/count_zeros.h> +#include <linux/entry-virt.h> +#include <linux/eventfd.h> +#include <linux/poll.h> +#include <linux/file.h> +#include <linux/vmalloc.h> +#include <asm/debugreg.h> +#include <asm/mshyperv.h> +#include <trace/events/ipi.h> +#include <uapi/asm/mtrr.h> +#include <uapi/linux/mshv.h> +#include <hyperv/hvhdk.h> + +#include "../../kernel/fpu/legacy.h" +#include "mshv.h" +#include "mshv_vtl.h" +#include "hyperv_vmbus.h" + +MODULE_AUTHOR("Microsoft"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Microsoft Hyper-V VTL Driver"); + +#define MSHV_ENTRY_REASON_LOWER_VTL_CALL 0x1 +#define MSHV_ENTRY_REASON_INTERRUPT 0x2 +#define MSHV_ENTRY_REASON_INTERCEPT 0x3 + +#define MSHV_REAL_OFF_SHIFT 16 +#define MSHV_PG_OFF_CPU_MASK (BIT_ULL(MSHV_REAL_OFF_SHIFT) - 1) +#define MSHV_RUN_PAGE_OFFSET 0 +#define MSHV_REG_PAGE_OFFSET 1 +#define VTL2_VMBUS_SINT_INDEX 7 + +static struct device *mem_dev; + +static struct tasklet_struct msg_dpc; +static wait_queue_head_t fd_wait_queue; +static bool has_message; +static struct eventfd_ctx *flag_eventfds[HV_EVENT_FLAGS_COUNT]; +static DEFINE_MUTEX(flag_lock); +static bool __read_mostly mshv_has_reg_page; + +/* hvcall code is of type u16, allocate a bitmap of size (1 << 16) to accommodate it */ +#define MAX_BITMAP_SIZE ((U16_MAX + 1) / 8) + +struct mshv_vtl_hvcall_fd { + u8 allow_bitmap[MAX_BITMAP_SIZE]; + bool allow_map_initialized; + /* + * Used to protect hvcall setup in IOCTLs + */ + struct mutex init_mutex; + struct miscdevice *dev; +}; + +struct mshv_vtl_poll_file { + struct file *file; + wait_queue_entry_t wait; + wait_queue_head_t *wqh; + poll_table pt; + int cpu; +}; + +struct mshv_vtl { + struct device *module_dev; + u64 id; +}; + +struct mshv_vtl_per_cpu { + struct mshv_vtl_run *run; + struct page *reg_page; +}; + +/* SYNIC_OVERLAY_PAGE_MSR - internal, identical to hv_synic_simp */ +union hv_synic_overlay_page_msr { + u64 as_uint64; + struct { + u64 enabled: 1; + u64 reserved: 11; + u64 pfn: 52; + } __packed; +}; + +static struct mutex mshv_vtl_poll_file_lock; +static union hv_register_vsm_page_offsets mshv_vsm_page_offsets; +static union hv_register_vsm_capabilities mshv_vsm_capabilities; + +static DEFINE_PER_CPU(struct mshv_vtl_poll_file, mshv_vtl_poll_file); +static DEFINE_PER_CPU(unsigned long long, num_vtl0_transitions); +static DEFINE_PER_CPU(struct mshv_vtl_per_cpu, mshv_vtl_per_cpu); + +static const union hv_input_vtl input_vtl_zero; +static const union hv_input_vtl input_vtl_normal = { + .use_target_vtl = 1, +}; + +static const struct file_operations mshv_vtl_fops; + +static long +mshv_ioctl_create_vtl(void __user *user_arg, struct device *module_dev) +{ + struct mshv_vtl *vtl; + struct file *file; + int fd; + + vtl = kzalloc(sizeof(*vtl), GFP_KERNEL); + if (!vtl) + return -ENOMEM; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) { + kfree(vtl); + return fd; + } + file = anon_inode_getfile("mshv_vtl", &mshv_vtl_fops, + vtl, O_RDWR); + if (IS_ERR(file)) { + kfree(vtl); + return PTR_ERR(file); + } + vtl->module_dev = module_dev; + fd_install(fd, file); + + return fd; +} + +static long +mshv_ioctl_check_extension(void __user *user_arg) +{ + u32 arg; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + switch (arg) { + case MSHV_CAP_CORE_API_STABLE: + return 0; + case MSHV_CAP_REGISTER_PAGE: + return mshv_has_reg_page; + case MSHV_CAP_VTL_RETURN_ACTION: + return mshv_vsm_capabilities.return_action_available; + case MSHV_CAP_DR6_SHARED: + return mshv_vsm_capabilities.dr6_shared; + } + + return -EOPNOTSUPP; +} + +static long +mshv_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + struct miscdevice *misc = filp->private_data; + + switch (ioctl) { + case MSHV_CHECK_EXTENSION: + return mshv_ioctl_check_extension((void __user *)arg); + case MSHV_CREATE_VTL: + return mshv_ioctl_create_vtl((void __user *)arg, misc->this_device); + } + + return -ENOTTY; +} + +static const struct file_operations mshv_dev_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = mshv_dev_ioctl, + .llseek = noop_llseek, +}; + +static struct miscdevice mshv_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "mshv", + .fops = &mshv_dev_fops, + .mode = 0600, +}; + +static struct mshv_vtl_run *mshv_vtl_this_run(void) +{ + return *this_cpu_ptr(&mshv_vtl_per_cpu.run); +} + +static struct mshv_vtl_run *mshv_vtl_cpu_run(int cpu) +{ + return *per_cpu_ptr(&mshv_vtl_per_cpu.run, cpu); +} + +static struct page *mshv_vtl_cpu_reg_page(int cpu) +{ + return *per_cpu_ptr(&mshv_vtl_per_cpu.reg_page, cpu); +} + +static void mshv_vtl_configure_reg_page(struct mshv_vtl_per_cpu *per_cpu) +{ + struct hv_register_assoc reg_assoc = {}; + union hv_synic_overlay_page_msr overlay = {}; + struct page *reg_page; + + reg_page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL); + if (!reg_page) { + WARN(1, "failed to allocate register page\n"); + return; + } + + overlay.enabled = 1; + overlay.pfn = page_to_hvpfn(reg_page); + reg_assoc.name = HV_X64_REGISTER_REG_PAGE; + reg_assoc.value.reg64 = overlay.as_uint64; + + if (hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + 1, input_vtl_zero, ®_assoc)) { + WARN(1, "failed to setup register page\n"); + __free_page(reg_page); + return; + } + + per_cpu->reg_page = reg_page; + mshv_has_reg_page = true; +} + +static void mshv_vtl_synic_enable_regs(unsigned int cpu) +{ + union hv_synic_sint sint; + + sint.as_uint64 = 0; + sint.vector = HYPERVISOR_CALLBACK_VECTOR; + sint.masked = false; + sint.auto_eoi = hv_recommend_using_aeoi(); + + /* Enable intercepts */ + if (!mshv_vsm_capabilities.intercept_page_available) + hv_set_msr(HV_MSR_SINT0 + HV_SYNIC_INTERCEPTION_SINT_INDEX, + sint.as_uint64); + + /* VTL2 Host VSP SINT is (un)masked when the user mode requests that */ +} + +static int mshv_vtl_get_vsm_regs(void) +{ + struct hv_register_assoc registers[2]; + int ret, count = 2; + + registers[0].name = HV_REGISTER_VSM_CODE_PAGE_OFFSETS; + registers[1].name = HV_REGISTER_VSM_CAPABILITIES; + + ret = hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + count, input_vtl_zero, registers); + if (ret) + return ret; + + mshv_vsm_page_offsets.as_uint64 = registers[0].value.reg64; + mshv_vsm_capabilities.as_uint64 = registers[1].value.reg64; + + return ret; +} + +static int mshv_vtl_configure_vsm_partition(struct device *dev) +{ + union hv_register_vsm_partition_config config; + struct hv_register_assoc reg_assoc; + + config.as_uint64 = 0; + config.default_vtl_protection_mask = HV_MAP_GPA_PERMISSIONS_MASK; + config.enable_vtl_protection = 1; + config.zero_memory_on_reset = 1; + config.intercept_vp_startup = 1; + config.intercept_cpuid_unimplemented = 1; + + if (mshv_vsm_capabilities.intercept_page_available) { + dev_dbg(dev, "using intercept page\n"); + config.intercept_page = 1; + } + + reg_assoc.name = HV_REGISTER_VSM_PARTITION_CONFIG; + reg_assoc.value.reg64 = config.as_uint64; + + return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + 1, input_vtl_zero, ®_assoc); +} + +static void mshv_vtl_vmbus_isr(void) +{ + struct hv_per_cpu_context *per_cpu; + struct hv_message *msg; + u32 message_type; + union hv_synic_event_flags *event_flags; + struct eventfd_ctx *eventfd; + u16 i; + + per_cpu = this_cpu_ptr(hv_context.cpu_context); + if (smp_processor_id() == 0) { + msg = (struct hv_message *)per_cpu->hyp_synic_message_page + VTL2_VMBUS_SINT_INDEX; + message_type = READ_ONCE(msg->header.message_type); + if (message_type != HVMSG_NONE) + tasklet_schedule(&msg_dpc); + } + + event_flags = (union hv_synic_event_flags *)per_cpu->hyp_synic_event_page + + VTL2_VMBUS_SINT_INDEX; + for_each_set_bit(i, event_flags->flags, HV_EVENT_FLAGS_COUNT) { + if (!sync_test_and_clear_bit(i, event_flags->flags)) + continue; + rcu_read_lock(); + eventfd = READ_ONCE(flag_eventfds[i]); + if (eventfd) + eventfd_signal(eventfd); + rcu_read_unlock(); + } + + vmbus_isr(); +} + +static int mshv_vtl_alloc_context(unsigned int cpu) +{ + struct mshv_vtl_per_cpu *per_cpu = this_cpu_ptr(&mshv_vtl_per_cpu); + + per_cpu->run = (struct mshv_vtl_run *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + if (!per_cpu->run) + return -ENOMEM; + + if (mshv_vsm_capabilities.intercept_page_available) + mshv_vtl_configure_reg_page(per_cpu); + + mshv_vtl_synic_enable_regs(cpu); + + return 0; +} + +static int mshv_vtl_cpuhp_online; + +static int hv_vtl_setup_synic(void) +{ + int ret; + + /* Use our isr to first filter out packets destined for userspace */ + hv_setup_vmbus_handler(mshv_vtl_vmbus_isr); + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vtl:online", + mshv_vtl_alloc_context, NULL); + if (ret < 0) { + hv_setup_vmbus_handler(vmbus_isr); + return ret; + } + + mshv_vtl_cpuhp_online = ret; + + return 0; +} + +static void hv_vtl_remove_synic(void) +{ + cpuhp_remove_state(mshv_vtl_cpuhp_online); + hv_setup_vmbus_handler(vmbus_isr); +} + +static int vtl_get_vp_register(struct hv_register_assoc *reg) +{ + return hv_call_get_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + 1, input_vtl_normal, reg); +} + +static int vtl_set_vp_register(struct hv_register_assoc *reg) +{ + return hv_call_set_vp_registers(HV_VP_INDEX_SELF, HV_PARTITION_ID_SELF, + 1, input_vtl_normal, reg); +} + +static int mshv_vtl_ioctl_add_vtl0_mem(struct mshv_vtl *vtl, void __user *arg) +{ + struct mshv_vtl_ram_disposition vtl0_mem; + struct dev_pagemap *pgmap; + void *addr; + + if (copy_from_user(&vtl0_mem, arg, sizeof(vtl0_mem))) + return -EFAULT; + /* vtl0_mem.last_pfn is excluded in the pagemap range for VTL0 as per design */ + if (vtl0_mem.last_pfn <= vtl0_mem.start_pfn) { + dev_err(vtl->module_dev, "range start pfn (%llx) > end pfn (%llx)\n", + vtl0_mem.start_pfn, vtl0_mem.last_pfn); + return -EFAULT; + } + + pgmap = kzalloc(sizeof(*pgmap), GFP_KERNEL); + if (!pgmap) + return -ENOMEM; + + pgmap->ranges[0].start = PFN_PHYS(vtl0_mem.start_pfn); + pgmap->ranges[0].end = PFN_PHYS(vtl0_mem.last_pfn) - 1; + pgmap->nr_range = 1; + pgmap->type = MEMORY_DEVICE_GENERIC; + + /* + * Determine the highest page order that can be used for the given memory range. + * This works best when the range is aligned; i.e. both the start and the length. + */ + pgmap->vmemmap_shift = count_trailing_zeros(vtl0_mem.start_pfn | vtl0_mem.last_pfn); + dev_dbg(vtl->module_dev, + "Add VTL0 memory: start: 0x%llx, end_pfn: 0x%llx, page order: %lu\n", + vtl0_mem.start_pfn, vtl0_mem.last_pfn, pgmap->vmemmap_shift); + + addr = devm_memremap_pages(mem_dev, pgmap); + if (IS_ERR(addr)) { + dev_err(vtl->module_dev, "devm_memremap_pages error: %ld\n", PTR_ERR(addr)); + kfree(pgmap); + return -EFAULT; + } + + /* Don't free pgmap, since it has to stick around until the memory + * is unmapped, which will never happen as there is no scenario + * where VTL0 can be released/shutdown without bringing down VTL2. + */ + return 0; +} + +static void mshv_vtl_cancel(int cpu) +{ + int here = get_cpu(); + + if (here != cpu) { + if (!xchg_relaxed(&mshv_vtl_cpu_run(cpu)->cancel, 1)) + smp_send_reschedule(cpu); + } else { + WRITE_ONCE(mshv_vtl_this_run()->cancel, 1); + } + put_cpu(); +} + +static int mshv_vtl_poll_file_wake(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) +{ + struct mshv_vtl_poll_file *poll_file = container_of(wait, struct mshv_vtl_poll_file, wait); + + mshv_vtl_cancel(poll_file->cpu); + + return 0; +} + +static void mshv_vtl_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, poll_table *pt) +{ + struct mshv_vtl_poll_file *poll_file = container_of(pt, struct mshv_vtl_poll_file, pt); + + WARN_ON(poll_file->wqh); + poll_file->wqh = wqh; + add_wait_queue(wqh, &poll_file->wait); +} + +static int mshv_vtl_ioctl_set_poll_file(struct mshv_vtl_set_poll_file __user *user_input) +{ + struct file *file, *old_file; + struct mshv_vtl_poll_file *poll_file; + struct mshv_vtl_set_poll_file input; + + if (copy_from_user(&input, user_input, sizeof(input))) + return -EFAULT; + + if (input.cpu >= num_possible_cpus() || !cpu_online(input.cpu)) + return -EINVAL; + /* + * CPU Hotplug is not supported in VTL2 in OpenHCL, where this kernel driver exists. + * CPU is expected to remain online after above cpu_online() check. + */ + + file = NULL; + file = fget(input.fd); + if (!file) + return -EBADFD; + + poll_file = per_cpu_ptr(&mshv_vtl_poll_file, READ_ONCE(input.cpu)); + if (!poll_file) + return -EINVAL; + + mutex_lock(&mshv_vtl_poll_file_lock); + + if (poll_file->wqh) + remove_wait_queue(poll_file->wqh, &poll_file->wait); + poll_file->wqh = NULL; + + old_file = poll_file->file; + poll_file->file = file; + poll_file->cpu = input.cpu; + + if (file) { + init_waitqueue_func_entry(&poll_file->wait, mshv_vtl_poll_file_wake); + init_poll_funcptr(&poll_file->pt, mshv_vtl_ptable_queue_proc); + vfs_poll(file, &poll_file->pt); + } + + mutex_unlock(&mshv_vtl_poll_file_lock); + + if (old_file) + fput(old_file); + + return 0; +} + +/* Static table mapping register names to their corresponding actions */ +static const struct { + enum hv_register_name reg_name; + int debug_reg_num; /* -1 if not a debug register */ + u32 msr_addr; /* 0 if not an MSR */ +} reg_table[] = { + /* Debug registers */ + {HV_X64_REGISTER_DR0, 0, 0}, + {HV_X64_REGISTER_DR1, 1, 0}, + {HV_X64_REGISTER_DR2, 2, 0}, + {HV_X64_REGISTER_DR3, 3, 0}, + {HV_X64_REGISTER_DR6, 6, 0}, + /* MTRR MSRs */ + {HV_X64_REGISTER_MSR_MTRR_CAP, -1, MSR_MTRRcap}, + {HV_X64_REGISTER_MSR_MTRR_DEF_TYPE, -1, MSR_MTRRdefType}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0, -1, MTRRphysBase_MSR(0)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1, -1, MTRRphysBase_MSR(1)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2, -1, MTRRphysBase_MSR(2)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3, -1, MTRRphysBase_MSR(3)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4, -1, MTRRphysBase_MSR(4)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5, -1, MTRRphysBase_MSR(5)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6, -1, MTRRphysBase_MSR(6)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7, -1, MTRRphysBase_MSR(7)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8, -1, MTRRphysBase_MSR(8)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9, -1, MTRRphysBase_MSR(9)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA, -1, MTRRphysBase_MSR(0xa)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB, -1, MTRRphysBase_MSR(0xb)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC, -1, MTRRphysBase_MSR(0xc)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASED, -1, MTRRphysBase_MSR(0xd)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE, -1, MTRRphysBase_MSR(0xe)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF, -1, MTRRphysBase_MSR(0xf)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0, -1, MTRRphysMask_MSR(0)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1, -1, MTRRphysMask_MSR(1)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2, -1, MTRRphysMask_MSR(2)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3, -1, MTRRphysMask_MSR(3)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4, -1, MTRRphysMask_MSR(4)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5, -1, MTRRphysMask_MSR(5)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6, -1, MTRRphysMask_MSR(6)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7, -1, MTRRphysMask_MSR(7)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8, -1, MTRRphysMask_MSR(8)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9, -1, MTRRphysMask_MSR(9)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA, -1, MTRRphysMask_MSR(0xa)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB, -1, MTRRphysMask_MSR(0xb)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC, -1, MTRRphysMask_MSR(0xc)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD, -1, MTRRphysMask_MSR(0xd)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE, -1, MTRRphysMask_MSR(0xe)}, + {HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF, -1, MTRRphysMask_MSR(0xf)}, + {HV_X64_REGISTER_MSR_MTRR_FIX64K00000, -1, MSR_MTRRfix64K_00000}, + {HV_X64_REGISTER_MSR_MTRR_FIX16K80000, -1, MSR_MTRRfix16K_80000}, + {HV_X64_REGISTER_MSR_MTRR_FIX16KA0000, -1, MSR_MTRRfix16K_A0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KC0000, -1, MSR_MTRRfix4K_C0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KC8000, -1, MSR_MTRRfix4K_C8000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KD0000, -1, MSR_MTRRfix4K_D0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KD8000, -1, MSR_MTRRfix4K_D8000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KE0000, -1, MSR_MTRRfix4K_E0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KE8000, -1, MSR_MTRRfix4K_E8000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KF0000, -1, MSR_MTRRfix4K_F0000}, + {HV_X64_REGISTER_MSR_MTRR_FIX4KF8000, -1, MSR_MTRRfix4K_F8000}, +}; + +static int mshv_vtl_get_set_reg(struct hv_register_assoc *regs, bool set) +{ + u64 *reg64; + enum hv_register_name gpr_name; + int i; + + gpr_name = regs->name; + reg64 = ®s->value.reg64; + + /* Search for the register in the table */ + for (i = 0; i < ARRAY_SIZE(reg_table); i++) { + if (reg_table[i].reg_name != gpr_name) + continue; + if (reg_table[i].debug_reg_num != -1) { + /* Handle debug registers */ + if (gpr_name == HV_X64_REGISTER_DR6 && + !mshv_vsm_capabilities.dr6_shared) + goto hypercall; + if (set) + native_set_debugreg(reg_table[i].debug_reg_num, *reg64); + else + *reg64 = native_get_debugreg(reg_table[i].debug_reg_num); + } else { + /* Handle MSRs */ + if (set) + wrmsrl(reg_table[i].msr_addr, *reg64); + else + rdmsrl(reg_table[i].msr_addr, *reg64); + } + return 0; + } + +hypercall: + return 1; +} + +static void mshv_vtl_return(struct mshv_vtl_cpu_context *vtl0) +{ + struct hv_vp_assist_page *hvp; + + hvp = hv_vp_assist_page[smp_processor_id()]; + + /* + * Process signal event direct set in the run page, if any. + */ + if (mshv_vsm_capabilities.return_action_available) { + u32 offset = READ_ONCE(mshv_vtl_this_run()->vtl_ret_action_size); + + WRITE_ONCE(mshv_vtl_this_run()->vtl_ret_action_size, 0); + + /* + * Hypervisor will take care of clearing out the actions + * set in the assist page. + */ + memcpy(hvp->vtl_ret_actions, + mshv_vtl_this_run()->vtl_ret_actions, + min_t(u32, offset, sizeof(hvp->vtl_ret_actions))); + } + + mshv_vtl_return_call(vtl0); +} + +static bool mshv_vtl_process_intercept(void) +{ + struct hv_per_cpu_context *mshv_cpu; + void *synic_message_page; + struct hv_message *msg; + u32 message_type; + + mshv_cpu = this_cpu_ptr(hv_context.cpu_context); + synic_message_page = mshv_cpu->hyp_synic_message_page; + if (unlikely(!synic_message_page)) + return true; + + msg = (struct hv_message *)synic_message_page + HV_SYNIC_INTERCEPTION_SINT_INDEX; + message_type = READ_ONCE(msg->header.message_type); + if (message_type == HVMSG_NONE) + return true; + + memcpy(mshv_vtl_this_run()->exit_message, msg, sizeof(*msg)); + vmbus_signal_eom(msg, message_type); + + return false; +} + +static int mshv_vtl_ioctl_return_to_lower_vtl(void) +{ + preempt_disable(); + for (;;) { + unsigned long irq_flags; + struct hv_vp_assist_page *hvp; + int ret; + + if (__xfer_to_guest_mode_work_pending()) { + preempt_enable(); + ret = xfer_to_guest_mode_handle_work(); + if (ret) + return ret; + preempt_disable(); + } + + local_irq_save(irq_flags); + if (READ_ONCE(mshv_vtl_this_run()->cancel)) { + local_irq_restore(irq_flags); + preempt_enable(); + return -EINTR; + } + + mshv_vtl_return(&mshv_vtl_this_run()->cpu_context); + local_irq_restore(irq_flags); + + hvp = hv_vp_assist_page[smp_processor_id()]; + this_cpu_inc(num_vtl0_transitions); + switch (hvp->vtl_entry_reason) { + case MSHV_ENTRY_REASON_INTERRUPT: + if (!mshv_vsm_capabilities.intercept_page_available && + likely(!mshv_vtl_process_intercept())) + goto done; + break; + + case MSHV_ENTRY_REASON_INTERCEPT: + WARN_ON(!mshv_vsm_capabilities.intercept_page_available); + memcpy(mshv_vtl_this_run()->exit_message, hvp->intercept_message, + sizeof(hvp->intercept_message)); + goto done; + + default: + panic("unknown entry reason: %d", hvp->vtl_entry_reason); + } + } + +done: + preempt_enable(); + + return 0; +} + +static long +mshv_vtl_ioctl_get_regs(void __user *user_args) +{ + struct mshv_vp_registers args; + struct hv_register_assoc reg; + long ret; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + /* This IOCTL supports processing only one register at a time. */ + if (args.count != 1) + return -EINVAL; + + if (copy_from_user(®, (void __user *)args.regs_ptr, + sizeof(reg))) + return -EFAULT; + + ret = mshv_vtl_get_set_reg(®, false); + if (!ret) + goto copy_args; /* No need of hypercall */ + ret = vtl_get_vp_register(®); + if (ret) + return ret; + +copy_args: + if (copy_to_user((void __user *)args.regs_ptr, ®, sizeof(reg))) + ret = -EFAULT; + + return ret; +} + +static long +mshv_vtl_ioctl_set_regs(void __user *user_args) +{ + struct mshv_vp_registers args; + struct hv_register_assoc reg; + long ret; + + if (copy_from_user(&args, user_args, sizeof(args))) + return -EFAULT; + + /* This IOCTL supports processing only one register at a time. */ + if (args.count != 1) + return -EINVAL; + + if (copy_from_user(®, (void __user *)args.regs_ptr, sizeof(reg))) + return -EFAULT; + + ret = mshv_vtl_get_set_reg(®, true); + if (!ret) + return ret; /* No need of hypercall */ + ret = vtl_set_vp_register(®); + + return ret; +} + +static long +mshv_vtl_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) +{ + long ret; + struct mshv_vtl *vtl = filp->private_data; + + switch (ioctl) { + case MSHV_SET_POLL_FILE: + ret = mshv_vtl_ioctl_set_poll_file((struct mshv_vtl_set_poll_file __user *)arg); + break; + case MSHV_GET_VP_REGISTERS: + ret = mshv_vtl_ioctl_get_regs((void __user *)arg); + break; + case MSHV_SET_VP_REGISTERS: + ret = mshv_vtl_ioctl_set_regs((void __user *)arg); + break; + case MSHV_RETURN_TO_LOWER_VTL: + ret = mshv_vtl_ioctl_return_to_lower_vtl(); + break; + case MSHV_ADD_VTL0_MEMORY: + ret = mshv_vtl_ioctl_add_vtl0_mem(vtl, (void __user *)arg); + break; + default: + dev_err(vtl->module_dev, "invalid vtl ioctl: %#x\n", ioctl); + ret = -ENOTTY; + } + + return ret; +} + +static vm_fault_t mshv_vtl_fault(struct vm_fault *vmf) +{ + struct page *page; + int cpu = vmf->pgoff & MSHV_PG_OFF_CPU_MASK; + int real_off = vmf->pgoff >> MSHV_REAL_OFF_SHIFT; + + if (!cpu_online(cpu)) + return VM_FAULT_SIGBUS; + /* + * CPU Hotplug is not supported in VTL2 in OpenHCL, where this kernel driver exists. + * CPU is expected to remain online after above cpu_online() check. + */ + + if (real_off == MSHV_RUN_PAGE_OFFSET) { + page = virt_to_page(mshv_vtl_cpu_run(cpu)); + } else if (real_off == MSHV_REG_PAGE_OFFSET) { + if (!mshv_has_reg_page) + return VM_FAULT_SIGBUS; + page = mshv_vtl_cpu_reg_page(cpu); + } else { + return VM_FAULT_NOPAGE; + } + + get_page(page); + vmf->page = page; + + return 0; +} + +static const struct vm_operations_struct mshv_vtl_vm_ops = { + .fault = mshv_vtl_fault, +}; + +static int mshv_vtl_mmap(struct file *filp, struct vm_area_struct *vma) +{ + vma->vm_ops = &mshv_vtl_vm_ops; + + return 0; +} + +static int mshv_vtl_release(struct inode *inode, struct file *filp) +{ + struct mshv_vtl *vtl = filp->private_data; + + kfree(vtl); + + return 0; +} + +static const struct file_operations mshv_vtl_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = mshv_vtl_ioctl, + .release = mshv_vtl_release, + .mmap = mshv_vtl_mmap, +}; + +static void mshv_vtl_synic_mask_vmbus_sint(const u8 *mask) +{ + union hv_synic_sint sint; + + sint.as_uint64 = 0; + sint.vector = HYPERVISOR_CALLBACK_VECTOR; + sint.masked = (*mask != 0); + sint.auto_eoi = hv_recommend_using_aeoi(); + + hv_set_msr(HV_MSR_SINT0 + VTL2_VMBUS_SINT_INDEX, + sint.as_uint64); + + if (!sint.masked) + pr_debug("%s: Unmasking VTL2 VMBUS SINT on VP %d\n", __func__, smp_processor_id()); + else + pr_debug("%s: Masking VTL2 VMBUS SINT on VP %d\n", __func__, smp_processor_id()); +} + +static void mshv_vtl_read_remote(void *buffer) +{ + struct hv_per_cpu_context *mshv_cpu = this_cpu_ptr(hv_context.cpu_context); + struct hv_message *msg = (struct hv_message *)mshv_cpu->hyp_synic_message_page + + VTL2_VMBUS_SINT_INDEX; + u32 message_type = READ_ONCE(msg->header.message_type); + + WRITE_ONCE(has_message, false); + if (message_type == HVMSG_NONE) + return; + + memcpy(buffer, msg, sizeof(*msg)); + vmbus_signal_eom(msg, message_type); +} + +static bool vtl_synic_mask_vmbus_sint_masked = true; + +static ssize_t mshv_vtl_sint_read(struct file *filp, char __user *arg, size_t size, loff_t *offset) +{ + struct hv_message msg = {}; + int ret; + + if (size < sizeof(msg)) + return -EINVAL; + + for (;;) { + smp_call_function_single(VMBUS_CONNECT_CPU, mshv_vtl_read_remote, &msg, true); + if (msg.header.message_type != HVMSG_NONE) + break; + + if (READ_ONCE(vtl_synic_mask_vmbus_sint_masked)) + return 0; /* EOF */ + + if (filp->f_flags & O_NONBLOCK) + return -EAGAIN; + + ret = wait_event_interruptible(fd_wait_queue, + READ_ONCE(has_message) || + READ_ONCE(vtl_synic_mask_vmbus_sint_masked)); + if (ret) + return ret; + } + + if (copy_to_user(arg, &msg, sizeof(msg))) + return -EFAULT; + + return sizeof(msg); +} + +static __poll_t mshv_vtl_sint_poll(struct file *filp, poll_table *wait) +{ + __poll_t mask = 0; + + poll_wait(filp, &fd_wait_queue, wait); + if (READ_ONCE(has_message) || READ_ONCE(vtl_synic_mask_vmbus_sint_masked)) + mask |= EPOLLIN | EPOLLRDNORM; + + return mask; +} + +static void mshv_vtl_sint_on_msg_dpc(unsigned long data) +{ + WRITE_ONCE(has_message, true); + wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN); +} + +static int mshv_vtl_sint_ioctl_post_msg(struct mshv_vtl_sint_post_msg __user *arg) +{ + struct mshv_vtl_sint_post_msg message; + u8 payload[HV_MESSAGE_PAYLOAD_BYTE_COUNT]; + + if (copy_from_user(&message, arg, sizeof(message))) + return -EFAULT; + if (message.payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) + return -EINVAL; + if (copy_from_user(payload, (void __user *)message.payload_ptr, + message.payload_size)) + return -EFAULT; + + return hv_post_message((union hv_connection_id)message.connection_id, + message.message_type, (void *)payload, + message.payload_size); +} + +static int mshv_vtl_sint_ioctl_signal_event(struct mshv_vtl_signal_event __user *arg) +{ + u64 input, status; + struct mshv_vtl_signal_event signal_event; + + if (copy_from_user(&signal_event, arg, sizeof(signal_event))) + return -EFAULT; + + input = signal_event.connection_id | ((u64)signal_event.flag << 32); + + status = hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, input); + + return hv_result_to_errno(status); +} + +static int mshv_vtl_sint_ioctl_set_eventfd(struct mshv_vtl_set_eventfd __user *arg) +{ + struct mshv_vtl_set_eventfd set_eventfd; + struct eventfd_ctx *eventfd, *old_eventfd; + + if (copy_from_user(&set_eventfd, arg, sizeof(set_eventfd))) + return -EFAULT; + if (set_eventfd.flag >= HV_EVENT_FLAGS_COUNT) + return -EINVAL; + + eventfd = NULL; + if (set_eventfd.fd >= 0) { + eventfd = eventfd_ctx_fdget(set_eventfd.fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + } + + guard(mutex)(&flag_lock); + old_eventfd = READ_ONCE(flag_eventfds[set_eventfd.flag]); + WRITE_ONCE(flag_eventfds[set_eventfd.flag], eventfd); + + if (old_eventfd) { + synchronize_rcu(); + eventfd_ctx_put(old_eventfd); + } + + return 0; +} + +static int mshv_vtl_sint_ioctl_pause_msg_stream(struct mshv_sint_mask __user *arg) +{ + static DEFINE_MUTEX(vtl2_vmbus_sint_mask_mutex); + struct mshv_sint_mask mask; + + if (copy_from_user(&mask, arg, sizeof(mask))) + return -EFAULT; + guard(mutex)(&vtl2_vmbus_sint_mask_mutex); + on_each_cpu((smp_call_func_t)mshv_vtl_synic_mask_vmbus_sint, &mask.mask, 1); + WRITE_ONCE(vtl_synic_mask_vmbus_sint_masked, mask.mask != 0); + if (mask.mask) + wake_up_interruptible_poll(&fd_wait_queue, EPOLLIN); + + return 0; +} + +static long mshv_vtl_sint_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case MSHV_SINT_POST_MESSAGE: + return mshv_vtl_sint_ioctl_post_msg((struct mshv_vtl_sint_post_msg __user *)arg); + case MSHV_SINT_SIGNAL_EVENT: + return mshv_vtl_sint_ioctl_signal_event((struct mshv_vtl_signal_event __user *)arg); + case MSHV_SINT_SET_EVENTFD: + return mshv_vtl_sint_ioctl_set_eventfd((struct mshv_vtl_set_eventfd __user *)arg); + case MSHV_SINT_PAUSE_MESSAGE_STREAM: + return mshv_vtl_sint_ioctl_pause_msg_stream((struct mshv_sint_mask __user *)arg); + default: + return -ENOIOCTLCMD; + } +} + +static const struct file_operations mshv_vtl_sint_ops = { + .owner = THIS_MODULE, + .read = mshv_vtl_sint_read, + .poll = mshv_vtl_sint_poll, + .unlocked_ioctl = mshv_vtl_sint_ioctl, +}; + +static struct miscdevice mshv_vtl_sint_dev = { + .name = "mshv_sint", + .fops = &mshv_vtl_sint_ops, + .mode = 0600, + .minor = MISC_DYNAMIC_MINOR, +}; + +static int mshv_vtl_hvcall_dev_open(struct inode *node, struct file *f) +{ + struct miscdevice *dev = f->private_data; + struct mshv_vtl_hvcall_fd *fd; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + fd = vzalloc(sizeof(*fd)); + if (!fd) + return -ENOMEM; + fd->dev = dev; + f->private_data = fd; + mutex_init(&fd->init_mutex); + + return 0; +} + +static int mshv_vtl_hvcall_dev_release(struct inode *node, struct file *f) +{ + struct mshv_vtl_hvcall_fd *fd; + + fd = f->private_data; + if (fd) { + vfree(fd); + f->private_data = NULL; + } + + return 0; +} + +static int mshv_vtl_hvcall_do_setup(struct mshv_vtl_hvcall_fd *fd, + struct mshv_vtl_hvcall_setup __user *hvcall_setup_user) +{ + struct mshv_vtl_hvcall_setup hvcall_setup; + + guard(mutex)(&fd->init_mutex); + + if (fd->allow_map_initialized) { + dev_err(fd->dev->this_device, + "Hypercall allow map has already been set, pid %d\n", + current->pid); + return -EINVAL; + } + + if (copy_from_user(&hvcall_setup, hvcall_setup_user, + sizeof(struct mshv_vtl_hvcall_setup))) { + return -EFAULT; + } + if (hvcall_setup.bitmap_array_size > ARRAY_SIZE(fd->allow_bitmap)) + return -EINVAL; + + if (copy_from_user(&fd->allow_bitmap, + (void __user *)hvcall_setup.allow_bitmap_ptr, + hvcall_setup.bitmap_array_size)) { + return -EFAULT; + } + + dev_info(fd->dev->this_device, "Hypercall allow map has been set, pid %d\n", + current->pid); + fd->allow_map_initialized = true; + return 0; +} + +static bool mshv_vtl_hvcall_is_allowed(struct mshv_vtl_hvcall_fd *fd, u16 call_code) +{ + return test_bit(call_code, (unsigned long *)fd->allow_bitmap); +} + +static int mshv_vtl_hvcall_call(struct mshv_vtl_hvcall_fd *fd, + struct mshv_vtl_hvcall __user *hvcall_user) +{ + struct mshv_vtl_hvcall hvcall; + void *in, *out; + int ret; + + if (copy_from_user(&hvcall, hvcall_user, sizeof(struct mshv_vtl_hvcall))) + return -EFAULT; + if (hvcall.input_size > HV_HYP_PAGE_SIZE) + return -EINVAL; + if (hvcall.output_size > HV_HYP_PAGE_SIZE) + return -EINVAL; + + /* + * By default, all hypercalls are not allowed. + * The user mode code has to set up the allow bitmap once. + */ + + if (!mshv_vtl_hvcall_is_allowed(fd, hvcall.control & 0xFFFF)) { + dev_err(fd->dev->this_device, + "Hypercall with control data %#llx isn't allowed\n", + hvcall.control); + return -EPERM; + } + + /* + * This may create a problem for Confidential VM (CVM) usecase where we need to use + * Hyper-V driver allocated per-cpu input and output pages (hyperv_pcpu_input_arg and + * hyperv_pcpu_output_arg) for making a hypervisor call. + * + * TODO: Take care of this when CVM support is added. + */ + in = (void *)__get_free_page(GFP_KERNEL); + out = (void *)__get_free_page(GFP_KERNEL); + + if (copy_from_user(in, (void __user *)hvcall.input_ptr, hvcall.input_size)) { + ret = -EFAULT; + goto free_pages; + } + + hvcall.status = hv_do_hypercall(hvcall.control, in, out); + + if (copy_to_user((void __user *)hvcall.output_ptr, out, hvcall.output_size)) { + ret = -EFAULT; + goto free_pages; + } + ret = put_user(hvcall.status, &hvcall_user->status); +free_pages: + free_page((unsigned long)in); + free_page((unsigned long)out); + + return ret; +} + +static long mshv_vtl_hvcall_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ + struct mshv_vtl_hvcall_fd *fd = f->private_data; + + switch (cmd) { + case MSHV_HVCALL_SETUP: + return mshv_vtl_hvcall_do_setup(fd, (struct mshv_vtl_hvcall_setup __user *)arg); + case MSHV_HVCALL: + return mshv_vtl_hvcall_call(fd, (struct mshv_vtl_hvcall __user *)arg); + default: + break; + } + + return -ENOIOCTLCMD; +} + +static const struct file_operations mshv_vtl_hvcall_dev_file_ops = { + .owner = THIS_MODULE, + .open = mshv_vtl_hvcall_dev_open, + .release = mshv_vtl_hvcall_dev_release, + .unlocked_ioctl = mshv_vtl_hvcall_dev_ioctl, +}; + +static struct miscdevice mshv_vtl_hvcall_dev = { + .name = "mshv_hvcall", + .nodename = "mshv_hvcall", + .fops = &mshv_vtl_hvcall_dev_file_ops, + .mode = 0600, + .minor = MISC_DYNAMIC_MINOR, +}; + +static int mshv_vtl_low_open(struct inode *inodep, struct file *filp) +{ + pid_t pid = task_pid_vnr(current); + uid_t uid = current_uid().val; + int ret = 0; + + pr_debug("%s: Opening VTL low, task group %d, uid %d\n", __func__, pid, uid); + + if (capable(CAP_SYS_ADMIN)) { + filp->private_data = inodep; + } else { + pr_err("%s: VTL low open failed: CAP_SYS_ADMIN required. task group %d, uid %d", + __func__, pid, uid); + ret = -EPERM; + } + + return ret; +} + +static bool can_fault(struct vm_fault *vmf, unsigned long size, unsigned long *pfn) +{ + unsigned long mask = size - 1; + unsigned long start = vmf->address & ~mask; + unsigned long end = start + size; + bool is_valid; + + is_valid = (vmf->address & mask) == ((vmf->pgoff << PAGE_SHIFT) & mask) && + start >= vmf->vma->vm_start && + end <= vmf->vma->vm_end; + + if (is_valid) + *pfn = vmf->pgoff & ~(mask >> PAGE_SHIFT); + + return is_valid; +} + +static vm_fault_t mshv_vtl_low_huge_fault(struct vm_fault *vmf, unsigned int order) +{ + unsigned long pfn = vmf->pgoff; + vm_fault_t ret = VM_FAULT_FALLBACK; + + switch (order) { + case 0: + return vmf_insert_mixed(vmf->vma, vmf->address, pfn); + + case PMD_ORDER: + if (can_fault(vmf, PMD_SIZE, &pfn)) + ret = vmf_insert_pfn_pmd(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); + return ret; + + case PUD_ORDER: + if (can_fault(vmf, PUD_SIZE, &pfn)) + ret = vmf_insert_pfn_pud(vmf, pfn, vmf->flags & FAULT_FLAG_WRITE); + return ret; + + default: + return VM_FAULT_SIGBUS; + } +} + +static vm_fault_t mshv_vtl_low_fault(struct vm_fault *vmf) +{ + return mshv_vtl_low_huge_fault(vmf, 0); +} + +static const struct vm_operations_struct mshv_vtl_low_vm_ops = { + .fault = mshv_vtl_low_fault, + .huge_fault = mshv_vtl_low_huge_fault, +}; + +static int mshv_vtl_low_mmap(struct file *filp, struct vm_area_struct *vma) +{ + vma->vm_ops = &mshv_vtl_low_vm_ops; + vm_flags_set(vma, VM_HUGEPAGE | VM_MIXEDMAP); + + return 0; +} + +static const struct file_operations mshv_vtl_low_file_ops = { + .owner = THIS_MODULE, + .open = mshv_vtl_low_open, + .mmap = mshv_vtl_low_mmap, +}; + +static struct miscdevice mshv_vtl_low = { + .name = "mshv_vtl_low", + .nodename = "mshv_vtl_low", + .fops = &mshv_vtl_low_file_ops, + .mode = 0600, + .minor = MISC_DYNAMIC_MINOR, +}; + +static int __init mshv_vtl_init(void) +{ + int ret; + struct device *dev = mshv_dev.this_device; + + /* + * This creates /dev/mshv which provides functionality to create VTLs and partitions. + */ + ret = misc_register(&mshv_dev); + if (ret) { + dev_err(dev, "mshv device register failed: %d\n", ret); + goto free_dev; + } + + tasklet_init(&msg_dpc, mshv_vtl_sint_on_msg_dpc, 0); + init_waitqueue_head(&fd_wait_queue); + + if (mshv_vtl_get_vsm_regs()) { + dev_emerg(dev, "Unable to get VSM capabilities !!\n"); + ret = -ENODEV; + goto free_dev; + } + if (mshv_vtl_configure_vsm_partition(dev)) { + dev_emerg(dev, "VSM configuration failed !!\n"); + ret = -ENODEV; + goto free_dev; + } + + mshv_vtl_return_call_init(mshv_vsm_page_offsets.vtl_return_offset); + ret = hv_vtl_setup_synic(); + if (ret) + goto free_dev; + + /* + * mshv_sint device adds VMBus relay ioctl support. + * This provides a channel for VTL0 to communicate with VTL2. + */ + ret = misc_register(&mshv_vtl_sint_dev); + if (ret) + goto free_synic; + + /* + * mshv_hvcall device adds interface to enable userspace for direct hypercalls support. + */ + ret = misc_register(&mshv_vtl_hvcall_dev); + if (ret) + goto free_sint; + + /* + * mshv_vtl_low device is used to map VTL0 address space to a user-mode process in VTL2. + * It implements mmap() to allow a user-mode process in VTL2 to map to the address of VTL0. + */ + ret = misc_register(&mshv_vtl_low); + if (ret) + goto free_hvcall; + + /* + * "mshv vtl mem dev" device is later used to setup VTL0 memory. + */ + mem_dev = kzalloc(sizeof(*mem_dev), GFP_KERNEL); + if (!mem_dev) { + ret = -ENOMEM; + goto free_low; + } + + mutex_init(&mshv_vtl_poll_file_lock); + + device_initialize(mem_dev); + dev_set_name(mem_dev, "mshv vtl mem dev"); + ret = device_add(mem_dev); + if (ret) { + dev_err(dev, "mshv vtl mem dev add: %d\n", ret); + goto free_mem; + } + + return 0; + +free_mem: + kfree(mem_dev); +free_low: + misc_deregister(&mshv_vtl_low); +free_hvcall: + misc_deregister(&mshv_vtl_hvcall_dev); +free_sint: + misc_deregister(&mshv_vtl_sint_dev); +free_synic: + hv_vtl_remove_synic(); +free_dev: + misc_deregister(&mshv_dev); + + return ret; +} + +static void __exit mshv_vtl_exit(void) +{ + device_del(mem_dev); + kfree(mem_dev); + misc_deregister(&mshv_vtl_low); + misc_deregister(&mshv_vtl_hvcall_dev); + misc_deregister(&mshv_vtl_sint_dev); + hv_vtl_remove_synic(); + misc_deregister(&mshv_dev); +} + +module_init(mshv_vtl_init); +module_exit(mshv_vtl_exit); diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c index 23ce1fb70de1..3c421a7f78c0 100644 --- a/drivers/hv/ring_buffer.c +++ b/drivers/hv/ring_buffer.c @@ -184,7 +184,8 @@ void hv_ringbuffer_pre_init(struct vmbus_channel *channel) /* Initialize the ring buffer. */ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, - struct page *pages, u32 page_cnt, u32 max_pkt_size) + struct page *pages, u32 page_cnt, u32 max_pkt_size, + bool confidential) { struct page **pages_wraparound; int i; @@ -208,7 +209,7 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, ring_info->ring_buffer = (struct hv_ring_buffer *) vmap(pages_wraparound, page_cnt * 2 - 1, VM_MAP, - pgprot_decrypted(PAGE_KERNEL)); + confidential ? PAGE_KERNEL : pgprot_decrypted(PAGE_KERNEL)); kfree(pages_wraparound); if (!ring_info->ring_buffer) diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 69591dc7bad2..a53af6fe81a6 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -36,6 +36,7 @@ #include <linux/syscore_ops.h> #include <linux/dma-map-ops.h> #include <linux/pci.h> +#include <linux/export.h> #include <clocksource/hyperv_timer.h> #include <asm/mshyperv.h> #include "hyperv_vmbus.h" @@ -57,6 +58,18 @@ int vmbus_irq; int vmbus_interrupt; /* + * If the Confidential VMBus is used, the data on the "wire" is not + * visible to either the host or the hypervisor. + */ +static bool is_confidential; + +bool vmbus_is_confidential(void) +{ + return is_confidential; +} +EXPORT_SYMBOL_GPL(vmbus_is_confidential); + +/* * The panic notifier below is responsible solely for unloading the * vmbus connection, which is necessary in a panic event. * @@ -1045,12 +1058,9 @@ static void vmbus_onmessage_work(struct work_struct *work) kfree(ctx); } -void vmbus_on_msg_dpc(unsigned long data) +static void __vmbus_on_msg_dpc(void *message_page_addr) { - struct hv_per_cpu_context *hv_cpu = (void *)data; - void *page_addr = hv_cpu->synic_message_page; - struct hv_message msg_copy, *msg = (struct hv_message *)page_addr + - VMBUS_MESSAGE_SINT; + struct hv_message msg_copy, *msg; struct vmbus_channel_message_header *hdr; enum vmbus_channel_message_type msgtype; const struct vmbus_channel_message_table_entry *entry; @@ -1058,6 +1068,10 @@ void vmbus_on_msg_dpc(unsigned long data) __u8 payload_size; u32 message_type; + if (!message_page_addr) + return; + msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT; + /* * 'enum vmbus_channel_message_type' is supposed to always be 'u32' as * it is being used in 'struct vmbus_channel_message_header' definition @@ -1183,6 +1197,14 @@ msg_handled: vmbus_signal_eom(msg, message_type); } +void vmbus_on_msg_dpc(unsigned long data) +{ + struct hv_per_cpu_context *hv_cpu = (void *)data; + + __vmbus_on_msg_dpc(hv_cpu->hyp_synic_message_page); + __vmbus_on_msg_dpc(hv_cpu->para_synic_message_page); +} + #ifdef CONFIG_PM_SLEEP /* * Fake RESCIND_CHANNEL messages to clean up hv_sock channels by force for @@ -1221,21 +1243,19 @@ static void vmbus_force_channel_rescinded(struct vmbus_channel *channel) #endif /* CONFIG_PM_SLEEP */ /* - * Schedule all channels with events pending + * Schedule all channels with events pending. + * The event page can be directly checked to get the id of + * the channel that has the interrupt pending. */ -static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) +static void vmbus_chan_sched(void *event_page_addr) { unsigned long *recv_int_page; u32 maxbits, relid; + union hv_synic_event_flags *event; - /* - * The event page can be directly checked to get the id of - * the channel that has the interrupt pending. - */ - void *page_addr = hv_cpu->synic_event_page; - union hv_synic_event_flags *event - = (union hv_synic_event_flags *)page_addr + - VMBUS_MESSAGE_SINT; + if (!event_page_addr) + return; + event = (union hv_synic_event_flags *)event_page_addr + VMBUS_MESSAGE_SINT; maxbits = HV_EVENT_FLAGS_COUNT; recv_int_page = event->flags; @@ -1243,6 +1263,11 @@ static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu) if (unlikely(!recv_int_page)) return; + /* + * Suggested-by: Michael Kelley <mhklinux@outlook.com> + * One possible optimization would be to keep track of the largest relID that's in use, + * and only scan up to that relID. + */ for_each_set_bit(relid, recv_int_page, maxbits) { void (*callback_fn)(void *context); struct vmbus_channel *channel; @@ -1306,29 +1331,39 @@ sched_unlock_rcu: } } -static void vmbus_isr(void) +static void vmbus_message_sched(struct hv_per_cpu_context *hv_cpu, void *message_page_addr) { - struct hv_per_cpu_context *hv_cpu - = this_cpu_ptr(hv_context.cpu_context); - void *page_addr; struct hv_message *msg; - vmbus_chan_sched(hv_cpu); - - page_addr = hv_cpu->synic_message_page; - msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT; + if (!message_page_addr) + return; + msg = (struct hv_message *)message_page_addr + VMBUS_MESSAGE_SINT; /* Check if there are actual msgs to be processed */ if (msg->header.message_type != HVMSG_NONE) { if (msg->header.message_type == HVMSG_TIMER_EXPIRED) { hv_stimer0_isr(); vmbus_signal_eom(msg, HVMSG_TIMER_EXPIRED); - } else + } else { tasklet_schedule(&hv_cpu->msg_dpc); + } } +} + +void vmbus_isr(void) +{ + struct hv_per_cpu_context *hv_cpu + = this_cpu_ptr(hv_context.cpu_context); + + vmbus_chan_sched(hv_cpu->hyp_synic_event_page); + vmbus_chan_sched(hv_cpu->para_synic_event_page); + + vmbus_message_sched(hv_cpu, hv_cpu->hyp_synic_message_page); + vmbus_message_sched(hv_cpu, hv_cpu->para_synic_message_page); add_interrupt_randomness(vmbus_interrupt); } +EXPORT_SYMBOL_FOR_MODULES(vmbus_isr, "mshv_vtl"); static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id) { @@ -1343,6 +1378,59 @@ static void vmbus_percpu_work(struct work_struct *work) hv_synic_init(cpu); } +static int vmbus_alloc_synic_and_connect(void) +{ + int ret, cpu; + struct work_struct __percpu *works; + int hyperv_cpuhp_online; + + ret = hv_synic_alloc(); + if (ret < 0) + goto err_alloc; + + works = alloc_percpu(struct work_struct); + if (!works) { + ret = -ENOMEM; + goto err_alloc; + } + + /* + * Initialize the per-cpu interrupt state and stimer state. + * Then connect to the host. + */ + cpus_read_lock(); + for_each_online_cpu(cpu) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, vmbus_percpu_work); + schedule_work_on(cpu, work); + } + + for_each_online_cpu(cpu) + flush_work(per_cpu_ptr(works, cpu)); + + /* Register the callbacks for possible CPU online/offline'ing */ + ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online", + hv_synic_init, hv_synic_cleanup); + cpus_read_unlock(); + free_percpu(works); + if (ret < 0) + goto err_alloc; + hyperv_cpuhp_online = ret; + + ret = vmbus_connect(); + if (ret) + goto err_connect; + return 0; + +err_connect: + cpuhp_remove_state(hyperv_cpuhp_online); + return -ENODEV; +err_alloc: + hv_synic_free(); + return -ENOMEM; +} + /* * vmbus_bus_init -Main vmbus driver initialization routine. * @@ -1353,8 +1441,7 @@ static void vmbus_percpu_work(struct work_struct *work) */ static int vmbus_bus_init(void) { - int ret, cpu; - struct work_struct __percpu *works; + int ret; ret = hv_init(); if (ret != 0) { @@ -1389,41 +1476,15 @@ static int vmbus_bus_init(void) } } - ret = hv_synic_alloc(); - if (ret) - goto err_alloc; - - works = alloc_percpu(struct work_struct); - if (!works) { - ret = -ENOMEM; - goto err_alloc; - } - /* - * Initialize the per-cpu interrupt state and stimer state. - * Then connect to the host. + * Cache the value as getting it involves a VM exit on x86(_64), and + * doing that on each VP while initializing SynIC's wastes time. */ - cpus_read_lock(); - for_each_online_cpu(cpu) { - struct work_struct *work = per_cpu_ptr(works, cpu); - - INIT_WORK(work, vmbus_percpu_work); - schedule_work_on(cpu, work); - } - - for_each_online_cpu(cpu) - flush_work(per_cpu_ptr(works, cpu)); - - /* Register the callbacks for possible CPU online/offline'ing */ - ret = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online", - hv_synic_init, hv_synic_cleanup); - cpus_read_unlock(); - free_percpu(works); - if (ret < 0) - goto err_alloc; - hyperv_cpuhp_online = ret; - - ret = vmbus_connect(); + is_confidential = ms_hyperv.confidential_vmbus_available; + if (is_confidential) + pr_info("Establishing connection to the confidential VMBus\n"); + hv_para_set_sint_proxy(!is_confidential); + ret = vmbus_alloc_synic_and_connect(); if (ret) goto err_connect; @@ -1439,9 +1500,6 @@ static int vmbus_bus_init(void) return 0; err_connect: - cpuhp_remove_state(hyperv_cpuhp_online); -err_alloc: - hv_synic_free(); if (vmbus_irq == -1) { hv_remove_vmbus_handler(); } else { @@ -2798,10 +2856,10 @@ static void hv_crash_handler(struct pt_regs *regs) */ cpu = smp_processor_id(); hv_stimer_cleanup(cpu); - hv_synic_disable_regs(cpu); + hv_hyp_synic_disable_regs(cpu); }; -static int hv_synic_suspend(void) +static int hv_synic_suspend(void *data) { /* * When we reach here, all the non-boot CPUs have been offlined. @@ -2823,14 +2881,14 @@ static int hv_synic_suspend(void) * interrupts-disabled context. */ - hv_synic_disable_regs(0); + hv_hyp_synic_disable_regs(0); return 0; } -static void hv_synic_resume(void) +static void hv_synic_resume(void *data) { - hv_synic_enable_regs(0); + hv_hyp_synic_enable_regs(0); /* * Note: we don't need to call hv_stimer_init(0), because the timer @@ -2840,11 +2898,15 @@ static void hv_synic_resume(void) } /* The callbacks run only on CPU0, with irqs_disabled. */ -static struct syscore_ops hv_synic_syscore_ops = { +static const struct syscore_ops hv_synic_syscore_ops = { .suspend = hv_synic_suspend, .resume = hv_synic_resume, }; +static struct syscore hv_synic_syscore = { + .ops = &hv_synic_syscore_ops, +}; + static int __init hv_acpi_init(void) { int ret; @@ -2887,7 +2949,7 @@ static int __init hv_acpi_init(void) hv_setup_kexec_handler(hv_kexec_handler); hv_setup_crash_handler(hv_crash_handler); - register_syscore_ops(&hv_synic_syscore_ops); + register_syscore(&hv_synic_syscore); return 0; @@ -2901,7 +2963,7 @@ static void __exit vmbus_exit(void) { int cpu; - unregister_syscore_ops(&hv_synic_syscore_ops); + unregister_syscore(&hv_synic_syscore); hv_remove_kexec_handler(); hv_remove_crash_handler(); |
