// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2025 Ant Group * Author: Tiwei Bie */ #define pr_fmt(fmt) "vfio-uml: " fmt #include #include #include #include #include #include #include #include #include #include "virt-pci.h" #include "vfio_user.h" #define to_vdev(_pdev) container_of(_pdev, struct uml_vfio_device, pdev) struct uml_vfio_intr_ctx { struct uml_vfio_device *dev; int irq; }; struct uml_vfio_device { const char *name; int group; struct um_pci_device pdev; struct uml_vfio_user_device udev; struct uml_vfio_intr_ctx *intr_ctx; int msix_cap; int msix_bar; int msix_offset; int msix_size; u32 *msix_data; struct list_head list; }; struct uml_vfio_group { int id; int fd; int users; struct list_head list; }; static struct { int fd; int users; } uml_vfio_container = { .fd = -1 }; static DEFINE_MUTEX(uml_vfio_container_mtx); static LIST_HEAD(uml_vfio_groups); static DEFINE_MUTEX(uml_vfio_groups_mtx); static LIST_HEAD(uml_vfio_devices); static int uml_vfio_set_container(int group_fd) { int err; guard(mutex)(¨_vfio_container_mtx); err = uml_vfio_user_set_container(uml_vfio_container.fd, group_fd); if (err) return err; uml_vfio_container.users++; if (uml_vfio_container.users > 1) return 0; err = uml_vfio_user_setup_iommu(uml_vfio_container.fd); if (err) { uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd); uml_vfio_container.users--; } return err; } static void uml_vfio_unset_container(int group_fd) { guard(mutex)(¨_vfio_container_mtx); uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd); uml_vfio_container.users--; } static int uml_vfio_open_group(int group_id) { struct uml_vfio_group *group; int err; guard(mutex)(¨_vfio_groups_mtx); list_for_each_entry(group, ¨_vfio_groups, list) { if (group->id == group_id) { group->users++; return group->fd; } } group = kzalloc(sizeof(*group), GFP_KERNEL); if (!group) return -ENOMEM; group->fd = uml_vfio_user_open_group(group_id); if (group->fd < 0) { err = group->fd; goto free_group; } err = uml_vfio_set_container(group->fd); if (err) goto close_group; group->id = group_id; group->users = 1; list_add(&group->list, ¨_vfio_groups); return group->fd; close_group: os_close_file(group->fd); free_group: kfree(group); return err; } static int uml_vfio_release_group(int group_fd) { struct uml_vfio_group *group; guard(mutex)(¨_vfio_groups_mtx); list_for_each_entry(group, ¨_vfio_groups, list) { if (group->fd == group_fd) { group->users--; if (group->users == 0) { uml_vfio_unset_container(group_fd); os_close_file(group_fd); list_del(&group->list); kfree(group); } return 0; } } return -ENOENT; } static irqreturn_t uml_vfio_interrupt(int unused, void *opaque) { struct uml_vfio_intr_ctx *ctx = opaque; struct uml_vfio_device *dev = ctx->dev; int index = ctx - dev->intr_ctx; int irqfd = dev->udev.irqfd[index]; int irq = dev->msix_data[index]; uint64_t v; int r; do { r = os_read_file(irqfd, &v, sizeof(v)); if (r == sizeof(v)) generic_handle_irq(irq); } while (r == sizeof(v) || r == -EINTR); WARN(r != -EAGAIN, "read returned %d\n", r); return IRQ_HANDLED; } static int uml_vfio_activate_irq(struct uml_vfio_device *dev, int index) { struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index]; int err, irqfd; if (ctx->irq >= 0) return 0; irqfd = uml_vfio_user_activate_irq(&dev->udev, index); if (irqfd < 0) return irqfd; ctx->irq = um_request_irq(UM_IRQ_ALLOC, irqfd, IRQ_READ, uml_vfio_interrupt, 0, "vfio-uml", ctx); if (ctx->irq < 0) { err = ctx->irq; goto deactivate; } err = add_sigio_fd(irqfd); if (err) goto free_irq; return 0; free_irq: um_free_irq(ctx->irq, ctx); ctx->irq = -1; deactivate: uml_vfio_user_deactivate_irq(&dev->udev, index); return err; } static int uml_vfio_deactivate_irq(struct uml_vfio_device *dev, int index) { struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index]; if (ctx->irq >= 0) { ignore_sigio_fd(dev->udev.irqfd[index]); um_free_irq(ctx->irq, ctx); uml_vfio_user_deactivate_irq(&dev->udev, index); ctx->irq = -1; } return 0; } static int uml_vfio_update_msix_cap(struct uml_vfio_device *dev, unsigned int offset, int size, unsigned long val) { /* * Here, we handle only the operations we care about, * ignoring the rest. */ if (size == 2 && offset == dev->msix_cap + PCI_MSIX_FLAGS) { switch (val & ~PCI_MSIX_FLAGS_QSIZE) { case PCI_MSIX_FLAGS_ENABLE: case 0: return uml_vfio_user_update_irqs(&dev->udev); } } return 0; } static int uml_vfio_update_msix_table(struct uml_vfio_device *dev, unsigned int offset, int size, unsigned long val) { int index; /* * Here, we handle only the operations we care about, * ignoring the rest. */ offset -= dev->msix_offset + PCI_MSIX_ENTRY_DATA; if (size != 4 || offset % PCI_MSIX_ENTRY_SIZE != 0) return 0; index = offset / PCI_MSIX_ENTRY_SIZE; if (index >= dev->udev.irq_count) return -EINVAL; dev->msix_data[index] = val; return val ? uml_vfio_activate_irq(dev, index) : uml_vfio_deactivate_irq(dev, index); } static unsigned long __uml_vfio_cfgspace_read(struct uml_vfio_device *dev, unsigned int offset, int size) { u8 data[8]; memset(data, 0xff, sizeof(data)); if (uml_vfio_user_cfgspace_read(&dev->udev, offset, data, size)) return ULONG_MAX; switch (size) { case 1: return data[0]; case 2: return le16_to_cpup((void *)data); case 4: return le32_to_cpup((void *)data); #ifdef CONFIG_64BIT case 8: return le64_to_cpup((void *)data); #endif default: return ULONG_MAX; } } static unsigned long uml_vfio_cfgspace_read(struct um_pci_device *pdev, unsigned int offset, int size) { struct uml_vfio_device *dev = to_vdev(pdev); return __uml_vfio_cfgspace_read(dev, offset, size); } static void __uml_vfio_cfgspace_write(struct uml_vfio_device *dev, unsigned int offset, int size, unsigned long val) { u8 data[8]; switch (size) { case 1: data[0] = (u8)val; break; case 2: put_unaligned_le16(val, (void *)data); break; case 4: put_unaligned_le32(val, (void *)data); break; #ifdef CONFIG_64BIT case 8: put_unaligned_le64(val, (void *)data); break; #endif } WARN_ON(uml_vfio_user_cfgspace_write(&dev->udev, offset, data, size)); } static void uml_vfio_cfgspace_write(struct um_pci_device *pdev, unsigned int offset, int size, unsigned long val) { struct uml_vfio_device *dev = to_vdev(pdev); if (offset < dev->msix_cap + PCI_CAP_MSIX_SIZEOF && offset + size > dev->msix_cap) WARN_ON(uml_vfio_update_msix_cap(dev, offset, size, val)); __uml_vfio_cfgspace_write(dev, offset, size, val); } static void uml_vfio_bar_copy_from(struct um_pci_device *pdev, int bar, void *buffer, unsigned int offset, int size) { struct uml_vfio_device *dev = to_vdev(pdev); memset(buffer, 0xff, size); uml_vfio_user_bar_read(&dev->udev, bar, offset, buffer, size); } static unsigned long uml_vfio_bar_read(struct um_pci_device *pdev, int bar, unsigned int offset, int size) { u8 data[8]; uml_vfio_bar_copy_from(pdev, bar, data, offset, size); switch (size) { case 1: return data[0]; case 2: return le16_to_cpup((void *)data); case 4: return le32_to_cpup((void *)data); #ifdef CONFIG_64BIT case 8: return le64_to_cpup((void *)data); #endif default: return ULONG_MAX; } } static void uml_vfio_bar_copy_to(struct um_pci_device *pdev, int bar, unsigned int offset, const void *buffer, int size) { struct uml_vfio_device *dev = to_vdev(pdev); uml_vfio_user_bar_write(&dev->udev, bar, offset, buffer, size); } static void uml_vfio_bar_write(struct um_pci_device *pdev, int bar, unsigned int offset, int size, unsigned long val) { struct uml_vfio_device *dev = to_vdev(pdev); u8 data[8]; if (bar == dev->msix_bar && offset + size > dev->msix_offset && offset < dev->msix_offset + dev->msix_size) WARN_ON(uml_vfio_update_msix_table(dev, offset, size, val)); switch (size) { case 1: data[0] = (u8)val; break; case 2: put_unaligned_le16(val, (void *)data); break; case 4: put_unaligned_le32(val, (void *)data); break; #ifdef CONFIG_64BIT case 8: put_unaligned_le64(val, (void *)data); break; #endif } uml_vfio_bar_copy_to(pdev, bar, offset, data, size); } static void uml_vfio_bar_set(struct um_pci_device *pdev, int bar, unsigned int offset, u8 value, int size) { struct uml_vfio_device *dev = to_vdev(pdev); int i; for (i = 0; i < size; i++) uml_vfio_user_bar_write(&dev->udev, bar, offset + i, &value, 1); } static const struct um_pci_ops uml_vfio_um_pci_ops = { .cfgspace_read = uml_vfio_cfgspace_read, .cfgspace_write = uml_vfio_cfgspace_write, .bar_read = uml_vfio_bar_read, .bar_write = uml_vfio_bar_write, .bar_copy_from = uml_vfio_bar_copy_from, .bar_copy_to = uml_vfio_bar_copy_to, .bar_set = uml_vfio_bar_set, }; static u8 uml_vfio_find_capability(struct uml_vfio_device *dev, u8 cap) { u8 id, pos; u16 ent; int ttl = 48; /* PCI_FIND_CAP_TTL */ pos = __uml_vfio_cfgspace_read(dev, PCI_CAPABILITY_LIST, sizeof(pos)); while (pos && ttl--) { ent = __uml_vfio_cfgspace_read(dev, pos, sizeof(ent)); id = ent & 0xff; if (id == 0xff) break; if (id == cap) return pos; pos = ent >> 8; } return 0; } static int uml_vfio_read_msix_table(struct uml_vfio_device *dev) { unsigned int off; u16 flags; u32 tbl; off = uml_vfio_find_capability(dev, PCI_CAP_ID_MSIX); if (!off) return -ENOTSUPP; dev->msix_cap = off; tbl = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_TABLE, sizeof(tbl)); flags = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_FLAGS, sizeof(flags)); dev->msix_bar = tbl & PCI_MSIX_TABLE_BIR; dev->msix_offset = tbl & PCI_MSIX_TABLE_OFFSET; dev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * PCI_MSIX_ENTRY_SIZE; dev->msix_data = kzalloc(dev->msix_size, GFP_KERNEL); if (!dev->msix_data) return -ENOMEM; return 0; } static void uml_vfio_open_device(struct uml_vfio_device *dev) { struct uml_vfio_intr_ctx *ctx; int err, group_id, i; group_id = uml_vfio_user_get_group_id(dev->name); if (group_id < 0) { pr_err("Failed to get group id (%s), error %d\n", dev->name, group_id); goto free_dev; } dev->group = uml_vfio_open_group(group_id); if (dev->group < 0) { pr_err("Failed to open group %d (%s), error %d\n", group_id, dev->name, dev->group); goto free_dev; } err = uml_vfio_user_setup_device(&dev->udev, dev->group, dev->name); if (err) { pr_err("Failed to setup device (%s), error %d\n", dev->name, err); goto release_group; } err = uml_vfio_read_msix_table(dev); if (err) { pr_err("Failed to read MSI-X table (%s), error %d\n", dev->name, err); goto teardown_udev; } dev->intr_ctx = kmalloc_array(dev->udev.irq_count, sizeof(struct uml_vfio_intr_ctx), GFP_KERNEL); if (!dev->intr_ctx) { pr_err("Failed to allocate interrupt context (%s)\n", dev->name); goto free_msix; } for (i = 0; i < dev->udev.irq_count; i++) { ctx = &dev->intr_ctx[i]; ctx->dev = dev; ctx->irq = -1; } dev->pdev.ops = ¨_vfio_um_pci_ops; err = um_pci_device_register(&dev->pdev); if (err) { pr_err("Failed to register UM PCI device (%s), error %d\n", dev->name, err); goto free_intr_ctx; } return; free_intr_ctx: kfree(dev->intr_ctx); free_msix: kfree(dev->msix_data); teardown_udev: uml_vfio_user_teardown_device(&dev->udev); release_group: uml_vfio_release_group(dev->group); free_dev: list_del(&dev->list); kfree(dev->name); kfree(dev); } static void uml_vfio_release_device(struct uml_vfio_device *dev) { int i; for (i = 0; i < dev->udev.irq_count; i++) uml_vfio_deactivate_irq(dev, i); uml_vfio_user_update_irqs(&dev->udev); um_pci_device_unregister(&dev->pdev); kfree(dev->intr_ctx); kfree(dev->msix_data); uml_vfio_user_teardown_device(&dev->udev); uml_vfio_release_group(dev->group); list_del(&dev->list); kfree(dev->name); kfree(dev); } static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *kp) { struct uml_vfio_device *dev; int fd; if (uml_vfio_container.fd < 0) { fd = uml_vfio_user_open_container(); if (fd < 0) return fd; uml_vfio_container.fd = fd; } dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; dev->name = kstrdup(device, GFP_KERNEL); if (!dev->name) { kfree(dev); return -ENOMEM; } list_add_tail(&dev->list, ¨_vfio_devices); return 0; } static int uml_vfio_cmdline_get(char *buffer, const struct kernel_param *kp) { return 0; } static const struct kernel_param_ops uml_vfio_cmdline_param_ops = { .set = uml_vfio_cmdline_set, .get = uml_vfio_cmdline_get, }; device_param_cb(device, ¨_vfio_cmdline_param_ops, NULL, 0400); __uml_help(uml_vfio_cmdline_param_ops, "vfio_uml.device=\n" " Pass through a PCI device to UML via VFIO. Currently, only MSI-X\n" " capable devices are supported, and it is assumed that drivers will\n" " use MSI-X. This parameter can be specified multiple times to pass\n" " through multiple PCI devices to UML.\n\n" ); static int __init uml_vfio_init(void) { struct uml_vfio_device *dev, *n; sigio_broken(); /* If the opening fails, the device will be released. */ list_for_each_entry_safe(dev, n, ¨_vfio_devices, list) uml_vfio_open_device(dev); return 0; } late_initcall(uml_vfio_init); static void __exit uml_vfio_exit(void) { struct uml_vfio_device *dev, *n; list_for_each_entry_safe(dev, n, ¨_vfio_devices, list) uml_vfio_release_device(dev); if (uml_vfio_container.fd >= 0) os_close_file(uml_vfio_container.fd); } module_exit(uml_vfio_exit);