summaryrefslogtreecommitdiff
path: root/arch/um/drivers/vfio_kern.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/um/drivers/vfio_kern.c')
-rw-r--r--arch/um/drivers/vfio_kern.c642
1 files changed, 642 insertions, 0 deletions
diff --git a/arch/um/drivers/vfio_kern.c b/arch/um/drivers/vfio_kern.c
new file mode 100644
index 000000000000..b51fc9888ae1
--- /dev/null
+++ b/arch/um/drivers/vfio_kern.c
@@ -0,0 +1,642 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 Ant Group
+ * Author: Tiwei Bie <tiwei.btw@antgroup.com>
+ */
+
+#define pr_fmt(fmt) "vfio-uml: " fmt
+
+#include <linux/module.h>
+#include <linux/logic_iomem.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+#include <irq_kern.h>
+#include <init.h>
+#include <os.h>
+
+#include "virt-pci.h"
+#include "vfio_user.h"
+
+#define to_vdev(_pdev) container_of(_pdev, struct uml_vfio_device, pdev)
+
+struct uml_vfio_intr_ctx {
+ struct uml_vfio_device *dev;
+ int irq;
+};
+
+struct uml_vfio_device {
+ const char *name;
+ int group;
+
+ struct um_pci_device pdev;
+ struct uml_vfio_user_device udev;
+ struct uml_vfio_intr_ctx *intr_ctx;
+
+ int msix_cap;
+ int msix_bar;
+ int msix_offset;
+ int msix_size;
+ u32 *msix_data;
+
+ struct list_head list;
+};
+
+struct uml_vfio_group {
+ int id;
+ int fd;
+ int users;
+ struct list_head list;
+};
+
+static struct {
+ int fd;
+ int users;
+} uml_vfio_container = { .fd = -1 };
+static DEFINE_MUTEX(uml_vfio_container_mtx);
+
+static LIST_HEAD(uml_vfio_groups);
+static DEFINE_MUTEX(uml_vfio_groups_mtx);
+
+static LIST_HEAD(uml_vfio_devices);
+
+static int uml_vfio_set_container(int group_fd)
+{
+ int err;
+
+ guard(mutex)(&uml_vfio_container_mtx);
+
+ err = uml_vfio_user_set_container(uml_vfio_container.fd, group_fd);
+ if (err)
+ return err;
+
+ uml_vfio_container.users++;
+ if (uml_vfio_container.users > 1)
+ return 0;
+
+ err = uml_vfio_user_setup_iommu(uml_vfio_container.fd);
+ if (err) {
+ uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd);
+ uml_vfio_container.users--;
+ }
+ return err;
+}
+
+static void uml_vfio_unset_container(int group_fd)
+{
+ guard(mutex)(&uml_vfio_container_mtx);
+
+ uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd);
+ uml_vfio_container.users--;
+}
+
+static int uml_vfio_open_group(int group_id)
+{
+ struct uml_vfio_group *group;
+ int err;
+
+ guard(mutex)(&uml_vfio_groups_mtx);
+
+ list_for_each_entry(group, &uml_vfio_groups, list) {
+ if (group->id == group_id) {
+ group->users++;
+ return group->fd;
+ }
+ }
+
+ group = kzalloc(sizeof(*group), GFP_KERNEL);
+ if (!group)
+ return -ENOMEM;
+
+ group->fd = uml_vfio_user_open_group(group_id);
+ if (group->fd < 0) {
+ err = group->fd;
+ goto free_group;
+ }
+
+ err = uml_vfio_set_container(group->fd);
+ if (err)
+ goto close_group;
+
+ group->id = group_id;
+ group->users = 1;
+
+ list_add(&group->list, &uml_vfio_groups);
+
+ return group->fd;
+
+close_group:
+ os_close_file(group->fd);
+free_group:
+ kfree(group);
+ return err;
+}
+
+static int uml_vfio_release_group(int group_fd)
+{
+ struct uml_vfio_group *group;
+
+ guard(mutex)(&uml_vfio_groups_mtx);
+
+ list_for_each_entry(group, &uml_vfio_groups, list) {
+ if (group->fd == group_fd) {
+ group->users--;
+ if (group->users == 0) {
+ uml_vfio_unset_container(group_fd);
+ os_close_file(group_fd);
+ list_del(&group->list);
+ kfree(group);
+ }
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+static irqreturn_t uml_vfio_interrupt(int unused, void *opaque)
+{
+ struct uml_vfio_intr_ctx *ctx = opaque;
+ struct uml_vfio_device *dev = ctx->dev;
+ int index = ctx - dev->intr_ctx;
+ int irqfd = dev->udev.irqfd[index];
+ int irq = dev->msix_data[index];
+ uint64_t v;
+ int r;
+
+ do {
+ r = os_read_file(irqfd, &v, sizeof(v));
+ if (r == sizeof(v))
+ generic_handle_irq(irq);
+ } while (r == sizeof(v) || r == -EINTR);
+ WARN(r != -EAGAIN, "read returned %d\n", r);
+
+ return IRQ_HANDLED;
+}
+
+static int uml_vfio_activate_irq(struct uml_vfio_device *dev, int index)
+{
+ struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index];
+ int err, irqfd;
+
+ if (ctx->irq >= 0)
+ return 0;
+
+ irqfd = uml_vfio_user_activate_irq(&dev->udev, index);
+ if (irqfd < 0)
+ return irqfd;
+
+ ctx->irq = um_request_irq(UM_IRQ_ALLOC, irqfd, IRQ_READ,
+ uml_vfio_interrupt, 0,
+ "vfio-uml", ctx);
+ if (ctx->irq < 0) {
+ err = ctx->irq;
+ goto deactivate;
+ }
+
+ err = add_sigio_fd(irqfd);
+ if (err)
+ goto free_irq;
+
+ return 0;
+
+free_irq:
+ um_free_irq(ctx->irq, ctx);
+ ctx->irq = -1;
+deactivate:
+ uml_vfio_user_deactivate_irq(&dev->udev, index);
+ return err;
+}
+
+static int uml_vfio_deactivate_irq(struct uml_vfio_device *dev, int index)
+{
+ struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index];
+
+ if (ctx->irq >= 0) {
+ ignore_sigio_fd(dev->udev.irqfd[index]);
+ um_free_irq(ctx->irq, ctx);
+ uml_vfio_user_deactivate_irq(&dev->udev, index);
+ ctx->irq = -1;
+ }
+ return 0;
+}
+
+static int uml_vfio_update_msix_cap(struct uml_vfio_device *dev,
+ unsigned int offset, int size,
+ unsigned long val)
+{
+ /*
+ * Here, we handle only the operations we care about,
+ * ignoring the rest.
+ */
+ if (size == 2 && offset == dev->msix_cap + PCI_MSIX_FLAGS) {
+ switch (val & ~PCI_MSIX_FLAGS_QSIZE) {
+ case PCI_MSIX_FLAGS_ENABLE:
+ case 0:
+ return uml_vfio_user_update_irqs(&dev->udev);
+ }
+ }
+ return 0;
+}
+
+static int uml_vfio_update_msix_table(struct uml_vfio_device *dev,
+ unsigned int offset, int size,
+ unsigned long val)
+{
+ int index;
+
+ /*
+ * Here, we handle only the operations we care about,
+ * ignoring the rest.
+ */
+ offset -= dev->msix_offset + PCI_MSIX_ENTRY_DATA;
+
+ if (size != 4 || offset % PCI_MSIX_ENTRY_SIZE != 0)
+ return 0;
+
+ index = offset / PCI_MSIX_ENTRY_SIZE;
+ if (index >= dev->udev.irq_count)
+ return -EINVAL;
+
+ dev->msix_data[index] = val;
+
+ return val ? uml_vfio_activate_irq(dev, index) :
+ uml_vfio_deactivate_irq(dev, index);
+}
+
+static unsigned long __uml_vfio_cfgspace_read(struct uml_vfio_device *dev,
+ unsigned int offset, int size)
+{
+ u8 data[8];
+
+ memset(data, 0xff, sizeof(data));
+
+ if (uml_vfio_user_cfgspace_read(&dev->udev, offset, data, size))
+ return ULONG_MAX;
+
+ switch (size) {
+ case 1:
+ return data[0];
+ case 2:
+ return le16_to_cpup((void *)data);
+ case 4:
+ return le32_to_cpup((void *)data);
+#ifdef CONFIG_64BIT
+ case 8:
+ return le64_to_cpup((void *)data);
+#endif
+ default:
+ return ULONG_MAX;
+ }
+}
+
+static unsigned long uml_vfio_cfgspace_read(struct um_pci_device *pdev,
+ unsigned int offset, int size)
+{
+ struct uml_vfio_device *dev = to_vdev(pdev);
+
+ return __uml_vfio_cfgspace_read(dev, offset, size);
+}
+
+static void __uml_vfio_cfgspace_write(struct uml_vfio_device *dev,
+ unsigned int offset, int size,
+ unsigned long val)
+{
+ u8 data[8];
+
+ switch (size) {
+ case 1:
+ data[0] = (u8)val;
+ break;
+ case 2:
+ put_unaligned_le16(val, (void *)data);
+ break;
+ case 4:
+ put_unaligned_le32(val, (void *)data);
+ break;
+#ifdef CONFIG_64BIT
+ case 8:
+ put_unaligned_le64(val, (void *)data);
+ break;
+#endif
+ }
+
+ WARN_ON(uml_vfio_user_cfgspace_write(&dev->udev, offset, data, size));
+}
+
+static void uml_vfio_cfgspace_write(struct um_pci_device *pdev,
+ unsigned int offset, int size,
+ unsigned long val)
+{
+ struct uml_vfio_device *dev = to_vdev(pdev);
+
+ if (offset < dev->msix_cap + PCI_CAP_MSIX_SIZEOF &&
+ offset + size > dev->msix_cap)
+ WARN_ON(uml_vfio_update_msix_cap(dev, offset, size, val));
+
+ __uml_vfio_cfgspace_write(dev, offset, size, val);
+}
+
+static void uml_vfio_bar_copy_from(struct um_pci_device *pdev, int bar,
+ void *buffer, unsigned int offset, int size)
+{
+ struct uml_vfio_device *dev = to_vdev(pdev);
+
+ memset(buffer, 0xff, size);
+ uml_vfio_user_bar_read(&dev->udev, bar, offset, buffer, size);
+}
+
+static unsigned long uml_vfio_bar_read(struct um_pci_device *pdev, int bar,
+ unsigned int offset, int size)
+{
+ u8 data[8];
+
+ uml_vfio_bar_copy_from(pdev, bar, data, offset, size);
+
+ switch (size) {
+ case 1:
+ return data[0];
+ case 2:
+ return le16_to_cpup((void *)data);
+ case 4:
+ return le32_to_cpup((void *)data);
+#ifdef CONFIG_64BIT
+ case 8:
+ return le64_to_cpup((void *)data);
+#endif
+ default:
+ return ULONG_MAX;
+ }
+}
+
+static void uml_vfio_bar_copy_to(struct um_pci_device *pdev, int bar,
+ unsigned int offset, const void *buffer,
+ int size)
+{
+ struct uml_vfio_device *dev = to_vdev(pdev);
+
+ uml_vfio_user_bar_write(&dev->udev, bar, offset, buffer, size);
+}
+
+static void uml_vfio_bar_write(struct um_pci_device *pdev, int bar,
+ unsigned int offset, int size,
+ unsigned long val)
+{
+ struct uml_vfio_device *dev = to_vdev(pdev);
+ u8 data[8];
+
+ if (bar == dev->msix_bar && offset + size > dev->msix_offset &&
+ offset < dev->msix_offset + dev->msix_size)
+ WARN_ON(uml_vfio_update_msix_table(dev, offset, size, val));
+
+ switch (size) {
+ case 1:
+ data[0] = (u8)val;
+ break;
+ case 2:
+ put_unaligned_le16(val, (void *)data);
+ break;
+ case 4:
+ put_unaligned_le32(val, (void *)data);
+ break;
+#ifdef CONFIG_64BIT
+ case 8:
+ put_unaligned_le64(val, (void *)data);
+ break;
+#endif
+ }
+
+ uml_vfio_bar_copy_to(pdev, bar, offset, data, size);
+}
+
+static void uml_vfio_bar_set(struct um_pci_device *pdev, int bar,
+ unsigned int offset, u8 value, int size)
+{
+ struct uml_vfio_device *dev = to_vdev(pdev);
+ int i;
+
+ for (i = 0; i < size; i++)
+ uml_vfio_user_bar_write(&dev->udev, bar, offset + i, &value, 1);
+}
+
+static const struct um_pci_ops uml_vfio_um_pci_ops = {
+ .cfgspace_read = uml_vfio_cfgspace_read,
+ .cfgspace_write = uml_vfio_cfgspace_write,
+ .bar_read = uml_vfio_bar_read,
+ .bar_write = uml_vfio_bar_write,
+ .bar_copy_from = uml_vfio_bar_copy_from,
+ .bar_copy_to = uml_vfio_bar_copy_to,
+ .bar_set = uml_vfio_bar_set,
+};
+
+static u8 uml_vfio_find_capability(struct uml_vfio_device *dev, u8 cap)
+{
+ u8 id, pos;
+ u16 ent;
+ int ttl = 48; /* PCI_FIND_CAP_TTL */
+
+ pos = __uml_vfio_cfgspace_read(dev, PCI_CAPABILITY_LIST, sizeof(pos));
+
+ while (pos && ttl--) {
+ ent = __uml_vfio_cfgspace_read(dev, pos, sizeof(ent));
+
+ id = ent & 0xff;
+ if (id == 0xff)
+ break;
+ if (id == cap)
+ return pos;
+
+ pos = ent >> 8;
+ }
+
+ return 0;
+}
+
+static int uml_vfio_read_msix_table(struct uml_vfio_device *dev)
+{
+ unsigned int off;
+ u16 flags;
+ u32 tbl;
+
+ off = uml_vfio_find_capability(dev, PCI_CAP_ID_MSIX);
+ if (!off)
+ return -ENOTSUPP;
+
+ dev->msix_cap = off;
+
+ tbl = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_TABLE, sizeof(tbl));
+ flags = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_FLAGS, sizeof(flags));
+
+ dev->msix_bar = tbl & PCI_MSIX_TABLE_BIR;
+ dev->msix_offset = tbl & PCI_MSIX_TABLE_OFFSET;
+ dev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * PCI_MSIX_ENTRY_SIZE;
+
+ dev->msix_data = kzalloc(dev->msix_size, GFP_KERNEL);
+ if (!dev->msix_data)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void uml_vfio_open_device(struct uml_vfio_device *dev)
+{
+ struct uml_vfio_intr_ctx *ctx;
+ int err, group_id, i;
+
+ group_id = uml_vfio_user_get_group_id(dev->name);
+ if (group_id < 0) {
+ pr_err("Failed to get group id (%s), error %d\n",
+ dev->name, group_id);
+ goto free_dev;
+ }
+
+ dev->group = uml_vfio_open_group(group_id);
+ if (dev->group < 0) {
+ pr_err("Failed to open group %d (%s), error %d\n",
+ group_id, dev->name, dev->group);
+ goto free_dev;
+ }
+
+ err = uml_vfio_user_setup_device(&dev->udev, dev->group, dev->name);
+ if (err) {
+ pr_err("Failed to setup device (%s), error %d\n",
+ dev->name, err);
+ goto release_group;
+ }
+
+ err = uml_vfio_read_msix_table(dev);
+ if (err) {
+ pr_err("Failed to read MSI-X table (%s), error %d\n",
+ dev->name, err);
+ goto teardown_udev;
+ }
+
+ dev->intr_ctx = kmalloc_array(dev->udev.irq_count,
+ sizeof(struct uml_vfio_intr_ctx),
+ GFP_KERNEL);
+ if (!dev->intr_ctx) {
+ pr_err("Failed to allocate interrupt context (%s)\n",
+ dev->name);
+ goto free_msix;
+ }
+
+ for (i = 0; i < dev->udev.irq_count; i++) {
+ ctx = &dev->intr_ctx[i];
+ ctx->dev = dev;
+ ctx->irq = -1;
+ }
+
+ dev->pdev.ops = &uml_vfio_um_pci_ops;
+
+ err = um_pci_device_register(&dev->pdev);
+ if (err) {
+ pr_err("Failed to register UM PCI device (%s), error %d\n",
+ dev->name, err);
+ goto free_intr_ctx;
+ }
+
+ return;
+
+free_intr_ctx:
+ kfree(dev->intr_ctx);
+free_msix:
+ kfree(dev->msix_data);
+teardown_udev:
+ uml_vfio_user_teardown_device(&dev->udev);
+release_group:
+ uml_vfio_release_group(dev->group);
+free_dev:
+ list_del(&dev->list);
+ kfree(dev->name);
+ kfree(dev);
+}
+
+static void uml_vfio_release_device(struct uml_vfio_device *dev)
+{
+ int i;
+
+ for (i = 0; i < dev->udev.irq_count; i++)
+ uml_vfio_deactivate_irq(dev, i);
+ uml_vfio_user_update_irqs(&dev->udev);
+
+ um_pci_device_unregister(&dev->pdev);
+ kfree(dev->intr_ctx);
+ kfree(dev->msix_data);
+ uml_vfio_user_teardown_device(&dev->udev);
+ uml_vfio_release_group(dev->group);
+ list_del(&dev->list);
+ kfree(dev->name);
+ kfree(dev);
+}
+
+static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *kp)
+{
+ struct uml_vfio_device *dev;
+ int fd;
+
+ if (uml_vfio_container.fd < 0) {
+ fd = uml_vfio_user_open_container();
+ if (fd < 0)
+ return fd;
+ uml_vfio_container.fd = fd;
+ }
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+
+ dev->name = kstrdup(device, GFP_KERNEL);
+ if (!dev->name) {
+ kfree(dev);
+ return -ENOMEM;
+ }
+
+ list_add_tail(&dev->list, &uml_vfio_devices);
+ return 0;
+}
+
+static int uml_vfio_cmdline_get(char *buffer, const struct kernel_param *kp)
+{
+ return 0;
+}
+
+static const struct kernel_param_ops uml_vfio_cmdline_param_ops = {
+ .set = uml_vfio_cmdline_set,
+ .get = uml_vfio_cmdline_get,
+};
+
+device_param_cb(device, &uml_vfio_cmdline_param_ops, NULL, 0400);
+__uml_help(uml_vfio_cmdline_param_ops,
+"vfio_uml.device=<domain:bus:slot.function>\n"
+" Pass through a PCI device to UML via VFIO. Currently, only MSI-X\n"
+" capable devices are supported, and it is assumed that drivers will\n"
+" use MSI-X. This parameter can be specified multiple times to pass\n"
+" through multiple PCI devices to UML.\n\n"
+);
+
+static int __init uml_vfio_init(void)
+{
+ struct uml_vfio_device *dev, *n;
+
+ sigio_broken();
+
+ /* If the opening fails, the device will be released. */
+ list_for_each_entry_safe(dev, n, &uml_vfio_devices, list)
+ uml_vfio_open_device(dev);
+
+ return 0;
+}
+late_initcall(uml_vfio_init);
+
+static void __exit uml_vfio_exit(void)
+{
+ struct uml_vfio_device *dev, *n;
+
+ list_for_each_entry_safe(dev, n, &uml_vfio_devices, list)
+ uml_vfio_release_device(dev);
+
+ if (uml_vfio_container.fd >= 0)
+ os_close_file(uml_vfio_container.fd);
+}
+module_exit(uml_vfio_exit);