diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-04 18:42:48 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-04 18:42:48 -0800 |
| commit | a3ebb59eee2e558e8f8f27fc3f75cd367f17cd8e (patch) | |
| tree | 934bbee766e0cce55cfcaec5dcae856c0361af37 /drivers | |
| parent | ce5cfb0fa20dc6454da039612e34325b7b4a8243 (diff) | |
| parent | d721f52e31553a848e0e9947ca15a49c5674aef3 (diff) | |
Merge tag 'vfio-v6.19-rc1' of https://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson:
- Move libvfio selftest artifacts in preparation of more tightly
coupled integration with KVM selftests (David Matlack)
- Fix comment typo in mtty driver (Chu Guangqing)
- Support for new hardware revision in the hisi_acc vfio-pci variant
driver where the migration registers can now be accessed via the PF.
When enabled for this support, the full BAR can be exposed to the
user (Longfang Liu)
- Fix vfio cdev support for VF token passing, using the correct size
for the kernel structure, thereby actually allowing userspace to
provide a non-zero UUID token. Also set the match token callback for
the hisi_acc, fixing VF token support for this this vfio-pci variant
driver (Raghavendra Rao Ananta)
- Introduce internal callbacks on vfio devices to simplify and
consolidate duplicate code for generating VFIO_DEVICE_GET_REGION_INFO
data, removing various ioctl intercepts with a more structured
solution (Jason Gunthorpe)
- Introduce dma-buf support for vfio-pci devices, allowing MMIO regions
to be exposed through dma-buf objects with lifecycle managed through
move operations. This enables low-level interactions such as a
vfio-pci based SPDK drivers interacting directly with dma-buf capable
RDMA devices to enable peer-to-peer operations. IOMMUFD is also now
able to build upon this support to fill a long standing feature gap
versus the legacy vfio type1 IOMMU backend with an implementation of
P2P support for VM use cases that better manages the lifecycle of the
P2P mapping (Leon Romanovsky, Jason Gunthorpe, Vivek Kasireddy)
- Convert eventfd triggering for error and request signals to use RCU
mechanisms in order to avoid a 3-way lockdep reported deadlock issue
(Alex Williamson)
- Fix a 32-bit overflow introduced via dma-buf support manifesting with
large DMA buffers (Alex Mastro)
- Convert nvgrace-gpu vfio-pci variant driver to insert mappings on
fault rather than at mmap time. This conversion serves both to make
use of huge PFNMAPs but also to both avoid corrected RAS events
during reset by now being subject to vfio-pci-core's use of
unmap_mapping_range(), and to enable a device readiness test after
reset (Ankit Agrawal)
- Refactoring of vfio selftests to support multi-device tests and split
code to provide better separation between IOMMU and device objects.
This work also enables a new test suite addition to measure parallel
device initialization latency (David Matlack)
* tag 'vfio-v6.19-rc1' of https://github.com/awilliam/linux-vfio: (65 commits)
vfio: selftests: Add vfio_pci_device_init_perf_test
vfio: selftests: Eliminate INVALID_IOVA
vfio: selftests: Split libvfio.h into separate header files
vfio: selftests: Move vfio_selftests_*() helpers into libvfio.c
vfio: selftests: Rename vfio_util.h to libvfio.h
vfio: selftests: Stop passing device for IOMMU operations
vfio: selftests: Move IOVA allocator into iova_allocator.c
vfio: selftests: Move IOMMU library code into iommu.c
vfio: selftests: Rename struct vfio_dma_region to dma_region
vfio: selftests: Upgrade driver logging to dev_err()
vfio: selftests: Prefix logs with device BDF where relevant
vfio: selftests: Eliminate overly chatty logging
vfio: selftests: Support multiple devices in the same container/iommufd
vfio: selftests: Introduce struct iommu
vfio: selftests: Rename struct vfio_iommu_mode to iommu_mode
vfio: selftests: Allow passing multiple BDFs on the command line
vfio: selftests: Split run.sh into separate scripts
vfio: selftests: Move run.sh into scripts directory
vfio/nvgrace-gpu: wait for the GPU mem to be ready
vfio/nvgrace-gpu: Inform devmem unmapped after reset
...
Diffstat (limited to 'drivers')
32 files changed, 1613 insertions, 650 deletions
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index be25ecbdba69..f8bfff5dd0bd 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -3032,11 +3032,36 @@ static void qm_put_pci_res(struct hisi_qm *qm) pci_release_mem_regions(pdev); } +static void hisi_mig_region_clear(struct hisi_qm *qm) +{ + u32 val; + + /* Clear migration region set of PF */ + if (qm->fun_type == QM_HW_PF && qm->ver > QM_HW_V3) { + val = readl(qm->io_base + QM_MIG_REGION_SEL); + val &= ~QM_MIG_REGION_EN; + writel(val, qm->io_base + QM_MIG_REGION_SEL); + } +} + +static void hisi_mig_region_enable(struct hisi_qm *qm) +{ + u32 val; + + /* Select migration region of PF */ + if (qm->fun_type == QM_HW_PF && qm->ver > QM_HW_V3) { + val = readl(qm->io_base + QM_MIG_REGION_SEL); + val |= QM_MIG_REGION_EN; + writel(val, qm->io_base + QM_MIG_REGION_SEL); + } +} + static void hisi_qm_pci_uninit(struct hisi_qm *qm) { struct pci_dev *pdev = qm->pdev; pci_free_irq_vectors(pdev); + hisi_mig_region_clear(qm); qm_put_pci_res(qm); pci_disable_device(pdev); } @@ -5752,6 +5777,7 @@ int hisi_qm_init(struct hisi_qm *qm) goto err_free_qm_memory; qm_cmd_init(qm); + hisi_mig_region_enable(qm); return 0; @@ -5890,6 +5916,7 @@ static int qm_rebuild_for_resume(struct hisi_qm *qm) } qm_cmd_init(qm); + hisi_mig_region_enable(qm); hisi_qm_dev_err_init(qm); /* Set the doorbell timeout to QM_DB_TIMEOUT_CFG ns. */ writel(QM_DB_TIMEOUT_SET, qm->io_base + QM_DB_TIMEOUT_CFG); diff --git a/drivers/dma-buf/Makefile b/drivers/dma-buf/Makefile index 70ec901edf2c..2008fb7481b3 100644 --- a/drivers/dma-buf/Makefile +++ b/drivers/dma-buf/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y := dma-buf.o dma-fence.o dma-fence-array.o dma-fence-chain.o \ - dma-fence-unwrap.o dma-resv.o + dma-fence-unwrap.o dma-resv.o dma-buf-mapping.o obj-$(CONFIG_DMABUF_HEAPS) += dma-heap.o obj-$(CONFIG_DMABUF_HEAPS) += heaps/ obj-$(CONFIG_SYNC_FILE) += sync_file.o diff --git a/drivers/dma-buf/dma-buf-mapping.c b/drivers/dma-buf/dma-buf-mapping.c new file mode 100644 index 000000000000..b7352e609fbd --- /dev/null +++ b/drivers/dma-buf/dma-buf-mapping.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * DMA BUF Mapping Helpers + * + */ +#include <linux/dma-buf-mapping.h> +#include <linux/dma-resv.h> + +static struct scatterlist *fill_sg_entry(struct scatterlist *sgl, size_t length, + dma_addr_t addr) +{ + unsigned int len, nents; + int i; + + nents = DIV_ROUND_UP(length, UINT_MAX); + for (i = 0; i < nents; i++) { + len = min_t(size_t, length, UINT_MAX); + length -= len; + /* + * DMABUF abuses scatterlist to create a scatterlist + * that does not have any CPU list, only the DMA list. + * Always set the page related values to NULL to ensure + * importers can't use it. The phys_addr based DMA API + * does not require the CPU list for mapping or unmapping. + */ + sg_set_page(sgl, NULL, 0, 0); + sg_dma_address(sgl) = addr + (dma_addr_t)i * UINT_MAX; + sg_dma_len(sgl) = len; + sgl = sg_next(sgl); + } + + return sgl; +} + +static unsigned int calc_sg_nents(struct dma_iova_state *state, + struct dma_buf_phys_vec *phys_vec, + size_t nr_ranges, size_t size) +{ + unsigned int nents = 0; + size_t i; + + if (!state || !dma_use_iova(state)) { + for (i = 0; i < nr_ranges; i++) + nents += DIV_ROUND_UP(phys_vec[i].len, UINT_MAX); + } else { + /* + * In IOVA case, there is only one SG entry which spans + * for whole IOVA address space, but we need to make sure + * that it fits sg->length, maybe we need more. + */ + nents = DIV_ROUND_UP(size, UINT_MAX); + } + + return nents; +} + +/** + * struct dma_buf_dma - holds DMA mapping information + * @sgt: Scatter-gather table + * @state: DMA IOVA state relevant in IOMMU-based DMA + * @size: Total size of DMA transfer + */ +struct dma_buf_dma { + struct sg_table sgt; + struct dma_iova_state *state; + size_t size; +}; + +/** + * dma_buf_phys_vec_to_sgt - Returns the scatterlist table of the attachment + * from arrays of physical vectors. This funciton is intended for MMIO memory + * only. + * @attach: [in] attachment whose scatterlist is to be returned + * @provider: [in] p2pdma provider + * @phys_vec: [in] array of physical vectors + * @nr_ranges: [in] number of entries in phys_vec array + * @size: [in] total size of phys_vec + * @dir: [in] direction of DMA transfer + * + * Returns sg_table containing the scatterlist to be returned; returns ERR_PTR + * on error. May return -EINTR if it is interrupted by a signal. + * + * On success, the DMA addresses and lengths in the returned scatterlist are + * PAGE_SIZE aligned. + * + * A mapping must be unmapped by using dma_buf_free_sgt(). + * + * NOTE: This function is intended for exporters. If direct traffic routing is + * mandatory exporter should call routing pci_p2pdma_map_type() before calling + * this function. + */ +struct sg_table *dma_buf_phys_vec_to_sgt(struct dma_buf_attachment *attach, + struct p2pdma_provider *provider, + struct dma_buf_phys_vec *phys_vec, + size_t nr_ranges, size_t size, + enum dma_data_direction dir) +{ + unsigned int nents, mapped_len = 0; + struct dma_buf_dma *dma; + struct scatterlist *sgl; + dma_addr_t addr; + size_t i; + int ret; + + dma_resv_assert_held(attach->dmabuf->resv); + + if (WARN_ON(!attach || !attach->dmabuf || !provider)) + /* This function is supposed to work on MMIO memory only */ + return ERR_PTR(-EINVAL); + + dma = kzalloc(sizeof(*dma), GFP_KERNEL); + if (!dma) + return ERR_PTR(-ENOMEM); + + switch (pci_p2pdma_map_type(provider, attach->dev)) { + case PCI_P2PDMA_MAP_BUS_ADDR: + /* + * There is no need in IOVA at all for this flow. + */ + break; + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + dma->state = kzalloc(sizeof(*dma->state), GFP_KERNEL); + if (!dma->state) { + ret = -ENOMEM; + goto err_free_dma; + } + + dma_iova_try_alloc(attach->dev, dma->state, 0, size); + break; + default: + ret = -EINVAL; + goto err_free_dma; + } + + nents = calc_sg_nents(dma->state, phys_vec, nr_ranges, size); + ret = sg_alloc_table(&dma->sgt, nents, GFP_KERNEL | __GFP_ZERO); + if (ret) + goto err_free_state; + + sgl = dma->sgt.sgl; + + for (i = 0; i < nr_ranges; i++) { + if (!dma->state) { + addr = pci_p2pdma_bus_addr_map(provider, + phys_vec[i].paddr); + } else if (dma_use_iova(dma->state)) { + ret = dma_iova_link(attach->dev, dma->state, + phys_vec[i].paddr, 0, + phys_vec[i].len, dir, + DMA_ATTR_MMIO); + if (ret) + goto err_unmap_dma; + + mapped_len += phys_vec[i].len; + } else { + addr = dma_map_phys(attach->dev, phys_vec[i].paddr, + phys_vec[i].len, dir, + DMA_ATTR_MMIO); + ret = dma_mapping_error(attach->dev, addr); + if (ret) + goto err_unmap_dma; + } + + if (!dma->state || !dma_use_iova(dma->state)) + sgl = fill_sg_entry(sgl, phys_vec[i].len, addr); + } + + if (dma->state && dma_use_iova(dma->state)) { + WARN_ON_ONCE(mapped_len != size); + ret = dma_iova_sync(attach->dev, dma->state, 0, mapped_len); + if (ret) + goto err_unmap_dma; + + sgl = fill_sg_entry(sgl, mapped_len, dma->state->addr); + } + + dma->size = size; + + /* + * No CPU list included — set orig_nents = 0 so others can detect + * this via SG table (use nents only). + */ + dma->sgt.orig_nents = 0; + + + /* + * SGL must be NULL to indicate that SGL is the last one + * and we allocated correct number of entries in sg_alloc_table() + */ + WARN_ON_ONCE(sgl); + return &dma->sgt; + +err_unmap_dma: + if (!i || !dma->state) { + ; /* Do nothing */ + } else if (dma_use_iova(dma->state)) { + dma_iova_destroy(attach->dev, dma->state, mapped_len, dir, + DMA_ATTR_MMIO); + } else { + for_each_sgtable_dma_sg(&dma->sgt, sgl, i) + dma_unmap_phys(attach->dev, sg_dma_address(sgl), + sg_dma_len(sgl), dir, DMA_ATTR_MMIO); + } + sg_free_table(&dma->sgt); +err_free_state: + kfree(dma->state); +err_free_dma: + kfree(dma); + return ERR_PTR(ret); +} +EXPORT_SYMBOL_NS_GPL(dma_buf_phys_vec_to_sgt, "DMA_BUF"); + +/** + * dma_buf_free_sgt- unmaps the buffer + * @attach: [in] attachment to unmap buffer from + * @sgt: [in] scatterlist info of the buffer to unmap + * @dir: [in] direction of DMA transfer + * + * This unmaps a DMA mapping for @attached obtained + * by dma_buf_phys_vec_to_sgt(). + */ +void dma_buf_free_sgt(struct dma_buf_attachment *attach, struct sg_table *sgt, + enum dma_data_direction dir) +{ + struct dma_buf_dma *dma = container_of(sgt, struct dma_buf_dma, sgt); + int i; + + dma_resv_assert_held(attach->dmabuf->resv); + + if (!dma->state) { + ; /* Do nothing */ + } else if (dma_use_iova(dma->state)) { + dma_iova_destroy(attach->dev, dma->state, dma->size, dir, + DMA_ATTR_MMIO); + } else { + struct scatterlist *sgl; + + for_each_sgtable_dma_sg(sgt, sgl, i) + dma_unmap_phys(attach->dev, sg_dma_address(sgl), + sg_dma_len(sgl), dir, DMA_ATTR_MMIO); + } + + sg_free_table(sgt); + kfree(dma->state); + kfree(dma); + +} +EXPORT_SYMBOL_NS_GPL(dma_buf_free_sgt, "DMA_BUF"); diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index bbeba0d3fca8..3abc9206f1a8 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1141,6 +1141,122 @@ static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, u32 flags, return func(vgpu, index, start, count, flags, data); } +static int intel_vgpu_ioctl_get_region_info(struct vfio_device *vfio_dev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) +{ + struct vfio_region_info_cap_sparse_mmap *sparse = NULL; + struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); + int nr_areas = 1; + int cap_type_id; + unsigned int i; + int ret; + + switch (info->index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vgpu->gvt->device_info.cfg_space_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + break; + case VFIO_PCI_BAR0_REGION_INDEX: + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vgpu->cfg_space.bar[info->index].size; + if (!info->size) { + info->flags = 0; + break; + } + + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + break; + case VFIO_PCI_BAR1_REGION_INDEX: + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0; + info->flags = 0; + break; + case VFIO_PCI_BAR2_REGION_INDEX: + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->flags = VFIO_REGION_INFO_FLAG_CAPS | + VFIO_REGION_INFO_FLAG_MMAP | + VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + info->size = gvt_aperture_sz(vgpu->gvt); + + sparse = kzalloc(struct_size(sparse, areas, nr_areas), + GFP_KERNEL); + if (!sparse) + return -ENOMEM; + + sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; + sparse->header.version = 1; + sparse->nr_areas = nr_areas; + cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; + sparse->areas[0].offset = + PAGE_ALIGN(vgpu_aperture_offset(vgpu)); + sparse->areas[0].size = vgpu_aperture_sz(vgpu); + break; + + case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0; + info->flags = 0; + + gvt_dbg_core("get region info bar:%d\n", info->index); + break; + + case VFIO_PCI_ROM_REGION_INDEX: + case VFIO_PCI_VGA_REGION_INDEX: + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0; + info->flags = 0; + + gvt_dbg_core("get region info index:%d\n", info->index); + break; + default: { + struct vfio_region_info_cap_type cap_type = { + .header.id = VFIO_REGION_INFO_CAP_TYPE, + .header.version = 1 + }; + + if (info->index >= VFIO_PCI_NUM_REGIONS + vgpu->num_regions) + return -EINVAL; + info->index = array_index_nospec( + info->index, VFIO_PCI_NUM_REGIONS + vgpu->num_regions); + + i = info->index - VFIO_PCI_NUM_REGIONS; + + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vgpu->region[i].size; + info->flags = vgpu->region[i].flags; + + cap_type.type = vgpu->region[i].type; + cap_type.subtype = vgpu->region[i].subtype; + + ret = vfio_info_add_capability(caps, &cap_type.header, + sizeof(cap_type)); + if (ret) + return ret; + } + } + + if ((info->flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) { + ret = -EINVAL; + if (cap_type_id == VFIO_REGION_INFO_CAP_SPARSE_MMAP) { + ret = vfio_info_add_capability( + caps, &sparse->header, + struct_size(sparse, areas, sparse->nr_areas)); + } + if (ret) { + kfree(sparse); + return ret; + } + } + + kfree(sparse); + return 0; +} + static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd, unsigned long arg) { @@ -1169,152 +1285,6 @@ static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; - } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { - struct vfio_region_info info; - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; - unsigned int i; - int ret; - struct vfio_region_info_cap_sparse_mmap *sparse = NULL; - int nr_areas = 1; - int cap_type_id; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { - case VFIO_PCI_CONFIG_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vgpu->gvt->device_info.cfg_space_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - break; - case VFIO_PCI_BAR0_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vgpu->cfg_space.bar[info.index].size; - if (!info.size) { - info.flags = 0; - break; - } - - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - break; - case VFIO_PCI_BAR1_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0; - info.flags = 0; - break; - case VFIO_PCI_BAR2_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.flags = VFIO_REGION_INFO_FLAG_CAPS | - VFIO_REGION_INFO_FLAG_MMAP | - VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - info.size = gvt_aperture_sz(vgpu->gvt); - - sparse = kzalloc(struct_size(sparse, areas, nr_areas), - GFP_KERNEL); - if (!sparse) - return -ENOMEM; - - sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; - sparse->header.version = 1; - sparse->nr_areas = nr_areas; - cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; - sparse->areas[0].offset = - PAGE_ALIGN(vgpu_aperture_offset(vgpu)); - sparse->areas[0].size = vgpu_aperture_sz(vgpu); - break; - - case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0; - info.flags = 0; - - gvt_dbg_core("get region info bar:%d\n", info.index); - break; - - case VFIO_PCI_ROM_REGION_INDEX: - case VFIO_PCI_VGA_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0; - info.flags = 0; - - gvt_dbg_core("get region info index:%d\n", info.index); - break; - default: - { - struct vfio_region_info_cap_type cap_type = { - .header.id = VFIO_REGION_INFO_CAP_TYPE, - .header.version = 1 }; - - if (info.index >= VFIO_PCI_NUM_REGIONS + - vgpu->num_regions) - return -EINVAL; - info.index = - array_index_nospec(info.index, - VFIO_PCI_NUM_REGIONS + - vgpu->num_regions); - - i = info.index - VFIO_PCI_NUM_REGIONS; - - info.offset = - VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vgpu->region[i].size; - info.flags = vgpu->region[i].flags; - - cap_type.type = vgpu->region[i].type; - cap_type.subtype = vgpu->region[i].subtype; - - ret = vfio_info_add_capability(&caps, - &cap_type.header, - sizeof(cap_type)); - if (ret) - return ret; - } - } - - if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) { - ret = -EINVAL; - if (cap_type_id == VFIO_REGION_INFO_CAP_SPARSE_MMAP) - ret = vfio_info_add_capability(&caps, - &sparse->header, - struct_size(sparse, areas, - sparse->nr_areas)); - if (ret) { - kfree(sparse); - return ret; - } - } - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + - sizeof(info), caps.buf, - caps.size)) { - kfree(caps.buf); - kfree(sparse); - return -EFAULT; - } - info.cap_offset = sizeof(info); - } - - kfree(caps.buf); - } - - kfree(sparse); - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { struct vfio_irq_info info; @@ -1477,6 +1447,7 @@ static const struct vfio_device_ops intel_vgpu_dev_ops = { .write = intel_vgpu_write, .mmap = intel_vgpu_mmap, .ioctl = intel_vgpu_ioctl, + .get_region_info_caps = intel_vgpu_ioctl_get_region_info, .dma_unmap = intel_vgpu_dma_unmap, .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index f1fb27681b0b..c92088855450 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1439,8 +1439,8 @@ int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, * as a bus address, __finalise_sg() will copy the dma * address into the output segment. */ - s->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state, - sg_phys(s)); + s->dma_address = pci_p2pdma_bus_addr_map( + p2pdma_state.mem, sg_phys(s)); sg_dma_len(s) = sg->length; sg_dma_mark_bus_address(s); continue; diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 78e108e47254..981a76b6b7c0 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -25,12 +25,12 @@ struct pci_p2pdma { struct gen_pool *pool; bool p2pmem_published; struct xarray map_types; + struct p2pdma_provider mem[PCI_STD_NUM_BARS]; }; struct pci_p2pdma_pagemap { - struct pci_dev *provider; - u64 bus_offset; struct dev_pagemap pgmap; + struct p2pdma_provider *mem; }; static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap) @@ -204,8 +204,8 @@ static void p2pdma_page_free(struct page *page) { struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page_pgmap(page)); /* safe to dereference while a reference is held to the percpu ref */ - struct pci_p2pdma *p2pdma = - rcu_dereference_protected(pgmap->provider->p2pdma, 1); + struct pci_p2pdma *p2pdma = rcu_dereference_protected( + to_pci_dev(pgmap->mem->owner)->p2pdma, 1); struct percpu_ref *ref; gen_pool_free_owner(p2pdma->pool, (uintptr_t)page_to_virt(page), @@ -228,56 +228,136 @@ static void pci_p2pdma_release(void *data) /* Flush and disable pci_alloc_p2p_mem() */ pdev->p2pdma = NULL; - synchronize_rcu(); + if (p2pdma->pool) + synchronize_rcu(); + xa_destroy(&p2pdma->map_types); + + if (!p2pdma->pool) + return; gen_pool_destroy(p2pdma->pool); sysfs_remove_group(&pdev->dev.kobj, &p2pmem_group); - xa_destroy(&p2pdma->map_types); } -static int pci_p2pdma_setup(struct pci_dev *pdev) +/** + * pcim_p2pdma_init - Initialise peer-to-peer DMA providers + * @pdev: The PCI device to enable P2PDMA for + * + * This function initializes the peer-to-peer DMA infrastructure + * for a PCI device. It allocates and sets up the necessary data + * structures to support P2PDMA operations, including mapping type + * tracking. + */ +int pcim_p2pdma_init(struct pci_dev *pdev) { - int error = -ENOMEM; struct pci_p2pdma *p2p; + int i, ret; + + p2p = rcu_dereference_protected(pdev->p2pdma, 1); + if (p2p) + return 0; p2p = devm_kzalloc(&pdev->dev, sizeof(*p2p), GFP_KERNEL); if (!p2p) return -ENOMEM; xa_init(&p2p->map_types); + /* + * Iterate over all standard PCI BARs and record only those that + * correspond to MMIO regions. Skip non-memory resources (e.g. I/O + * port BARs) since they cannot be used for peer-to-peer (P2P) + * transactions. + */ + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM)) + continue; - p2p->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev)); - if (!p2p->pool) - goto out; + p2p->mem[i].owner = &pdev->dev; + p2p->mem[i].bus_offset = + pci_bus_address(pdev, i) - pci_resource_start(pdev, i); + } - error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev); - if (error) - goto out_pool_destroy; + ret = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_release, pdev); + if (ret) + goto out_p2p; - error = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group); - if (error) + rcu_assign_pointer(pdev->p2pdma, p2p); + return 0; + +out_p2p: + devm_kfree(&pdev->dev, p2p); + return ret; +} +EXPORT_SYMBOL_GPL(pcim_p2pdma_init); + +/** + * pcim_p2pdma_provider - Get peer-to-peer DMA provider + * @pdev: The PCI device to enable P2PDMA for + * @bar: BAR index to get provider + * + * This function gets peer-to-peer DMA provider for a PCI device. The lifetime + * of the provider (and of course the MMIO) is bound to the lifetime of the + * driver. A driver calling this function must ensure that all references to the + * provider, and any DMA mappings created for any MMIO, are all cleaned up + * before the driver remove() completes. + * + * Since P2P is almost always shared with a second driver this means some system + * to notify, invalidate and revoke the MMIO's DMA must be in place to use this + * function. For example a revoke can be built using DMABUF. + */ +struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, int bar) +{ + struct pci_p2pdma *p2p; + + if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM)) + return NULL; + + p2p = rcu_dereference_protected(pdev->p2pdma, 1); + if (WARN_ON(!p2p)) + /* Someone forgot to call to pcim_p2pdma_init() before */ + return NULL; + + return &p2p->mem[bar]; +} +EXPORT_SYMBOL_GPL(pcim_p2pdma_provider); + +static int pci_p2pdma_setup_pool(struct pci_dev *pdev) +{ + struct pci_p2pdma *p2pdma; + int ret; + + p2pdma = rcu_dereference_protected(pdev->p2pdma, 1); + if (p2pdma->pool) + /* We already setup pools, do nothing, */ + return 0; + + p2pdma->pool = gen_pool_create(PAGE_SHIFT, dev_to_node(&pdev->dev)); + if (!p2pdma->pool) + return -ENOMEM; + + ret = sysfs_create_group(&pdev->dev.kobj, &p2pmem_group); + if (ret) goto out_pool_destroy; - rcu_assign_pointer(pdev->p2pdma, p2p); return 0; out_pool_destroy: - gen_pool_destroy(p2p->pool); -out: - devm_kfree(&pdev->dev, p2p); - return error; + gen_pool_destroy(p2pdma->pool); + p2pdma->pool = NULL; + return ret; } static void pci_p2pdma_unmap_mappings(void *data) { - struct pci_dev *pdev = data; + struct pci_p2pdma_pagemap *p2p_pgmap = data; /* * Removing the alloc attribute from sysfs will call * unmap_mapping_range() on the inode, teardown any existing userspace * mappings and prevent new ones from being created. */ - sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr, + sysfs_remove_file_from_group(&p2p_pgmap->mem->owner->kobj, + &p2pmem_alloc_attr.attr, p2pmem_group.name); } @@ -295,6 +375,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset) { struct pci_p2pdma_pagemap *p2p_pgmap; + struct p2pdma_provider *mem; struct dev_pagemap *pgmap; struct pci_p2pdma *p2pdma; void *addr; @@ -312,11 +393,21 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, if (size + offset > pci_resource_len(pdev, bar)) return -EINVAL; - if (!pdev->p2pdma) { - error = pci_p2pdma_setup(pdev); - if (error) - return error; - } + error = pcim_p2pdma_init(pdev); + if (error) + return error; + + error = pci_p2pdma_setup_pool(pdev); + if (error) + return error; + + mem = pcim_p2pdma_provider(pdev, bar); + /* + * We checked validity of BAR prior to call + * to pcim_p2pdma_provider. It should never return NULL. + */ + if (WARN_ON(!mem)) + return -EINVAL; p2p_pgmap = devm_kzalloc(&pdev->dev, sizeof(*p2p_pgmap), GFP_KERNEL); if (!p2p_pgmap) @@ -328,10 +419,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, pgmap->nr_range = 1; pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; pgmap->ops = &p2pdma_pgmap_ops; - - p2p_pgmap->provider = pdev; - p2p_pgmap->bus_offset = pci_bus_address(pdev, bar) - - pci_resource_start(pdev, bar); + p2p_pgmap->mem = mem; addr = devm_memremap_pages(&pdev->dev, pgmap); if (IS_ERR(addr)) { @@ -340,7 +428,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, } error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings, - pdev); + p2p_pgmap); if (error) goto pages_free; @@ -972,16 +1060,26 @@ void pci_p2pmem_publish(struct pci_dev *pdev, bool publish) } EXPORT_SYMBOL_GPL(pci_p2pmem_publish); -static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap, - struct device *dev) +/** + * pci_p2pdma_map_type - Determine the mapping type for P2PDMA transfers + * @provider: P2PDMA provider structure + * @dev: Target device for the transfer + * + * Determines how peer-to-peer DMA transfers should be mapped between + * the provider and the target device. The mapping type indicates whether + * the transfer can be done directly through PCI switches or must go + * through the host bridge. + */ +enum pci_p2pdma_map_type pci_p2pdma_map_type(struct p2pdma_provider *provider, + struct device *dev) { enum pci_p2pdma_map_type type = PCI_P2PDMA_MAP_NOT_SUPPORTED; - struct pci_dev *provider = to_p2p_pgmap(pgmap)->provider; + struct pci_dev *pdev = to_pci_dev(provider->owner); struct pci_dev *client; struct pci_p2pdma *p2pdma; int dist; - if (!provider->p2pdma) + if (!pdev->p2pdma) return PCI_P2PDMA_MAP_NOT_SUPPORTED; if (!dev_is_pci(dev)) @@ -990,7 +1088,7 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap, client = to_pci_dev(dev); rcu_read_lock(); - p2pdma = rcu_dereference(provider->p2pdma); + p2pdma = rcu_dereference(pdev->p2pdma); if (p2pdma) type = xa_to_value(xa_load(&p2pdma->map_types, @@ -998,7 +1096,7 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap, rcu_read_unlock(); if (type == PCI_P2PDMA_MAP_UNKNOWN) - return calc_map_type_and_dist(provider, client, &dist, true); + return calc_map_type_and_dist(pdev, client, &dist, true); return type; } @@ -1006,9 +1104,13 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap, void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state, struct device *dev, struct page *page) { - state->pgmap = page_pgmap(page); - state->map = pci_p2pdma_map_type(state->pgmap, dev); - state->bus_off = to_p2p_pgmap(state->pgmap)->bus_offset; + struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(page_pgmap(page)); + + if (state->mem == p2p_pgmap->mem) + return; + + state->mem = p2p_pgmap->mem; + state->map = pci_p2pdma_map_type(p2p_pgmap->mem, dev); } /** diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index ea532a8a4a0c..a596f6013019 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -313,10 +313,12 @@ static int vfio_ccw_mdev_get_device_info(struct vfio_ccw_private *private, return 0; } -static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private, - struct vfio_region_info *info, - unsigned long arg) +static int vfio_ccw_mdev_ioctl_get_region_info(struct vfio_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { + struct vfio_ccw_private *private = + container_of(vdev, struct vfio_ccw_private, vdev); int i; switch (info->index) { @@ -328,7 +330,6 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private, return 0; default: /* all other regions are handled via capability chain */ { - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; struct vfio_region_info_cap_type cap_type = { .header.id = VFIO_REGION_INFO_CAP_TYPE, .header.version = 1 }; @@ -351,27 +352,10 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private, cap_type.type = private->region[i].type; cap_type.subtype = private->region[i].subtype; - ret = vfio_info_add_capability(&caps, &cap_type.header, + ret = vfio_info_add_capability(caps, &cap_type.header, sizeof(cap_type)); if (ret) return ret; - - info->flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info->argsz < sizeof(*info) + caps.size) { - info->argsz = sizeof(*info) + caps.size; - info->cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(*info)); - if (copy_to_user((void __user *)arg + sizeof(*info), - caps.buf, caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info->cap_offset = sizeof(*info); - } - - kfree(caps.buf); - } } return 0; @@ -532,24 +516,6 @@ static ssize_t vfio_ccw_mdev_ioctl(struct vfio_device *vdev, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; } - case VFIO_DEVICE_GET_REGION_INFO: - { - struct vfio_region_info info; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - ret = vfio_ccw_mdev_get_region_info(private, &info, arg); - if (ret) - return ret; - - return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; - } case VFIO_DEVICE_GET_IRQ_INFO: { struct vfio_irq_info info; @@ -627,6 +593,7 @@ static const struct vfio_device_ops vfio_ccw_dev_ops = { .read = vfio_ccw_mdev_read, .write = vfio_ccw_mdev_write, .ioctl = vfio_ccw_mdev_ioctl, + .get_region_info_caps = vfio_ccw_mdev_ioctl_get_region_info, .request = vfio_ccw_mdev_request, .dma_unmap = vfio_ccw_dma_unmap, .bind_iommufd = vfio_iommufd_emulated_bind, diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c index 5dd5f5ad7686..253031b86b60 100644 --- a/drivers/vfio/cdx/main.c +++ b/drivers/vfio/cdx/main.c @@ -129,28 +129,22 @@ static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev, return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } -static int vfio_cdx_ioctl_get_region_info(struct vfio_cdx_device *vdev, - struct vfio_region_info __user *arg) +static int vfio_cdx_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { - unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct vfio_cdx_device *vdev = + container_of(core_vdev, struct vfio_cdx_device, vdev); struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); - struct vfio_region_info info; - - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= cdx_dev->res_count) + if (info->index >= cdx_dev->res_count) return -EINVAL; /* map offset to the physical address */ - info.offset = vfio_cdx_index_to_offset(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + info->offset = vfio_cdx_index_to_offset(info->index); + info->size = vdev->regions[info->index].size; + info->flags = vdev->regions[info->index].flags; + return 0; } static int vfio_cdx_ioctl_get_irq_info(struct vfio_cdx_device *vdev, @@ -219,8 +213,6 @@ static long vfio_cdx_ioctl(struct vfio_device *core_vdev, switch (cmd) { case VFIO_DEVICE_GET_INFO: return vfio_cdx_ioctl_get_info(vdev, uarg); - case VFIO_DEVICE_GET_REGION_INFO: - return vfio_cdx_ioctl_get_region_info(vdev, uarg); case VFIO_DEVICE_GET_IRQ_INFO: return vfio_cdx_ioctl_get_irq_info(vdev, uarg); case VFIO_DEVICE_SET_IRQS: @@ -284,6 +276,7 @@ static const struct vfio_device_ops vfio_cdx_ops = { .open_device = vfio_cdx_open_device, .close_device = vfio_cdx_close_device, .ioctl = vfio_cdx_ioctl, + .get_region_info_caps = vfio_cdx_ioctl_get_region_info, .device_feature = vfio_cdx_ioctl_feature, .mmap = vfio_cdx_mmap, .bind_iommufd = vfio_iommufd_physical_bind, diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c index 480cac3a0c27..8ceca24ac136 100644 --- a/drivers/vfio/device_cdev.c +++ b/drivers/vfio/device_cdev.c @@ -99,7 +99,7 @@ long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, return ret; if (user_size < minsz) return -EINVAL; - ret = copy_struct_from_user(&bind, minsz, arg, user_size); + ret = copy_struct_from_user(&bind, sizeof(bind), arg, user_size); if (ret) return ret; diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index 76ccbab0e3d6..ba47100f28c1 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -117,6 +117,24 @@ static void vfio_fsl_mc_close_device(struct vfio_device *core_vdev) fsl_mc_cleanup_irq_pool(mc_cont); } +static int vfio_fsl_mc_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) +{ + struct vfio_fsl_mc_device *vdev = + container_of(core_vdev, struct vfio_fsl_mc_device, vdev); + struct fsl_mc_device *mc_dev = vdev->mc_dev; + + if (info->index >= mc_dev->obj_desc.region_count) + return -EINVAL; + + /* map offset to the physical address */ + info->offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info->index); + info->size = vdev->regions[info->index].size; + info->flags = vdev->regions[info->index].flags; + return 0; +} + static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { @@ -149,30 +167,6 @@ static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; } - case VFIO_DEVICE_GET_REGION_INFO: - { - struct vfio_region_info info; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= mc_dev->obj_desc.region_count) - return -EINVAL; - - /* map offset to the physical address */ - info.offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - if (copy_to_user((void __user *)arg, &info, minsz)) - return -EFAULT; - return 0; - } case VFIO_DEVICE_GET_IRQ_INFO: { struct vfio_irq_info info; @@ -589,6 +583,7 @@ static const struct vfio_device_ops vfio_fsl_mc_ops = { .open_device = vfio_fsl_mc_open_device, .close_device = vfio_fsl_mc_close_device, .ioctl = vfio_fsl_mc_ioctl, + .get_region_info_caps = vfio_fsl_mc_ioctl_get_region_info, .read = vfio_fsl_mc_read, .write = vfio_fsl_mc_write, .mmap = vfio_fsl_mc_mmap, diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 2b0172f54665..2b9fca00e9e8 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -55,6 +55,9 @@ config VFIO_PCI_ZDEV_KVM To enable s390x KVM vfio-pci extensions, say Y. +config VFIO_PCI_DMABUF + def_bool y if VFIO_PCI_CORE && PCI_P2PDMA && DMA_SHARED_BUFFER + source "drivers/vfio/pci/mlx5/Kconfig" source "drivers/vfio/pci/hisilicon/Kconfig" diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index cf00c0a7e55c..53f59226ae01 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -2,6 +2,7 @@ vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o +vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o vfio-pci-y := vfio_pci.o diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index fde33f54e99e..cf45f6370c36 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -125,9 +125,25 @@ static int qm_get_cqc(struct hisi_qm *qm, u64 *addr) return 0; } +static void qm_xqc_reg_offsets(struct hisi_qm *qm, + u32 *eqc_addr, u32 *aeqc_addr) +{ + struct hisi_acc_vf_core_device *hisi_acc_vdev = + container_of(qm, struct hisi_acc_vf_core_device, vf_qm); + + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) { + *eqc_addr = QM_EQC_VF_DW0; + *aeqc_addr = QM_AEQC_VF_DW0; + } else { + *eqc_addr = QM_EQC_PF_DW0; + *aeqc_addr = QM_AEQC_PF_DW0; + } +} + static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) { struct device *dev = &qm->pdev->dev; + u32 eqc_addr, aeqc_addr; int ret; ret = qm_read_regs(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1); @@ -167,15 +183,16 @@ static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) return ret; } + qm_xqc_reg_offsets(qm, &eqc_addr, &aeqc_addr); /* QM_EQC_DW has 7 regs */ - ret = qm_read_regs(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7); + ret = qm_read_regs(qm, eqc_addr, vf_data->qm_eqc_dw, 7); if (ret) { dev_err(dev, "failed to read QM_EQC_DW\n"); return ret; } /* QM_AEQC_DW has 7 regs */ - ret = qm_read_regs(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7); + ret = qm_read_regs(qm, aeqc_addr, vf_data->qm_aeqc_dw, 7); if (ret) { dev_err(dev, "failed to read QM_AEQC_DW\n"); return ret; @@ -187,6 +204,7 @@ static int qm_get_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) { struct device *dev = &qm->pdev->dev; + u32 eqc_addr, aeqc_addr; int ret; /* Check VF state */ @@ -239,15 +257,16 @@ static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) return ret; } + qm_xqc_reg_offsets(qm, &eqc_addr, &aeqc_addr); /* QM_EQC_DW has 7 regs */ - ret = qm_write_regs(qm, QM_EQC_DW0, vf_data->qm_eqc_dw, 7); + ret = qm_write_regs(qm, eqc_addr, vf_data->qm_eqc_dw, 7); if (ret) { dev_err(dev, "failed to write QM_EQC_DW\n"); return ret; } /* QM_AEQC_DW has 7 regs */ - ret = qm_write_regs(qm, QM_AEQC_DW0, vf_data->qm_aeqc_dw, 7); + ret = qm_write_regs(qm, aeqc_addr, vf_data->qm_aeqc_dw, 7); if (ret) { dev_err(dev, "failed to write QM_AEQC_DW\n"); return ret; @@ -1186,34 +1205,52 @@ static int hisi_acc_vf_qm_init(struct hisi_acc_vf_core_device *hisi_acc_vdev) { struct vfio_pci_core_device *vdev = &hisi_acc_vdev->core_device; struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; + struct hisi_qm *pf_qm = hisi_acc_vdev->pf_qm; struct pci_dev *vf_dev = vdev->pdev; + u32 val; - /* - * ACC VF dev BAR2 region consists of both functional register space - * and migration control register space. For migration to work, we - * need access to both. Hence, we map the entire BAR2 region here. - * But unnecessarily exposing the migration BAR region to the Guest - * has the potential to prevent/corrupt the Guest migration. Hence, - * we restrict access to the migration control space from - * Guest(Please see mmap/ioctl/read/write override functions). - * - * Please note that it is OK to expose the entire VF BAR if migration - * is not supported or required as this cannot affect the ACC PF - * configurations. - * - * Also the HiSilicon ACC VF devices supported by this driver on - * HiSilicon hardware platforms are integrated end point devices - * and the platform lacks the capability to perform any PCIe P2P - * between these devices. - */ + val = readl(pf_qm->io_base + QM_MIG_REGION_SEL); + if (pf_qm->ver > QM_HW_V3 && (val & QM_MIG_REGION_EN)) + hisi_acc_vdev->drv_mode = HW_ACC_MIG_PF_CTRL; + else + hisi_acc_vdev->drv_mode = HW_ACC_MIG_VF_CTRL; - vf_qm->io_base = - ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX), - pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX)); - if (!vf_qm->io_base) - return -EIO; + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_PF_CTRL) { + /* + * On hardware platforms greater than QM_HW_V3, the migration function + * register is placed in the BAR2 configuration region of the PF, + * and each VF device occupies 8KB of configuration space. + */ + vf_qm->io_base = pf_qm->io_base + QM_MIG_REGION_OFFSET + + hisi_acc_vdev->vf_id * QM_MIG_REGION_SIZE; + } else { + /* + * ACC VF dev BAR2 region consists of both functional register space + * and migration control register space. For migration to work, we + * need access to both. Hence, we map the entire BAR2 region here. + * But unnecessarily exposing the migration BAR region to the Guest + * has the potential to prevent/corrupt the Guest migration. Hence, + * we restrict access to the migration control space from + * Guest(Please see mmap/ioctl/read/write override functions). + * + * Please note that it is OK to expose the entire VF BAR if migration + * is not supported or required as this cannot affect the ACC PF + * configurations. + * + * Also the HiSilicon ACC VF devices supported by this driver on + * HiSilicon hardware platforms are integrated end point devices + * and the platform lacks the capability to perform any PCIe P2P + * between these devices. + */ + vf_qm->io_base = + ioremap(pci_resource_start(vf_dev, VFIO_PCI_BAR2_REGION_INDEX), + pci_resource_len(vf_dev, VFIO_PCI_BAR2_REGION_INDEX)); + if (!vf_qm->io_base) + return -EIO; + } vf_qm->fun_type = QM_HW_VF; + vf_qm->ver = pf_qm->ver; vf_qm->pdev = vf_dev; mutex_init(&vf_qm->mailbox_lock); @@ -1250,6 +1287,28 @@ static struct hisi_qm *hisi_acc_get_pf_qm(struct pci_dev *pdev) return !IS_ERR(pf_qm) ? pf_qm : NULL; } +static size_t hisi_acc_get_resource_len(struct vfio_pci_core_device *vdev, + unsigned int index) +{ + struct hisi_acc_vf_core_device *hisi_acc_vdev = + hisi_acc_drvdata(vdev->pdev); + + /* + * On the old HW_ACC_MIG_VF_CTRL mode device, the ACC VF device + * BAR2 region encompasses both functional register space + * and migration control register space. + * only the functional region should be report to Guest. + */ + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) + return (pci_resource_len(vdev->pdev, index) >> 1); + /* + * On the new HW device, the migration control register + * has been moved to the PF device BAR2 region. + * The VF device BAR2 is entirely functional register space. + */ + return pci_resource_len(vdev->pdev, index); +} + static int hisi_acc_pci_rw_access_check(struct vfio_device *core_vdev, size_t count, loff_t *ppos, size_t *new_count) @@ -1260,8 +1319,9 @@ static int hisi_acc_pci_rw_access_check(struct vfio_device *core_vdev, if (index == VFIO_PCI_BAR2_REGION_INDEX) { loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; - resource_size_t end = pci_resource_len(vdev->pdev, index) / 2; + resource_size_t end; + end = hisi_acc_get_resource_len(vdev, index); /* Check if access is for migration control region */ if (pos >= end) return -EINVAL; @@ -1282,8 +1342,9 @@ static int hisi_acc_vfio_pci_mmap(struct vfio_device *core_vdev, index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); if (index == VFIO_PCI_BAR2_REGION_INDEX) { u64 req_len, pgoff, req_start; - resource_size_t end = pci_resource_len(vdev->pdev, index) / 2; + resource_size_t end; + end = hisi_acc_get_resource_len(vdev, index); req_len = vma->vm_end - vma->vm_start; pgoff = vma->vm_pgoff & ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); @@ -1324,43 +1385,23 @@ static ssize_t hisi_acc_vfio_pci_read(struct vfio_device *core_vdev, return vfio_pci_core_read(core_vdev, buf, new_count, ppos); } -static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int cmd, - unsigned long arg) +static int hisi_acc_vfio_ioctl_get_region(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { - if (cmd == VFIO_DEVICE_GET_REGION_INFO) { - struct vfio_pci_core_device *vdev = - container_of(core_vdev, struct vfio_pci_core_device, vdev); - struct pci_dev *pdev = vdev->pdev; - struct vfio_region_info info; - unsigned long minsz; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); - if (info.index == VFIO_PCI_BAR2_REGION_INDEX) { - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + if (info->index != VFIO_PCI_BAR2_REGION_INDEX) + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); - /* - * ACC VF dev BAR2 region consists of both functional - * register space and migration control register space. - * Report only the functional region to Guest. - */ - info.size = pci_resource_len(pdev, info.index) / 2; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE | - VFIO_REGION_INFO_FLAG_MMAP; + info->size = hisi_acc_get_resource_len(vdev, info->index); - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; - } - } - return vfio_pci_core_ioctl(core_vdev, cmd, arg); + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | + VFIO_REGION_INFO_FLAG_MMAP; + return 0; } static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vdev) @@ -1521,7 +1562,8 @@ static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev) hisi_acc_vf_disable_fds(hisi_acc_vdev); mutex_lock(&hisi_acc_vdev->open_mutex); hisi_acc_vdev->dev_opened = false; - iounmap(vf_qm->io_base); + if (hisi_acc_vdev->drv_mode == HW_ACC_MIG_VF_CTRL) + iounmap(vf_qm->io_base); mutex_unlock(&hisi_acc_vdev->open_mutex); vfio_pci_core_close_device(core_vdev); } @@ -1557,13 +1599,15 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .release = vfio_pci_core_release_dev, .open_device = hisi_acc_vfio_pci_open_device, .close_device = hisi_acc_vfio_pci_close_device, - .ioctl = hisi_acc_vfio_pci_ioctl, + .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = hisi_acc_vfio_ioctl_get_region, .device_feature = vfio_pci_core_ioctl_feature, .read = hisi_acc_vfio_pci_read, .write = hisi_acc_vfio_pci_write, .mmap = hisi_acc_vfio_pci_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, @@ -1577,6 +1621,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .open_device = hisi_acc_vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 91002ceeebc1..cd55eba64dfb 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -50,8 +50,10 @@ #define QM_QUE_ISO_CFG_V 0x0030 #define QM_PAGE_SIZE 0x0034 -#define QM_EQC_DW0 0X8000 -#define QM_AEQC_DW0 0X8020 +#define QM_EQC_VF_DW0 0X8000 +#define QM_AEQC_VF_DW0 0X8020 +#define QM_EQC_PF_DW0 0x1c00 +#define QM_AEQC_PF_DW0 0x1c20 #define ACC_DRV_MAJOR_VER 1 #define ACC_DRV_MINOR_VER 0 @@ -59,6 +61,22 @@ #define ACC_DEV_MAGIC_V1 0XCDCDCDCDFEEDAACC #define ACC_DEV_MAGIC_V2 0xAACCFEEDDECADEDE +#define QM_MIG_REGION_OFFSET 0x180000 +#define QM_MIG_REGION_SIZE 0x2000 + +/** + * On HW_ACC_MIG_VF_CTRL mode, the configuration domain supporting live + * migration functionality is located in the latter 32KB of the VF's BAR2. + * The Guest is only provided with the first 32KB of the VF's BAR2. + * On HW_ACC_MIG_PF_CTRL mode, the configuration domain supporting live + * migration functionality is located in the PF's BAR2, and the entire 64KB + * of the VF's BAR2 is allocated to the Guest. + */ +enum hw_drv_mode { + HW_ACC_MIG_VF_CTRL = 0, + HW_ACC_MIG_PF_CTRL, +}; + struct acc_vf_data { #define QM_MATCH_SIZE offsetofend(struct acc_vf_data, qm_rsv_state) /* QM match information */ @@ -125,6 +143,7 @@ struct hisi_acc_vf_core_device { struct pci_dev *vf_dev; struct hisi_qm *pf_qm; struct hisi_qm vf_qm; + enum hw_drv_mode drv_mode; /* * vf_qm_state represents the QM_VF_STATE register value. * It is set by Guest driver for the ACC VF dev indicating diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 7ec47e736a8e..9c5970411d07 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -1366,6 +1366,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .open_device = mlx5vf_pci_open_device, .close_device = mlx5vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index e346392b72f6..84d142a47ec6 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -7,6 +7,8 @@ #include <linux/vfio_pci_core.h> #include <linux/delay.h> #include <linux/jiffies.h> +#include <linux/pci-p2pdma.h> +#include <linux/pm_runtime.h> /* * The device memory usable to the workloads running in the VM is cached @@ -58,6 +60,8 @@ struct nvgrace_gpu_pci_core_device { /* Lock to control device memory kernel mapping */ struct mutex remap_lock; bool has_mig_hw_bug; + /* GPU has just been reset */ + bool reset_done; }; static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) @@ -102,6 +106,19 @@ static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) mutex_init(&nvdev->remap_lock); } + /* + * GPU readiness is checked by reading the BAR0 registers. + * + * ioremap BAR0 to ensure that the BAR0 mapping is present before + * register reads on first fault before establishing any GPU + * memory mapping. + */ + ret = vfio_pci_core_setup_barmap(vdev, 0); + if (ret) { + vfio_pci_core_disable(vdev); + return ret; + } + vfio_pci_core_finish_enable(vdev); return 0; @@ -130,6 +147,106 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) vfio_pci_core_close_device(core_vdev); } +static int nvgrace_gpu_wait_device_ready(void __iomem *io) +{ + unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); + + do { + if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && + (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) + return 0; + msleep(POLL_QUANTUM_MS); + } while (!time_after(jiffies, timeout)); + + return -ETIME; +} + +/* + * If the GPU memory is accessed by the CPU while the GPU is not ready + * after reset, it can cause harmless corrected RAS events to be logged. + * Make sure the GPU is ready before establishing the mappings. + */ +static int +nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev) +{ + struct vfio_pci_core_device *vdev = &nvdev->core_device; + int ret; + + lockdep_assert_held_read(&vdev->memory_lock); + + if (!nvdev->reset_done) + return 0; + + if (!__vfio_pci_memory_enabled(vdev)) + return -EIO; + + ret = nvgrace_gpu_wait_device_ready(vdev->barmap[0]); + if (ret) + return ret; + + nvdev->reset_done = false; + + return 0; +} + +static unsigned long addr_to_pgoff(struct vm_area_struct *vma, + unsigned long addr) +{ + u64 pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + return ((addr - vma->vm_start) >> PAGE_SHIFT) + pgoff; +} + +static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf, + unsigned int order) +{ + struct vm_area_struct *vma = vmf->vma; + struct nvgrace_gpu_pci_core_device *nvdev = vma->vm_private_data; + struct vfio_pci_core_device *vdev = &nvdev->core_device; + unsigned int index = + vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); + vm_fault_t ret = VM_FAULT_FALLBACK; + struct mem_region *memregion; + unsigned long pfn, addr; + + memregion = nvgrace_gpu_memregion(index, nvdev); + if (!memregion) + return VM_FAULT_SIGBUS; + + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); + pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr); + + if (is_aligned_for_order(vma, addr, pfn, order)) { + scoped_guard(rwsem_read, &vdev->memory_lock) { + if (vdev->pm_runtime_engaged || + nvgrace_gpu_check_device_ready(nvdev)) + return VM_FAULT_SIGBUS; + + ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order); + } + } + + dev_dbg_ratelimited(&vdev->pdev->dev, + "%s order = %d pfn 0x%lx: 0x%x\n", + __func__, order, pfn, + (unsigned int)ret); + + return ret; +} + +static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf) +{ + return nvgrace_gpu_vfio_pci_huge_fault(vmf, 0); +} + +static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = { + .fault = nvgrace_gpu_vfio_pci_fault, +#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP + .huge_fault = nvgrace_gpu_vfio_pci_huge_fault, +#endif +}; + static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) { @@ -137,10 +254,8 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); struct mem_region *memregion; - unsigned long start_pfn; u64 req_len, pgoff, end; unsigned int index; - int ret = 0; index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); @@ -157,17 +272,18 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) || - check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) || check_add_overflow(PFN_PHYS(pgoff), req_len, &end)) return -EOVERFLOW; /* - * Check that the mapping request does not go beyond available device - * memory size + * Check that the mapping request does not go beyond the exposed + * device memory size. */ if (end > memregion->memlength) return -EINVAL; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); + /* * The carved out region of the device memory needs the NORMAL_NC * property. Communicate as such to the hypervisor. @@ -184,56 +300,31 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); } - /* - * Perform a PFN map to the memory and back the device BAR by the - * GPU memory. - * - * The available GPU memory size may not be power-of-2 aligned. The - * remainder is only backed by vfio_device_ops read/write handlers. - * - * During device reset, the GPU is safely disconnected to the CPU - * and access to the BAR will be immediately returned preventing - * machine check. - */ - ret = remap_pfn_range(vma, vma->vm_start, start_pfn, - req_len, vma->vm_page_prot); - if (ret) - return ret; - - vma->vm_pgoff = start_pfn; + vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops; + vma->vm_private_data = nvdev; return 0; } -static long -nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, - unsigned long arg) +static int nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; struct vfio_region_info_cap_sparse_mmap *sparse; - struct vfio_region_info info; struct mem_region *memregion; u32 size; int ret; - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - /* * Request to determine the BAR region information. Send the * GPU memory information. */ - memregion = nvgrace_gpu_memregion(info.index, nvdev); + memregion = nvgrace_gpu_memregion(info->index, nvdev); if (!memregion) - return vfio_pci_core_ioctl(core_vdev, - VFIO_DEVICE_GET_REGION_INFO, arg); + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); size = struct_size(sparse, areas, 1); @@ -252,49 +343,28 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; sparse->header.version = 1; - ret = vfio_info_add_capability(&caps, &sparse->header, size); + ret = vfio_info_add_capability(caps, &sparse->header, size); kfree(sparse); if (ret) return ret; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); /* * The region memory size may not be power-of-2 aligned. * Given that the memory is a BAR and may not be * aligned, roundup to the next power-of-2. */ - info.size = memregion->bar_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | + info->size = memregion->bar_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE | VFIO_REGION_INFO_FLAG_MMAP; - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + - sizeof(info), caps.buf, - caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(info); - } - kfree(caps.buf); - } - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; + return 0; } static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { switch (cmd) { - case VFIO_DEVICE_GET_REGION_INFO: - return nvgrace_gpu_ioctl_get_region_info(core_vdev, arg); case VFIO_DEVICE_IOEVENTFD: return -ENOTTY; case VFIO_DEVICE_RESET: @@ -510,6 +580,7 @@ static ssize_t nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, char __user *buf, size_t count, loff_t *ppos) { + struct vfio_pci_core_device *vdev = &nvdev->core_device; u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); struct mem_region *memregion; @@ -536,9 +607,15 @@ nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, else mem_count = min(count, memregion->memlength - (size_t)offset); - ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos); - if (ret) - return ret; + scoped_guard(rwsem_read, &vdev->memory_lock) { + ret = nvgrace_gpu_check_device_ready(nvdev); + if (ret) + return ret; + + ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos); + if (ret) + return ret; + } /* * Only the device memory present on the hardware is mapped, which may @@ -563,9 +640,16 @@ nvgrace_gpu_read(struct vfio_device *core_vdev, struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); + struct vfio_pci_core_device *vdev = &nvdev->core_device; + int ret; - if (nvgrace_gpu_memregion(index, nvdev)) - return nvgrace_gpu_read_mem(nvdev, buf, count, ppos); + if (nvgrace_gpu_memregion(index, nvdev)) { + if (pm_runtime_resume_and_get(&vdev->pdev->dev)) + return -EIO; + ret = nvgrace_gpu_read_mem(nvdev, buf, count, ppos); + pm_runtime_put(&vdev->pdev->dev); + return ret; + } if (index == VFIO_PCI_CONFIG_REGION_INDEX) return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos); @@ -627,6 +711,7 @@ static ssize_t nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, size_t count, loff_t *ppos, const char __user *buf) { + struct vfio_pci_core_device *vdev = &nvdev->core_device; unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; struct mem_region *memregion; @@ -656,9 +741,15 @@ nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, */ mem_count = min(count, memregion->memlength - (size_t)offset); - ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos); - if (ret) - return ret; + scoped_guard(rwsem_read, &vdev->memory_lock) { + ret = nvgrace_gpu_check_device_ready(nvdev); + if (ret) + return ret; + + ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos); + if (ret) + return ret; + } exitfn: *ppos += count; @@ -672,10 +763,17 @@ nvgrace_gpu_write(struct vfio_device *core_vdev, struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); + struct vfio_pci_core_device *vdev = &nvdev->core_device; unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + int ret; - if (nvgrace_gpu_memregion(index, nvdev)) - return nvgrace_gpu_write_mem(nvdev, count, ppos, buf); + if (nvgrace_gpu_memregion(index, nvdev)) { + if (pm_runtime_resume_and_get(&vdev->pdev->dev)) + return -EIO; + ret = nvgrace_gpu_write_mem(nvdev, count, ppos, buf); + pm_runtime_put(&vdev->pdev->dev); + return ret; + } if (index == VFIO_PCI_CONFIG_REGION_INDEX) return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos); @@ -683,6 +781,50 @@ nvgrace_gpu_write(struct vfio_device *core_vdev, return vfio_pci_core_write(core_vdev, buf, count, ppos); } +static int nvgrace_get_dmabuf_phys(struct vfio_pci_core_device *core_vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges) +{ + struct nvgrace_gpu_pci_core_device *nvdev = container_of( + core_vdev, struct nvgrace_gpu_pci_core_device, core_device); + struct pci_dev *pdev = core_vdev->pdev; + struct mem_region *mem_region; + + /* + * if (nvdev->resmem.memlength && region_index == RESMEM_REGION_INDEX) { + * The P2P properties of the non-BAR memory is the same as the + * BAR memory, so just use the provider for index 0. Someday + * when CXL gets P2P support we could create CXLish providers + * for the non-BAR memory. + * } else if (region_index == USEMEM_REGION_INDEX) { + * This is actually cachable memory and isn't treated as P2P in + * the chip. For now we have no way to push cachable memory + * through everything and the Grace HW doesn't care what caching + * attribute is programmed into the SMMU. So use BAR 0. + * } + */ + mem_region = nvgrace_gpu_memregion(region_index, nvdev); + if (mem_region) { + *provider = pcim_p2pdma_provider(pdev, 0); + if (!*provider) + return -EINVAL; + return vfio_pci_core_fill_phys_vec(phys_vec, dma_ranges, + nr_ranges, + mem_region->memphys, + mem_region->memlength); + } + + return vfio_pci_core_get_dmabuf_phys(core_vdev, provider, region_index, + phys_vec, dma_ranges, nr_ranges); +} + +static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_ops = { + .get_dmabuf_phys = nvgrace_get_dmabuf_phys, +}; + static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .name = "nvgrace-gpu-vfio-pci", .init = vfio_pci_core_init_dev, @@ -690,6 +832,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .open_device = nvgrace_gpu_open_device, .close_device = nvgrace_gpu_close_device, .ioctl = nvgrace_gpu_ioctl, + .get_region_info_caps = nvgrace_gpu_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = nvgrace_gpu_read, .write = nvgrace_gpu_write, @@ -703,6 +846,10 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .detach_ioas = vfio_iommufd_physical_detach_ioas, }; +static const struct vfio_pci_device_ops nvgrace_gpu_pci_dev_core_ops = { + .get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys, +}; + static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { .name = "nvgrace-gpu-vfio-pci-core", .init = vfio_pci_core_init_dev, @@ -710,6 +857,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { .open_device = nvgrace_gpu_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, @@ -893,11 +1041,10 @@ static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev) * Ensure that the BAR0 region is enabled before accessing the * registers. */ -static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev) +static int nvgrace_gpu_probe_check_device_ready(struct pci_dev *pdev) { - unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); void __iomem *io; - int ret = -ETIME; + int ret; ret = pci_enable_device(pdev); if (ret) @@ -913,16 +1060,8 @@ static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev) goto iomap_exit; } - do { - if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && - (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) { - ret = 0; - goto reg_check_exit; - } - msleep(POLL_QUANTUM_MS); - } while (!time_after(jiffies, timeout)); + ret = nvgrace_gpu_wait_device_ready(io); -reg_check_exit: pci_iounmap(pdev, io); iomap_exit: pci_release_selected_regions(pdev, 1 << 0); @@ -939,7 +1078,7 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, u64 memphys, memlength; int ret; - ret = nvgrace_gpu_wait_device_ready(pdev); + ret = nvgrace_gpu_probe_check_device_ready(pdev); if (ret) return ret; @@ -965,6 +1104,9 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, memphys, memlength); if (ret) goto out_put_vdev; + nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_ops; + } else { + nvdev->core_device.pci_ops = &nvgrace_gpu_pci_dev_core_ops; } ret = vfio_pci_core_register_device(&nvdev->core_device); @@ -1002,12 +1144,38 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table); +/* + * The GPU reset is required to be serialized against the *first* mapping + * faults and read/writes accesses to prevent potential RAS events logging. + * + * First fault or access after a reset needs to poll device readiness, + * flag that a reset has occurred. The readiness test is done by holding + * the memory_lock read lock and we expect all vfio-pci initiated resets to + * hold the memory_lock write lock to avoid races. However, .reset_done + * extends beyond the scope of vfio-pci initiated resets therefore we + * cannot assert this behavior and use lockdep_assert_held_write. + */ +static void nvgrace_gpu_vfio_pci_reset_done(struct pci_dev *pdev) +{ + struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); + struct nvgrace_gpu_pci_core_device *nvdev = + container_of(core_device, struct nvgrace_gpu_pci_core_device, + core_device); + + nvdev->reset_done = true; +} + +static const struct pci_error_handlers nvgrace_gpu_vfio_pci_err_handlers = { + .reset_done = nvgrace_gpu_vfio_pci_reset_done, + .error_detected = vfio_pci_core_aer_err_detected, +}; + static struct pci_driver nvgrace_gpu_vfio_pci_driver = { .name = KBUILD_MODNAME, .id_table = nvgrace_gpu_vfio_pci_table, .probe = nvgrace_gpu_probe, .remove = nvgrace_gpu_remove, - .err_handler = &vfio_pci_core_err_handlers, + .err_handler = &nvgrace_gpu_vfio_pci_err_handlers, .driver_managed_dma = true, }; diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c index f3ccb0008f67..be103c74e969 100644 --- a/drivers/vfio/pci/pds/vfio_dev.c +++ b/drivers/vfio/pci/pds/vfio_dev.c @@ -195,6 +195,7 @@ static const struct vfio_device_ops pds_vfio_ops = { .open_device = pds_vfio_open_device, .close_device = pds_vfio_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c index a19b68043eb2..8fbdf7c6d666 100644 --- a/drivers/vfio/pci/qat/main.c +++ b/drivers/vfio/pci/qat/main.c @@ -609,6 +609,7 @@ static const struct vfio_device_ops qat_vf_pci_ops = { .open_device = qat_vf_pci_open_device, .close_device = qat_vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .read = vfio_pci_core_read, .write = vfio_pci_core_write, .mmap = vfio_pci_core_mmap, diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index ac10f14417f2..0c771064c0b8 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -132,6 +132,7 @@ static const struct vfio_device_ops vfio_pci_ops = { .open_device = vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, @@ -147,6 +148,10 @@ static const struct vfio_device_ops vfio_pci_ops = { .pasid_detach_ioas = vfio_iommufd_physical_pasid_detach_ioas, }; +static const struct vfio_pci_device_ops vfio_pci_dev_ops = { + .get_dmabuf_phys = vfio_pci_core_get_dmabuf_phys, +}; + static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct vfio_pci_core_device *vdev; @@ -161,6 +166,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) return PTR_ERR(vdev); dev_set_drvdata(&pdev->dev, vdev); + vdev->pci_ops = &vfio_pci_dev_ops; ret = vfio_pci_core_register_device(vdev); if (ret) goto out_put_vdev; diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 8f02f236b5b4..dc4e510e6e1b 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -416,6 +416,7 @@ bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev) return pdev->current_state < PCI_D3hot && (pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY)); } +EXPORT_SYMBOL_GPL(__vfio_pci_memory_enabled); /* * Restore the *real* BARs after we detect a FLR or backdoor reset. @@ -589,10 +590,12 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos, virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY); new_mem = !!(new_cmd & PCI_COMMAND_MEMORY); - if (!new_mem) + if (!new_mem) { vfio_pci_zap_and_down_write_memory_lock(vdev); - else + vfio_pci_dma_buf_move(vdev, true); + } else { down_write(&vdev->memory_lock); + } /* * If the user is writing mem/io enable (new_mem/io) and we @@ -627,6 +630,8 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos, *virt_cmd &= cpu_to_le16(~mask); *virt_cmd |= cpu_to_le16(new_cmd & mask); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } @@ -707,12 +712,16 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm) static void vfio_lock_and_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state) { - if (state >= PCI_D3hot) + if (state >= PCI_D3hot) { vfio_pci_zap_and_down_write_memory_lock(vdev); - else + vfio_pci_dma_buf_move(vdev, true); + } else { down_write(&vdev->memory_lock); + } vfio_pci_set_power_state(vdev, state); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } @@ -900,7 +909,10 @@ static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) { vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_dma_buf_move(vdev, true); pci_try_reset_function(vdev->pdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } } @@ -982,7 +994,10 @@ static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) { vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_dma_buf_move(vdev, true); pci_try_reset_function(vdev->pdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } } diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 7dcf5439dedc..3a11e6f450f7 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -28,6 +28,7 @@ #include <linux/nospec.h> #include <linux/sched/mm.h> #include <linux/iommufd.h> +#include <linux/pci-p2pdma.h> #if IS_ENABLED(CONFIG_EEH) #include <asm/eeh.h> #endif @@ -41,6 +42,40 @@ static bool nointxmask; static bool disable_vga; static bool disable_idle_d3; +static void vfio_pci_eventfd_rcu_free(struct rcu_head *rcu) +{ + struct vfio_pci_eventfd *eventfd = + container_of(rcu, struct vfio_pci_eventfd, rcu); + + eventfd_ctx_put(eventfd->ctx); + kfree(eventfd); +} + +int vfio_pci_eventfd_replace_locked(struct vfio_pci_core_device *vdev, + struct vfio_pci_eventfd __rcu **peventfd, + struct eventfd_ctx *ctx) +{ + struct vfio_pci_eventfd *new = NULL; + struct vfio_pci_eventfd *old; + + lockdep_assert_held(&vdev->igate); + + if (ctx) { + new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT); + if (!new) + return -ENOMEM; + + new->ctx = ctx; + } + + old = rcu_replace_pointer(*peventfd, new, + lockdep_is_held(&vdev->igate)); + if (old) + call_rcu(&old->rcu, vfio_pci_eventfd_rcu_free); + + return 0; +} + /* List of PF's that vfio_pci_core_sriov_configure() has been called on */ static DEFINE_MUTEX(vfio_pci_sriov_pfs_mutex); static LIST_HEAD(vfio_pci_sriov_pfs); @@ -286,6 +321,8 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev, * semaphore. */ vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_dma_buf_move(vdev, true); + if (vdev->pm_runtime_engaged) { up_write(&vdev->memory_lock); return -EINVAL; @@ -299,11 +336,9 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev, return 0; } -static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags, +static int vfio_pci_core_pm_entry(struct vfio_pci_core_device *vdev, u32 flags, void __user *arg, size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); int ret; ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0); @@ -320,12 +355,10 @@ static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags, } static int vfio_pci_core_pm_entry_with_wakeup( - struct vfio_device *device, u32 flags, + struct vfio_pci_core_device *vdev, u32 flags, struct vfio_device_low_power_entry_with_wakeup __user *arg, size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); struct vfio_device_low_power_entry_with_wakeup entry; struct eventfd_ctx *efdctx; int ret; @@ -373,14 +406,14 @@ static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev) */ down_write(&vdev->memory_lock); __vfio_pci_runtime_pm_exit(vdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); } -static int vfio_pci_core_pm_exit(struct vfio_device *device, u32 flags, +static int vfio_pci_core_pm_exit(struct vfio_pci_core_device *vdev, u32 flags, void __user *arg, size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); int ret; ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0); @@ -695,15 +728,11 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) #endif vfio_pci_core_disable(vdev); + vfio_pci_dma_buf_cleanup(vdev); + mutex_lock(&vdev->igate); - if (vdev->err_trigger) { - eventfd_ctx_put(vdev->err_trigger); - vdev->err_trigger = NULL; - } - if (vdev->req_trigger) { - eventfd_ctx_put(vdev->req_trigger); - vdev->req_trigger = NULL; - } + vfio_pci_eventfd_replace_locked(vdev, &vdev->err_trigger, NULL); + vfio_pci_eventfd_replace_locked(vdev, &vdev->req_trigger, NULL); mutex_unlock(&vdev->igate); } EXPORT_SYMBOL_GPL(vfio_pci_core_close_device); @@ -996,42 +1025,36 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } -static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, - struct vfio_region_info __user *arg) +int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) { - unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); struct pci_dev *pdev = vdev->pdev; - struct vfio_region_info info; - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; int i, ret; - if (copy_from_user(&info, arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { + switch (info->index) { case VFIO_PCI_CONFIG_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = pdev->cfg_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = pdev->cfg_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; break; case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = pci_resource_len(pdev, info.index); - if (!info.size) { - info.flags = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = pci_resource_len(pdev, info->index); + if (!info->size) { + info->flags = 0; break; } - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - if (vdev->bar_mmap_supported[info.index]) { - info.flags |= VFIO_REGION_INFO_FLAG_MMAP; - if (info.index == vdev->msix_bar) { - ret = msix_mmappable_cap(vdev, &caps); + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + if (vdev->bar_mmap_supported[info->index]) { + info->flags |= VFIO_REGION_INFO_FLAG_MMAP; + if (info->index == vdev->msix_bar) { + ret = msix_mmappable_cap(vdev, caps); if (ret) return ret; } @@ -1043,9 +1066,9 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, size_t size; u16 cmd; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.flags = 0; - info.size = 0; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->flags = 0; + info->size = 0; if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) { /* @@ -1055,16 +1078,17 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, cmd = vfio_pci_memory_lock_and_enable(vdev); io = pci_map_rom(pdev, &size); if (io) { - info.flags = VFIO_REGION_INFO_FLAG_READ; + info->flags = VFIO_REGION_INFO_FLAG_READ; /* Report the BAR size, not the ROM size. */ - info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE); + info->size = pci_resource_len(pdev, + PCI_ROM_RESOURCE); pci_unmap_rom(pdev, io); } vfio_pci_memory_unlock_and_restore(vdev, cmd); } else if (pdev->rom && pdev->romlen) { - info.flags = VFIO_REGION_INFO_FLAG_READ; + info->flags = VFIO_REGION_INFO_FLAG_READ; /* Report BAR size as power of two. */ - info.size = roundup_pow_of_two(pdev->romlen); + info->size = roundup_pow_of_two(pdev->romlen); } break; @@ -1073,10 +1097,10 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, if (!vdev->has_vga) return -EINVAL; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0xc0000; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = 0xc0000; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; break; default: { @@ -1085,53 +1109,36 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, .header.version = 1 }; - if (info.index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) + if (info->index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) return -EINVAL; - info.index = array_index_nospec( - info.index, VFIO_PCI_NUM_REGIONS + vdev->num_regions); + info->index = array_index_nospec( + info->index, VFIO_PCI_NUM_REGIONS + vdev->num_regions); - i = info.index - VFIO_PCI_NUM_REGIONS; + i = info->index - VFIO_PCI_NUM_REGIONS; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vdev->region[i].size; - info.flags = vdev->region[i].flags; + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = vdev->region[i].size; + info->flags = vdev->region[i].flags; cap_type.type = vdev->region[i].type; cap_type.subtype = vdev->region[i].subtype; - ret = vfio_info_add_capability(&caps, &cap_type.header, + ret = vfio_info_add_capability(caps, &cap_type.header, sizeof(cap_type)); if (ret) return ret; if (vdev->region[i].ops->add_capability) { ret = vdev->region[i].ops->add_capability( - vdev, &vdev->region[i], &caps); + vdev, &vdev->region[i], caps); if (ret) return ret; } } } - - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user(arg + 1, caps.buf, caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(*arg); - } - - kfree(caps.buf); - } - - return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; + return 0; } +EXPORT_SYMBOL_GPL(vfio_pci_ioctl_get_region_info); static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev, struct vfio_irq_info __user *arg) @@ -1227,7 +1234,10 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, */ vfio_pci_set_power_state(vdev, PCI_D0); + vfio_pci_dma_buf_move(vdev, true); ret = pci_try_reset_function(vdev->pdev); + if (__vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); return ret; @@ -1457,8 +1467,6 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, return vfio_pci_ioctl_get_irq_info(vdev, uarg); case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO: return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg); - case VFIO_DEVICE_GET_REGION_INFO: - return vfio_pci_ioctl_get_region_info(vdev, uarg); case VFIO_DEVICE_IOEVENTFD: return vfio_pci_ioctl_ioeventfd(vdev, uarg); case VFIO_DEVICE_PCI_HOT_RESET: @@ -1473,11 +1481,10 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, } EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl); -static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags, - uuid_t __user *arg, size_t argsz) +static int vfio_pci_core_feature_token(struct vfio_pci_core_device *vdev, + u32 flags, uuid_t __user *arg, + size_t argsz) { - struct vfio_pci_core_device *vdev = - container_of(device, struct vfio_pci_core_device, vdev); uuid_t uuid; int ret; @@ -1504,16 +1511,21 @@ static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags, int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz) { + struct vfio_pci_core_device *vdev = + container_of(device, struct vfio_pci_core_device, vdev); + switch (flags & VFIO_DEVICE_FEATURE_MASK) { case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY: - return vfio_pci_core_pm_entry(device, flags, arg, argsz); + return vfio_pci_core_pm_entry(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP: - return vfio_pci_core_pm_entry_with_wakeup(device, flags, + return vfio_pci_core_pm_entry_with_wakeup(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_LOW_POWER_EXIT: - return vfio_pci_core_pm_exit(device, flags, arg, argsz); + return vfio_pci_core_pm_exit(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: - return vfio_pci_core_feature_token(device, flags, arg, argsz); + return vfio_pci_core_feature_token(vdev, flags, arg, argsz); + case VFIO_DEVICE_FEATURE_DMA_BUF: + return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz); default: return -ENOTTY; } @@ -1640,49 +1652,49 @@ static unsigned long vma_to_pfn(struct vm_area_struct *vma) return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff; } -static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, - unsigned int order) +vm_fault_t vfio_pci_vmf_insert_pfn(struct vfio_pci_core_device *vdev, + struct vm_fault *vmf, + unsigned long pfn, + unsigned int order) { - struct vm_area_struct *vma = vmf->vma; - struct vfio_pci_core_device *vdev = vma->vm_private_data; - unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1); - unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; - unsigned long pfn = vma_to_pfn(vma) + pgoff; - vm_fault_t ret = VM_FAULT_SIGBUS; - - if (order && (addr < vma->vm_start || - addr + (PAGE_SIZE << order) > vma->vm_end || - pfn & ((1 << order) - 1))) { - ret = VM_FAULT_FALLBACK; - goto out; - } - - down_read(&vdev->memory_lock); + lockdep_assert_held_read(&vdev->memory_lock); if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) - goto out_unlock; + return VM_FAULT_SIGBUS; switch (order) { case 0: - ret = vmf_insert_pfn(vma, vmf->address, pfn); - break; + return vmf_insert_pfn(vmf->vma, vmf->address, pfn); #ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP case PMD_ORDER: - ret = vmf_insert_pfn_pmd(vmf, pfn, false); - break; + return vmf_insert_pfn_pmd(vmf, pfn, false); #endif #ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP case PUD_ORDER: - ret = vmf_insert_pfn_pud(vmf, pfn, false); + return vmf_insert_pfn_pud(vmf, pfn, false); break; #endif default: - ret = VM_FAULT_FALLBACK; + return VM_FAULT_FALLBACK; + } +} +EXPORT_SYMBOL_GPL(vfio_pci_vmf_insert_pfn); + +static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, + unsigned int order) +{ + struct vm_area_struct *vma = vmf->vma; + struct vfio_pci_core_device *vdev = vma->vm_private_data; + unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1); + unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; + unsigned long pfn = vma_to_pfn(vma) + pgoff; + vm_fault_t ret = VM_FAULT_FALLBACK; + + if (is_aligned_for_order(vma, addr, pfn, order)) { + scoped_guard(rwsem_read, &vdev->memory_lock) + ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order); } -out_unlock: - up_read(&vdev->memory_lock); -out: dev_dbg_ratelimited(&vdev->pdev->dev, "%s(,order = %d) BAR %ld page offset 0x%lx: 0x%x\n", __func__, order, @@ -1749,18 +1761,9 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma * Even though we don't make use of the barmap for the mmap, * we need to request the region and the barmap tracks that. */ - if (!vdev->barmap[index]) { - ret = pci_request_selected_regions(pdev, - 1 << index, "vfio-pci"); - if (ret) - return ret; - - vdev->barmap[index] = pci_iomap(pdev, index, 0); - if (!vdev->barmap[index]) { - pci_release_selected_regions(pdev, 1 << index); - return -ENOMEM; - } - } + ret = vfio_pci_core_setup_barmap(vdev, index); + if (ret) + return ret; vma->vm_private_data = vdev; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); @@ -1800,21 +1803,21 @@ void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count) struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); struct pci_dev *pdev = vdev->pdev; + struct vfio_pci_eventfd *eventfd; - mutex_lock(&vdev->igate); - - if (vdev->req_trigger) { + rcu_read_lock(); + eventfd = rcu_dereference(vdev->req_trigger); + if (eventfd) { if (!(count % 10)) pci_notice_ratelimited(pdev, "Relaying device request to user (#%u)\n", count); - eventfd_signal(vdev->req_trigger); + eventfd_signal(eventfd->ctx); } else if (count == 0) { pci_warn(pdev, "No device request channel registered, blocked until released by user\n"); } - - mutex_unlock(&vdev->igate); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(vfio_pci_core_request); @@ -2085,6 +2088,7 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) { struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); + int ret; vdev->pdev = to_pci_dev(core_vdev->dev); vdev->irq_type = VFIO_PCI_NUM_IRQS; @@ -2094,6 +2098,10 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) INIT_LIST_HEAD(&vdev->dummy_resources_list); INIT_LIST_HEAD(&vdev->ioeventfds_list); INIT_LIST_HEAD(&vdev->sriov_pfs_item); + ret = pcim_p2pdma_init(vdev->pdev); + if (ret && ret != -EOPNOTSUPP) + return ret; + INIT_LIST_HEAD(&vdev->dmabufs); init_rwsem(&vdev->memory_lock); xa_init(&vdev->ctx); @@ -2227,13 +2235,13 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, pci_channel_state_t state) { struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev); + struct vfio_pci_eventfd *eventfd; - mutex_lock(&vdev->igate); - - if (vdev->err_trigger) - eventfd_signal(vdev->err_trigger); - - mutex_unlock(&vdev->igate); + rcu_read_lock(); + eventfd = rcu_dereference(vdev->err_trigger); + if (eventfd) + eventfd_signal(eventfd->ctx); + rcu_read_unlock(); return PCI_ERS_RESULT_CAN_RECOVER; } @@ -2458,6 +2466,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, break; } + vfio_pci_dma_buf_move(vdev, true); vfio_pci_zap_bars(vdev); } @@ -2486,8 +2495,11 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, err_undo: list_for_each_entry_from_reverse(vdev, &dev_set->device_list, - vdev.dev_set_list) + vdev.dev_set_list) { + if (vdev->vdev.open_count && __vfio_pci_memory_enabled(vdev)) + vfio_pci_dma_buf_move(vdev, false); up_write(&vdev->memory_lock); + } list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) pm_runtime_put(&vdev->pdev->dev); diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c new file mode 100644 index 000000000000..6698f540bdac --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. + */ +#include <linux/dma-buf-mapping.h> +#include <linux/pci-p2pdma.h> +#include <linux/dma-resv.h> + +#include "vfio_pci_priv.h" + +MODULE_IMPORT_NS("DMA_BUF"); + +struct vfio_pci_dma_buf { + struct dma_buf *dmabuf; + struct vfio_pci_core_device *vdev; + struct list_head dmabufs_elm; + size_t size; + struct dma_buf_phys_vec *phys_vec; + struct p2pdma_provider *provider; + u32 nr_ranges; + u8 revoked : 1; +}; + +static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + struct vfio_pci_dma_buf *priv = dmabuf->priv; + + if (!attachment->peer2peer) + return -EOPNOTSUPP; + + if (priv->revoked) + return -ENODEV; + + return 0; +} + +static struct sg_table * +vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment, + enum dma_data_direction dir) +{ + struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv; + + dma_resv_assert_held(priv->dmabuf->resv); + + if (priv->revoked) + return ERR_PTR(-ENODEV); + + return dma_buf_phys_vec_to_sgt(attachment, priv->provider, + priv->phys_vec, priv->nr_ranges, + priv->size, dir); +} + +static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment, + struct sg_table *sgt, + enum dma_data_direction dir) +{ + dma_buf_free_sgt(attachment, sgt, dir); +} + +static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf) +{ + struct vfio_pci_dma_buf *priv = dmabuf->priv; + + /* + * Either this or vfio_pci_dma_buf_cleanup() will remove from the list. + * The refcount prevents both. + */ + if (priv->vdev) { + down_write(&priv->vdev->memory_lock); + list_del_init(&priv->dmabufs_elm); + up_write(&priv->vdev->memory_lock); + vfio_device_put_registration(&priv->vdev->vdev); + } + kfree(priv->phys_vec); + kfree(priv); +} + +static const struct dma_buf_ops vfio_pci_dmabuf_ops = { + .attach = vfio_pci_dma_buf_attach, + .map_dma_buf = vfio_pci_dma_buf_map, + .unmap_dma_buf = vfio_pci_dma_buf_unmap, + .release = vfio_pci_dma_buf_release, +}; + +int vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges, phys_addr_t start, + phys_addr_t len) +{ + phys_addr_t max_addr; + unsigned int i; + + max_addr = start + len; + for (i = 0; i < nr_ranges; i++) { + phys_addr_t end; + + if (!dma_ranges[i].length) + return -EINVAL; + + if (check_add_overflow(start, dma_ranges[i].offset, + &phys_vec[i].paddr) || + check_add_overflow(phys_vec[i].paddr, + dma_ranges[i].length, &end)) + return -EOVERFLOW; + if (end > max_addr) + return -EINVAL; + + phys_vec[i].len = dma_ranges[i].length; + } + return 0; +} +EXPORT_SYMBOL_GPL(vfio_pci_core_fill_phys_vec); + +int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges) +{ + struct pci_dev *pdev = vdev->pdev; + + *provider = pcim_p2pdma_provider(pdev, region_index); + if (!*provider) + return -EINVAL; + + return vfio_pci_core_fill_phys_vec( + phys_vec, dma_ranges, nr_ranges, + pci_resource_start(pdev, region_index), + pci_resource_len(pdev, region_index)); +} +EXPORT_SYMBOL_GPL(vfio_pci_core_get_dmabuf_phys); + +static int validate_dmabuf_input(struct vfio_device_feature_dma_buf *dma_buf, + struct vfio_region_dma_range *dma_ranges, + size_t *lengthp) +{ + size_t length = 0; + u32 i; + + for (i = 0; i < dma_buf->nr_ranges; i++) { + u64 offset = dma_ranges[i].offset; + u64 len = dma_ranges[i].length; + + if (!len || !PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) + return -EINVAL; + + if (check_add_overflow(length, len, &length)) + return -EINVAL; + } + + /* + * dma_iova_try_alloc() will WARN on if userspace proposes a size that + * is too big, eg with lots of ranges. + */ + if ((u64)(length) & DMA_IOVA_USE_SWIOTLB) + return -EINVAL; + + *lengthp = length; + return 0; +} + +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf __user *arg, + size_t argsz) +{ + struct vfio_device_feature_dma_buf get_dma_buf = {}; + struct vfio_region_dma_range *dma_ranges; + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct vfio_pci_dma_buf *priv; + size_t length; + int ret; + + if (!vdev->pci_ops || !vdev->pci_ops->get_dmabuf_phys) + return -EOPNOTSUPP; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET, + sizeof(get_dma_buf)); + if (ret != 1) + return ret; + + if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf))) + return -EFAULT; + + if (!get_dma_buf.nr_ranges || get_dma_buf.flags) + return -EINVAL; + + /* + * For PCI the region_index is the BAR number like everything else. + */ + if (get_dma_buf.region_index >= VFIO_PCI_ROM_REGION_INDEX) + return -ENODEV; + + dma_ranges = memdup_array_user(&arg->dma_ranges, get_dma_buf.nr_ranges, + sizeof(*dma_ranges)); + if (IS_ERR(dma_ranges)) + return PTR_ERR(dma_ranges); + + ret = validate_dmabuf_input(&get_dma_buf, dma_ranges, &length); + if (ret) + goto err_free_ranges; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) { + ret = -ENOMEM; + goto err_free_ranges; + } + priv->phys_vec = kcalloc(get_dma_buf.nr_ranges, sizeof(*priv->phys_vec), + GFP_KERNEL); + if (!priv->phys_vec) { + ret = -ENOMEM; + goto err_free_priv; + } + + priv->vdev = vdev; + priv->nr_ranges = get_dma_buf.nr_ranges; + priv->size = length; + ret = vdev->pci_ops->get_dmabuf_phys(vdev, &priv->provider, + get_dma_buf.region_index, + priv->phys_vec, dma_ranges, + priv->nr_ranges); + if (ret) + goto err_free_phys; + + kfree(dma_ranges); + dma_ranges = NULL; + + if (!vfio_device_try_get_registration(&vdev->vdev)) { + ret = -ENODEV; + goto err_free_phys; + } + + exp_info.ops = &vfio_pci_dmabuf_ops; + exp_info.size = priv->size; + exp_info.flags = get_dma_buf.open_flags; + exp_info.priv = priv; + + priv->dmabuf = dma_buf_export(&exp_info); + if (IS_ERR(priv->dmabuf)) { + ret = PTR_ERR(priv->dmabuf); + goto err_dev_put; + } + + /* dma_buf_put() now frees priv */ + INIT_LIST_HEAD(&priv->dmabufs_elm); + down_write(&vdev->memory_lock); + dma_resv_lock(priv->dmabuf->resv, NULL); + priv->revoked = !__vfio_pci_memory_enabled(vdev); + list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs); + dma_resv_unlock(priv->dmabuf->resv); + up_write(&vdev->memory_lock); + + /* + * dma_buf_fd() consumes the reference, when the file closes the dmabuf + * will be released. + */ + ret = dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags); + if (ret < 0) + goto err_dma_buf; + return ret; + +err_dma_buf: + dma_buf_put(priv->dmabuf); +err_dev_put: + vfio_device_put_registration(&vdev->vdev); +err_free_phys: + kfree(priv->phys_vec); +err_free_priv: + kfree(priv); +err_free_ranges: + kfree(dma_ranges); + return ret; +} + +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) +{ + struct vfio_pci_dma_buf *priv; + struct vfio_pci_dma_buf *tmp; + + lockdep_assert_held_write(&vdev->memory_lock); + + list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { + if (!get_file_active(&priv->dmabuf->file)) + continue; + + if (priv->revoked != revoked) { + dma_resv_lock(priv->dmabuf->resv, NULL); + priv->revoked = revoked; + dma_buf_move_notify(priv->dmabuf); + dma_resv_unlock(priv->dmabuf->resv); + } + fput(priv->dmabuf->file); + } +} + +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) +{ + struct vfio_pci_dma_buf *priv; + struct vfio_pci_dma_buf *tmp; + + down_write(&vdev->memory_lock); + list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) { + if (!get_file_active(&priv->dmabuf->file)) + continue; + + dma_resv_lock(priv->dmabuf->resv, NULL); + list_del_init(&priv->dmabufs_elm); + priv->vdev = NULL; + priv->revoked = true; + dma_buf_move_notify(priv->dmabuf); + dma_resv_unlock(priv->dmabuf->resv); + vfio_device_put_registration(&vdev->vdev); + fput(priv->dmabuf->file); + } + up_write(&vdev->memory_lock); +} diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 30d3e921cb0d..c76e753b3cec 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -731,21 +731,27 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev, return 0; } -static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, +static int vfio_pci_set_ctx_trigger_single(struct vfio_pci_core_device *vdev, + struct vfio_pci_eventfd __rcu **peventfd, unsigned int count, uint32_t flags, void *data) { /* DATA_NONE/DATA_BOOL enables loopback testing */ if (flags & VFIO_IRQ_SET_DATA_NONE) { - if (*ctx) { - if (count) { - eventfd_signal(*ctx); - } else { - eventfd_ctx_put(*ctx); - *ctx = NULL; - } + struct vfio_pci_eventfd *eventfd; + + eventfd = rcu_dereference_protected(*peventfd, + lockdep_is_held(&vdev->igate)); + + if (!eventfd) + return -EINVAL; + + if (count) { + eventfd_signal(eventfd->ctx); return 0; } + + return vfio_pci_eventfd_replace_locked(vdev, peventfd, NULL); } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { uint8_t trigger; @@ -753,8 +759,15 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, return -EINVAL; trigger = *(uint8_t *)data; - if (trigger && *ctx) - eventfd_signal(*ctx); + + if (trigger) { + struct vfio_pci_eventfd *eventfd = + rcu_dereference_protected(*peventfd, + lockdep_is_held(&vdev->igate)); + + if (eventfd) + eventfd_signal(eventfd->ctx); + } return 0; } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { @@ -765,22 +778,23 @@ static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, fd = *(int32_t *)data; if (fd == -1) { - if (*ctx) - eventfd_ctx_put(*ctx); - *ctx = NULL; + return vfio_pci_eventfd_replace_locked(vdev, + peventfd, NULL); } else if (fd >= 0) { struct eventfd_ctx *efdctx; + int ret; efdctx = eventfd_ctx_fdget(fd); if (IS_ERR(efdctx)) return PTR_ERR(efdctx); - if (*ctx) - eventfd_ctx_put(*ctx); + ret = vfio_pci_eventfd_replace_locked(vdev, + peventfd, efdctx); + if (ret) + eventfd_ctx_put(efdctx); - *ctx = efdctx; + return ret; } - return 0; } return -EINVAL; @@ -793,7 +807,7 @@ static int vfio_pci_set_err_trigger(struct vfio_pci_core_device *vdev, if (index != VFIO_PCI_ERR_IRQ_INDEX || start != 0 || count > 1) return -EINVAL; - return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, + return vfio_pci_set_ctx_trigger_single(vdev, &vdev->err_trigger, count, flags, data); } @@ -804,7 +818,7 @@ static int vfio_pci_set_req_trigger(struct vfio_pci_core_device *vdev, if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count > 1) return -EINVAL; - return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger, + return vfio_pci_set_ctx_trigger_single(vdev, &vdev->req_trigger, count, flags, data); } diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index a9972eacb293..27ac280f00b9 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -26,6 +26,10 @@ struct vfio_pci_ioeventfd { bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev); void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev); +int vfio_pci_eventfd_replace_locked(struct vfio_pci_core_device *vdev, + struct vfio_pci_eventfd __rcu **peventfd, + struct eventfd_ctx *ctx); + int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags, unsigned index, unsigned start, unsigned count, void *data); @@ -60,7 +64,6 @@ void vfio_config_free(struct vfio_pci_core_device *vdev); int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state); -bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev); void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev); u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev); void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, @@ -107,4 +110,27 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev) return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; } +#ifdef CONFIG_VFIO_PCI_DMABUF +int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf __user *arg, + size_t argsz); +void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev); +void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked); +#else +static inline int +vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, + struct vfio_device_feature_dma_buf __user *arg, + size_t argsz) +{ + return -ENOTTY; +} +static inline void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) +{ +} +static inline void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, + bool revoked) +{ +} +#endif + #endif diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h index c7d7e27af386..cb3d5e57d3a3 100644 --- a/drivers/vfio/pci/virtio/common.h +++ b/drivers/vfio/pci/virtio/common.h @@ -109,10 +109,9 @@ void virtiovf_migration_reset_done(struct pci_dev *pdev); #ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev); -long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, - unsigned int cmd, unsigned long arg); int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - unsigned int cmd, unsigned long arg); + struct vfio_region_info *info, + struct vfio_info_cap *caps); ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, size_t count, loff_t *ppos); diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c index 832af5ba267c..1ed349a55629 100644 --- a/drivers/vfio/pci/virtio/legacy_io.c +++ b/drivers/vfio/pci/virtio/legacy_io.c @@ -281,41 +281,19 @@ ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user } int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - unsigned int cmd, unsigned long arg) + struct vfio_region_info *info, + struct vfio_info_cap *caps) { struct virtiovf_pci_core_device *virtvdev = container_of( core_vdev, struct virtiovf_pci_core_device, core_device.vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); - void __user *uarg = (void __user *)arg; - struct vfio_region_info info = {}; - if (copy_from_user(&info, uarg, minsz)) - return -EFAULT; + if (info->index != VFIO_PCI_BAR0_REGION_INDEX) + return vfio_pci_ioctl_get_region_info(core_vdev, info, caps); - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { - case VFIO_PCI_BAR0_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = virtvdev->bar0_virtual_buf_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0; - default: - return vfio_pci_core_ioctl(core_vdev, cmd, arg); - } -} - -long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, - unsigned long arg) -{ - switch (cmd) { - case VFIO_DEVICE_GET_REGION_INFO: - return virtiovf_pci_ioctl_get_region_info(core_vdev, cmd, arg); - default: - return vfio_pci_core_ioctl(core_vdev, cmd, arg); - } + info->offset = VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = virtvdev->bar0_virtual_buf_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; + return 0; } static int virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev) diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c index 8084f3e36a9f..d2e5cbca13c8 100644 --- a/drivers/vfio/pci/virtio/main.c +++ b/drivers/vfio/pci/virtio/main.c @@ -88,6 +88,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = { .open_device = virtiovf_pci_open_device, .close_device = virtiovf_pci_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, @@ -108,7 +109,8 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = { .release = virtiovf_pci_core_release_dev, .open_device = virtiovf_pci_open_device, .close_device = virtiovf_pci_close_device, - .ioctl = virtiovf_vfio_pci_core_ioctl, + .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = virtiovf_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = virtiovf_pci_core_read, .write = virtiovf_pci_core_write, @@ -130,6 +132,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = { .open_device = virtiovf_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = vfio_pci_ioctl_get_region_info, .device_feature = vfio_pci_core_ioctl_feature, .read = vfio_pci_core_read, .write = vfio_pci_core_write, diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c index 9f5c527baa8a..fa754f203b2d 100644 --- a/drivers/vfio/platform/vfio_amba.c +++ b/drivers/vfio/platform/vfio_amba.c @@ -115,6 +115,7 @@ static const struct vfio_device_ops vfio_amba_ops = { .open_device = vfio_platform_open_device, .close_device = vfio_platform_close_device, .ioctl = vfio_platform_ioctl, + .get_region_info_caps = vfio_platform_ioctl_get_region_info, .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c index 512533501eb7..a4d3ace3e02d 100644 --- a/drivers/vfio/platform/vfio_platform.c +++ b/drivers/vfio/platform/vfio_platform.c @@ -101,6 +101,7 @@ static const struct vfio_device_ops vfio_platform_ops = { .open_device = vfio_platform_open_device, .close_device = vfio_platform_close_device, .ioctl = vfio_platform_ioctl, + .get_region_info_caps = vfio_platform_ioctl_get_region_info, .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index 3bf1043cd795..c2990b7e900f 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -272,6 +272,24 @@ err_irq: } EXPORT_SYMBOL_GPL(vfio_platform_open_device); +int vfio_platform_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) +{ + struct vfio_platform_device *vdev = + container_of(core_vdev, struct vfio_platform_device, vdev); + + if (info->index >= vdev->num_regions) + return -EINVAL; + + /* map offset to the physical address */ + info->offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info->index); + info->size = vdev->regions[info->index].size; + info->flags = vdev->regions[info->index].flags; + return 0; +} +EXPORT_SYMBOL_GPL(vfio_platform_ioctl_get_region_info); + long vfio_platform_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { @@ -300,28 +318,6 @@ long vfio_platform_ioctl(struct vfio_device *core_vdev, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; - } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { - struct vfio_region_info info; - - minsz = offsetofend(struct vfio_region_info, offset); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - if (info.index >= vdev->num_regions) - return -EINVAL; - - /* map offset to the physical address */ - info.offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info.index); - info.size = vdev->regions[info.index].size; - info.flags = vdev->regions[info.index].flags; - - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; - } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { struct vfio_irq_info info; diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h index 8d8fab516849..05084212a76e 100644 --- a/drivers/vfio/platform/vfio_platform_private.h +++ b/drivers/vfio/platform/vfio_platform_private.h @@ -85,6 +85,9 @@ int vfio_platform_open_device(struct vfio_device *core_vdev); void vfio_platform_close_device(struct vfio_device *core_vdev); long vfio_platform_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg); +int vfio_platform_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps); ssize_t vfio_platform_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 38c8e9350a60..f7df90c423b4 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -172,11 +172,13 @@ void vfio_device_put_registration(struct vfio_device *device) if (refcount_dec_and_test(&device->refcount)) complete(&device->comp); } +EXPORT_SYMBOL_GPL(vfio_device_put_registration); bool vfio_device_try_get_registration(struct vfio_device *device) { return refcount_inc_not_zero(&device->refcount); } +EXPORT_SYMBOL_GPL(vfio_device_try_get_registration); /* * VFIO driver API @@ -1259,6 +1261,51 @@ static int vfio_ioctl_device_feature(struct vfio_device *device, } } +static long vfio_get_region_info(struct vfio_device *device, + struct vfio_region_info __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_region_info, offset); + struct vfio_region_info info = {}; + struct vfio_info_cap caps = {}; + int ret; + + if (unlikely(!device->ops->get_region_info_caps)) + return -EINVAL; + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + if (info.argsz < minsz) + return -EINVAL; + + ret = device->ops->get_region_info_caps(device, &info, &caps); + if (ret) + goto out_free; + + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user(arg + 1, caps.buf, caps.size)) { + ret = -EFAULT; + goto out_free; + } + info.cap_offset = sizeof(info); + } + } + + if (copy_to_user(arg, &info, minsz)){ + ret = -EFAULT; + goto out_free; + } + +out_free: + kfree(caps.buf); + return ret; +} + static long vfio_device_fops_unl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { @@ -1296,6 +1343,10 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, ret = vfio_ioctl_device_feature(device, uptr); break; + case VFIO_DEVICE_GET_REGION_INFO: + ret = vfio_get_region_info(device, uptr); + break; + default: if (unlikely(!device->ops->ioctl)) ret = -EINVAL; |
