summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/debugfs-vfio6
-rw-r--r--drivers/vfio/debugfs.c19
-rw-r--r--drivers/vfio/vfio_iommu_type1.c112
-rw-r--r--include/linux/mm.h7
-rw-r--r--include/linux/mm_inline.h36
5 files changed, 158 insertions, 22 deletions
diff --git a/Documentation/ABI/testing/debugfs-vfio b/Documentation/ABI/testing/debugfs-vfio
index 90f7c262f591..70ec2d454686 100644
--- a/Documentation/ABI/testing/debugfs-vfio
+++ b/Documentation/ABI/testing/debugfs-vfio
@@ -23,3 +23,9 @@ Contact: Longfang Liu <liulongfang@huawei.com>
Description: Read the live migration status of the vfio device.
The contents of the state file reflects the migration state
relative to those defined in the vfio_device_mig_state enum
+
+What: /sys/kernel/debug/vfio/<device>/migration/features
+Date: Oct 2025
+KernelVersion: 6.18
+Contact: Cédric Le Goater <clg@redhat.com>
+Description: Read the migration features of the vfio device.
diff --git a/drivers/vfio/debugfs.c b/drivers/vfio/debugfs.c
index 298bd866f157..8b0ca7a09064 100644
--- a/drivers/vfio/debugfs.c
+++ b/drivers/vfio/debugfs.c
@@ -58,6 +58,23 @@ static int vfio_device_state_read(struct seq_file *seq, void *data)
return 0;
}
+static int vfio_device_features_read(struct seq_file *seq, void *data)
+{
+ struct device *vf_dev = seq->private;
+ struct vfio_device *vdev = container_of(vf_dev, struct vfio_device, device);
+
+ if (vdev->migration_flags & VFIO_MIGRATION_STOP_COPY)
+ seq_puts(seq, "stop-copy\n");
+ if (vdev->migration_flags & VFIO_MIGRATION_P2P)
+ seq_puts(seq, "p2p\n");
+ if (vdev->migration_flags & VFIO_MIGRATION_PRE_COPY)
+ seq_puts(seq, "pre-copy\n");
+ if (vdev->log_ops)
+ seq_puts(seq, "dirty-tracking\n");
+
+ return 0;
+}
+
void vfio_device_debugfs_init(struct vfio_device *vdev)
{
struct device *dev = &vdev->device;
@@ -72,6 +89,8 @@ void vfio_device_debugfs_init(struct vfio_device *vdev)
vdev->debug_root);
debugfs_create_devm_seqfile(dev, "state", vfio_dev_migration,
vfio_device_state_read);
+ debugfs_create_devm_seqfile(dev, "features", vfio_dev_migration,
+ vfio_device_features_read);
}
}
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index f8d68fe77b41..916cad80941c 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -37,6 +37,7 @@
#include <linux/vfio.h>
#include <linux/workqueue.h>
#include <linux/notifier.h>
+#include <linux/mm_inline.h>
#include "vfio.h"
#define DRIVER_VERSION "0.2"
@@ -92,6 +93,7 @@ struct vfio_dma {
bool iommu_mapped;
bool lock_cap; /* capable(CAP_IPC_LOCK) */
bool vaddr_invalid;
+ bool has_rsvd; /* has 1 or more rsvd pfns */
struct task_struct *task;
struct rb_root pfn_list; /* Ex-user pinned pfn list */
unsigned long *bitmap;
@@ -318,7 +320,13 @@ static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
/*
* Helper Functions for host iova-pfn list
*/
-static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
+
+/*
+ * Find the highest vfio_pfn that overlapping the range
+ * [iova_start, iova_end) in rb tree.
+ */
+static struct vfio_pfn *vfio_find_vpfn_range(struct vfio_dma *dma,
+ dma_addr_t iova_start, dma_addr_t iova_end)
{
struct vfio_pfn *vpfn;
struct rb_node *node = dma->pfn_list.rb_node;
@@ -326,9 +334,9 @@ static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
while (node) {
vpfn = rb_entry(node, struct vfio_pfn, node);
- if (iova < vpfn->iova)
+ if (iova_end <= vpfn->iova)
node = node->rb_left;
- else if (iova > vpfn->iova)
+ else if (iova_start > vpfn->iova)
node = node->rb_right;
else
return vpfn;
@@ -336,6 +344,11 @@ static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
return NULL;
}
+static inline struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
+{
+ return vfio_find_vpfn_range(dma, iova, iova + 1);
+}
+
static void vfio_link_pfn(struct vfio_dma *dma,
struct vfio_pfn *new)
{
@@ -614,6 +627,39 @@ done:
return ret;
}
+
+static long vpfn_pages(struct vfio_dma *dma,
+ dma_addr_t iova_start, long nr_pages)
+{
+ dma_addr_t iova_end = iova_start + (nr_pages << PAGE_SHIFT);
+ struct vfio_pfn *top = vfio_find_vpfn_range(dma, iova_start, iova_end);
+ long ret = 1;
+ struct vfio_pfn *vpfn;
+ struct rb_node *prev;
+ struct rb_node *next;
+
+ if (likely(!top))
+ return 0;
+
+ prev = next = &top->node;
+
+ while ((prev = rb_prev(prev))) {
+ vpfn = rb_entry(prev, struct vfio_pfn, node);
+ if (vpfn->iova < iova_start)
+ break;
+ ret++;
+ }
+
+ while ((next = rb_next(next))) {
+ vpfn = rb_entry(next, struct vfio_pfn, node);
+ if (vpfn->iova >= iova_end)
+ break;
+ ret++;
+ }
+
+ return ret;
+}
+
/*
* Attempt to pin pages. We really don't want to track all the pfns and
* the iommu can only map chunks of consecutive pfns anyway, so get the
@@ -687,32 +733,47 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
* and rsvd here, and therefore continues to use the batch.
*/
while (true) {
+ long nr_pages, acct_pages = 0;
+
if (pfn != *pfn_base + pinned ||
rsvd != is_invalid_reserved_pfn(pfn))
goto out;
/*
+ * Using GUP with the FOLL_LONGTERM in
+ * vaddr_get_pfns() will not return invalid
+ * or reserved pages.
+ */
+ nr_pages = num_pages_contiguous(
+ &batch->pages[batch->offset],
+ batch->size);
+ if (!rsvd) {
+ acct_pages = nr_pages;
+ acct_pages -= vpfn_pages(dma, iova, nr_pages);
+ }
+
+ /*
* Reserved pages aren't counted against the user,
* externally pinned pages are already counted against
* the user.
*/
- if (!rsvd && !vfio_find_vpfn(dma, iova)) {
+ if (acct_pages) {
if (!dma->lock_cap &&
- mm->locked_vm + lock_acct + 1 > limit) {
+ mm->locked_vm + lock_acct + acct_pages > limit) {
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
__func__, limit << PAGE_SHIFT);
ret = -ENOMEM;
goto unpin_out;
}
- lock_acct++;
+ lock_acct += acct_pages;
}
- pinned++;
- npage--;
- vaddr += PAGE_SIZE;
- iova += PAGE_SIZE;
- batch->offset++;
- batch->size--;
+ pinned += nr_pages;
+ npage -= nr_pages;
+ vaddr += PAGE_SIZE * nr_pages;
+ iova += PAGE_SIZE * nr_pages;
+ batch->offset += nr_pages;
+ batch->size -= nr_pages;
if (!batch->size)
break;
@@ -722,6 +783,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
}
out:
+ dma->has_rsvd |= rsvd;
ret = vfio_lock_acct(dma, lock_acct, false);
unpin_out:
@@ -738,21 +800,29 @@ unpin_out:
return pinned;
}
+static inline void put_valid_unreserved_pfns(unsigned long start_pfn,
+ unsigned long npage, int prot)
+{
+ unpin_user_page_range_dirty_lock(pfn_to_page(start_pfn), npage,
+ prot & IOMMU_WRITE);
+}
+
static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
unsigned long pfn, unsigned long npage,
bool do_accounting)
{
- long unlocked = 0, locked = 0;
- long i;
+ long unlocked = 0, locked = vpfn_pages(dma, iova, npage);
- for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
- if (put_pfn(pfn++, dma->prot)) {
- unlocked++;
- if (vfio_find_vpfn(dma, iova))
- locked++;
- }
- }
+ if (dma->has_rsvd) {
+ unsigned long i;
+ for (i = 0; i < npage; i++)
+ if (put_pfn(pfn++, dma->prot))
+ unlocked++;
+ } else {
+ put_valid_unreserved_pfns(pfn, npage, dma->prot);
+ unlocked = npage;
+ }
if (do_accounting)
vfio_lock_acct(dma, locked - unlocked, true);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 06978b4dbeb8..f092ce3530bb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1833,7 +1833,12 @@ static inline unsigned long memdesc_section(memdesc_flags_t mdf)
{
return (mdf.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
}
-#endif
+#else /* !SECTION_IN_PAGE_FLAGS */
+static inline unsigned long memdesc_section(memdesc_flags_t mdf)
+{
+ return 0;
+}
+#endif /* SECTION_IN_PAGE_FLAGS */
/**
* folio_pfn - Return the Page Frame Number of a folio.
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index d6c1011b38f2..f6a2b2d20016 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -617,4 +617,40 @@ static inline bool vma_has_recency(const struct vm_area_struct *vma)
return true;
}
+/**
+ * num_pages_contiguous() - determine the number of contiguous pages
+ * that represent contiguous PFNs
+ * @pages: an array of page pointers
+ * @nr_pages: length of the array, at least 1
+ *
+ * Determine the number of contiguous pages that represent contiguous PFNs
+ * in @pages, starting from the first page.
+ *
+ * In some kernel configs contiguous PFNs will not have contiguous struct
+ * pages. In these configurations num_pages_contiguous() will return a num
+ * smaller than ideal number. The caller should continue to check for pfn
+ * contiguity after each call to num_pages_contiguous().
+ *
+ * Returns the number of contiguous pages.
+ */
+static inline size_t num_pages_contiguous(struct page **pages, size_t nr_pages)
+{
+ struct page *cur_page = pages[0];
+ unsigned long section = memdesc_section(cur_page->flags);
+ size_t i;
+
+ for (i = 1; i < nr_pages; i++) {
+ if (++cur_page != pages[i])
+ break;
+ /*
+ * In unproblematic kernel configs, page_to_section() == 0 and
+ * the whole check will get optimized out.
+ */
+ if (memdesc_section(cur_page->flags) != section)
+ break;
+ }
+
+ return i;
+}
+
#endif