diff options
| -rw-r--r-- | Documentation/ABI/testing/debugfs-vfio | 6 | ||||
| -rw-r--r-- | drivers/vfio/debugfs.c | 19 | ||||
| -rw-r--r-- | drivers/vfio/vfio_iommu_type1.c | 112 | ||||
| -rw-r--r-- | include/linux/mm.h | 7 | ||||
| -rw-r--r-- | include/linux/mm_inline.h | 36 |
5 files changed, 158 insertions, 22 deletions
diff --git a/Documentation/ABI/testing/debugfs-vfio b/Documentation/ABI/testing/debugfs-vfio index 90f7c262f591..70ec2d454686 100644 --- a/Documentation/ABI/testing/debugfs-vfio +++ b/Documentation/ABI/testing/debugfs-vfio @@ -23,3 +23,9 @@ Contact: Longfang Liu <liulongfang@huawei.com> Description: Read the live migration status of the vfio device. The contents of the state file reflects the migration state relative to those defined in the vfio_device_mig_state enum + +What: /sys/kernel/debug/vfio/<device>/migration/features +Date: Oct 2025 +KernelVersion: 6.18 +Contact: Cédric Le Goater <clg@redhat.com> +Description: Read the migration features of the vfio device. diff --git a/drivers/vfio/debugfs.c b/drivers/vfio/debugfs.c index 298bd866f157..8b0ca7a09064 100644 --- a/drivers/vfio/debugfs.c +++ b/drivers/vfio/debugfs.c @@ -58,6 +58,23 @@ static int vfio_device_state_read(struct seq_file *seq, void *data) return 0; } +static int vfio_device_features_read(struct seq_file *seq, void *data) +{ + struct device *vf_dev = seq->private; + struct vfio_device *vdev = container_of(vf_dev, struct vfio_device, device); + + if (vdev->migration_flags & VFIO_MIGRATION_STOP_COPY) + seq_puts(seq, "stop-copy\n"); + if (vdev->migration_flags & VFIO_MIGRATION_P2P) + seq_puts(seq, "p2p\n"); + if (vdev->migration_flags & VFIO_MIGRATION_PRE_COPY) + seq_puts(seq, "pre-copy\n"); + if (vdev->log_ops) + seq_puts(seq, "dirty-tracking\n"); + + return 0; +} + void vfio_device_debugfs_init(struct vfio_device *vdev) { struct device *dev = &vdev->device; @@ -72,6 +89,8 @@ void vfio_device_debugfs_init(struct vfio_device *vdev) vdev->debug_root); debugfs_create_devm_seqfile(dev, "state", vfio_dev_migration, vfio_device_state_read); + debugfs_create_devm_seqfile(dev, "features", vfio_dev_migration, + vfio_device_features_read); } } diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index f8d68fe77b41..916cad80941c 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -37,6 +37,7 @@ #include <linux/vfio.h> #include <linux/workqueue.h> #include <linux/notifier.h> +#include <linux/mm_inline.h> #include "vfio.h" #define DRIVER_VERSION "0.2" @@ -92,6 +93,7 @@ struct vfio_dma { bool iommu_mapped; bool lock_cap; /* capable(CAP_IPC_LOCK) */ bool vaddr_invalid; + bool has_rsvd; /* has 1 or more rsvd pfns */ struct task_struct *task; struct rb_root pfn_list; /* Ex-user pinned pfn list */ unsigned long *bitmap; @@ -318,7 +320,13 @@ static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu) /* * Helper Functions for host iova-pfn list */ -static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova) + +/* + * Find the highest vfio_pfn that overlapping the range + * [iova_start, iova_end) in rb tree. + */ +static struct vfio_pfn *vfio_find_vpfn_range(struct vfio_dma *dma, + dma_addr_t iova_start, dma_addr_t iova_end) { struct vfio_pfn *vpfn; struct rb_node *node = dma->pfn_list.rb_node; @@ -326,9 +334,9 @@ static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova) while (node) { vpfn = rb_entry(node, struct vfio_pfn, node); - if (iova < vpfn->iova) + if (iova_end <= vpfn->iova) node = node->rb_left; - else if (iova > vpfn->iova) + else if (iova_start > vpfn->iova) node = node->rb_right; else return vpfn; @@ -336,6 +344,11 @@ static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova) return NULL; } +static inline struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova) +{ + return vfio_find_vpfn_range(dma, iova, iova + 1); +} + static void vfio_link_pfn(struct vfio_dma *dma, struct vfio_pfn *new) { @@ -614,6 +627,39 @@ done: return ret; } + +static long vpfn_pages(struct vfio_dma *dma, + dma_addr_t iova_start, long nr_pages) +{ + dma_addr_t iova_end = iova_start + (nr_pages << PAGE_SHIFT); + struct vfio_pfn *top = vfio_find_vpfn_range(dma, iova_start, iova_end); + long ret = 1; + struct vfio_pfn *vpfn; + struct rb_node *prev; + struct rb_node *next; + + if (likely(!top)) + return 0; + + prev = next = &top->node; + + while ((prev = rb_prev(prev))) { + vpfn = rb_entry(prev, struct vfio_pfn, node); + if (vpfn->iova < iova_start) + break; + ret++; + } + + while ((next = rb_next(next))) { + vpfn = rb_entry(next, struct vfio_pfn, node); + if (vpfn->iova >= iova_end) + break; + ret++; + } + + return ret; +} + /* * Attempt to pin pages. We really don't want to track all the pfns and * the iommu can only map chunks of consecutive pfns anyway, so get the @@ -687,32 +733,47 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, * and rsvd here, and therefore continues to use the batch. */ while (true) { + long nr_pages, acct_pages = 0; + if (pfn != *pfn_base + pinned || rsvd != is_invalid_reserved_pfn(pfn)) goto out; /* + * Using GUP with the FOLL_LONGTERM in + * vaddr_get_pfns() will not return invalid + * or reserved pages. + */ + nr_pages = num_pages_contiguous( + &batch->pages[batch->offset], + batch->size); + if (!rsvd) { + acct_pages = nr_pages; + acct_pages -= vpfn_pages(dma, iova, nr_pages); + } + + /* * Reserved pages aren't counted against the user, * externally pinned pages are already counted against * the user. */ - if (!rsvd && !vfio_find_vpfn(dma, iova)) { + if (acct_pages) { if (!dma->lock_cap && - mm->locked_vm + lock_acct + 1 > limit) { + mm->locked_vm + lock_acct + acct_pages > limit) { pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, limit << PAGE_SHIFT); ret = -ENOMEM; goto unpin_out; } - lock_acct++; + lock_acct += acct_pages; } - pinned++; - npage--; - vaddr += PAGE_SIZE; - iova += PAGE_SIZE; - batch->offset++; - batch->size--; + pinned += nr_pages; + npage -= nr_pages; + vaddr += PAGE_SIZE * nr_pages; + iova += PAGE_SIZE * nr_pages; + batch->offset += nr_pages; + batch->size -= nr_pages; if (!batch->size) break; @@ -722,6 +783,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, } out: + dma->has_rsvd |= rsvd; ret = vfio_lock_acct(dma, lock_acct, false); unpin_out: @@ -738,21 +800,29 @@ unpin_out: return pinned; } +static inline void put_valid_unreserved_pfns(unsigned long start_pfn, + unsigned long npage, int prot) +{ + unpin_user_page_range_dirty_lock(pfn_to_page(start_pfn), npage, + prot & IOMMU_WRITE); +} + static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, unsigned long pfn, unsigned long npage, bool do_accounting) { - long unlocked = 0, locked = 0; - long i; + long unlocked = 0, locked = vpfn_pages(dma, iova, npage); - for (i = 0; i < npage; i++, iova += PAGE_SIZE) { - if (put_pfn(pfn++, dma->prot)) { - unlocked++; - if (vfio_find_vpfn(dma, iova)) - locked++; - } - } + if (dma->has_rsvd) { + unsigned long i; + for (i = 0; i < npage; i++) + if (put_pfn(pfn++, dma->prot)) + unlocked++; + } else { + put_valid_unreserved_pfns(pfn, npage, dma->prot); + unlocked = npage; + } if (do_accounting) vfio_lock_acct(dma, locked - unlocked, true); diff --git a/include/linux/mm.h b/include/linux/mm.h index 06978b4dbeb8..f092ce3530bb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1833,7 +1833,12 @@ static inline unsigned long memdesc_section(memdesc_flags_t mdf) { return (mdf.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK; } -#endif +#else /* !SECTION_IN_PAGE_FLAGS */ +static inline unsigned long memdesc_section(memdesc_flags_t mdf) +{ + return 0; +} +#endif /* SECTION_IN_PAGE_FLAGS */ /** * folio_pfn - Return the Page Frame Number of a folio. diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index d6c1011b38f2..f6a2b2d20016 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -617,4 +617,40 @@ static inline bool vma_has_recency(const struct vm_area_struct *vma) return true; } +/** + * num_pages_contiguous() - determine the number of contiguous pages + * that represent contiguous PFNs + * @pages: an array of page pointers + * @nr_pages: length of the array, at least 1 + * + * Determine the number of contiguous pages that represent contiguous PFNs + * in @pages, starting from the first page. + * + * In some kernel configs contiguous PFNs will not have contiguous struct + * pages. In these configurations num_pages_contiguous() will return a num + * smaller than ideal number. The caller should continue to check for pfn + * contiguity after each call to num_pages_contiguous(). + * + * Returns the number of contiguous pages. + */ +static inline size_t num_pages_contiguous(struct page **pages, size_t nr_pages) +{ + struct page *cur_page = pages[0]; + unsigned long section = memdesc_section(cur_page->flags); + size_t i; + + for (i = 1; i < nr_pages; i++) { + if (++cur_page != pages[i]) + break; + /* + * In unproblematic kernel configs, page_to_section() == 0 and + * the whole check will get optimized out. + */ + if (memdesc_section(cur_page->flags) != section) + break; + } + + return i; +} + #endif |
