From 929bf010e0599ddef6b640cd314f1de65dd1ca3e Mon Sep 17 00:00:00 2001 From: Li Zhe Date: Thu, 14 Aug 2025 14:47:10 +0800 Subject: [PATCH 1/6] mm: introduce num_pages_contiguous() Let's add a simple helper for determining the number of contiguous pages that represent contiguous PFNs. In an ideal world, this helper would be simpler or not even required. Unfortunately, on some configs we still have to maintain (SPARSEMEM without VMEMMAP), the memmap is allocated per memory section, and we might run into weird corner cases of false positives when blindly testing for contiguous pages only. One example of such false positives would be a memory section-sized hole that does not have a memmap. The surrounding memory sections might get "struct pages" that are contiguous, but the PFNs are actually not. This helper will, for example, be useful for determining contiguous PFNs in a GUP result, to batch further operations across returned "struct page"s. VFIO will utilize this interface to accelerate the VFIO DMA map process. Implementation based on Linus' suggestions to avoid new usage of nth_page() where avoidable. Suggested-by: Linus Torvalds Suggested-by: Jason Gunthorpe Signed-off-by: Li Zhe Co-developed-by: David Hildenbrand Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20250814064714.56485-2-lizhe.67@bytedance.com Signed-off-by: Alex Williamson --- include/linux/mm.h | 7 ++++++- include/linux/mm_inline.h | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 06978b4dbeb8..f092ce3530bb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1833,7 +1833,12 @@ static inline unsigned long memdesc_section(memdesc_flags_t mdf) { return (mdf.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK; } -#endif +#else /* !SECTION_IN_PAGE_FLAGS */ +static inline unsigned long memdesc_section(memdesc_flags_t mdf) +{ + return 0; +} +#endif /* SECTION_IN_PAGE_FLAGS */ /** * folio_pfn - Return the Page Frame Number of a folio. diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index d6c1011b38f2..f6a2b2d20016 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -617,4 +617,40 @@ static inline bool vma_has_recency(const struct vm_area_struct *vma) return true; } +/** + * num_pages_contiguous() - determine the number of contiguous pages + * that represent contiguous PFNs + * @pages: an array of page pointers + * @nr_pages: length of the array, at least 1 + * + * Determine the number of contiguous pages that represent contiguous PFNs + * in @pages, starting from the first page. + * + * In some kernel configs contiguous PFNs will not have contiguous struct + * pages. In these configurations num_pages_contiguous() will return a num + * smaller than ideal number. The caller should continue to check for pfn + * contiguity after each call to num_pages_contiguous(). + * + * Returns the number of contiguous pages. + */ +static inline size_t num_pages_contiguous(struct page **pages, size_t nr_pages) +{ + struct page *cur_page = pages[0]; + unsigned long section = memdesc_section(cur_page->flags); + size_t i; + + for (i = 1; i < nr_pages; i++) { + if (++cur_page != pages[i]) + break; + /* + * In unproblematic kernel configs, page_to_section() == 0 and + * the whole check will get optimized out. + */ + if (memdesc_section(cur_page->flags) != section) + break; + } + + return i; +} + #endif From f6c84a52cc41e2aaed0d956d0a1c1802513a239c Mon Sep 17 00:00:00 2001 From: Li Zhe Date: Thu, 14 Aug 2025 14:47:11 +0800 Subject: [PATCH 2/6] vfio/type1: optimize vfio_pin_pages_remote() When vfio_pin_pages_remote() is called with a range of addresses that includes large folios, the function currently performs individual statistics counting operations for each page. This can lead to significant performance overheads, especially when dealing with large ranges of pages. Batch processing of statistical counting operations can effectively enhance performance. In addition, the pages obtained through longterm GUP are neither invalid nor reserved. Therefore, we can reduce the overhead associated with some calls to function is_invalid_reserved_pfn(). The performance test results for completing the 16G VFIO IOMMU DMA mapping are as follows. Base(v6.16): ------- AVERAGE (MADV_HUGEPAGE) -------- VFIO MAP DMA in 0.049 s (328.5 GB/s) ------- AVERAGE (MAP_POPULATE) -------- VFIO MAP DMA in 0.268 s (59.6 GB/s) ------- AVERAGE (HUGETLBFS) -------- VFIO MAP DMA in 0.051 s (310.9 GB/s) With this patch: ------- AVERAGE (MADV_HUGEPAGE) -------- VFIO MAP DMA in 0.025 s (629.8 GB/s) ------- AVERAGE (MAP_POPULATE) -------- VFIO MAP DMA in 0.253 s (63.1 GB/s) ------- AVERAGE (HUGETLBFS) -------- VFIO MAP DMA in 0.030 s (530.5 GB/s) For large folio, we achieve an over 40% performance improvement. For small folios, the performance test results indicate a slight improvement. Signed-off-by: Li Zhe Co-developed-by: Alex Williamson Acked-by: David Hildenbrand Tested-by: Eric Farman Link: https://lore.kernel.org/r/20250814064714.56485-3-lizhe.67@bytedance.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 84 ++++++++++++++++++++++++++++----- 1 file changed, 72 insertions(+), 12 deletions(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index f8d68fe77b41..7829b5e268c2 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "vfio.h" #define DRIVER_VERSION "0.2" @@ -318,7 +319,13 @@ static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu) /* * Helper Functions for host iova-pfn list */ -static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova) + +/* + * Find the highest vfio_pfn that overlapping the range + * [iova_start, iova_end) in rb tree. + */ +static struct vfio_pfn *vfio_find_vpfn_range(struct vfio_dma *dma, + dma_addr_t iova_start, dma_addr_t iova_end) { struct vfio_pfn *vpfn; struct rb_node *node = dma->pfn_list.rb_node; @@ -326,9 +333,9 @@ static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova) while (node) { vpfn = rb_entry(node, struct vfio_pfn, node); - if (iova < vpfn->iova) + if (iova_end <= vpfn->iova) node = node->rb_left; - else if (iova > vpfn->iova) + else if (iova_start > vpfn->iova) node = node->rb_right; else return vpfn; @@ -336,6 +343,11 @@ static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova) return NULL; } +static inline struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova) +{ + return vfio_find_vpfn_range(dma, iova, iova + 1); +} + static void vfio_link_pfn(struct vfio_dma *dma, struct vfio_pfn *new) { @@ -614,6 +626,39 @@ static long vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr, return ret; } + +static long vpfn_pages(struct vfio_dma *dma, + dma_addr_t iova_start, long nr_pages) +{ + dma_addr_t iova_end = iova_start + (nr_pages << PAGE_SHIFT); + struct vfio_pfn *top = vfio_find_vpfn_range(dma, iova_start, iova_end); + long ret = 1; + struct vfio_pfn *vpfn; + struct rb_node *prev; + struct rb_node *next; + + if (likely(!top)) + return 0; + + prev = next = &top->node; + + while ((prev = rb_prev(prev))) { + vpfn = rb_entry(prev, struct vfio_pfn, node); + if (vpfn->iova < iova_start) + break; + ret++; + } + + while ((next = rb_next(next))) { + vpfn = rb_entry(next, struct vfio_pfn, node); + if (vpfn->iova >= iova_end) + break; + ret++; + } + + return ret; +} + /* * Attempt to pin pages. We really don't want to track all the pfns and * the iommu can only map chunks of consecutive pfns anyway, so get the @@ -687,32 +732,47 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, * and rsvd here, and therefore continues to use the batch. */ while (true) { + long nr_pages, acct_pages = 0; + if (pfn != *pfn_base + pinned || rsvd != is_invalid_reserved_pfn(pfn)) goto out; + /* + * Using GUP with the FOLL_LONGTERM in + * vaddr_get_pfns() will not return invalid + * or reserved pages. + */ + nr_pages = num_pages_contiguous( + &batch->pages[batch->offset], + batch->size); + if (!rsvd) { + acct_pages = nr_pages; + acct_pages -= vpfn_pages(dma, iova, nr_pages); + } + /* * Reserved pages aren't counted against the user, * externally pinned pages are already counted against * the user. */ - if (!rsvd && !vfio_find_vpfn(dma, iova)) { + if (acct_pages) { if (!dma->lock_cap && - mm->locked_vm + lock_acct + 1 > limit) { + mm->locked_vm + lock_acct + acct_pages > limit) { pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, limit << PAGE_SHIFT); ret = -ENOMEM; goto unpin_out; } - lock_acct++; + lock_acct += acct_pages; } - pinned++; - npage--; - vaddr += PAGE_SIZE; - iova += PAGE_SIZE; - batch->offset++; - batch->size--; + pinned += nr_pages; + npage -= nr_pages; + vaddr += PAGE_SIZE * nr_pages; + iova += PAGE_SIZE * nr_pages; + batch->offset += nr_pages; + batch->size -= nr_pages; if (!batch->size) break; From d10872050ffeda8c3bdc08f3376bb49b34b4e643 Mon Sep 17 00:00:00 2001 From: Li Zhe Date: Thu, 14 Aug 2025 14:47:12 +0800 Subject: [PATCH 3/6] vfio/type1: batch vfio_find_vpfn() in function vfio_unpin_pages_remote() The function vpfn_pages() can help us determine the number of vpfn nodes on the vpfn rb tree within a specified range. This allows us to avoid searching for each vpfn individually in the function vfio_unpin_pages_remote(). This patch batches the vfio_find_vpfn() calls in function vfio_unpin_pages_remote(). Signed-off-by: Li Zhe Link: https://lore.kernel.org/r/20250814064714.56485-4-lizhe.67@bytedance.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 7829b5e268c2..dbacd852efae 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -802,16 +802,12 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, unsigned long pfn, unsigned long npage, bool do_accounting) { - long unlocked = 0, locked = 0; + long unlocked = 0, locked = vpfn_pages(dma, iova, npage); long i; - for (i = 0; i < npage; i++, iova += PAGE_SIZE) { - if (put_pfn(pfn++, dma->prot)) { + for (i = 0; i < npage; i++) + if (put_pfn(pfn++, dma->prot)) unlocked++; - if (vfio_find_vpfn(dma, iova)) - locked++; - } - } if (do_accounting) vfio_lock_acct(dma, locked - unlocked, true); From 089722e8939e580c9ccc64678ba22f563fdf3bb5 Mon Sep 17 00:00:00 2001 From: Li Zhe Date: Thu, 14 Aug 2025 14:47:13 +0800 Subject: [PATCH 4/6] vfio/type1: introduce a new member has_rsvd for struct vfio_dma Introduce a new member has_rsvd for struct vfio_dma. This member is used to indicate whether there are any reserved or invalid pfns in the region represented by this vfio_dma. If it is true, it indicates that there is at least one pfn in this region that is either reserved or invalid. Signed-off-by: Li Zhe Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20250814064714.56485-5-lizhe.67@bytedance.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index dbacd852efae..30e1b54f6c25 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -93,6 +93,7 @@ struct vfio_dma { bool iommu_mapped; bool lock_cap; /* capable(CAP_IPC_LOCK) */ bool vaddr_invalid; + bool has_rsvd; /* has 1 or more rsvd pfns */ struct task_struct *task; struct rb_root pfn_list; /* Ex-user pinned pfn list */ unsigned long *bitmap; @@ -782,6 +783,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, } out: + dma->has_rsvd |= rsvd; ret = vfio_lock_acct(dma, lock_acct, false); unpin_out: From d14de5b92578c769e12d84c9bdeee5627c042270 Mon Sep 17 00:00:00 2001 From: Li Zhe Date: Thu, 14 Aug 2025 14:47:14 +0800 Subject: [PATCH 5/6] vfio/type1: optimize vfio_unpin_pages_remote() When vfio_unpin_pages_remote() is called with a range of addresses that includes large folios, the function currently performs individual put_pfn() operations for each page. This can lead to significant performance overheads, especially when dealing with large ranges of pages. It would be very rare for reserved PFNs and non reserved will to be mixed within the same range. So this patch utilizes the has_rsvd variable introduced in the previous patch to determine whether batch put_pfn() operations can be performed. Moreover, compared to put_pfn(), unpin_user_page_range_dirty_lock() is capable of handling large folio scenarios more efficiently. The performance test results for completing the 16G VFIO IOMMU DMA unmapping are as follows. Base(v6.16): ------- AVERAGE (MADV_HUGEPAGE) -------- VFIO UNMAP DMA in 0.141 s (113.7 GB/s) ------- AVERAGE (MAP_POPULATE) -------- VFIO UNMAP DMA in 0.307 s (52.2 GB/s) ------- AVERAGE (HUGETLBFS) -------- VFIO UNMAP DMA in 0.135 s (118.6 GB/s) With this patchset: ------- AVERAGE (MADV_HUGEPAGE) -------- VFIO UNMAP DMA in 0.044 s (363.2 GB/s) ------- AVERAGE (MAP_POPULATE) -------- VFIO UNMAP DMA in 0.289 s (55.3 GB/s) ------- AVERAGE (HUGETLBFS) -------- VFIO UNMAP DMA in 0.044 s (361.3 GB/s) For large folio, we achieve an over 67% performance improvement in the VFIO UNMAP DMA item. For small folios, the performance test results appear to show a slight improvement. Suggested-by: Jason Gunthorpe Signed-off-by: Li Zhe Reviewed-by: David Hildenbrand Acked-by: David Hildenbrand Link: https://lore.kernel.org/r/20250814064714.56485-6-lizhe.67@bytedance.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 30e1b54f6c25..916cad80941c 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -800,17 +800,29 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, return pinned; } +static inline void put_valid_unreserved_pfns(unsigned long start_pfn, + unsigned long npage, int prot) +{ + unpin_user_page_range_dirty_lock(pfn_to_page(start_pfn), npage, + prot & IOMMU_WRITE); +} + static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, unsigned long pfn, unsigned long npage, bool do_accounting) { long unlocked = 0, locked = vpfn_pages(dma, iova, npage); - long i; - for (i = 0; i < npage; i++) - if (put_pfn(pfn++, dma->prot)) - unlocked++; + if (dma->has_rsvd) { + unsigned long i; + for (i = 0; i < npage; i++) + if (put_pfn(pfn++, dma->prot)) + unlocked++; + } else { + put_valid_unreserved_pfns(pfn, npage, dma->prot); + unlocked = npage; + } if (do_accounting) vfio_lock_acct(dma, locked - unlocked, true); From 451bb96328981808463405d436bd58de16dd967d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Thu, 18 Sep 2025 14:19:28 +0200 Subject: [PATCH 6/6] vfio: Dump migration features under debugfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A debugfs directory was recently added for VFIO devices. Add a new "features" file under the migration sub-directory to expose which features the device supports. Signed-off-by: Cédric Le Goater Link: https://lore.kernel.org/r/20250918121928.1921871-1-clg@redhat.com Signed-off-by: Alex Williamson --- Documentation/ABI/testing/debugfs-vfio | 6 ++++++ drivers/vfio/debugfs.c | 19 +++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/Documentation/ABI/testing/debugfs-vfio b/Documentation/ABI/testing/debugfs-vfio index 90f7c262f591..70ec2d454686 100644 --- a/Documentation/ABI/testing/debugfs-vfio +++ b/Documentation/ABI/testing/debugfs-vfio @@ -23,3 +23,9 @@ Contact: Longfang Liu Description: Read the live migration status of the vfio device. The contents of the state file reflects the migration state relative to those defined in the vfio_device_mig_state enum + +What: /sys/kernel/debug/vfio//migration/features +Date: Oct 2025 +KernelVersion: 6.18 +Contact: Cédric Le Goater +Description: Read the migration features of the vfio device. diff --git a/drivers/vfio/debugfs.c b/drivers/vfio/debugfs.c index 298bd866f157..8b0ca7a09064 100644 --- a/drivers/vfio/debugfs.c +++ b/drivers/vfio/debugfs.c @@ -58,6 +58,23 @@ static int vfio_device_state_read(struct seq_file *seq, void *data) return 0; } +static int vfio_device_features_read(struct seq_file *seq, void *data) +{ + struct device *vf_dev = seq->private; + struct vfio_device *vdev = container_of(vf_dev, struct vfio_device, device); + + if (vdev->migration_flags & VFIO_MIGRATION_STOP_COPY) + seq_puts(seq, "stop-copy\n"); + if (vdev->migration_flags & VFIO_MIGRATION_P2P) + seq_puts(seq, "p2p\n"); + if (vdev->migration_flags & VFIO_MIGRATION_PRE_COPY) + seq_puts(seq, "pre-copy\n"); + if (vdev->log_ops) + seq_puts(seq, "dirty-tracking\n"); + + return 0; +} + void vfio_device_debugfs_init(struct vfio_device *vdev) { struct device *dev = &vdev->device; @@ -72,6 +89,8 @@ void vfio_device_debugfs_init(struct vfio_device *vdev) vdev->debug_root); debugfs_create_devm_seqfile(dev, "state", vfio_dev_migration, vfio_device_state_read); + debugfs_create_devm_seqfile(dev, "features", vfio_dev_migration, + vfio_device_features_read); } }