From 63451de16e0a08be40f9ab5e7c5c8f5c79676fb1 Mon Sep 17 00:00:00 2001 From: Sunny Patel Date: Sat, 25 Apr 2026 19:05:27 +0530 Subject: [PATCH 01/14] mm/migrate_device: fix spinlock leak in migrate_vma_insert_huge_pmd_page When check_stable_address_space() fails after the PMD spinlock has been acquired via pmd_lock(), the code jumps directly to the abort label, bypassing the spin_unlock() call in unlock_abort. This causes the PMD spinlock to be permanently held, leading to a deadlock. Change the goto target from abort to unlock_abort to ensure the spinlock is always released on this error path. Link: https://lore.kernel.org/20260425133537.17463-1-nueralspacetech@gmail.com Fixes: a30b48bf1b24 ("mm/migrate_device: implement THP migration of zone device pages") Signed-off-by: Sunny Patel Reviewed-by: Andrew Morton Acked-by: Zi Yan Acked-by: Balbir Singh Acked-by: David Hildenbrand (Arm) Cc: Alistair Popple Cc: Byungchul Park Cc: Gregory Price Cc: "Huang, Ying" Cc: Joshua Hahn Cc: Matthew Brost Cc: Rakie Kim Cc: Signed-off-by: Andrew Morton --- mm/migrate_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index fbfe5715f635..ab49d4dcdb60 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -850,7 +850,7 @@ static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate, ptl = pmd_lock(vma->vm_mm, pmdp); csa_ret = check_stable_address_space(vma->vm_mm); if (csa_ret) - goto abort; + goto unlock_abort; /* * Check for userfaultfd but do not deliver the fault. Instead, From d4e7b5c4cc353f154d5ab8bb2e1ce7714d77a6e9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 26 Apr 2026 10:36:12 -0700 Subject: [PATCH 02/14] mm/damon/sysfs-schemes: call missing mem_cgroup_iter_break() damon_sysfs_memcg_path_to_id() breaks mem_cgroup_iter() loop without calling mem_cgroup_iter_break(). This leaks the cgroup reference. Fix the issue by calling mem_cgroup_iter_break() before the break. The issue was discovered [1] by Sashiko. Link: https://lore.kernel.org/20260426173625.86521-1-sj@kernel.org Link: https://lore.kernel.org/20260423004148.74722-1-sj@kernel.org [1] Fixes: 29cbb9a13f05 ("mm/damon/sysfs-schemes: implement scheme filters") Signed-off-by: SeongJae Park Cc: # 6.3.x Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 245d63808411..04746cbb3327 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -2594,6 +2594,7 @@ static int damon_sysfs_memcg_path_to_id(char *memcg_path, u64 *id) if (damon_sysfs_memcg_path_eq(memcg, path, memcg_path)) { *id = mem_cgroup_id(memcg); found = true; + mem_cgroup_iter_break(NULL, memcg); break; } } From 620072fd783290ad92c2d445a47b0a61b161f352 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 26 Apr 2026 12:31:17 -0700 Subject: [PATCH 03/14] mm/damon: fix damos_stat tracepoint format for sz_applied The print format is wrongly marking sz_applied as sz_tried. Fix it. Link: https://lore.kernel.org/20260426193119.88095-1-sj@kernel.org Fixes: 804c26b961da ("mm/damon/core: add trace point for damos stat per apply interval") Signed-off-by: SeongJae Park Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: # 7.0.x Signed-off-by: Andrew Morton --- include/trace/events/damon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h index 24fc402ab3c8..7e25f4469b81 100644 --- a/include/trace/events/damon.h +++ b/include/trace/events/damon.h @@ -41,7 +41,7 @@ TRACE_EVENT(damos_stat_after_apply_interval, ), TP_printk("ctx_idx=%u scheme_idx=%u nr_tried=%lu sz_tried=%lu " - "nr_applied=%lu sz_tried=%lu sz_ops_filter_passed=%lu " + "nr_applied=%lu sz_applied=%lu sz_ops_filter_passed=%lu " "qt_exceeds=%lu nr_snapshots=%lu", __entry->context_idx, __entry->scheme_idx, __entry->nr_tried, __entry->sz_tried, From c416aee7e7d04fec2d2d30786b3c8393108b85d2 Mon Sep 17 00:00:00 2001 From: Illia Ostapyshyn Date: Mon, 27 Apr 2026 16:24:47 +0200 Subject: [PATCH 04/14] scripts/gdb: mm: cast untyped symbols in x86_page_ops The symbols phys_base, _text, and _end, used in x86_page_ops are either defined in assembly or implicitly by the linker. Thus, they lack type information and cause a conversion error after gdb.parse_and_eval. Explicitly cast these expressions to unsigned long. Link: https://lore.kernel.org/20260427142448.666117-2-illia@yshyn.com Fixes: 55f8b4518d14 ("scripts/gdb: implement x86_page_ops in mm.py") Signed-off-by: Illia Ostapyshyn Cc: Florian Fainelli Cc: Jan Kiszka Cc: Kieran Bingham Cc: Vlastimil Babka Cc: Hao Li Cc: Harry Yoo Cc: Seongjun Hong Cc: Signed-off-by: Andrew Morton --- scripts/gdb/linux/mm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/gdb/linux/mm.py b/scripts/gdb/linux/mm.py index d78908f6664d..dffadccbb01d 100644 --- a/scripts/gdb/linux/mm.py +++ b/scripts/gdb/linux/mm.py @@ -40,11 +40,11 @@ class x86_page_ops(): self.PAGE_OFFSET = int(gdb.parse_and_eval("page_offset_base")) self.VMEMMAP_START = int(gdb.parse_and_eval("vmemmap_base")) - self.PHYS_BASE = int(gdb.parse_and_eval("phys_base")) + self.PHYS_BASE = int(gdb.parse_and_eval("(unsigned long) phys_base")) self.START_KERNEL_map = 0xffffffff80000000 - self.KERNEL_START = gdb.parse_and_eval("_text") - self.KERNEL_END = gdb.parse_and_eval("_end") + self.KERNEL_START = gdb.parse_and_eval("(unsigned long) &_text") + self.KERNEL_END = gdb.parse_and_eval("(unsigned long) &_end") self.VMALLOC_START = int(gdb.parse_and_eval("vmalloc_base")) if self.VMALLOC_START == 0xffffc90000000000: From 228e25e33325865ebe589da5366449a8ecf7d0da Mon Sep 17 00:00:00 2001 From: Illia Ostapyshyn Date: Mon, 27 Apr 2026 16:24:48 +0200 Subject: [PATCH 05/14] scripts/gdb: slab: update field names of struct kmem_cache The commit 5ba6bc27b1f9 ("slab: decouple pointer to barn from kmem_cache_node") reorganized the struct kmem_cache to factor out the per-node fields to the new struct kmem_cache_per_node_ptrs. This causes the gdb scripts for lx-slabinfo and lx-slabtrace fail as they still reference the old structure. Adjust the gdb scripts to match the current state of struct kmem_cache. Link: https://lore.kernel.org/20260427142448.666117-3-illia@yshyn.com Fixes: 5ba6bc27b1f9 ("slab: decouple pointer to barn from kmem_cache_node") Signed-off-by: Illia Ostapyshyn Acked-by: Harry Yoo (Oracle) Acked-by: Vlastimil Babka (SUSE) Cc: Florian Fainelli Cc: Hao Li Cc: Jan Kiszka Cc: Kieran Bingham Cc: Seongjun Hong Signed-off-by: Andrew Morton --- scripts/gdb/linux/slab.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/gdb/linux/slab.py b/scripts/gdb/linux/slab.py index 0e2d93867fe2..ddde25aeca8d 100644 --- a/scripts/gdb/linux/slab.py +++ b/scripts/gdb/linux/slab.py @@ -196,7 +196,7 @@ def slabtrace(alloc, cache_name): if target_cache['flags'] & SLAB_STORE_USER: for i in range(0, nr_node_ids): - cache_node = target_cache['node'][i] + cache_node = target_cache['per_node']['node'][i] if cache_node['nr_slabs']['counter'] == 0: continue process_slab(loc_track, cache_node['partial'], alloc, target_cache) @@ -300,7 +300,7 @@ def slabinfo(): nr_free = 0 nr_slabs = 0 for i in range(0, nr_node_ids): - cache_node = cache['node'][i] + cache_node = cache['per_node']['node'][i] try: nr_slabs += cache_node['nr_slabs']['counter'] nr_objs = int(cache_node['total_objects']['counter']) From 3432cbb291aabf85f8af4b9d1ec37179168ff999 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Mon, 27 Apr 2026 12:03:51 -0400 Subject: [PATCH 06/14] selftests/mm: run_vmtests.sh: fix destructive tests invocation Destructive tests should be invoked with -d command-line option, but this won't work today since 'd' is missing in getopts command-line. This commit fixes it. Link: https://lore.kernel.org/214fd9e4-5398-4c26-859e-c982c2e277c3@redhat.com Fixes: f16ff3b692ad ("selftests/mm: run_vmtests.sh: add missing tests") Signed-off-by: Luiz Capitulino Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index d8468451b3a3..c17b133a81d2 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -103,7 +103,7 @@ RUN_ALL=false RUN_DESTRUCTIVE=false TAP_PREFIX="# " -while getopts "aht:n" OPT; do +while getopts "aht:nd" OPT; do case ${OPT} in "a") RUN_ALL=true ;; "h") usage ;; From 77dcdff56d0b52947f110e9e43a1fc846ee8d94a Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Tue, 28 Apr 2026 15:48:32 +0300 Subject: [PATCH 07/14] MAINTAINERS: add tree for KDUMP and KEXEC Patch series "MAINTAINERS: update KEXEC, KDUMP and LIVE UPDATE". KHO and LiveUpdate team is going to pick kdump and kexec patches to their tree at https://git.kernel.org/pub/scm/linux/kernel/git/liveupdate/linux.git Update MAINTAINERS to reflect this change and add kexec@ list to LIVE UPDATE entry. This patch (of 2): KHO and LiveUpdate team is going to pick kdump and kexec patches to their tree at https://git.kernel.org/pub/scm/linux/kernel/git/liveupdate/linux.git Update MAINTAINERS to reflect it. Link: https://lore.kernel.org/20260428124833.1903302-1-rppt@kernel.org Link: https://lore.kernel.org/20260428124833.1903302-2-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Pasha Tatashin Acked-by: Baoquan He Acked-by: Pratyush Yadav Cc: Mike Rapoport Cc: Dave Young Cc: Eric W. Biederman Signed-off-by: Andrew Morton --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index b2040011a386..44833dc85827 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13863,6 +13863,7 @@ M: Pratyush Yadav R: Dave Young L: kexec@lists.infradead.org S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/liveupdate/linux.git F: Documentation/admin-guide/kdump/ F: fs/proc/vmcore.c F: include/linux/crash_core.h @@ -14179,6 +14180,7 @@ M: Pasha Tatashin M: Pratyush Yadav L: kexec@lists.infradead.org W: http://kernel.org/pub/linux/utils/kernel/kexec/ +T: git git://git.kernel.org/pub/scm/linux/kernel/git/liveupdate/linux.git F: include/linux/kexec.h F: include/uapi/linux/kexec.h F: kernel/kexec* From ec9f2ee9a4046b2d5e5a6b6fa6a2ed1542250e73 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Tue, 28 Apr 2026 15:48:33 +0300 Subject: [PATCH 08/14] MAINTAINERS: add kexec@ list to LIVE UPDATE ENTRY Link: https://lore.kernel.org/20260428124833.1903302-3-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Pasha Tatashin Acked-by: Baoquan He Cc: Dave Young Cc: Eric W. Biederman Cc: Pratyush Yadav Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 44833dc85827..3a1cc86e2c0a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14897,6 +14897,7 @@ LIVE UPDATE M: Pasha Tatashin M: Mike Rapoport M: Pratyush Yadav +L: kexec@lists.infradead.org L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/liveupdate/linux.git From 6a288a4ddb4a994490505ab5f41c445f8e6b6467 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Tue, 21 Apr 2026 17:39:07 +0200 Subject: [PATCH 09/14] mm/page_alloc: fix initialization of tags of the huge zero folio with init_on_free __GFP_ZEROTAGS semantics are currently a bit weird, but effectively this flag is only ever set alongside __GFP_ZERO and __GFP_SKIP_KASAN. If we run with init_on_free, we will zero out pages during __free_pages_prepare(), to skip zeroing on the allocation path. However, when allocating with __GFP_ZEROTAG set, post_alloc_hook() will consequently not only skip clearing page content, but also skip clearing tag memory. Not clearing tags through __GFP_ZEROTAGS is irrelevant for most pages that will get mapped to user space through set_pte_at() later: set_pte_at() and friends will detect that the tags have not been initialized yet (PG_mte_tagged not set), and initialize them. However, for the huge zero folio, which will be mapped through a PMD marked as special, this initialization will not be performed, ending up exposing whatever tags were still set for the pages. The docs (Documentation/arch/arm64/memory-tagging-extension.rst) state that allocation tags are set to 0 when a page is first mapped to user space. That no longer holds with the huge zero folio when init_on_free is enabled. Fix it by decoupling __GFP_ZEROTAGS from __GFP_ZERO, passing to tag_clear_highpages() whether we want to also clear page content. Invert the meaning of the tag_clear_highpages() return value to have clearer semantics. Reproduced with the huge zero folio by modifying the check_buffer_fill arm64/mte selftest to use a 2 MiB area, after making sure that pages have a non-0 tag set when freeing (note that, during boot, we will not actually initialize tags, but only set KASAN_TAG_KERNEL in the page flags). $ ./check_buffer_fill 1..20 ... not ok 17 Check initial tags with private mapping, sync error mode and mmap memory not ok 18 Check initial tags with private mapping, sync error mode and mmap/mprotect memory ... This code needs more cleanups; we'll tackle that next, like decoupling __GFP_ZEROTAGS from __GFP_SKIP_KASAN. [akpm@linux-foundation.org: s/__GPF_ZERO/__GFP_ZERO/, per David] Link: https://lore.kernel.org/20260421-zerotags-v2-1-05cb1035482e@kernel.org Fixes: adfb6609c680 ("mm/huge_memory: initialise the tags of the huge zero folio") Signed-off-by: David Hildenbrand (Arm) Reviewed-by: Catalin Marinas Tested-by: Lance Yang Cc: Brendan Jackman Cc: Dev Jain Cc: Johannes Weiner Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Mark Brown Cc: Michal Hocko Cc: Mike Rapoport Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Will Deacon Cc: Zi Yan Cc: Signed-off-by: Andrew Morton --- arch/arm64/include/asm/page.h | 2 +- arch/arm64/mm/fault.c | 11 +++++++---- include/linux/gfp_types.h | 10 +++++----- include/linux/highmem.h | 7 ++++--- mm/page_alloc.c | 8 ++++---- 5 files changed, 21 insertions(+), 17 deletions(-) diff --git a/arch/arm64/include/asm/page.h b/arch/arm64/include/asm/page.h index e25d0d18f6d7..58200de8a221 100644 --- a/arch/arm64/include/asm/page.h +++ b/arch/arm64/include/asm/page.h @@ -33,7 +33,7 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr); #define vma_alloc_zeroed_movable_folio vma_alloc_zeroed_movable_folio -bool tag_clear_highpages(struct page *to, int numpages); +bool tag_clear_highpages(struct page *to, int numpages, bool clear_pages); #define __HAVE_ARCH_TAG_CLEAR_HIGHPAGES #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 0f3c5c7ca054..739800835920 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -1018,7 +1018,7 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, return vma_alloc_folio(flags, 0, vma, vaddr); } -bool tag_clear_highpages(struct page *page, int numpages) +bool tag_clear_highpages(struct page *page, int numpages, bool clear_pages) { /* * Check if MTE is supported and fall back to clear_highpage(). @@ -1026,13 +1026,16 @@ bool tag_clear_highpages(struct page *page, int numpages) * post_alloc_hook() will invoke tag_clear_highpages(). */ if (!system_supports_mte()) - return false; + return clear_pages; /* Newly allocated pages, shouldn't have been tagged yet */ for (int i = 0; i < numpages; i++, page++) { WARN_ON_ONCE(!try_page_mte_tagging(page)); - mte_zero_clear_page_tags(page_address(page)); + if (clear_pages) + mte_zero_clear_page_tags(page_address(page)); + else + mte_clear_page_tags(page_address(page)); set_page_mte_tagged(page); } - return true; + return false; } diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 6c75df30a281..cd4972a7c97c 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -273,11 +273,11 @@ enum { * * %__GFP_ZERO returns a zeroed page on success. * - * %__GFP_ZEROTAGS zeroes memory tags at allocation time if the memory itself - * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that - * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting - * memory tags at the same time as zeroing memory has minimal additional - * performance impact. + * %__GFP_ZEROTAGS zeroes memory tags at allocation time. Setting memory tags at + * the same time as zeroing memory (e.g., with __GFP_ZERO) has minimal + * additional performance impact. However, __GFP_ZEROTAGS also zeroes the tags + * even if memory is not getting zeroed at allocation time (e.g., + * with init_on_free). * * %__GFP_SKIP_KASAN makes KASAN skip unpoisoning on page allocation. * Used for userspace and vmalloc pages; the latter are unpoisoned by diff --git a/include/linux/highmem.h b/include/linux/highmem.h index af03db851a1d..d7aac9de1c8a 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -347,10 +347,11 @@ static inline void clear_highpage_kasan_tagged(struct page *page) #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGES -/* Return false to let people know we did not initialize the pages */ -static inline bool tag_clear_highpages(struct page *page, int numpages) +/* Returns true if the caller has to initialize the pages */ +static inline bool tag_clear_highpages(struct page *page, int numpages, + bool clear_pages) { - return false; + return clear_pages; } #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 227d58dc3de6..23c7298d3be2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1808,9 +1808,9 @@ static inline bool should_skip_init(gfp_t flags) inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { + const bool zero_tags = gfp_flags & __GFP_ZEROTAGS; bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && !should_skip_init(gfp_flags); - bool zero_tags = init && (gfp_flags & __GFP_ZEROTAGS); int i; set_page_private(page, 0); @@ -1832,11 +1832,11 @@ inline void post_alloc_hook(struct page *page, unsigned int order, */ /* - * If memory tags should be zeroed - * (which happens only when memory should be initialized as well). + * Clearing tags can efficiently clear the memory for us as well, if + * required. */ if (zero_tags) - init = !tag_clear_highpages(page, 1 << order); + init = tag_clear_highpages(page, 1 << order, /* clear_pages= */init); if (!should_skip_kasan_unpoison(gfp_flags) && kasan_unpoison_pages(page, order, init)) { From efdadbc180e53fe257a6e85f6bc706cb58088653 Mon Sep 17 00:00:00 2001 From: "Christian A. Ehrhardt" Date: Tue, 21 Apr 2026 09:07:07 +0200 Subject: [PATCH 10/14] lib: kunit_iov_iter: fix test fail on powerpc Increase buffer size to accommodate machines with 64K PAGE_SIZE. Link: https://lore.kernel.org/20260421070707.992873-1-lk@c--e.de Fixes: 0913b7554726 ("lib: kunit_iov_iter: add tests for extract_iter_to_sg") Signed-off-by: Christian A. Ehrhardt Reported-by: David Gow Closes: https://lore.kernel.org/34a81ec2-af84-465d-9b5e-7bb5bf01680f@davidgow.net Tested-by: David Gow Tested-by: Josh Law Reviewed-by: Josh Law Signed-off-by: Andrew Morton --- lib/tests/kunit_iov_iter.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/tests/kunit_iov_iter.c b/lib/tests/kunit_iov_iter.c index 37bd6eb25896..f02f7b7aa796 100644 --- a/lib/tests/kunit_iov_iter.c +++ b/lib/tests/kunit_iov_iter.c @@ -1128,7 +1128,7 @@ static void __init iov_kunit_iter_to_sg_kvec(struct kunit *test) struct kvec kvec; size_t bufsize; - bufsize = 0x100000; + bufsize = 0x200000; iov_kunit_iter_to_sg_init(test, bufsize, false, &data); kvec.iov_base = data.buffer; @@ -1146,7 +1146,7 @@ static void __init iov_kunit_iter_to_sg_bvec(struct kunit *test) struct bio_vec *bvec; struct iov_iter iter; - bufsize = 0x100000; + bufsize = 0x200000; iov_kunit_iter_to_sg_init(test, bufsize, false, &data); bvec = kunit_kmalloc_array(test, data.npages, sizeof(*bvec), @@ -1173,7 +1173,7 @@ static void __init iov_kunit_iter_to_sg_folioq(struct kunit *test) struct iov_iter iter; size_t bufsize; - bufsize = 0x100000; + bufsize = 0x200000; iov_kunit_iter_to_sg_init(test, bufsize, false, &data); folioq = iov_kunit_create_folioq(test); @@ -1190,7 +1190,7 @@ static void __init iov_kunit_iter_to_sg_xarray(struct kunit *test) struct iov_iter iter; size_t bufsize; - bufsize = 0x100000; + bufsize = 0x200000; iov_kunit_iter_to_sg_init(test, bufsize, false, &data); xarray = iov_kunit_create_xarray(test); @@ -1206,7 +1206,7 @@ static void __init iov_kunit_iter_to_sg_ubuf(struct kunit *test) struct iov_iter iter; size_t bufsize; - bufsize = 0x100000; + bufsize = 0x200000; iov_kunit_iter_to_sg_init(test, bufsize, true, &data); iov_iter_ubuf(&iter, READ, data.ubuf, bufsize); From 93866f55f7e292fe3d47d36c9efe5ee10213a06b Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 28 Apr 2026 16:52:17 +0800 Subject: [PATCH 11/14] mm/memory_hotplug: fix memory block reference leak on remove Patch series "mm: Fix memory block leaks and locking", v2. This series fixes two memory block device reference leaks and one locking issue around the per-memory_block hwpoison counter. This patch (of 2): remove_memory_blocks_and_altmaps() looks up each memory block with find_memory_block(), which acquires a reference to the memory block device. That reference is never dropped on this path, resulting in a leaked device reference when removing memory blocks and their altmaps. Drop the reference after retrieving mem->altmap and clearing mem->altmap, before removing the memory block device. Link: https://lore.kernel.org/20260428085219.1316047-1-songmuchun@bytedance.com Link: https://lore.kernel.org/20260428085219.1316047-2-songmuchun@bytedance.com Fixes: 6b8f0798b85a ("mm/memory_hotplug: split memmap_on_memory requests across memblocks") Signed-off-by: Muchun Song Acked-by: Oscar Salvador Acked-by: David Hildenbrand (Arm) Cc: Danilo Krummrich Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: "Rafael J. Wysocki" Cc: Vishal Verma Cc: Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2a943ec57c85..40c7915dabe0 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1422,6 +1422,8 @@ static void remove_memory_blocks_and_altmaps(u64 start, u64 size) altmap = mem->altmap; mem->altmap = NULL; + /* drop the ref. we got via find_memory_block() */ + put_device(&mem->dev); remove_memory_block_devices(cur_start, memblock_size); From 03a2cc1756a0570f887d624cd6c535ea0cbd4951 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 28 Apr 2026 16:52:18 +0800 Subject: [PATCH 12/14] drivers/base/memory: fix memory block reference leak in poison accounting memblk_nr_poison_inc() and memblk_nr_poison_sub() look up a memory block via find_memory_block_by_id(), which acquires a reference to the memory block device. Both helpers use the returned memory block without dropping that reference, leaking the device reference on each successful lookup. Drop the reference after updating nr_hwpoison. Link: https://lore.kernel.org/20260428085219.1316047-3-songmuchun@bytedance.com Fixes: 5033091de814 ("mm/hwpoison: introduce per-memory_block hwpoison counter") Signed-off-by: Muchun Song Reviewed-by: Miaohe Lin Acked-by: Oscar Salvador Acked-by: David Hildenbrand (Arm) Cc: Danilo Krummrich Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Naoya Horiguchi Cc: "Rafael J. Wysocki" Cc: Vishal Verma Cc: Signed-off-by: Andrew Morton --- drivers/base/memory.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index f806a683b767..6981b55d582a 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -1230,8 +1230,10 @@ void memblk_nr_poison_inc(unsigned long pfn) const unsigned long block_id = pfn_to_block_id(pfn); struct memory_block *mem = find_memory_block_by_id(block_id); - if (mem) + if (mem) { atomic_long_inc(&mem->nr_hwpoison); + put_device(&mem->dev); + } } void memblk_nr_poison_sub(unsigned long pfn, long i) @@ -1239,8 +1241,10 @@ void memblk_nr_poison_sub(unsigned long pfn, long i) const unsigned long block_id = pfn_to_block_id(pfn); struct memory_block *mem = find_memory_block_by_id(block_id); - if (mem) + if (mem) { atomic_long_sub(i, &mem->nr_hwpoison); + put_device(&mem->dev); + } } static unsigned long memblk_nr_poison(struct memory_block *mem) From c0c6ccd9828c3a1950623b546fa57292a77b5c73 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Thu, 30 Apr 2026 13:31:22 +0200 Subject: [PATCH 13/14] mm: fix __vm_normal_page() to handle missing support for pmd_special()/pud_special() On x86 32-bit with THP enabled, zap_huge_pmd() is seen to generate a "WARNING: mm/memory.c:735 at __vm_normal_page+0x6a/0x7d", from the VM_WARN_ON_ONCE(is_zero_pfn(pfn) || is_huge_zero_pfn(pfn)); followed by "BUG: Bad rss-counter state"s, then later "BUG: Bad page state"s when reclaim gets to call shrink_huge_zero_folio_scan(). It's as if the _PAGE_SPECIAL bit never got set in the huge_zero pmd: and indeed, whereas pte_special() and pte_mkspecial() are subject to a dedicated CONFIG_ARCH_HAS_PTE_SPECIAL, pmd_special() and pmd_mkspecial() are subject to CONFIG_ARCH_SUPPORTS_PMD_PFNMAP, which is never enabled on any 32-bit architecture. While the problem was exposed through commit d80a9cb1a64a ("mm/huge_memory: add and use normal_or_softleaf_folio_pmd()"), it was an oversight in commit af38538801c6 ("mm/memory: factor out common code from vm_normal_page_*()") and would result in other problems: * huge zero folio accounted in smaps, pagemap (PAGE_IS_FILE) and numamaps as file-backed THP * folio_walk_start() returning the folio even without FW_ZEROPAGE set. Callers seem to tolerate that, though. ... and triggering the VM_WARN_ON_ONE(), although never reported so far. To fix it, teach vm_normal_page_pmd()/vm_normal_page_pud() to consider whether pmd_special/pud_special is actually implemented. Link: https://lore.kernel.org/20260430-pmd_special-v1-1-dbcbcfd72c20@kernel.org Fixes: af38538801c6 ("mm/memory: factor out common code from vm_normal_page_*()") Signed-off-by: David Hildenbrand (Arm) Reported-by: Hugh Dickins Closes: https://lore.kernel.org/r/74a75b59-2e13-3985-ee99-d5521f39df2a@google.com Reported-by: Bibo Mao Closes: https://lore.kernel.org/r/20260430041121.2839350-1-maobibo@loongson.cn Debugged-by: Hugh Dickins Reviewed-by: Lance Yang Tested-by: Bibo Mao Reviewed-by: Baolin Wang Reviewed-by: Oscar Salvador Reviewed-by: Lorenzo Stoakes Cc: Liam R. Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/memory.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index ea6568571131..c51ad671b95f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -612,6 +612,21 @@ static void print_bad_page_map(struct vm_area_struct *vma, dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } + +static inline bool pgtable_level_has_pxx_special(enum pgtable_level level) +{ + switch (level) { + case PGTABLE_LEVEL_PTE: + return IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL); + case PGTABLE_LEVEL_PMD: + return IS_ENABLED(CONFIG_ARCH_SUPPORTS_PMD_PFNMAP); + case PGTABLE_LEVEL_PUD: + return IS_ENABLED(CONFIG_ARCH_SUPPORTS_PUD_PFNMAP); + default: + return false; + } +} + #define print_bad_pte(vma, addr, pte, page) \ print_bad_page_map(vma, addr, pte_val(pte), page, PGTABLE_LEVEL_PTE) @@ -684,7 +699,7 @@ static inline struct page *__vm_normal_page(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, bool special, unsigned long long entry, enum pgtable_level level) { - if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { + if (pgtable_level_has_pxx_special(level)) { if (unlikely(special)) { #ifdef CONFIG_FIND_NORMAL_PAGE if (vma->vm_ops && vma->vm_ops->find_normal_page) @@ -699,8 +714,9 @@ static inline struct page *__vm_normal_page(struct vm_area_struct *vma, return NULL; } /* - * With CONFIG_ARCH_HAS_PTE_SPECIAL, any special page table - * mappings (incl. shared zero folios) are marked accordingly. + * With working pte_special()/pmd_special()..., any special page + * table mappings (incl. shared zero folios) are marked + * accordingly. */ } else { if (unlikely(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) { From be3f38d05cc5a7c3f13e51994c5dd043ab604d28 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 1 May 2026 16:51:16 +1000 Subject: [PATCH 14/14] mm/memory: fix spurious warning when unmapping device-private/exclusive pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Device private and exclusive entries are only supported for anonymous folios. This condition is tested in __migrate_device_pages() and make_device_exclusive() using folio_test_anon(). However the unmap path tests this assumption using vma_is_anonymous(). This is wrong because whilst anonymous VMAs can only contain folios where folio_test_anon() is true the opposite relation does not hold. A folio for which folio_test_anon() is true does not imply vma_is_anonymous() is true. Such a condition can occur if for example a folio is part of a private filebacked mapping. In this case vma_is_anonymous() is false as the mapping is filebacked, but folio_test_anon() may be true, thus permitting devices to migrate the folio to device private memory. This can lead to the following spurious warnings during process teardown: [ 772.737706] ------------[ cut here ]------------ [ 772.739201] WARNING: mm/memory.c:1754 at unmap_page_range.cold+0x26/0x18a, CPU#17: hmm-tests/2041 [ 772.742050] Modules linked in: test_hmm nvidia_uvm(O) nvidia(O) [ 772.743959] CPU: 17 UID: 0 PID: 2041 Comm: hmm-tests Tainted: G W O 7.0.0+ #387 PREEMPT(full) [ 772.747104] Tainted: [W]=WARN, [O]=OOT_MODULE [ 772.748509] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.17.0-0-gb52ca86e094d-prebuilt.qemu.org 04/01/2014 [ 772.752117] RIP: 0010:unmap_page_range.cold+0x26/0x18a [ 772.753780] Code: 7e fe ff ff 48 89 4c 24 78 4c 89 44 24 38 e8 f2 ff b1 00 48 8b 4c 24 78 4c 8b 44 24 38 48 8b 44 24 18 48 83 78 48 00 74 04 90 <0f> 0b 90 48 89 ca b8 ff ff 37 00 48 c1 ea 03 48 c1 e0 2a 80 3c 02 [ 772.759602] RSP: 0018:ffff888112607550 EFLAGS: 00010286 [ 772.761310] RAX: ffff88811bbf4dc0 RBX: dffffc0000000000 RCX: ffffea03e9bfffd8 [ 772.763583] RDX: 1ffff1102377e9c1 RSI: 0000000000000008 RDI: ffff88811bbf4e08 [ 772.765914] RBP: 0000000000000006 R08: ffff8881059f7448 R09: ffffed10224c0e68 [ 772.768184] R10: ffff888112607347 R11: 0000000000000001 R12: 0000000000000001 [ 772.770461] R13: ffffea03e9bfffc0 R14: ffff888112607908 R15: ffffea03e9bfffc0 [ 772.772782] FS: 00007f327caa2780(0000) GS:ffff888427b7d000(0000) knlGS:0000000000000000 [ 772.775328] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 772.777187] CR2: 00007f327ca89000 CR3: 00000001994d5000 CR4: 00000000000006f0 [ 772.779135] Call Trace: [ 772.779792] [ 772.780317] ? dmirror_interval_invalidate+0x1a3/0x290 [test_hmm] [ 772.781873] ? vm_normal_page_pud+0x2b0/0x2b0 [ 772.782992] ? __rwlock_init+0x150/0x150 [ 772.784006] ? lock_release+0x216/0x2b0 [ 772.785008] ? __mmu_notifier_invalidate_range_start+0x505/0x6e0 [ 772.786522] ? lock_release+0x216/0x2b0 [ 772.787498] ? unmap_single_vma+0xb6/0x210 [ 772.788573] unmap_vmas+0x27d/0x520 [ 772.789506] ? unmap_single_vma+0x210/0x210 [ 772.790607] ? mas_update_gap.part.0+0x620/0x620 [ 772.791834] unmap_region+0x19e/0x350 [ 772.792769] ? remove_vma+0x130/0x130 [ 772.793684] ? mas_alloc_nodes+0x1f2/0x300 [ 772.794730] vms_complete_munmap_vmas+0x8c1/0xe20 [ 772.795926] ? unmap_region+0x350/0x350 [ 772.796917] do_vmi_align_munmap+0x36a/0x4e0 [ 772.798018] ? lock_release+0x216/0x2b0 [ 772.799024] ? vma_shrink+0x620/0x620 [ 772.799983] do_vmi_munmap+0x150/0x2c0 [ 772.800939] __vm_munmap+0x161/0x2c0 [ 772.801872] ? expand_downwards+0xd60/0xd60 [ 772.802948] ? clockevents_program_event+0x1ef/0x540 [ 772.804217] ? lock_release+0x216/0x2b0 [ 772.805158] __x64_sys_munmap+0x59/0x80 [ 772.805776] do_syscall_64+0xfc/0x670 [ 772.806336] ? irqentry_exit+0xda/0x580 [ 772.806976] entry_SYSCALL_64_after_hwframe+0x4b/0x53 [ 772.807772] RIP: 0033:0x7f327cbb2717 [ 772.808323] Code: 73 01 c3 48 8b 0d f9 76 0d 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 b8 0b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c9 76 0d 00 f7 d8 64 89 01 48 [ 772.811337] RSP: 002b:00007ffde7f57d38 EFLAGS: 00000202 ORIG_RAX: 000000000000000b [ 772.812564] RAX: ffffffffffffffda RBX: 00007f327cc9c000 RCX: 00007f327cbb2717 [ 772.813733] RDX: 0000000000000000 RSI: 0000000000400000 RDI: 00007f327c289000 [ 772.814867] RBP: 0000000000421360 R08: 000000000000001a R09: 0000000000000000 [ 772.815991] R10: 0000000000000003 R11: 0000000000000202 R12: 00007ffde7f57d74 [ 772.817121] R13: 00007f327c689010 R14: 0000000000100000 R15: 00007f327c289000 [ 772.818272] [ 772.818614] irq event stamp: 0 [ 772.819159] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [ 772.820174] hardirqs last disabled at (0): [] copy_process+0x19f3/0x6440 [ 772.821511] softirqs last enabled at (0): [] copy_process+0x1a40/0x6440 [ 772.822869] softirqs last disabled at (0): [<0000000000000000>] 0x0 [ 772.823871] ---[ end trace 0000000000000000 ]--- Fix this by using the same check for folio_test_anon() in zap_nonpresent_ptes(). Also add a hmm-test case for this. Link: https://lore.kernel.org/20260501065116.2057242-1-apopple@nvidia.com Fixes: 999dad824c39 ("mm/shmem: persist uffd-wp bit across zapping for file-backed") Signed-off-by: Alistair Popple Reported-by: Arsen Arsenović Reviewed-by: Balbir Singh Cc: David Hildenbrand Cc: Jason Gunthorpe Cc: John Hubbard Cc: Leon Romanovsky Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Peter Xu Cc: Matthew Brost Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- tools/testing/selftests/mm/hmm-tests.c | 50 ++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index c51ad671b95f..86a973119bd4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1755,7 +1755,7 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, * consider uffd-wp bit when zap. For more information, * see zap_install_uffd_wp_if_needed(). */ - WARN_ON_ONCE(!vma_is_anonymous(vma)); + WARN_ON_ONCE(!folio_test_anon(folio)); rss[mm_counter(folio)]--; folio_remove_rmap_pte(folio, page, vma); folio_put(folio); diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c index 788689497e92..77fb4c5d871b 100644 --- a/tools/testing/selftests/mm/hmm-tests.c +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -985,6 +985,56 @@ TEST_F(hmm, migrate) hmm_buffer_free(buffer); } +/* + * Migrate private file memory to device private memory. + */ +TEST_F(hmm, migrate_file_private) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + int fd; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + fd = hmm_create_file(size); + ASSERT_GE(fd, 0); + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = fd; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + /* * Migrate anonymous memory to device private memory and fault some of it back * to system memory, then try migrating the resulting mix of system and device