From 394bfac1c7f7b701c2c93834c5761b9c9ceeebcf Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 22 Aug 2025 06:33:18 +0000 Subject: [PATCH 01/34] mm/khugepaged: fix the address passed to notifier on testing young Commit 8ee53820edfd ("thp: mmu_notifier_test_young") introduced mmu_notifier_test_young(), but we are passing the wrong address. In xxx_scan_pmd(), the actual iteration address is "_address" not "address". We seem to misuse the variable on the very beginning. Change it to the right one. [akpm@linux-foundation.org fix whitespace, per everyone] Link: https://lkml.kernel.org/r/20250822063318.11644-1-richard.weiyang@gmail.com Fixes: 8ee53820edfd ("thp: mmu_notifier_test_young") Signed-off-by: Wei Yang Reviewed-by: Dev Jain Reviewed-by: Zi Yan Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Cc: Baolin Wang Cc: Liam R. Howlett Cc: Nico Pache Cc: Ryan Roberts Cc: Barry Song Cc: Signed-off-by: Andrew Morton --- mm/khugepaged.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6b40bdfd224c..b486c1d19b2d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1417,8 +1417,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, */ if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || - folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, - address))) + folio_test_referenced(folio) || + mmu_notifier_test_young(vma->vm_mm, _address))) referenced++; } if (!writable) { From 397f6d14f9c370e4910e6885294c340f39dedbf5 Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Fri, 27 Jun 2025 20:57:47 +0800 Subject: [PATCH 02/34] mm/memory_hotplug: fix hwpoisoned large folio handling in do_migrate_range() In do_migrate_range(), the hwpoisoned folio may be large folio, which can't be handled by unmap_poisoned_folio(). I can reproduce this issue in qemu after adding delay in memory_failure() BUG: kernel NULL pointer dereference, address: 0000000000000000 Workqueue: kacpi_hotplug acpi_hotplug_work_fn RIP: 0010:try_to_unmap_one+0x16a/0xfc0 rmap_walk_anon+0xda/0x1f0 try_to_unmap+0x78/0x80 ? __pfx_try_to_unmap_one+0x10/0x10 ? __pfx_folio_not_mapped+0x10/0x10 ? __pfx_folio_lock_anon_vma_read+0x10/0x10 unmap_poisoned_folio+0x60/0x140 do_migrate_range+0x4d1/0x600 ? slab_memory_callback+0x6a/0x190 ? notifier_call_chain+0x56/0xb0 offline_pages+0x3e6/0x460 memory_subsys_offline+0x130/0x1f0 device_offline+0xba/0x110 acpi_bus_offline+0xb7/0x130 acpi_scan_hot_remove+0x77/0x290 acpi_device_hotplug+0x1e0/0x240 acpi_hotplug_work_fn+0x1a/0x30 process_one_work+0x186/0x340 Besides, do_migrate_range() may be called between memory_failure set hwpoison flag and isolate the folio from lru, so remove WARN_ON(). In other places, unmap_poisoned_folio() is called when the folio is isolated, obey it in do_migrate_range() too. [david@redhat.com: don't abort offlining, fixed typo, add comment] Link: https://lkml.kernel.org/r/3c214dff-9649-4015-840f-10de0e03ebe4@redhat.com Fixes: b15c87263a69 ("hwpoison, memory_hotplug: allow hwpoisoned pages to be offlined") Signed-off-by: Jinjiang Tu Signed-off-by: David Hildenbrand Acked-by: Zi Yan Reviewed-by: Miaohe Lin Cc: Kefeng Wang Cc: Luis Chamberalin Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Oscar Salvador Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1f15af712bc3..74318c787715 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1815,8 +1815,14 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1; if (folio_contain_hwpoisoned_page(folio)) { - if (WARN_ON(folio_test_lru(folio))) - folio_isolate_lru(folio); + /* + * unmap_poisoned_folio() cannot handle large folios + * in all cases yet. + */ + if (folio_test_large(folio) && !folio_test_hugetlb(folio)) + goto put_folio; + if (folio_test_lru(folio) && !folio_isolate_lru(folio)) + goto put_folio; if (folio_mapped(folio)) { folio_lock(folio); unmap_poisoned_folio(folio, pfn, false); From 669602b5b7386e4fa00fc67b045ca3fd816e685d Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 24 Aug 2025 16:07:59 +0300 Subject: [PATCH 03/34] init/main.c: fix boot time tracing crash Steven Rostedt reported a crash with "ftrace=function" kernel command line: [ 0.159269] BUG: kernel NULL pointer dereference, address: 000000000000001c [ 0.160254] #PF: supervisor read access in kernel mode [ 0.160975] #PF: error_code(0x0000) - not-present page [ 0.161697] PGD 0 P4D 0 [ 0.162055] Oops: Oops: 0000 [#1] SMP PTI [ 0.162619] CPU: 0 UID: 0 PID: 0 Comm: swapper Not tainted 6.17.0-rc2-test-00006-g48d06e78b7cb-dirty #9 PREEMPT(undef) [ 0.164141] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 0.165439] RIP: 0010:kmem_cache_alloc_noprof (mm/slub.c:4237) [ 0.166186] Code: 90 90 90 f3 0f 1e fa 0f 1f 44 00 00 55 48 89 e5 41 57 41 56 41 55 41 54 49 89 fc 53 48 83 e4 f0 48 83 ec 20 8b 05 c9 b6 7e 01 <44> 8b 77 1c 65 4c 8b 2d b5 ea 20 02 4c 89 6c 24 18 41 89 f5 21 f0 [ 0.168811] RSP: 0000:ffffffffb2e03b30 EFLAGS: 00010086 [ 0.169545] RAX: 0000000001fff33f RBX: 0000000000000000 RCX: 0000000000000000 [ 0.170544] RDX: 0000000000002800 RSI: 0000000000002800 RDI: 0000000000000000 [ 0.171554] RBP: ffffffffb2e03b80 R08: 0000000000000004 R09: ffffffffb2e03c90 [ 0.172549] R10: ffffffffb2e03c90 R11: 0000000000000000 R12: 0000000000000000 [ 0.173544] R13: ffffffffb2e03c90 R14: ffffffffb2e03c90 R15: 0000000000000001 [ 0.174542] FS: 0000000000000000(0000) GS:ffff9d2808114000(0000) knlGS:0000000000000000 [ 0.175684] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 0.176486] CR2: 000000000000001c CR3: 000000007264c001 CR4: 00000000000200b0 [ 0.177483] Call Trace: [ 0.177828] [ 0.178123] mas_alloc_nodes (lib/maple_tree.c:176 (discriminator 2) lib/maple_tree.c:1255 (discriminator 2)) [ 0.178692] mas_store_gfp (lib/maple_tree.c:5468) [ 0.179223] execmem_cache_add_locked (mm/execmem.c:207) [ 0.179870] execmem_alloc (mm/execmem.c:213 mm/execmem.c:313 mm/execmem.c:335 mm/execmem.c:475) [ 0.180397] ? ftrace_caller (arch/x86/kernel/ftrace_64.S:169) [ 0.180922] ? __pfx_ftrace_caller (arch/x86/kernel/ftrace_64.S:158) [ 0.181517] execmem_alloc_rw (mm/execmem.c:487) [ 0.182052] arch_ftrace_update_trampoline (arch/x86/kernel/ftrace.c:266 arch/x86/kernel/ftrace.c:344 arch/x86/kernel/ftrace.c:474) [ 0.182778] ? ftrace_caller_op_ptr (arch/x86/kernel/ftrace_64.S:182) [ 0.183388] ftrace_update_trampoline (kernel/trace/ftrace.c:7947) [ 0.184024] __register_ftrace_function (kernel/trace/ftrace.c:368) [ 0.184682] ftrace_startup (kernel/trace/ftrace.c:3048) [ 0.185205] ? __pfx_function_trace_call (kernel/trace/trace_functions.c:210) [ 0.185877] register_ftrace_function_nolock (kernel/trace/ftrace.c:8717) [ 0.186595] register_ftrace_function (kernel/trace/ftrace.c:8745) [ 0.187254] ? __pfx_function_trace_call (kernel/trace/trace_functions.c:210) [ 0.187924] function_trace_init (kernel/trace/trace_functions.c:170) [ 0.188499] tracing_set_tracer (kernel/trace/trace.c:5916 kernel/trace/trace.c:6349) [ 0.189088] register_tracer (kernel/trace/trace.c:2391) [ 0.189642] early_trace_init (kernel/trace/trace.c:11075 kernel/trace/trace.c:11149) [ 0.190204] start_kernel (init/main.c:970) [ 0.190732] x86_64_start_reservations (arch/x86/kernel/head64.c:307) [ 0.191381] x86_64_start_kernel (??:?) [ 0.191955] common_startup_64 (arch/x86/kernel/head_64.S:419) [ 0.192534] [ 0.192839] Modules linked in: [ 0.193267] CR2: 000000000000001c [ 0.193730] ---[ end trace 0000000000000000 ]--- The crash happens because on x86 ftrace allocations from execmem require maple tree to be initialized. Move maple tree initialization that depends only on slab availability earlier in boot so that it will happen right after mm_core_init(). Link: https://lkml.kernel.org/r/20250824130759.1732736-1-rppt@kernel.org Fixes: 5d79c2be5081 ("x86/ftrace: enable EXECMEM_ROX_CACHE for ftrace allocations") Signed-off-by: Mike Rapoport (Microsoft) Reported-by: Steven Rostedt (Google) Tested-by: Steven Rostedt (Google) Closes: https://lore.kernel.org/all/20250820184743.0302a8b5@gandalf.local.home/ Reviewed-by: Masami Hiramatsu (Google) Reviewed-by: Liam R. Howlett Cc: Borislav Betkov Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- init/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/main.c b/init/main.c index 0ee0ee7b7c2c..5753e9539ae6 100644 --- a/init/main.c +++ b/init/main.c @@ -956,6 +956,7 @@ void start_kernel(void) sort_main_extable(); trap_init(); mm_core_init(); + maple_tree_init(); poking_init(); ftrace_init(); @@ -973,7 +974,6 @@ void start_kernel(void) "Interrupts were enabled *very* early, fixing it\n")) local_irq_disable(); radix_tree_init(); - maple_tree_init(); /* * Set up housekeeping before setting up workqueues to allow the unbound From 21cc2b5c5062a256ae9064442d37ebbc23f5aef7 Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Sun, 24 Aug 2025 03:21:15 +0900 Subject: [PATCH 04/34] mm/hugetlb: add missing hugetlb_lock in __unmap_hugepage_range() When restoring a reservation for an anonymous page, we need to check to freeing a surplus. However, __unmap_hugepage_range() causes data race because it reads h->surplus_huge_pages without the protection of hugetlb_lock. And adjust_reservation is a boolean variable that indicates whether reservations for anonymous pages in each folio should be restored. Therefore, it should be initialized to false for each round of the loop. However, this variable is not initialized to false except when defining the current adjust_reservation variable. This means that once adjust_reservation is set to true even once within the loop, reservations for anonymous pages will be restored unconditionally in all subsequent rounds, regardless of the folio's state. To fix this, we need to add the missing hugetlb_lock, unlock the page_table_lock earlier so that we don't lock the hugetlb_lock inside the page_table_lock lock, and initialize adjust_reservation to false on each round within the loop. Link: https://lkml.kernel.org/r/20250823182115.1193563-1-aha310510@gmail.com Fixes: df7a6d1f6405 ("mm/hugetlb: restore the reservation if needed") Signed-off-by: Jeongjun Park Reported-by: syzbot+417aeb05fd190f3a6da9@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=417aeb05fd190f3a6da9 Reviewed-by: Sidhartha Kumar Cc: Breno Leitao Cc: David Hildenbrand Cc: Muchun Song Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 753f99b4c718..eed59cfb5d21 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5851,7 +5851,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, spinlock_t *ptl; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - bool adjust_reservation = false; + bool adjust_reservation; unsigned long last_addr_mask; bool force_flush = false; @@ -5944,6 +5944,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, sz); hugetlb_count_sub(pages_per_huge_page(h), mm); hugetlb_remove_rmap(folio); + spin_unlock(ptl); /* * Restore the reservation for anonymous page, otherwise the @@ -5951,14 +5952,16 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * If there we are freeing a surplus, do not set the restore * reservation bit. */ + adjust_reservation = false; + + spin_lock_irq(&hugetlb_lock); if (!h->surplus_huge_pages && __vma_private_lock(vma) && folio_test_anon(folio)) { folio_set_hugetlb_restore_reserve(folio); /* Reservation to be adjusted after the spin lock */ adjust_reservation = true; } - - spin_unlock(ptl); + spin_unlock_irq(&hugetlb_lock); /* * Adjust the reservation for the region that will have the From ce652aac9c90a96c6536681d17518efb1f660fb8 Mon Sep 17 00:00:00 2001 From: Sang-Heon Jeon Date: Fri, 22 Aug 2025 11:50:57 +0900 Subject: [PATCH 05/34] mm/damon/core: set quota->charged_from to jiffies at first charge window Kernel initializes the "jiffies" timer as 5 minutes below zero, as shown in include/linux/jiffies.h /* * Have the 32 bit jiffies value wrap 5 minutes after boot * so jiffies wrap bugs show up earlier. */ #define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) And jiffies comparison help functions cast unsigned value to signed to cover wraparound #define time_after_eq(a,b) \ (typecheck(unsigned long, a) && \ typecheck(unsigned long, b) && \ ((long)((a) - (b)) >= 0)) When quota->charged_from is initialized to 0, time_after_eq() can incorrectly return FALSE even after reset_interval has elapsed. This occurs when (jiffies - reset_interval) produces a value with MSB=1, which is interpreted as negative in signed arithmetic. This issue primarily affects 32-bit systems because: On 64-bit systems: MSB=1 values occur after ~292 million years from boot (assuming HZ=1000), almost impossible. On 32-bit systems: MSB=1 values occur during the first 5 minutes after boot, and the second half of every jiffies wraparound cycle, starting from day 25 (assuming HZ=1000) When above unexpected FALSE return from time_after_eq() occurs, the charging window will not reset. The user impact depends on esz value at that time. If esz is 0, scheme ignores configured quotas and runs without any limits. If esz is not 0, scheme stops working once the quota is exhausted. It remains until the charging window finally resets. So, change quota->charged_from to jiffies at damos_adjust_quota() when it is considered as the first charge window. By this change, we can avoid unexpected FALSE return from time_after_eq() Link: https://lkml.kernel.org/r/20250822025057.1740854-1-ekffu200098@gmail.com Fixes: 2b8a248d5873 ("mm/damon/schemes: implement size quota for schemes application speed control") # 5.16 Signed-off-by: Sang-Heon Jeon Reviewed-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- mm/damon/core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index 106ee8b0f2d5..c2e0b469fd43 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2111,6 +2111,10 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) if (!quota->ms && !quota->sz && list_empty("a->goals)) return; + /* First charge window */ + if (!quota->total_charged_sz && !quota->charged_from) + quota->charged_from = jiffies; + /* New charge window starts */ if (time_after_eq(jiffies, quota->charged_from + msecs_to_jiffies(quota->reset_interval))) { From 711f19dfd783ffb37ca4324388b9c4cb87e71363 Mon Sep 17 00:00:00 2001 From: Quanmin Yan Date: Wed, 27 Aug 2025 19:58:57 +0800 Subject: [PATCH 06/34] mm/damon/lru_sort: avoid divide-by-zero in damon_lru_sort_apply_parameters() Patch series "mm/damon: avoid divide-by-zero in DAMON module's parameters application". DAMON's RECLAIM and LRU_SORT modules perform no validation on user-configured parameters during application, which may lead to division-by-zero errors. Avoid the divide-by-zero by adding validation checks when DAMON modules attempt to apply the parameters. This patch (of 2): During the calculation of 'hot_thres' and 'cold_thres', either 'sample_interval' or 'aggr_interval' is used as the divisor, which may lead to division-by-zero errors. Fix it by directly returning -EINVAL when such a case occurs. Additionally, since 'aggr_interval' is already required to be set no smaller than 'sample_interval' in damon_set_attrs(), only the case where 'sample_interval' is zero needs to be checked. Link: https://lkml.kernel.org/r/20250827115858.1186261-2-yanquanmin1@huawei.com Fixes: 40e983cca927 ("mm/damon: introduce DAMON-based LRU-lists Sorting") Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: Kefeng Wang Cc: ze zuo Cc: [6.0+] Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 151a9de5ad8b..b5a5ed16a7a5 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -198,6 +198,11 @@ static int damon_lru_sort_apply_parameters(void) if (err) return err; + if (!damon_lru_sort_mon_attrs.sample_interval) { + err = -EINVAL; + goto out; + } + err = damon_set_attrs(ctx, &damon_lru_sort_mon_attrs); if (err) goto out; From e6b543ca9806d7bced863f43020e016ee996c057 Mon Sep 17 00:00:00 2001 From: Quanmin Yan Date: Wed, 27 Aug 2025 19:58:58 +0800 Subject: [PATCH 07/34] mm/damon/reclaim: avoid divide-by-zero in damon_reclaim_apply_parameters() When creating a new scheme of DAMON_RECLAIM, the calculation of 'min_age_region' uses 'aggr_interval' as the divisor, which may lead to division-by-zero errors. Fix it by directly returning -EINVAL when such a case occurs. Link: https://lkml.kernel.org/r/20250827115858.1186261-3-yanquanmin1@huawei.com Fixes: f5a79d7c0c87 ("mm/damon: introduce struct damos_access_pattern") Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: Kefeng Wang Cc: ze zuo Cc: [6.1+] Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 3c71b4596676..fb7c982a0018 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -194,6 +194,11 @@ static int damon_reclaim_apply_parameters(void) if (err) return err; + if (!damon_reclaim_mon_attrs.aggr_interval) { + err = -EINVAL; + goto out; + } + err = damon_set_attrs(param_ctx, &damon_reclaim_mon_attrs); if (err) goto out; From 04d3cd43700a2d0fe4bfb1012a8ec7f2e34a3507 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 27 Aug 2025 03:42:21 -0700 Subject: [PATCH 08/34] arm64: kexec: initialize kexec_buf struct in load_other_segments() Patch series "kexec: Fix invalid field access". The kexec_buf structure was previously declared without initialization. commit bf454ec31add ("kexec_file: allow to place kexec_buf randomly") added a field that is always read but not consistently populated by all architectures. This un-initialized field will contain garbage. This is also triggering a UBSAN warning when the uninitialized data was accessed: ------------[ cut here ]------------ UBSAN: invalid-load in ./include/linux/kexec.h:210:10 load of value 252 is not a valid value for type '_Bool' Zero-initializing kexec_buf at declaration ensures all fields are cleanly set, preventing future instances of uninitialized memory being used. An initial fix was already landed for arm64[0], and this patchset fixes the problem on the remaining arm64 code and on riscv, as raised by Mark. Discussions about this problem could be found at[1][2]. This patch (of 3): The kexec_buf structure was previously declared without initialization. commit bf454ec31add ("kexec_file: allow to place kexec_buf randomly") added a field that is always read but not consistently populated by all architectures. This un-initialized field will contain garbage. This is also triggering a UBSAN warning when the uninitialized data was accessed: ------------[ cut here ]------------ UBSAN: invalid-load in ./include/linux/kexec.h:210:10 load of value 252 is not a valid value for type '_Bool' Zero-initializing kexec_buf at declaration ensures all fields are cleanly set, preventing future instances of uninitialized memory being used. Link: https://lkml.kernel.org/r/20250827-kbuf_all-v1-0-1df9882bb01a@debian.org Link: https://lkml.kernel.org/r/20250827-kbuf_all-v1-1-1df9882bb01a@debian.org Link: https://lore.kernel.org/all/20250826180742.f2471131255ec1c43683ea07@linux-foundation.org/ [0] Link: https://lore.kernel.org/all/oninomspajhxp4omtdapxnckxydbk2nzmrix7rggmpukpnzadw@c67o7njgdgm3/ [1] Link: https://lore.kernel.org/all/20250826-akpm-v1-1-3c831f0e3799@debian.org/ [2] Fixes: bf454ec31add ("kexec_file: allow to place kexec_buf randomly") Signed-off-by: Breno Leitao Acked-by: Baoquan He Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Coiby Xu Cc: Heiko Carstens Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Signed-off-by: Andrew Morton --- arch/arm64/kernel/machine_kexec_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index af1ca875c52c..410060ebd86d 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -94,7 +94,7 @@ int load_other_segments(struct kimage *image, char *initrd, unsigned long initrd_len, char *cmdline) { - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; void *dtb = NULL; unsigned long initrd_load_addr = 0, dtb_len, orig_segments = image->nr_segments; From 8afbd0045922b8146acf1a78ae818693e0468dbd Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 27 Aug 2025 03:42:22 -0700 Subject: [PATCH 09/34] riscv: kexec: initialize kexec_buf struct The kexec_buf structure was previously declared without initialization. commit bf454ec31add ("kexec_file: allow to place kexec_buf randomly") added a field that is always read but not consistently populated by all architectures. This un-initialized field will contain garbage. This is also triggering a UBSAN warning when the uninitialized data was accessed: ------------[ cut here ]------------ UBSAN: invalid-load in ./include/linux/kexec.h:210:10 load of value 252 is not a valid value for type '_Bool' Zero-initializing kexec_buf at declaration ensures all fields are cleanly set, preventing future instances of uninitialized memory being used. Link: https://lkml.kernel.org/r/20250827-kbuf_all-v1-2-1df9882bb01a@debian.org Fixes: bf454ec31add ("kexec_file: allow to place kexec_buf randomly") Signed-off-by: Breno Leitao Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Baoquan He Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Coiby Xu Cc: Heiko Carstens Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Signed-off-by: Andrew Morton --- arch/riscv/kernel/kexec_elf.c | 4 ++-- arch/riscv/kernel/kexec_image.c | 2 +- arch/riscv/kernel/machine_kexec_file.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/riscv/kernel/kexec_elf.c b/arch/riscv/kernel/kexec_elf.c index 56444c7bd34e..531d348db84d 100644 --- a/arch/riscv/kernel/kexec_elf.c +++ b/arch/riscv/kernel/kexec_elf.c @@ -28,7 +28,7 @@ static int riscv_kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, int i; int ret = 0; size_t size; - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; const struct elf_phdr *phdr; kbuf.image = image; @@ -66,7 +66,7 @@ static int elf_find_pbase(struct kimage *image, unsigned long kernel_len, { int i; int ret; - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; const struct elf_phdr *phdr; unsigned long lowest_paddr = ULONG_MAX; unsigned long lowest_vaddr = ULONG_MAX; diff --git a/arch/riscv/kernel/kexec_image.c b/arch/riscv/kernel/kexec_image.c index 26a81774a78a..8f2eb900910b 100644 --- a/arch/riscv/kernel/kexec_image.c +++ b/arch/riscv/kernel/kexec_image.c @@ -41,7 +41,7 @@ static void *image_load(struct kimage *image, struct riscv_image_header *h; u64 flags; bool be_image, be_kernel; - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; int ret; /* Check Image header */ diff --git a/arch/riscv/kernel/machine_kexec_file.c b/arch/riscv/kernel/machine_kexec_file.c index e36104af2e24..b9eb41b0a975 100644 --- a/arch/riscv/kernel/machine_kexec_file.c +++ b/arch/riscv/kernel/machine_kexec_file.c @@ -261,7 +261,7 @@ int load_extra_segments(struct kimage *image, unsigned long kernel_start, int ret; void *fdt; unsigned long initrd_pbase = 0UL; - struct kexec_buf kbuf; + struct kexec_buf kbuf = {}; char *modified_cmdline = NULL; kbuf.image = image; From e67f0bd05519012eaabaae68618ffc4ed30ab680 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 27 Aug 2025 03:42:23 -0700 Subject: [PATCH 10/34] s390: kexec: initialize kexec_buf struct The kexec_buf structure was previously declared without initialization. commit bf454ec31add ("kexec_file: allow to place kexec_buf randomly") added a field that is always read but not consistently populated by all architectures. This un-initialized field will contain garbage. This is also triggering a UBSAN warning when the uninitialized data was accessed: ------------[ cut here ]------------ UBSAN: invalid-load in ./include/linux/kexec.h:210:10 load of value 252 is not a valid value for type '_Bool' Zero-initializing kexec_buf at declaration ensures all fields are cleanly set, preventing future instances of uninitialized memory being used. Link: https://lkml.kernel.org/r/20250827-kbuf_all-v1-3-1df9882bb01a@debian.org Fixes: bf454ec31add ("kexec_file: allow to place kexec_buf randomly") Signed-off-by: Breno Leitao Cc: Albert Ou Cc: Alexander Gordeev Cc: Alexandre Ghiti Cc: Baoquan He Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Coiby Xu Cc: Heiko Carstens Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Signed-off-by: Andrew Morton --- arch/s390/kernel/kexec_elf.c | 2 +- arch/s390/kernel/kexec_image.c | 2 +- arch/s390/kernel/machine_kexec_file.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c index 4d364de43799..143e34a4eca5 100644 --- a/arch/s390/kernel/kexec_elf.c +++ b/arch/s390/kernel/kexec_elf.c @@ -16,7 +16,7 @@ static int kexec_file_add_kernel_elf(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; const Elf_Ehdr *ehdr; const Elf_Phdr *phdr; Elf_Addr entry; diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c index a32ce8bea745..9a439175723c 100644 --- a/arch/s390/kernel/kexec_image.c +++ b/arch/s390/kernel/kexec_image.c @@ -16,7 +16,7 @@ static int kexec_file_add_kernel_image(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; buf.image = image; diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index c2bac14dd668..a36d7311c668 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -129,7 +129,7 @@ static int kexec_file_update_purgatory(struct kimage *image, static int kexec_file_add_purgatory(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; int ret; buf.image = image; @@ -152,7 +152,7 @@ static int kexec_file_add_purgatory(struct kimage *image, static int kexec_file_add_initrd(struct kimage *image, struct s390_load_data *data) { - struct kexec_buf buf; + struct kexec_buf buf = {}; int ret; buf.image = image; @@ -184,7 +184,7 @@ static int kexec_file_add_ipl_report(struct kimage *image, { __u32 *lc_ipl_parmblock_ptr; unsigned int len, ncerts; - struct kexec_buf buf; + struct kexec_buf buf = {}; unsigned long addr; void *ptr, *end; int ret; From 3be306cccdccede13e3cefd0c14e430cc2b7c9c7 Mon Sep 17 00:00:00 2001 From: Kyle Meyer Date: Thu, 28 Aug 2025 13:38:20 -0500 Subject: [PATCH 11/34] mm/memory-failure: fix redundant updates for already poisoned pages Duplicate memory errors can be reported by multiple sources. Passing an already poisoned page to action_result() causes issues: * The amount of hardware corrupted memory is incorrectly updated. * Per NUMA node MF stats are incorrectly updated. * Redundant "already poisoned" messages are printed. Avoid those issues by: * Skipping hardware corrupted memory updates for already poisoned pages. * Skipping per NUMA node MF stats updates for already poisoned pages. * Dropping redundant "already poisoned" messages. Make MF_MSG_ALREADY_POISONED consistent with other action_page_types and make calls to action_result() consistent for already poisoned normal pages and huge pages. Link: https://lkml.kernel.org/r/aLCiHMy12Ck3ouwC@hpe.com Fixes: b8b9488d50b7 ("mm/memory-failure: improve memory failure action_result messages") Signed-off-by: Kyle Meyer Reviewed-by: Jiaqi Yan Acked-by: David Hildenbrand Reviewed-by: Jane Chu Acked-by: Miaohe Lin Cc: Borislav Betkov Cc: Kyle Meyer Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Luck, Tony" Cc: Michal Hocko Cc: Mike Rapoport Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Russ Anderson Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/memory-failure.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index fc30ca4804bf..10b3c281c2ae 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -956,7 +956,7 @@ static const char * const action_page_types[] = { [MF_MSG_BUDDY] = "free buddy page", [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", - [MF_MSG_ALREADY_POISONED] = "already poisoned", + [MF_MSG_ALREADY_POISONED] = "already poisoned page", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1349,9 +1349,10 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type, { trace_memory_failure_event(pfn, type, result); - num_poisoned_pages_inc(pfn); - - update_per_node_mf_stats(pfn, result); + if (type != MF_MSG_ALREADY_POISONED) { + num_poisoned_pages_inc(pfn); + update_per_node_mf_stats(pfn, result); + } pr_err("%#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); @@ -2094,12 +2095,11 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb *hugetlb = 0; return 0; } else if (res == -EHWPOISON) { - pr_err("%#lx: already hardware poisoned\n", pfn); if (flags & MF_ACTION_REQUIRED) { folio = page_folio(p); res = kill_accessing_process(current, folio_pfn(folio), flags); - action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); } + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); return res; } else if (res == -EBUSY) { if (!(flags & MF_NO_RETRY)) { @@ -2285,7 +2285,6 @@ int memory_failure(unsigned long pfn, int flags) goto unlock_mutex; if (TestSetPageHWPoison(p)) { - pr_err("%#lx: already hardware poisoned\n", pfn); res = -EHWPOISON; if (flags & MF_ACTION_REQUIRED) res = kill_accessing_process(current, pfn, flags); From 7989fdce69ec0a928e136477da2aa208a191fba2 Mon Sep 17 00:00:00 2001 From: Vlad Dumitrescu Date: Fri, 22 Aug 2025 15:55:16 -0700 Subject: [PATCH 12/34] percpu: fix race on alloc failed warning limit The 'allocation failed, ...' warning messages can cause unlimited log spam, contrary to the implementation's intent. The warn_limit variable is accessed without synchronization. If more than threads enter the warning path at the same time, the variable will get decremented past 0. Once it becomes negative, the non-zero check will always return true leading to unlimited log spam. Use atomic operation to access warn_limit and change condition to test for non-negative (>= 0) - atomic_dec_if_positive will return -1 once warn_limit becomes 0. Continue to print disable message alongside the last warning. While the change cited in Fixes is only adjacent, the warning limit implementation was correct before it. Only non-atomic allocations were considered for warnings, and those happened to hold pcpu_alloc_mutex while accessing warn_limit. [vdumitrescu@nvidia.com: prevent warn_limit from going negative, per Christoph Lameter] Link: https://lkml.kernel.org/r/ee87cc59-2717-4dbb-8052-1d2692c5aaaa@nvidia.com Link: https://lkml.kernel.org/r/ab22061a-a62f-4429-945b-744e5cc4ba35@nvidia.com Fixes: f7d77dfc91f7 ("mm/percpu.c: print error message too if atomic alloc failed") Signed-off-by: Vlad Dumitrescu Reviewed-by: Baoquan He Cc: Christoph Lameter (Ampere) Cc: Dennis Zhou Cc: Tejun Heo Signed-off-by: Andrew Morton --- mm/percpu.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index a56f35dcc417..81462ce5866e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1734,7 +1734,7 @@ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, bool is_atomic; bool do_warn; struct obj_cgroup *objcg = NULL; - static int warn_limit = 10; + static atomic_t warn_limit = ATOMIC_INIT(10); struct pcpu_chunk *chunk, *next; const char *err; int slot, off, cpu, ret; @@ -1904,13 +1904,17 @@ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, fail: trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align); - if (do_warn && warn_limit) { - pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", - size, align, is_atomic, err); - if (!is_atomic) - dump_stack(); - if (!--warn_limit) - pr_info("limit reached, disable warning\n"); + if (do_warn) { + int remaining = atomic_dec_if_positive(&warn_limit); + + if (remaining >= 0) { + pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", + size, align, is_atomic, err); + if (!is_atomic) + dump_stack(); + if (remaining == 0) + pr_info("limit reached, disable warning\n"); + } } if (is_atomic) { From 78d2d32f0b789d67cbe5cfea0c0714cb2446c37e Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Thu, 28 Aug 2025 14:26:56 +0000 Subject: [PATCH 13/34] mm/mremap: fix regression in vrm->new_addr check Commit 3215eaceca87 ("mm/mremap: refactor initial parameter sanity checks") moved the sanity check for vrm->new_addr from mremap_to() to check_mremap_params(). However, this caused a regression as vrm->new_addr is now checked even when MREMAP_FIXED and MREMAP_DONTUNMAP flags are not specified. In this case, vrm->new_addr can be garbage and create unexpected failures. Fix this by moving the new_addr check after the vrm_implies_new_addr() guard. This ensures that the new_addr is only checked when the user has specified one explicitly. Link: https://lkml.kernel.org/r/20250828142657.770502-1-cmllamas@google.com Fixes: 3215eaceca87 ("mm/mremap: refactor initial parameter sanity checks") Signed-off-by: Carlos Llamas Reviewed-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Cc: Carlos Llamas Cc: Jann Horn Signed-off-by: Andrew Morton --- mm/mremap.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index e618a706aff5..35de0a7b910e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1774,15 +1774,18 @@ static unsigned long check_mremap_params(struct vma_remap_struct *vrm) if (!vrm->new_len) return -EINVAL; - /* Is the new length or address silly? */ - if (vrm->new_len > TASK_SIZE || - vrm->new_addr > TASK_SIZE - vrm->new_len) + /* Is the new length silly? */ + if (vrm->new_len > TASK_SIZE) return -EINVAL; /* Remainder of checks are for cases with specific new_addr. */ if (!vrm_implies_new_addr(vrm)) return 0; + /* Is the new address silly? */ + if (vrm->new_addr > TASK_SIZE - vrm->new_len) + return -EINVAL; + /* The new address must be page-aligned. */ if (offset_in_page(vrm->new_addr)) return -EINVAL; From d613f53c83ec47089c4e25859d5e8e0359f6f8da Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 28 Aug 2025 10:46:18 +0800 Subject: [PATCH 14/34] mm/memory-failure: fix VM_BUG_ON_PAGE(PagePoisoned(page)) when unpoison memory When I did memory failure tests, below panic occurs: page dumped because: VM_BUG_ON_PAGE(PagePoisoned(page)) kernel BUG at include/linux/page-flags.h:616! Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI CPU: 3 PID: 720 Comm: bash Not tainted 6.10.0-rc1-00195-g148743902568 #40 RIP: 0010:unpoison_memory+0x2f3/0x590 RSP: 0018:ffffa57fc8787d60 EFLAGS: 00000246 RAX: 0000000000000037 RBX: 0000000000000009 RCX: ffff9be25fcdc9c8 RDX: 0000000000000000 RSI: 0000000000000027 RDI: ffff9be25fcdc9c0 RBP: 0000000000300000 R08: ffffffffb4956f88 R09: 0000000000009ffb R10: 0000000000000284 R11: ffffffffb4926fa0 R12: ffffe6b00c000000 R13: ffff9bdb453dfd00 R14: 0000000000000000 R15: fffffffffffffffe FS: 00007f08f04e4740(0000) GS:ffff9be25fcc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000564787a30410 CR3: 000000010d4e2000 CR4: 00000000000006f0 Call Trace: unpoison_memory+0x2f3/0x590 simple_attr_write_xsigned.constprop.0.isra.0+0xb3/0x110 debugfs_attr_write+0x42/0x60 full_proxy_write+0x5b/0x80 vfs_write+0xd5/0x540 ksys_write+0x64/0xe0 do_syscall_64+0xb9/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f08f0314887 RSP: 002b:00007ffece710078 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000009 RCX: 00007f08f0314887 RDX: 0000000000000009 RSI: 0000564787a30410 RDI: 0000000000000001 RBP: 0000564787a30410 R08: 000000000000fefe R09: 000000007fffffff R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000009 R13: 00007f08f041b780 R14: 00007f08f0417600 R15: 00007f08f0416a00 Modules linked in: hwpoison_inject ---[ end trace 0000000000000000 ]--- RIP: 0010:unpoison_memory+0x2f3/0x590 RSP: 0018:ffffa57fc8787d60 EFLAGS: 00000246 RAX: 0000000000000037 RBX: 0000000000000009 RCX: ffff9be25fcdc9c8 RDX: 0000000000000000 RSI: 0000000000000027 RDI: ffff9be25fcdc9c0 RBP: 0000000000300000 R08: ffffffffb4956f88 R09: 0000000000009ffb R10: 0000000000000284 R11: ffffffffb4926fa0 R12: ffffe6b00c000000 R13: ffff9bdb453dfd00 R14: 0000000000000000 R15: fffffffffffffffe FS: 00007f08f04e4740(0000) GS:ffff9be25fcc0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000564787a30410 CR3: 000000010d4e2000 CR4: 00000000000006f0 Kernel panic - not syncing: Fatal exception Kernel Offset: 0x31c00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception ]--- The root cause is that unpoison_memory() tries to check the PG_HWPoison flags of an uninitialized page. So VM_BUG_ON_PAGE(PagePoisoned(page)) is triggered. This can be reproduced by below steps: 1.Offline memory block: echo offline > /sys/devices/system/memory/memory12/state 2.Get offlined memory pfn: page-types -b n -rlN 3.Write pfn to unpoison-pfn echo > /sys/kernel/debug/hwpoison/unpoison-pfn This scenario can be identified by pfn_to_online_page() returning NULL. And ZONE_DEVICE pages are never expected, so we can simply fail if pfn_to_online_page() == NULL to fix the bug. Link: https://lkml.kernel.org/r/20250828024618.1744895-1-linmiaohe@huawei.com Fixes: f1dd2cd13c4b ("mm, memory_hotplug: do not associate hotadded memory to zones until online") Signed-off-by: Miaohe Lin Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton --- mm/memory-failure.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 10b3c281c2ae..df6ee59527dd 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2568,10 +2568,9 @@ int unpoison_memory(unsigned long pfn) static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - if (!pfn_valid(pfn)) - return -ENXIO; - - p = pfn_to_page(pfn); + p = pfn_to_online_page(pfn); + if (!p) + return -EIO; folio = page_folio(p); mutex_lock(&mf_mutex); From 04100f775c2ea501927f508f17ad824ad1f23c8d Mon Sep 17 00:00:00 2001 From: Mark Tinguely Date: Fri, 29 Aug 2025 10:18:15 -0500 Subject: [PATCH 15/34] ocfs2: fix recursive semaphore deadlock in fiemap call syzbot detected a OCFS2 hang due to a recursive semaphore on a FS_IOC_FIEMAP of the extent list on a specially crafted mmap file. context_switch kernel/sched/core.c:5357 [inline] __schedule+0x1798/0x4cc0 kernel/sched/core.c:6961 __schedule_loop kernel/sched/core.c:7043 [inline] schedule+0x165/0x360 kernel/sched/core.c:7058 schedule_preempt_disabled+0x13/0x30 kernel/sched/core.c:7115 rwsem_down_write_slowpath+0x872/0xfe0 kernel/locking/rwsem.c:1185 __down_write_common kernel/locking/rwsem.c:1317 [inline] __down_write kernel/locking/rwsem.c:1326 [inline] down_write+0x1ab/0x1f0 kernel/locking/rwsem.c:1591 ocfs2_page_mkwrite+0x2ff/0xc40 fs/ocfs2/mmap.c:142 do_page_mkwrite+0x14d/0x310 mm/memory.c:3361 wp_page_shared mm/memory.c:3762 [inline] do_wp_page+0x268d/0x5800 mm/memory.c:3981 handle_pte_fault mm/memory.c:6068 [inline] __handle_mm_fault+0x1033/0x5440 mm/memory.c:6195 handle_mm_fault+0x40a/0x8e0 mm/memory.c:6364 do_user_addr_fault+0x764/0x1390 arch/x86/mm/fault.c:1387 handle_page_fault arch/x86/mm/fault.c:1476 [inline] exc_page_fault+0x76/0xf0 arch/x86/mm/fault.c:1532 asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:623 RIP: 0010:copy_user_generic arch/x86/include/asm/uaccess_64.h:126 [inline] RIP: 0010:raw_copy_to_user arch/x86/include/asm/uaccess_64.h:147 [inline] RIP: 0010:_inline_copy_to_user include/linux/uaccess.h:197 [inline] RIP: 0010:_copy_to_user+0x85/0xb0 lib/usercopy.c:26 Code: e8 00 bc f7 fc 4d 39 fc 72 3d 4d 39 ec 77 38 e8 91 b9 f7 fc 4c 89 f7 89 de e8 47 25 5b fd 0f 01 cb 4c 89 ff 48 89 d9 4c 89 f6 a4 0f 1f 00 48 89 cb 0f 01 ca 48 89 d8 5b 41 5c 41 5d 41 5e 41 RSP: 0018:ffffc9000403f950 EFLAGS: 00050256 RAX: ffffffff84c7f101 RBX: 0000000000000038 RCX: 0000000000000038 RDX: 0000000000000000 RSI: ffffc9000403f9e0 RDI: 0000200000000060 RBP: ffffc9000403fa90 R08: ffffc9000403fa17 R09: 1ffff92000807f42 R10: dffffc0000000000 R11: fffff52000807f43 R12: 0000200000000098 R13: 00007ffffffff000 R14: ffffc9000403f9e0 R15: 0000200000000060 copy_to_user include/linux/uaccess.h:225 [inline] fiemap_fill_next_extent+0x1c0/0x390 fs/ioctl.c:145 ocfs2_fiemap+0x888/0xc90 fs/ocfs2/extent_map.c:806 ioctl_fiemap fs/ioctl.c:220 [inline] do_vfs_ioctl+0x1173/0x1430 fs/ioctl.c:532 __do_sys_ioctl fs/ioctl.c:596 [inline] __se_sys_ioctl+0x82/0x170 fs/ioctl.c:584 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f5f13850fd9 RSP: 002b:00007ffe3b3518b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000200000000000 RCX: 00007f5f13850fd9 RDX: 0000200000000040 RSI: 00000000c020660b RDI: 0000000000000004 RBP: 6165627472616568 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffe3b3518f0 R13: 00007ffe3b351b18 R14: 431bde82d7b634db R15: 00007f5f1389a03b ocfs2_fiemap() takes a read lock of the ip_alloc_sem semaphore (since v2.6.22-527-g7307de80510a) and calls fiemap_fill_next_extent() to read the extent list of this running mmap executable. The user supplied buffer to hold the fiemap information page faults calling ocfs2_page_mkwrite() which will take a write lock (since v2.6.27-38-g00dc417fa3e7) of the same semaphore. This recursive semaphore will hold filesystem locks and causes a hang of the fileystem. The ip_alloc_sem protects the inode extent list and size. Release the read semphore before calling fiemap_fill_next_extent() in ocfs2_fiemap() and ocfs2_fiemap_inline(). This does an unnecessary semaphore lock/unlock on the last extent but simplifies the error path. Link: https://lkml.kernel.org/r/61d1a62b-2631-4f12-81e2-cd689914360b@oracle.com Fixes: 00dc417fa3e7 ("ocfs2: fiemap support") Signed-off-by: Mark Tinguely Reported-by: syzbot+541dcc6ee768f77103e7@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=541dcc6ee768f77103e7 Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/extent_map.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 930150ed5db1..ef147e8b3271 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -706,6 +706,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, * it not only handles the fiemap for inlined files, but also deals * with the fast symlink, cause they have no difference for extent * mapping per se. + * + * Must be called with ip_alloc_sem semaphore held. */ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, struct fiemap_extent_info *fieinfo, @@ -717,6 +719,7 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, u64 phys; u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST; struct ocfs2_inode_info *oi = OCFS2_I(inode); + lockdep_assert_held_read(&oi->ip_alloc_sem); di = (struct ocfs2_dinode *)di_bh->b_data; if (ocfs2_inode_is_fast_symlink(inode)) @@ -732,8 +735,11 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data); + /* Release the ip_alloc_sem to prevent deadlock on page fault */ + up_read(&OCFS2_I(inode)->ip_alloc_sem); ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count, flags); + down_read(&OCFS2_I(inode)->ip_alloc_sem); if (ret < 0) return ret; } @@ -802,9 +808,11 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits; phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits; virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits; - + /* Release the ip_alloc_sem to prevent deadlock on page fault */ + up_read(&OCFS2_I(inode)->ip_alloc_sem); ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes, len_bytes, fe_flags); + down_read(&OCFS2_I(inode)->ip_alloc_sem); if (ret) break; From 79357cd06d41d0f5a11b17d7c86176e395d10ef2 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Sun, 31 Aug 2025 14:10:58 +0200 Subject: [PATCH 16/34] mm/vmalloc, mm/kasan: respect gfp mask in kasan_populate_vmalloc() kasan_populate_vmalloc() and its helpers ignore the caller's gfp_mask and always allocate memory using the hardcoded GFP_KERNEL flag. This makes them inconsistent with vmalloc(), which was recently extended to support GFP_NOFS and GFP_NOIO allocations. Page table allocations performed during shadow population also ignore the external gfp_mask. To preserve the intended semantics of GFP_NOFS and GFP_NOIO, wrap the apply_to_page_range() calls into the appropriate memalloc scope. xfs calls vmalloc with GFP_NOFS, so this bug could lead to deadlock. There was a report here https://lkml.kernel.org/r/686ea951.050a0220.385921.0016.GAE@google.com This patch: - Extends kasan_populate_vmalloc() and helpers to take gfp_mask; - Passes gfp_mask down to alloc_pages_bulk() and __get_free_page(); - Enforces GFP_NOFS/NOIO semantics with memalloc_*_save()/restore() around apply_to_page_range(); - Updates vmalloc.c and percpu allocator call sites accordingly. Link: https://lkml.kernel.org/r/20250831121058.92971-1-urezki@gmail.com Fixes: 451769ebb7e7 ("mm/vmalloc: alloc GFP_NO{FS,IO} for vmalloc") Signed-off-by: Uladzislau Rezki (Sony) Reported-by: syzbot+3470c9ffee63e4abafeb@syzkaller.appspotmail.com Reviewed-by: Andrey Ryabinin Cc: Baoquan He Cc: Michal Hocko Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Vincenzo Frascino Cc: Signed-off-by: Andrew Morton --- include/linux/kasan.h | 6 +++--- mm/kasan/shadow.c | 31 ++++++++++++++++++++++++------- mm/vmalloc.c | 8 ++++---- 3 files changed, 31 insertions(+), 14 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 890011071f2b..fe5ce9215821 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -562,7 +562,7 @@ static inline void kasan_init_hw_tags(void) { } #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); -int kasan_populate_vmalloc(unsigned long addr, unsigned long size); +int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask); void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end, @@ -574,7 +574,7 @@ static inline void kasan_populate_early_vm_area_shadow(void *start, unsigned long size) { } static inline int kasan_populate_vmalloc(unsigned long start, - unsigned long size) + unsigned long size, gfp_t gfp_mask) { return 0; } @@ -610,7 +610,7 @@ static __always_inline void kasan_poison_vmalloc(const void *start, static inline void kasan_populate_early_vm_area_shadow(void *start, unsigned long size) { } static inline int kasan_populate_vmalloc(unsigned long start, - unsigned long size) + unsigned long size, gfp_t gfp_mask) { return 0; } diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index e2ceebf737ef..11d472a5c4e8 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -336,13 +336,13 @@ static void ___free_pages_bulk(struct page **pages, int nr_pages) } } -static int ___alloc_pages_bulk(struct page **pages, int nr_pages) +static int ___alloc_pages_bulk(struct page **pages, int nr_pages, gfp_t gfp_mask) { unsigned long nr_populated, nr_total = nr_pages; struct page **page_array = pages; while (nr_pages) { - nr_populated = alloc_pages_bulk(GFP_KERNEL, nr_pages, pages); + nr_populated = alloc_pages_bulk(gfp_mask, nr_pages, pages); if (!nr_populated) { ___free_pages_bulk(page_array, nr_total - nr_pages); return -ENOMEM; @@ -354,25 +354,42 @@ static int ___alloc_pages_bulk(struct page **pages, int nr_pages) return 0; } -static int __kasan_populate_vmalloc(unsigned long start, unsigned long end) +static int __kasan_populate_vmalloc(unsigned long start, unsigned long end, gfp_t gfp_mask) { unsigned long nr_pages, nr_total = PFN_UP(end - start); struct vmalloc_populate_data data; + unsigned int flags; int ret = 0; - data.pages = (struct page **)__get_free_page(GFP_KERNEL | __GFP_ZERO); + data.pages = (struct page **)__get_free_page(gfp_mask | __GFP_ZERO); if (!data.pages) return -ENOMEM; while (nr_total) { nr_pages = min(nr_total, PAGE_SIZE / sizeof(data.pages[0])); - ret = ___alloc_pages_bulk(data.pages, nr_pages); + ret = ___alloc_pages_bulk(data.pages, nr_pages, gfp_mask); if (ret) break; data.start = start; + + /* + * page tables allocations ignore external gfp mask, enforce it + * by the scope API + */ + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + flags = memalloc_nofs_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + flags = memalloc_noio_save(); + ret = apply_to_page_range(&init_mm, start, nr_pages * PAGE_SIZE, kasan_populate_vmalloc_pte, &data); + + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + memalloc_nofs_restore(flags); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + memalloc_noio_restore(flags); + ___free_pages_bulk(data.pages, nr_pages); if (ret) break; @@ -386,7 +403,7 @@ static int __kasan_populate_vmalloc(unsigned long start, unsigned long end) return ret; } -int kasan_populate_vmalloc(unsigned long addr, unsigned long size) +int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask) { unsigned long shadow_start, shadow_end; int ret; @@ -415,7 +432,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) shadow_start = PAGE_ALIGN_DOWN(shadow_start); shadow_end = PAGE_ALIGN(shadow_end); - ret = __kasan_populate_vmalloc(shadow_start, shadow_end); + ret = __kasan_populate_vmalloc(shadow_start, shadow_end, gfp_mask); if (ret) return ret; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6dbcdceecae1..5edd536ba9d2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2026,6 +2026,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, if (unlikely(!vmap_initialized)) return ERR_PTR(-EBUSY); + /* Only reclaim behaviour flags are relevant. */ + gfp_mask = gfp_mask & GFP_RECLAIM_MASK; might_sleep(); /* @@ -2038,8 +2040,6 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, */ va = node_alloc(size, align, vstart, vend, &addr, &vn_id); if (!va) { - gfp_mask = gfp_mask & GFP_RECLAIM_MASK; - va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); @@ -2089,7 +2089,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, BUG_ON(va->va_start < vstart); BUG_ON(va->va_end > vend); - ret = kasan_populate_vmalloc(addr, size); + ret = kasan_populate_vmalloc(addr, size, gfp_mask); if (ret) { free_vmap_area(va); return ERR_PTR(ret); @@ -4826,7 +4826,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, /* populate the kasan shadow space */ for (area = 0; area < nr_vms; area++) { - if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) + if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area], GFP_KERNEL)) goto err_free_shadow; } From 3fac212fe489aa0dbe8d80a42a7809840ca7b0f9 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 2 Sep 2025 15:49:26 -0700 Subject: [PATCH 17/34] compiler-clang.h: define __SANITIZE_*__ macros only when undefined Clang 22 recently added support for defining __SANITIZE__ macros similar to GCC [1], which causes warnings (or errors with CONFIG_WERROR=y or W=e) with the existing defines that the kernel creates to emulate this behavior with existing clang versions. In file included from :3: In file included from include/linux/compiler_types.h:171: include/linux/compiler-clang.h:37:9: error: '__SANITIZE_THREAD__' macro redefined [-Werror,-Wmacro-redefined] 37 | #define __SANITIZE_THREAD__ | ^ :352:9: note: previous definition is here 352 | #define __SANITIZE_THREAD__ 1 | ^ Refactor compiler-clang.h to only define the sanitizer macros when they are undefined and adjust the rest of the code to use these macros for checking if the sanitizers are enabled, clearing up the warnings and allowing the kernel to easily drop these defines when the minimum supported version of LLVM for building the kernel becomes 22.0.0 or newer. Link: https://lkml.kernel.org/r/20250902-clang-update-sanitize-defines-v1-1-cf3702ca3d92@kernel.org Link: https://github.com/llvm/llvm-project/commit/568c23bbd3303518c5056d7f03444dae4fdc8a9c [1] Signed-off-by: Nathan Chancellor Reviewed-by: Justin Stitt Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Bill Wendling Cc: Dmitriy Vyukov Cc: Marco Elver Cc: Signed-off-by: Andrew Morton --- include/linux/compiler-clang.h | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index fa4ffe037bc7..8720a0705900 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -18,23 +18,42 @@ #define KASAN_ABI_VERSION 5 /* + * Clang 22 added preprocessor macros to match GCC, in hopes of eventually + * dropping __has_feature support for sanitizers: + * https://github.com/llvm/llvm-project/commit/568c23bbd3303518c5056d7f03444dae4fdc8a9c + * Create these macros for older versions of clang so that it is easy to clean + * up once the minimum supported version of LLVM for building the kernel always + * creates these macros. + * * Note: Checking __has_feature(*_sanitizer) is only true if the feature is * enabled. Therefore it is not required to additionally check defined(CONFIG_*) * to avoid adding redundant attributes in other configurations. */ - -#if __has_feature(address_sanitizer) || __has_feature(hwaddress_sanitizer) -/* Emulate GCC's __SANITIZE_ADDRESS__ flag */ +#if __has_feature(address_sanitizer) && !defined(__SANITIZE_ADDRESS__) #define __SANITIZE_ADDRESS__ +#endif +#if __has_feature(hwaddress_sanitizer) && !defined(__SANITIZE_HWADDRESS__) +#define __SANITIZE_HWADDRESS__ +#endif +#if __has_feature(thread_sanitizer) && !defined(__SANITIZE_THREAD__) +#define __SANITIZE_THREAD__ +#endif + +/* + * Treat __SANITIZE_HWADDRESS__ the same as __SANITIZE_ADDRESS__ in the kernel. + */ +#ifdef __SANITIZE_HWADDRESS__ +#define __SANITIZE_ADDRESS__ +#endif + +#ifdef __SANITIZE_ADDRESS__ #define __no_sanitize_address \ __attribute__((no_sanitize("address", "hwaddress"))) #else #define __no_sanitize_address #endif -#if __has_feature(thread_sanitizer) -/* emulate gcc's __SANITIZE_THREAD__ flag */ -#define __SANITIZE_THREAD__ +#ifdef __SANITIZE_THREAD__ #define __no_sanitize_thread \ __attribute__((no_sanitize("thread"))) #else From 0ce9398aa0830f15f92bbed73853f9861c3e74ff Mon Sep 17 00:00:00 2001 From: wangzijie Date: Thu, 4 Sep 2025 21:57:15 +0800 Subject: [PATCH 18/34] proc: fix type confusion in pde_set_flags() Commit 2ce3d282bd50 ("proc: fix missing pde_set_flags() for net proc files") missed a key part in the definition of proc_dir_entry: union { const struct proc_ops *proc_ops; const struct file_operations *proc_dir_ops; }; So dereference of ->proc_ops assumes it is a proc_ops structure results in type confusion and make NULL check for 'proc_ops' not work for proc dir. Add !S_ISDIR(dp->mode) test before calling pde_set_flags() to fix it. Link: https://lkml.kernel.org/r/20250904135715.3972782-1-wangzijie1@honor.com Fixes: 2ce3d282bd50 ("proc: fix missing pde_set_flags() for net proc files") Signed-off-by: wangzijie Reported-by: Brad Spengler Closes: https://lore.kernel.org/all/20250903065758.3678537-1-wangzijie1@honor.com/ Cc: Alexey Dobriyan Cc: Al Viro Cc: Christian Brauner Cc: Jiri Slaby Cc: Stefano Brivio Cc: Signed-off-by: Andrew Morton --- fs/proc/generic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index bd0c099cfdd2..176281112273 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -393,7 +393,8 @@ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, if (proc_alloc_inum(&dp->low_ino)) goto out_free_entry; - pde_set_flags(dp); + if (!S_ISDIR(dp->mode)) + pde_set_flags(dp); write_lock(&proc_subdir_lock); dp->parent = dir; From 3260a3f0828e06f5f13fac69fb1999a6d60d9cff Mon Sep 17 00:00:00 2001 From: Stanislav Fort Date: Fri, 5 Sep 2025 13:10:46 +0300 Subject: [PATCH 19/34] mm/damon/sysfs: fix use-after-free in state_show() state_show() reads kdamond->damon_ctx without holding damon_sysfs_lock. This allows a use-after-free race: CPU 0 CPU 1 ----- ----- state_show() damon_sysfs_turn_damon_on() ctx = kdamond->damon_ctx; mutex_lock(&damon_sysfs_lock); damon_destroy_ctx(kdamond->damon_ctx); kdamond->damon_ctx = NULL; mutex_unlock(&damon_sysfs_lock); damon_is_running(ctx); /* ctx is freed */ mutex_lock(&ctx->kdamond_lock); /* UAF */ (The race can also occur with damon_sysfs_kdamonds_rm_dirs() and damon_sysfs_kdamond_release(), which free or replace the context under damon_sysfs_lock.) Fix by taking damon_sysfs_lock before dereferencing the context, mirroring the locking used in pid_show(). The bug has existed since state_show() first accessed kdamond->damon_ctx. Link: https://lkml.kernel.org/r/20250905101046.2288-1-disclosure@aisle.com Fixes: a61ea561c871 ("mm/damon/sysfs: link DAMON for virtual address spaces monitoring") Signed-off-by: Stanislav Fort Reported-by: Stanislav Fort Reviewed-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 6d2b0dab50cb..7b9254cadd5f 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1260,14 +1260,18 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, { struct damon_sysfs_kdamond *kdamond = container_of(kobj, struct damon_sysfs_kdamond, kobj); - struct damon_ctx *ctx = kdamond->damon_ctx; - bool running; + struct damon_ctx *ctx; + bool running = false; - if (!ctx) - running = false; - else + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + + ctx = kdamond->damon_ctx; + if (ctx) running = damon_is_running(ctx); + mutex_unlock(&damon_sysfs_lock); + return sysfs_emit(buf, "%s\n", running ? damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_ON] : damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_OFF]); From a68172d95c2845d2b5455b072b4ff51ba32650e9 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Fri, 5 Sep 2025 12:15:57 +0300 Subject: [PATCH 20/34] MAINTAINERS: add tree entry to numa memblocks and emulation block Link: https://lkml.kernel.org/r/20250905091557.3529937-1-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Vlastimil Babka Acked-by: Lorenzo Stoakes Acked-by: Liam R. Howlett Cc: David Hildenbrand Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 6dcfbd11efef..fbdbf7c012a0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16127,6 +16127,7 @@ M: Andrew Morton M: Mike Rapoport L: linux-mm@kvack.org S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git F: include/linux/numa_memblks.h F: mm/numa.c F: mm/numa_emulation.c From 98c6d259319ecf6e8d027abd3f14b81324b8c0ad Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Sep 2025 15:15:03 -0700 Subject: [PATCH 21/34] mm/gup: check ref_count instead of lru before migration Patch series "mm: better GUP pin lru_add_drain_all()", v2. Series of lru_add_drain_all()-related patches, arising from recent mm/gup migration report from Will Deacon. This patch (of 5): Will Deacon reports:- When taking a longterm GUP pin via pin_user_pages(), __gup_longterm_locked() tries to migrate target folios that should not be longterm pinned, for example because they reside in a CMA region or movable zone. This is done by first pinning all of the target folios anyway, collecting all of the longterm-unpinnable target folios into a list, dropping the pins that were just taken and finally handing the list off to migrate_pages() for the actual migration. It is critically important that no unexpected references are held on the folios being migrated, otherwise the migration will fail and pin_user_pages() will return -ENOMEM to its caller. Unfortunately, it is relatively easy to observe migration failures when running pKVM (which uses pin_user_pages() on crosvm's virtual address space to resolve stage-2 page faults from the guest) on a 6.15-based Pixel 6 device and this results in the VM terminating prematurely. In the failure case, 'crosvm' has called mlock(MLOCK_ONFAULT) on its mapping of guest memory prior to the pinning. Subsequently, when pin_user_pages() walks the page-table, the relevant 'pte' is not present and so the faulting logic allocates a new folio, mlocks it with mlock_folio() and maps it in the page-table. Since commit 2fbb0c10d1e8 ("mm/munlock: mlock_page() munlock_page() batch by pagevec"), mlock/munlock operations on a folio (formerly page), are deferred. For example, mlock_folio() takes an additional reference on the target folio before placing it into a per-cpu 'folio_batch' for later processing by mlock_folio_batch(), which drops the refcount once the operation is complete. Processing of the batches is coupled with the LRU batch logic and can be forcefully drained with lru_add_drain_all() but as long as a folio remains unprocessed on the batch, its refcount will be elevated. This deferred batching therefore interacts poorly with the pKVM pinning scenario as we can find ourselves in a situation where the migration code fails to migrate a folio due to the elevated refcount from the pending mlock operation. Hugh Dickins adds:- !folio_test_lru() has never been a very reliable way to tell if an lru_add_drain_all() is worth calling, to remove LRU cache references to make the folio migratable: the LRU flag may be set even while the folio is held with an extra reference in a per-CPU LRU cache. 5.18 commit 2fbb0c10d1e8 may have made it more unreliable. Then 6.11 commit 33dfe9204f29 ("mm/gup: clear the LRU flag of a page before adding to LRU batch") tried to make it reliable, by moving LRU flag clearing; but missed the mlock/munlock batches, so still unreliable as reported. And it turns out to be difficult to extend 33dfe9204f29's LRU flag clearing to the mlock/munlock batches: if they do benefit from batching, mlock/munlock cannot be so effective when easily suppressed while !LRU. Instead, switch to an expected ref_count check, which was more reliable all along: some more false positives (unhelpful drains) than before, and never a guarantee that the folio will prove migratable, but better. Note on PG_private_2: ceph and nfs are still using the deprecated PG_private_2 flag, with the aid of netfs and filemap support functions. Although it is consistently matched by an increment of folio ref_count, folio_expected_ref_count() intentionally does not recognize it, and ceph folio migration currently depends on that for PG_private_2 folios to be rejected. New references to the deprecated flag are discouraged, so do not add it into the collect_longterm_unpinnable_folios() calculation: but longterm pinning of transiently PG_private_2 ceph and nfs folios (an uncommon case) may invoke a redundant lru_add_drain_all(). And this makes easy the backport to earlier releases: up to and including 6.12, btrfs also used PG_private_2, but without a ref_count increment. Note for stable backports: requires 6.16 commit 86ebd50224c0 ("mm: add folio_expected_ref_count() for reference count calculation"). Link: https://lkml.kernel.org/r/41395944-b0e3-c3ac-d648-8ddd70451d28@google.com Link: https://lkml.kernel.org/r/bd1f314a-fca1-8f19-cac0-b936c9614557@google.com Fixes: 9a4e9f3b2d73 ("mm: update get_user_pages_longterm to migrate pages allocated from CMA region") Signed-off-by: Hugh Dickins Reported-by: Will Deacon Closes: https://lore.kernel.org/linux-mm/20250815101858.24352-1-will@kernel.org/ Acked-by: Kiryl Shutsemau Acked-by: David Hildenbrand Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: Chris Li Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Keir Fraser Cc: Konstantin Khlebnikov Cc: Li Zhe Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Rik van Riel Cc: Shivank Garg Cc: Vlastimil Babka Cc: Wei Xu Cc: yangge Cc: Yuanchu Xie Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- mm/gup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index adffe663594d..82aec6443c0a 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2307,7 +2307,8 @@ static unsigned long collect_longterm_unpinnable_folios( continue; } - if (!folio_test_lru(folio) && drain_allow) { + if (drain_allow && folio_ref_count(folio) != + folio_expected_ref_count(folio) + 1) { lru_add_drain_all(); drain_allow = false; } From a09a8a1fbb374e0053b97306da9dbc05bd384685 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Sep 2025 15:16:53 -0700 Subject: [PATCH 22/34] mm/gup: local lru_add_drain() to avoid lru_add_drain_all() In many cases, if collect_longterm_unpinnable_folios() does need to drain the LRU cache to release a reference, the cache in question is on this same CPU, and much more efficiently drained by a preliminary local lru_add_drain(), than the later cross-CPU lru_add_drain_all(). Marked for stable, to counter the increase in lru_add_drain_all()s from "mm/gup: check ref_count instead of lru before migration". Note for clean backports: can take 6.16 commit a03db236aebf ("gup: optimize longterm pin_user_pages() for large folio") first. Link: https://lkml.kernel.org/r/66f2751f-283e-816d-9530-765db7edc465@google.com Signed-off-by: Hugh Dickins Acked-by: David Hildenbrand Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: Chris Li Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Keir Fraser Cc: Konstantin Khlebnikov Cc: Li Zhe Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Rik van Riel Cc: Shivank Garg Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: yangge Cc: Yuanchu Xie Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- mm/gup.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 82aec6443c0a..b47066a54f52 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2287,8 +2287,8 @@ static unsigned long collect_longterm_unpinnable_folios( struct pages_or_folios *pofs) { unsigned long collected = 0; - bool drain_allow = true; struct folio *folio; + int drained = 0; long i = 0; for (folio = pofs_get_folio(pofs, i); folio; @@ -2307,10 +2307,17 @@ static unsigned long collect_longterm_unpinnable_folios( continue; } - if (drain_allow && folio_ref_count(folio) != - folio_expected_ref_count(folio) + 1) { + if (drained == 0 && + folio_ref_count(folio) != + folio_expected_ref_count(folio) + 1) { + lru_add_drain(); + drained = 1; + } + if (drained == 1 && + folio_ref_count(folio) != + folio_expected_ref_count(folio) + 1) { lru_add_drain_all(); - drain_allow = false; + drained = 2; } if (!folio_isolate_lru(folio)) From afb99e9f500485160f34b8cad6d3763ada3e80e8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Sep 2025 15:19:17 -0700 Subject: [PATCH 23/34] mm: revert "mm/gup: clear the LRU flag of a page before adding to LRU batch" This reverts commit 33dfe9204f29: now that collect_longterm_unpinnable_folios() is checking ref_count instead of lru, and mlock/munlock do not participate in the revised LRU flag clearing, those changes are misleading, and enlarge the window during which mlock/munlock may miss an mlock_count update. It is possible (I'd hesitate to claim probable) that the greater likelihood of missed mlock_count updates would explain the "Realtime threads delayed due to kcompactd0" observed on 6.12 in the Link below. If that is the case, this reversion will help; but a complete solution needs also a further patch, beyond the scope of this series. Included some 80-column cleanup around folio_batch_add_and_move(). The role of folio_test_clear_lru() (before taking per-memcg lru_lock) is questionable since 6.13 removed mem_cgroup_move_account() etc; but perhaps there are still some races which need it - not examined here. Link: https://lore.kernel.org/linux-mm/DU0PR01MB10385345F7153F334100981888259A@DU0PR01MB10385.eurprd01.prod.exchangelabs.com/ Link: https://lkml.kernel.org/r/05905d7b-ed14-68b1-79d8-bdec30367eba@google.com Signed-off-by: Hugh Dickins Acked-by: David Hildenbrand Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: Chris Li Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Keir Fraser Cc: Konstantin Khlebnikov Cc: Li Zhe Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Rik van Riel Cc: Shivank Garg Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: yangge Cc: Yuanchu Xie Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- mm/swap.c | 50 ++++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 3632dd061beb..6ae2d5680574 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -164,6 +164,10 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; + /* block memcg migration while the folio moves between lru */ + if (move_fn != lru_add && !folio_test_clear_lru(folio)) + continue; + folio_lruvec_relock_irqsave(folio, &lruvec, &flags); move_fn(lruvec, folio); @@ -176,14 +180,10 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) } static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, - struct folio *folio, move_fn_t move_fn, - bool on_lru, bool disable_irq) + struct folio *folio, move_fn_t move_fn, bool disable_irq) { unsigned long flags; - if (on_lru && !folio_test_clear_lru(folio)) - return; - folio_get(folio); if (disable_irq) @@ -191,8 +191,8 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, else local_lock(&cpu_fbatches.lock); - if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || folio_test_large(folio) || - lru_cache_disabled()) + if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || + folio_test_large(folio) || lru_cache_disabled()) folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn); if (disable_irq) @@ -201,13 +201,13 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, local_unlock(&cpu_fbatches.lock); } -#define folio_batch_add_and_move(folio, op, on_lru) \ - __folio_batch_add_and_move( \ - &cpu_fbatches.op, \ - folio, \ - op, \ - on_lru, \ - offsetof(struct cpu_fbatches, op) >= offsetof(struct cpu_fbatches, lock_irq) \ +#define folio_batch_add_and_move(folio, op) \ + __folio_batch_add_and_move( \ + &cpu_fbatches.op, \ + folio, \ + op, \ + offsetof(struct cpu_fbatches, op) >= \ + offsetof(struct cpu_fbatches, lock_irq) \ ) static void lru_move_tail(struct lruvec *lruvec, struct folio *folio) @@ -231,10 +231,10 @@ static void lru_move_tail(struct lruvec *lruvec, struct folio *folio) void folio_rotate_reclaimable(struct folio *folio) { if (folio_test_locked(folio) || folio_test_dirty(folio) || - folio_test_unevictable(folio)) + folio_test_unevictable(folio) || !folio_test_lru(folio)) return; - folio_batch_add_and_move(folio, lru_move_tail, true); + folio_batch_add_and_move(folio, lru_move_tail); } void lru_note_cost_unlock_irq(struct lruvec *lruvec, bool file, @@ -328,10 +328,11 @@ static void folio_activate_drain(int cpu) void folio_activate(struct folio *folio) { - if (folio_test_active(folio) || folio_test_unevictable(folio)) + if (folio_test_active(folio) || folio_test_unevictable(folio) || + !folio_test_lru(folio)) return; - folio_batch_add_and_move(folio, lru_activate, true); + folio_batch_add_and_move(folio, lru_activate); } #else @@ -507,7 +508,7 @@ void folio_add_lru(struct folio *folio) lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) folio_set_active(folio); - folio_batch_add_and_move(folio, lru_add, false); + folio_batch_add_and_move(folio, lru_add); } EXPORT_SYMBOL(folio_add_lru); @@ -685,13 +686,13 @@ void lru_add_drain_cpu(int cpu) void deactivate_file_folio(struct folio *folio) { /* Deactivating an unevictable folio will not accelerate reclaim */ - if (folio_test_unevictable(folio)) + if (folio_test_unevictable(folio) || !folio_test_lru(folio)) return; if (lru_gen_enabled() && lru_gen_clear_refs(folio)) return; - folio_batch_add_and_move(folio, lru_deactivate_file, true); + folio_batch_add_and_move(folio, lru_deactivate_file); } /* @@ -704,13 +705,13 @@ void deactivate_file_folio(struct folio *folio) */ void folio_deactivate(struct folio *folio) { - if (folio_test_unevictable(folio)) + if (folio_test_unevictable(folio) || !folio_test_lru(folio)) return; if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio)) return; - folio_batch_add_and_move(folio, lru_deactivate, true); + folio_batch_add_and_move(folio, lru_deactivate); } /** @@ -723,10 +724,11 @@ void folio_deactivate(struct folio *folio) void folio_mark_lazyfree(struct folio *folio) { if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) || + !folio_test_lru(folio) || folio_test_swapcache(folio) || folio_test_unevictable(folio)) return; - folio_batch_add_and_move(folio, lru_lazyfree, true); + folio_batch_add_and_move(folio, lru_lazyfree); } void lru_add_drain(void) From 8d79ed36bfc83d0583ab72216b7980340478cdfb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Sep 2025 15:21:12 -0700 Subject: [PATCH 24/34] mm: revert "mm: vmscan.c: fix OOM on swap stress test" This reverts commit 0885ef470560: that was a fix to the reverted 33dfe9204f29b415bbc0abb1a50642d1ba94f5e9. Link: https://lkml.kernel.org/r/aa0e9d67-fbcd-9d79-88a1-641dfbe1d9d1@google.com Signed-off-by: Hugh Dickins Acked-by: David Hildenbrand Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: Chris Li Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Keir Fraser Cc: Konstantin Khlebnikov Cc: Li Zhe Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Rik van Riel Cc: Shivank Garg Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: yangge Cc: Yuanchu Xie Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index a48aec8bfd92..674999999cd0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4507,7 +4507,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* ineligible */ - if (!folio_test_lru(folio) || zone > sc->reclaim_idx) { + if (zone > sc->reclaim_idx) { gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); return true; From 2da6de30e60dd9bb14600eff1cc99df2fa2ddae3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Sep 2025 15:23:15 -0700 Subject: [PATCH 25/34] mm: folio_may_be_lru_cached() unless folio_test_large() mm/swap.c and mm/mlock.c agree to drain any per-CPU batch as soon as a large folio is added: so collect_longterm_unpinnable_folios() just wastes effort when calling lru_add_drain[_all]() on a large folio. But although there is good reason not to batch up PMD-sized folios, we might well benefit from batching a small number of low-order mTHPs (though unclear how that "small number" limitation will be implemented). So ask if folio_may_be_lru_cached() rather than !folio_test_large(), to insulate those particular checks from future change. Name preferred to "folio_is_batchable" because large folios can well be put on a batch: it's just the per-CPU LRU caches, drained much later, which need care. Marked for stable, to counter the increase in lru_add_drain_all()s from "mm/gup: check ref_count instead of lru before migration". Link: https://lkml.kernel.org/r/57d2eaf8-3607-f318-e0c5-be02dce61ad0@google.com Fixes: 9a4e9f3b2d73 ("mm: update get_user_pages_longterm to migrate pages allocated from CMA region") Signed-off-by: Hugh Dickins Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: Chris Li Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Keir Fraser Cc: Konstantin Khlebnikov Cc: Li Zhe Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Rik van Riel Cc: Shivank Garg Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: yangge Cc: Yuanchu Xie Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/swap.h | 10 ++++++++++ mm/gup.c | 4 ++-- mm/mlock.c | 6 +++--- mm/swap.c | 2 +- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 2fe6ed2cc3fd..7012a0f758d8 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -385,6 +385,16 @@ void folio_add_lru_vma(struct folio *, struct vm_area_struct *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); +static inline bool folio_may_be_lru_cached(struct folio *folio) +{ + /* + * Holding PMD-sized folios in per-CPU LRU cache unbalances accounting. + * Holding small numbers of low-order mTHP folios in per-CPU LRU cache + * will be sensible, but nobody has implemented and tested that yet. + */ + return !folio_test_large(folio); +} + extern atomic_t lru_disable_count; static inline bool lru_cache_disabled(void) diff --git a/mm/gup.c b/mm/gup.c index b47066a54f52..0bc4d140fc07 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2307,13 +2307,13 @@ static unsigned long collect_longterm_unpinnable_folios( continue; } - if (drained == 0 && + if (drained == 0 && folio_may_be_lru_cached(folio) && folio_ref_count(folio) != folio_expected_ref_count(folio) + 1) { lru_add_drain(); drained = 1; } - if (drained == 1 && + if (drained == 1 && folio_may_be_lru_cached(folio) && folio_ref_count(folio) != folio_expected_ref_count(folio) + 1) { lru_add_drain_all(); diff --git a/mm/mlock.c b/mm/mlock.c index a1d93ad33c6d..bb0776f5ef7c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -255,7 +255,7 @@ void mlock_folio(struct folio *folio) folio_get(folio); if (!folio_batch_add(fbatch, mlock_lru(folio)) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } @@ -278,7 +278,7 @@ void mlock_new_folio(struct folio *folio) folio_get(folio); if (!folio_batch_add(fbatch, mlock_new(folio)) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } @@ -299,7 +299,7 @@ void munlock_folio(struct folio *folio) */ folio_get(folio); if (!folio_batch_add(fbatch, folio) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } diff --git a/mm/swap.c b/mm/swap.c index 6ae2d5680574..b74ebe865dd9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -192,7 +192,7 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, local_lock(&cpu_fbatches.lock); if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn); if (disable_irq) From e6a0deb6fa5b0fc134ee2aa127d1cfc9456d8445 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 8 Sep 2025 13:15:12 -0700 Subject: [PATCH 26/34] mm/damon/core: introduce damon_call_control->dealloc_on_cancel Patch series "mm/damon/sysfs: fix refresh_ms control overwriting on multi-kdamonds usages". Automatic esssential DAMON/DAMOS status update feature of DAMON sysfs interface (refresh_ms) is broken [1] for multiple DAMON contexts (kdamonds) use case, since it uses a global single damon_call_control object for all created DAMON contexts. The fields of the object, particularly the list field is over-written for the contexts and it makes unexpected results including user-space hangup and kernel crashes [2]. Fix it by extending damon_call_control for the use case and updating the usage on DAMON sysfs interface to use per-context dynamically allocated damon_call_control object. This patch (of 2): When damon_call_control->repeat is set, damon_call() is executed asynchronously, and is eventually canceled when kdamond finishes. If the damon_call_control object is dynamically allocated, finding the place to deallocate the object is difficult. Introduce a new damon_call_control field, namely dealloc_on_cancel, to ask the kdamond deallocates those dynamically allocated objects when those are canceled. Link: https://lkml.kernel.org/r/20250908201513.60802-3-sj@kernel.org Link: https://lkml.kernel.org/r/20250908201513.60802-2-sj@kernel.org Fixes: d809a7c64ba8 ("mm/damon/sysfs: implement refresh_ms file internal work") Signed-off-by: SeongJae Park Cc: Yunjeong Mun Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ mm/damon/core.c | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index f13664c62ddd..9e62b2a85538 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -636,6 +636,7 @@ struct damon_operations { * @data: Data that will be passed to @fn. * @repeat: Repeat invocations. * @return_code: Return code from @fn invocation. + * @dealloc_on_cancel: De-allocate when canceled. * * Control damon_call(), which requests specific kdamond to invoke a given * function. Refer to damon_call() for more details. @@ -645,6 +646,7 @@ struct damon_call_control { void *data; bool repeat; int return_code; + bool dealloc_on_cancel; /* private: internal use only */ /* informs if the kdamond finished handling of the request */ struct completion completion; diff --git a/mm/damon/core.c b/mm/damon/core.c index c2e0b469fd43..08065b363972 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2479,10 +2479,14 @@ static void kdamond_call(struct damon_ctx *ctx, bool cancel) mutex_lock(&ctx->call_controls_lock); list_del(&control->list); mutex_unlock(&ctx->call_controls_lock); - if (!control->repeat) + if (!control->repeat) { complete(&control->completion); - else + } else if (control->canceled && control->dealloc_on_cancel) { + kfree(control); + continue; + } else { list_add(&control->list, &repeat_controls); + } } control = list_first_entry_or_null(&repeat_controls, struct damon_call_control, list); From 04a06b139ec08aa63d7377f6d3e5218f8ddb1c5d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 8 Sep 2025 13:15:13 -0700 Subject: [PATCH 27/34] mm/damon/sysfs: use dynamically allocated repeat mode damon_call_control DAMON sysfs interface is using a single global repeat mode damon_call_control variable for refresh_ms handling, for all DAMON contexts. As a result, when there are more than one context, the single global damon_call_control is unexpectedly over-written (corrupted). Particularly the ->link field is overwritten by the multiple contexts and this can cause a user hangup, and/or a kernel crash. Fix it by using dynamically allocated damon_call_control object per DAMON context. Link: https://lkml.kernel.org/r/20250908201513.60802-3-sj@kernel.org Link: https://lore.kernel.org/20250904011738.930-1-yunjeong.mun@sk.com [1] Link: https://lore.kernel.org/20250905035411.39501-1-sj@kernel.org [2] Fixes: d809a7c64ba8 ("mm/damon/sysfs: implement refresh_ms file internal work") Signed-off-by: SeongJae Park Reported-by: Yunjeong Mun Closes: https://lore.kernel.org/20250904011738.930-1-yunjeong.mun@sk.com Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 7b9254cadd5f..c96c2154128f 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1534,14 +1534,10 @@ static int damon_sysfs_repeat_call_fn(void *data) return 0; } -static struct damon_call_control damon_sysfs_repeat_call_control = { - .fn = damon_sysfs_repeat_call_fn, - .repeat = true, -}; - static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) { struct damon_ctx *ctx; + struct damon_call_control *repeat_call_control; int err; if (damon_sysfs_kdamond_running(kdamond)) @@ -1554,18 +1550,29 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) damon_destroy_ctx(kdamond->damon_ctx); kdamond->damon_ctx = NULL; + repeat_call_control = kmalloc(sizeof(*repeat_call_control), + GFP_KERNEL); + if (!repeat_call_control) + return -ENOMEM; + ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]); - if (IS_ERR(ctx)) + if (IS_ERR(ctx)) { + kfree(repeat_call_control); return PTR_ERR(ctx); + } err = damon_start(&ctx, 1, false); if (err) { + kfree(repeat_call_control); damon_destroy_ctx(ctx); return err; } kdamond->damon_ctx = ctx; - damon_sysfs_repeat_call_control.data = kdamond; - damon_call(ctx, &damon_sysfs_repeat_call_control); + repeat_call_control->fn = damon_sysfs_repeat_call_fn; + repeat_call_control->data = kdamond; + repeat_call_control->repeat = true; + repeat_call_control->dealloc_on_cancel = true; + damon_call(ctx, repeat_call_control); return err; } From 615cd3705d204680f4ae8d0ad0dec8b778dc2753 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 8 Sep 2025 20:49:59 +0100 Subject: [PATCH 28/34] MAINTAINERS: add Jann Horn as rmap reviewer Jann has been an excellent contributor in all areas of memory management, and has demonstrated great expertise in the reverse mapping. It's therefore appropriate for him to become a reviewer. Link: https://lkml.kernel.org/r/20250908194959.820913-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Harry Yoo Acked-by: SeongJae Park Acked-by: Vlastimil Babka Acked-by: Liam R. Howlett Cc: Jann Horn Cc: Rik van Riel Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index fbdbf7c012a0..b71b8c6813aa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16195,6 +16195,7 @@ R: Rik van Riel R: Liam R. Howlett R: Vlastimil Babka R: Harry Yoo +R: Jann Horn L: linux-mm@kvack.org S: Maintained F: include/linux/rmap.h From 72291a5a0ead8686e3cbfd35baa7d7b1cfdbf6ea Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Mon, 8 Sep 2025 18:48:57 +0800 Subject: [PATCH 29/34] MAINTAINERS: add Lance Yang as a THP reviewer I've been actively digging into the MM/THP subsystem for over a year now, and there's a real interest in contributing more and getting further involved. Well, missing out on any more cool THP things is really a pain ;) Link: https://lkml.kernel.org/r/20250908104857.35397-1-lance.yang@linux.dev Signed-off-by: Lance Yang Acked-by: David Hildenbrand Acked-by: Lorenzo Stoakes Acked-by: Zi Yan Acked-by: Barry Song Acked-by: Baolin Wang Cc: Liam R. Howlett Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index b71b8c6813aa..03b433441836 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16240,6 +16240,7 @@ R: Nico Pache R: Ryan Roberts R: Dev Jain R: Barry Song +R: Lance Yang L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org From f826edeb888c5a8bd1b6e95ae6a50b0db2b21902 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 8 Sep 2025 19:22:36 -0700 Subject: [PATCH 30/34] samples/damon/wsse: avoid starting DAMON before initialization Patch series "samples/damon: fix boot time enable handling fixup merge mistakes". First three patches of the patch series "mm/damon: fix misc bugs in DAMON modules" [1] were trying to fix boot time DAMON sample modules enabling issues. The issues are the modules can crash if those are enabled before DAMON is enabled, like using boot time parameter options. The three patches were fixing the issues by avoiding starting DAMON before the module initialization phase. However, probably by a mistake during a merge, only half of the change is merged, and the part for avoiding the starting of DAMON before the module initialized is missed. So the problem is not solved and thus the modules can still crash if enabled before DAMON is initialized. Fix those by applying the unmerged parts again. Note that the broken commits are merged into 6.17-rc1, but also backported to relevant stable kernels. So this series also needs to be merged into the stable kernels. Hence Cc-ing stable@. This patch (of 3): Commit 0ed1165c3727 ("samples/damon/wsse: fix boot time enable handling") is somehow incompletely applying the origin patch [2]. It is missing the part that avoids starting DAMON before module initialization. Probably a mistake during a merge has happened. Fix it by applying the missed part again. Link: https://lkml.kernel.org/r/20250909022238.2989-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250909022238.2989-2-sj@kernel.org Link: https://lkml.kernel.org/r/20250706193207.39810-1-sj@kernel.org [1] Link: https://lore.kernel.org/20250706193207.39810-2-sj@kernel.org [2] Fixes: 0ed1165c3727 ("samples/damon/wsse: fix boot time enable handling") Signed-off-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- samples/damon/wsse.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/samples/damon/wsse.c b/samples/damon/wsse.c index da052023b099..21eaf15f987d 100644 --- a/samples/damon/wsse.c +++ b/samples/damon/wsse.c @@ -118,6 +118,9 @@ static int damon_sample_wsse_enable_store( return 0; if (enabled) { + if (!init_called) + return 0; + err = damon_sample_wsse_start(); if (err) enabled = false; From e6b733ca2f99e968d696c2e812c8eb8e090bf37b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 8 Sep 2025 19:22:37 -0700 Subject: [PATCH 31/34] samples/damon/prcl: avoid starting DAMON before initialization Commit 2780505ec2b4 ("samples/damon/prcl: fix boot time enable crash") is somehow incompletely applying the origin patch [1]. It is missing the part that avoids starting DAMON before module initialization. Probably a mistake during a merge has happened. Fix it by applying the missed part again. Link: https://lkml.kernel.org/r/20250909022238.2989-3-sj@kernel.org Link: https://lore.kernel.org/20250706193207.39810-3-sj@kernel.org [1] Fixes: 2780505ec2b4 ("samples/damon/prcl: fix boot time enable crash") Signed-off-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- samples/damon/prcl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/samples/damon/prcl.c b/samples/damon/prcl.c index 1b839c06a612..0226652f94d5 100644 --- a/samples/damon/prcl.c +++ b/samples/damon/prcl.c @@ -137,6 +137,9 @@ static int damon_sample_prcl_enable_store( if (enabled == is_enabled) return 0; + if (!init_called) + return 0; + if (enabled) { err = damon_sample_prcl_start(); if (err) From c62cff40481c037307a13becbda795f7afdcfebd Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 8 Sep 2025 19:22:38 -0700 Subject: [PATCH 32/34] samples/damon/mtier: avoid starting DAMON before initialization Commit 964314344eab ("samples/damon/mtier: support boot time enable setup") is somehow incompletely applying the origin patch [1]. It is missing the part that avoids starting DAMON before module initialization. Probably a mistake during a merge has happened. Fix it by applying the missed part again. Link: https://lkml.kernel.org/r/20250909022238.2989-4-sj@kernel.org Link: https://lore.kernel.org/20250706193207.39810-4-sj@kernel.org [1] Fixes: 964314344eab ("samples/damon/mtier: support boot time enable setup") Signed-off-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- samples/damon/mtier.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/samples/damon/mtier.c b/samples/damon/mtier.c index 7ebd352138e4..beaf36657dea 100644 --- a/samples/damon/mtier.c +++ b/samples/damon/mtier.c @@ -208,6 +208,9 @@ static int damon_sample_mtier_enable_store( if (enabled == is_enabled) return 0; + if (!init_called) + return 0; + if (enabled) { err = damon_sample_mtier_start(); if (err) From 025e87f8ea2ae3a28bf1fe2b052bfa412c27ed4a Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Sat, 6 Sep 2025 23:43:34 +0900 Subject: [PATCH 33/34] nilfs2: fix CFI failure when accessing /sys/fs/nilfs2/features/* When accessing one of the files under /sys/fs/nilfs2/features when CONFIG_CFI_CLANG is enabled, there is a CFI violation: CFI failure at kobj_attr_show+0x59/0x80 (target: nilfs_feature_revision_show+0x0/0x30; expected type: 0xfc392c4d) ... Call Trace: sysfs_kf_seq_show+0x2a6/0x390 ? __cfi_kobj_attr_show+0x10/0x10 kernfs_seq_show+0x104/0x15b seq_read_iter+0x580/0xe2b ... When the kobject of the kset for /sys/fs/nilfs2 is initialized, its ktype is set to kset_ktype, which has a ->sysfs_ops of kobj_sysfs_ops. When nilfs_feature_attr_group is added to that kobject via sysfs_create_group(), the kernfs_ops of each files is sysfs_file_kfops_rw, which will call sysfs_kf_seq_show() when ->seq_show() is called. sysfs_kf_seq_show() in turn calls kobj_attr_show() through ->sysfs_ops->show(). kobj_attr_show() casts the provided attribute out to a 'struct kobj_attribute' via container_of() and calls ->show(), resulting in the CFI violation since neither nilfs_feature_revision_show() nor nilfs_feature_README_show() match the prototype of ->show() in 'struct kobj_attribute'. Resolve the CFI violation by adjusting the second parameter in nilfs_feature_{revision,README}_show() from 'struct attribute' to 'struct kobj_attribute' to match the expected prototype. Link: https://lkml.kernel.org/r/20250906144410.22511-1-konishi.ryusuke@gmail.com Fixes: aebe17f68444 ("nilfs2: add /sys/fs/nilfs2/features group") Signed-off-by: Nathan Chancellor Signed-off-by: Ryusuke Konishi Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202509021646.bc78d9ef-lkp@intel.com/ Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/sysfs.c | 4 ++-- fs/nilfs2/sysfs.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 14868a3dd592..bc52afbfc5c7 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -1075,7 +1075,7 @@ void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs) ************************************************************************/ static ssize_t nilfs_feature_revision_show(struct kobject *kobj, - struct attribute *attr, char *buf) + struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d.%d\n", NILFS_CURRENT_REV, NILFS_MINOR_REV); @@ -1087,7 +1087,7 @@ static const char features_readme_str[] = "(1) revision\n\tshow current revision of NILFS file system driver.\n"; static ssize_t nilfs_feature_README_show(struct kobject *kobj, - struct attribute *attr, + struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, features_readme_str); diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h index 78a87a016928..d370cd5cce3f 100644 --- a/fs/nilfs2/sysfs.h +++ b/fs/nilfs2/sysfs.h @@ -50,16 +50,16 @@ struct nilfs_sysfs_dev_subgroups { struct completion sg_segments_kobj_unregister; }; -#define NILFS_COMMON_ATTR_STRUCT(name) \ +#define NILFS_KOBJ_ATTR_STRUCT(name) \ struct nilfs_##name##_attr { \ struct attribute attr; \ - ssize_t (*show)(struct kobject *, struct attribute *, \ + ssize_t (*show)(struct kobject *, struct kobj_attribute *, \ char *); \ - ssize_t (*store)(struct kobject *, struct attribute *, \ + ssize_t (*store)(struct kobject *, struct kobj_attribute *, \ const char *, size_t); \ } -NILFS_COMMON_ATTR_STRUCT(feature); +NILFS_KOBJ_ATTR_STRUCT(feature); #define NILFS_DEV_ATTR_STRUCT(name) \ struct nilfs_##name##_attr { \ From ce4be9e4307c5a60701ff6e0cafa74caffdc54ce Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 9 Sep 2025 13:48:35 +0900 Subject: [PATCH 34/34] zram: fix slot write race condition Parallel concurrent writes to the same zram index result in leaked zsmalloc handles. Schematically we can have something like this: CPU0 CPU1 zram_slot_lock() zs_free(handle) zram_slot_lock() zram_slot_lock() zs_free(handle) zram_slot_lock() compress compress handle = zs_malloc() handle = zs_malloc() zram_slot_lock zram_set_handle(handle) zram_slot_lock zram_slot_lock zram_set_handle(handle) zram_slot_lock Either CPU0 or CPU1 zsmalloc handle will leak because zs_free() is done too early. In fact, we need to reset zram entry right before we set its new handle, all under the same slot lock scope. Link: https://lkml.kernel.org/r/20250909045150.635345-1-senozhatsky@chromium.org Fixes: 71268035f5d7 ("zram: free slot memory early during write") Signed-off-by: Sergey Senozhatsky Reported-by: Changhui Zhong Closes: https://lore.kernel.org/all/CAGVVp+UtpGoW5WEdEU7uVTtsSCjPN=ksN6EcvyypAtFDOUf30A@mail.gmail.com/ Tested-by: Changhui Zhong Cc: Jens Axboe Cc: Minchan Kim Cc: Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 8acad3cc6e6e..f31652085adc 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1795,6 +1795,7 @@ static int write_same_filled_page(struct zram *zram, unsigned long fill, u32 index) { zram_slot_lock(zram, index); + zram_free_page(zram, index); zram_set_flag(zram, index, ZRAM_SAME); zram_set_handle(zram, index, fill); zram_slot_unlock(zram, index); @@ -1832,6 +1833,7 @@ static int write_incompressible_page(struct zram *zram, struct page *page, kunmap_local(src); zram_slot_lock(zram, index); + zram_free_page(zram, index); zram_set_flag(zram, index, ZRAM_HUGE); zram_set_handle(zram, index, handle); zram_set_obj_size(zram, index, PAGE_SIZE); @@ -1855,11 +1857,6 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) unsigned long element; bool same_filled; - /* First, free memory allocated to this slot (if any) */ - zram_slot_lock(zram, index); - zram_free_page(zram, index); - zram_slot_unlock(zram, index); - mem = kmap_local_page(page); same_filled = page_same_filled(mem, &element); kunmap_local(mem); @@ -1901,6 +1898,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) zcomp_stream_put(zstrm); zram_slot_lock(zram, index); + zram_free_page(zram, index); zram_set_handle(zram, index, handle); zram_set_obj_size(zram, index, comp_len); zram_slot_unlock(zram, index);