From ee4d098cbc9160f573b5c1b5a51d6158efdb2896 Mon Sep 17 00:00:00 2001 From: Yin Tirui Date: Tue, 19 Aug 2025 15:55:10 +0800 Subject: [PATCH 01/17] of_numa: fix uninitialized memory nodes causing kernel panic When there are memory-only nodes (nodes without CPUs), these nodes are not properly initialized, causing kernel panic during boot. of_numa_init of_numa_parse_cpu_nodes node_set(nid, numa_nodes_parsed); of_numa_parse_memory_nodes In of_numa_parse_cpu_nodes, numa_nodes_parsed gets updated only for nodes containing CPUs. Memory-only nodes should have been updated in of_numa_parse_memory_nodes, but they weren't. Subsequently, when free_area_init() attempts to access NODE_DATA() for these uninitialized memory nodes, the kernel panics due to NULL pointer dereference. This can be reproduced on ARM64 QEMU with 1 CPU and 2 memory nodes: qemu-system-aarch64 \ -cpu host -nographic \ -m 4G -smp 1 \ -machine virt,accel=kvm,gic-version=3,iommu=smmuv3 \ -object memory-backend-ram,size=2G,id=mem0 \ -object memory-backend-ram,size=2G,id=mem1 \ -numa node,nodeid=0,memdev=mem0 \ -numa node,nodeid=1,memdev=mem1 \ -kernel $IMAGE \ -hda $DISK \ -append "console=ttyAMA0 root=/dev/vda rw earlycon" [ 0.000000] Booting Linux on physical CPU 0x0000000000 [0x481fd010] [ 0.000000] Linux version 6.17.0-rc1-00001-gabb4b3daf18c-dirty (yintirui@local) (gcc (GCC) 12.3.1, GNU ld (GNU Binutils) 2.41) #52 SMP PREEMPT Mon Aug 18 09:49:40 CST 2025 [ 0.000000] KASLR enabled [ 0.000000] random: crng init done [ 0.000000] Machine model: linux,dummy-virt [ 0.000000] efi: UEFI not found. [ 0.000000] earlycon: pl11 at MMIO 0x0000000009000000 (options '') [ 0.000000] printk: legacy bootconsole [pl11] enabled [ 0.000000] OF: reserved mem: Reserved memory: No reserved-memory node in the DT [ 0.000000] NODE_DATA(0) allocated [mem 0xbfffd9c0-0xbfffffff] [ 0.000000] node 1 must be removed before remove section 23 [ 0.000000] Zone ranges: [ 0.000000] DMA [mem 0x0000000040000000-0x00000000ffffffff] [ 0.000000] DMA32 empty [ 0.000000] Normal [mem 0x0000000100000000-0x000000013fffffff] [ 0.000000] Movable zone start for each node [ 0.000000] Early memory node ranges [ 0.000000] node 0: [mem 0x0000000040000000-0x00000000bfffffff] [ 0.000000] node 1: [mem 0x00000000c0000000-0x000000013fffffff] [ 0.000000] Initmem setup node 0 [mem 0x0000000040000000-0x00000000bfffffff] [ 0.000000] Unable to handle kernel NULL pointer dereference at virtual address 00000000000000a0 [ 0.000000] Mem abort info: [ 0.000000] ESR = 0x0000000096000004 [ 0.000000] EC = 0x25: DABT (current EL), IL = 32 bits [ 0.000000] SET = 0, FnV = 0 [ 0.000000] EA = 0, S1PTW = 0 [ 0.000000] FSC = 0x04: level 0 translation fault [ 0.000000] Data abort info: [ 0.000000] ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000 [ 0.000000] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 0.000000] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 0.000000] [00000000000000a0] user address but active_mm is swapper [ 0.000000] Internal error: Oops: 0000000096000004 [#1] SMP [ 0.000000] Modules linked in: [ 0.000000] CPU: 0 UID: 0 PID: 0 Comm: swapper Not tainted 6.17.0-rc1-00001-g760c6dabf762-dirty #54 PREEMPT [ 0.000000] Hardware name: linux,dummy-virt (DT) [ 0.000000] pstate: 800000c5 (Nzcv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 0.000000] pc : free_area_init+0x50c/0xf9c [ 0.000000] lr : free_area_init+0x5c0/0xf9c [ 0.000000] sp : ffffa02ca0f33c00 [ 0.000000] x29: ffffa02ca0f33cb0 x28: 0000000000000000 x27: 0000000000000000 [ 0.000000] x26: 4ec4ec4ec4ec4ec5 x25: 00000000000c0000 x24: 00000000000c0000 [ 0.000000] x23: 0000000000040000 x22: 0000000000000000 x21: ffffa02ca0f3b368 [ 0.000000] x20: ffffa02ca14c7b98 x19: 0000000000000000 x18: 0000000000000002 [ 0.000000] x17: 000000000000cacc x16: 0000000000000001 x15: 0000000000000001 [ 0.000000] x14: 0000000080000000 x13: 0000000000000018 x12: 0000000000000002 [ 0.000000] x11: ffffa02ca0fd4f00 x10: ffffa02ca14bab20 x9 : ffffa02ca14bab38 [ 0.000000] x8 : 00000000000c0000 x7 : 0000000000000001 x6 : 0000000000000002 [ 0.000000] x5 : 0000000140000000 x4 : ffffa02ca0f33c90 x3 : ffffa02ca0f33ca0 [ 0.000000] x2 : ffffa02ca0f33c98 x1 : 0000000080000000 x0 : 0000000000000001 [ 0.000000] Call trace: [ 0.000000] free_area_init+0x50c/0xf9c (P) [ 0.000000] bootmem_init+0x110/0x1dc [ 0.000000] setup_arch+0x278/0x60c [ 0.000000] start_kernel+0x70/0x748 [ 0.000000] __primary_switched+0x88/0x90 [ 0.000000] Code: d503201f b98093e0 52800016 f8607a93 (f9405260) [ 0.000000] ---[ end trace 0000000000000000 ]--- [ 0.000000] Kernel panic - not syncing: Attempted to kill the idle task! [ 0.000000] ---[ end Kernel panic - not syncing: Attempted to kill the idle task! ]--- Link: https://lkml.kernel.org/r/20250819075510.2079961-1-yintirui@huawei.com Fixes: 767507654c22 ("arch_numa: switch over to numa_memblks") Signed-off-by: Yin Tirui Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Kefeng Wang Cc: Chen Jun Cc: Dan Williams Cc: Joanthan Cameron Cc: Rob Herring Cc: Saravana Kannan Cc: Signed-off-by: Andrew Morton --- drivers/of/of_numa.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/of/of_numa.c b/drivers/of/of_numa.c index 230d5f628c1b..cd2dc8e825c9 100644 --- a/drivers/of/of_numa.c +++ b/drivers/of/of_numa.c @@ -59,8 +59,11 @@ static int __init of_numa_parse_memory_nodes(void) r = -EINVAL; } - for (i = 0; !r && !of_address_to_resource(np, i, &rsrc); i++) + for (i = 0; !r && !of_address_to_resource(np, i, &rsrc); i++) { r = numa_add_memblk(nid, rsrc.start, rsrc.end + 1); + if (!r) + node_set(nid, numa_nodes_parsed); + } if (!i || r) { of_node_put(np); From 5cc5e030bce2ec97ae5cdb2c1b94a98b1047b3fa Mon Sep 17 00:00:00 2001 From: Baptiste Lepers Date: Tue, 12 Aug 2025 15:26:56 +0200 Subject: [PATCH 02/17] rust: mm: mark VmaNew as transparent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unsafe code in VmaNew's methods assumes that the type has the same layout as the inner `bindings::vm_area_struct`. This is not guaranteed by the default struct representation in Rust, but requires specifying the `transparent` representation. Link: https://lkml.kernel.org/r/20250812132712.61007-1-baptiste.lepers@gmail.com Fixes: dcb81aeab406 ("mm: rust: add VmaNew for f_ops->mmap()") Signed-off-by: Baptiste Lepers Reviewed-by: Alice Ryhl Cc: Alex Gaynor Cc: Andreas Hindborg Cc: Björn Roy Baron Cc: Boqun Feng Cc: Danilo Krummrich Cc: Gary Guo Cc: Jann Horn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Miguel Ojeda Cc: Trevor Gross Cc: Signed-off-by: Andrew Morton --- rust/kernel/mm/virt.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/rust/kernel/mm/virt.rs b/rust/kernel/mm/virt.rs index 6086ca981b06..a1bfa4e19293 100644 --- a/rust/kernel/mm/virt.rs +++ b/rust/kernel/mm/virt.rs @@ -209,6 +209,7 @@ pub fn vm_insert_page(&self, address: usize, page: &Page) -> Result { /// /// For the duration of 'a, the referenced vma must be undergoing initialization in an /// `f_ops->mmap()` hook. +#[repr(transparent)] pub struct VmaNew { vma: VmaRef, } From f46e8ef8bb7b452584f2e75337b619ac51a7cadf Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Tue, 19 Aug 2025 21:41:02 +0800 Subject: [PATCH 03/17] ocfs2: prevent release journal inode after journal shutdown Before calling ocfs2_delete_osb(), ocfs2_journal_shutdown() has already been executed in ocfs2_dismount_volume(), so osb->journal must be NULL. Therefore, the following calltrace will inevitably fail when it reaches jbd2_journal_release_jbd_inode(). ocfs2_dismount_volume()-> ocfs2_delete_osb()-> ocfs2_free_slot_info()-> __ocfs2_free_slot_info()-> evict()-> ocfs2_evict_inode()-> ocfs2_clear_inode()-> jbd2_journal_release_jbd_inode(osb->journal->j_journal, Adding osb->journal checks will prevent null-ptr-deref during the above execution path. Link: https://lkml.kernel.org/r/tencent_357489BEAEE4AED74CBD67D246DBD2C4C606@qq.com Fixes: da5e7c87827e ("ocfs2: cleanup journal init and shutdown") Signed-off-by: Edward Adam Davis Reported-by: syzbot+47d8cb2f2cc1517e515a@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=47d8cb2f2cc1517e515a Tested-by: syzbot+47d8cb2f2cc1517e515a@syzkaller.appspotmail.com Reviewed-by: Mark Tinguely Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 14bf440ea4df..6c4f78f473fb 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1281,6 +1281,9 @@ static void ocfs2_clear_inode(struct inode *inode) * the journal is flushed before journal shutdown. Thus it is safe to * have inodes get cleaned up after journal shutdown. */ + if (!osb->journal) + return; + jbd2_journal_release_jbd_inode(osb->journal->j_journal, &oi->ip_jinode); } From 9614d8bee66387501f48718fa306e17f2aa3f2f3 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 31 Jul 2025 10:44:31 -0400 Subject: [PATCH 04/17] mm/userfaultfd: fix kmap_local LIFO ordering for CONFIG_HIGHPTE With CONFIG_HIGHPTE on 32-bit ARM, move_pages_pte() maps PTE pages using kmap_local_page(), which requires unmapping in Last-In-First-Out order. The current code maps dst_pte first, then src_pte, but unmaps them in the same order (dst_pte, src_pte), violating the LIFO requirement. This causes the warning in kunmap_local_indexed(): WARNING: CPU: 0 PID: 604 at mm/highmem.c:622 kunmap_local_indexed+0x178/0x17c addr \!= __fix_to_virt(FIX_KMAP_BEGIN + idx) Fix this by reversing the unmap order to respect LIFO ordering. This issue follows the same pattern as similar fixes: - commit eca6828403b8 ("crypto: skcipher - fix mismatch between mapping and unmapping order") - commit 8cf57c6df818 ("nilfs2: eliminate staggered calls to kunmap in nilfs_rename") Both of which addressed the same fundamental requirement that kmap_local operations must follow LIFO ordering. Link: https://lkml.kernel.org/r/20250731144431.773923-1-sashal@kernel.org Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI") Signed-off-by: Sasha Levin Acked-by: David Hildenbrand Reviewed-by: Suren Baghdasaryan Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 45e6290e2e8b..aefdf3a812a1 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1453,10 +1453,15 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, folio_unlock(src_folio); folio_put(src_folio); } - if (dst_pte) - pte_unmap(dst_pte); + /* + * Unmap in reverse order (LIFO) to maintain proper kmap_local + * index ordering when CONFIG_HIGHPTE is enabled. We mapped dst_pte + * first, then src_pte, so we must unmap src_pte first, then dst_pte. + */ if (src_pte) pte_unmap(src_pte); + if (dst_pte) + pte_unmap(dst_pte); mmu_notifier_invalidate_range_end(&range); if (si) put_swap_device(si); From 5bbc2b785e63699cfcaa7adbf739f6e9b771028a Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Tue, 5 Aug 2025 13:51:40 -0400 Subject: [PATCH 05/17] selftests/mm: fix FORCE_READ to read input value correctly FORCE_READ() converts input value x to its pointer type then reads from address x. This is wrong. If x is a non-pointer, it would be caught it easily. But all FORCE_READ() callers are trying to read from a pointer and FORCE_READ() basically reads a pointer to a pointer instead of the original typed pointer. Almost no access violation was found, except the one from split_huge_page_test. Fix it by implementing a simplified READ_ONCE() instead. Link: https://lkml.kernel.org/r/20250805175140.241656-1-ziy@nvidia.com Fixes: 3f6bfd4789a0 ("selftests/mm: reuse FORCE_READ to replace "asm volatile("" : "+r" (XXX));"") Signed-off-by: Zi Yan Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Reviewed-by: wang lian Reviewed-by: Wei Yang Cc: Christian Brauner Cc: Jann Horn Cc: Kairui Song Cc: Liam Howlett Cc: Mark Brown Cc: SeongJae Park Cc: Shuah Khan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/cow.c | 4 ++-- tools/testing/selftests/mm/guard-regions.c | 2 +- tools/testing/selftests/mm/hugetlb-madvise.c | 4 +++- tools/testing/selftests/mm/migration.c | 2 +- tools/testing/selftests/mm/pagemap_ioctl.c | 2 +- tools/testing/selftests/mm/split_huge_page_test.c | 7 +++++-- tools/testing/selftests/mm/vm_util.h | 2 +- 7 files changed, 14 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index d30625c18259..c744c603d688 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -1554,8 +1554,8 @@ static void run_with_zeropage(non_anon_test_fn fn, const char *desc) } /* Read from the page to populate the shared zeropage. */ - FORCE_READ(mem); - FORCE_READ(smem); + FORCE_READ(*mem); + FORCE_READ(*smem); fn(mem, smem, pagesize); munmap: diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c index b0d42eb04e3a..8dd81c0a4a5a 100644 --- a/tools/testing/selftests/mm/guard-regions.c +++ b/tools/testing/selftests/mm/guard-regions.c @@ -145,7 +145,7 @@ static bool try_access_buf(char *ptr, bool write) if (write) *ptr = 'x'; else - FORCE_READ(ptr); + FORCE_READ(*ptr); } signal_jump_set = false; diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c index 1afe14b9dc0c..c5940c0595be 100644 --- a/tools/testing/selftests/mm/hugetlb-madvise.c +++ b/tools/testing/selftests/mm/hugetlb-madvise.c @@ -50,8 +50,10 @@ void read_fault_pages(void *addr, unsigned long nr_pages) unsigned long i; for (i = 0; i < nr_pages; i++) { + unsigned long *addr2 = + ((unsigned long *)(addr + (i * huge_page_size))); /* Prevent the compiler from optimizing out the entire loop: */ - FORCE_READ(((unsigned long *)(addr + (i * huge_page_size)))); + FORCE_READ(*addr2); } } diff --git a/tools/testing/selftests/mm/migration.c b/tools/testing/selftests/mm/migration.c index c5a73617796a..ea945eebec2f 100644 --- a/tools/testing/selftests/mm/migration.c +++ b/tools/testing/selftests/mm/migration.c @@ -110,7 +110,7 @@ void *access_mem(void *ptr) * the memory access actually happens and prevents the compiler * from optimizing away this entire loop. */ - FORCE_READ((uint64_t *)ptr); + FORCE_READ(*(uint64_t *)ptr); } return NULL; diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index 0d4209eef0c3..e6face7c0166 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -1525,7 +1525,7 @@ void zeropfn_tests(void) ret = madvise(mem, hpage_size, MADV_HUGEPAGE); if (!ret) { - FORCE_READ(mem); + FORCE_READ(*mem); ret = pagemap_ioctl(mem, hpage_size, &vec, 1, 0, 0, PAGE_IS_PFNZERO, 0, 0, PAGE_IS_PFNZERO); diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 05de1fc0005b..44a3f8a58806 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -439,8 +439,11 @@ int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, int *fd, } madvise(*addr, fd_size, MADV_HUGEPAGE); - for (size_t i = 0; i < fd_size; i++) - FORCE_READ((*addr + i)); + for (size_t i = 0; i < fd_size; i++) { + char *addr2 = *addr + i; + + FORCE_READ(*addr2); + } if (!check_huge_file(*addr, fd_size / pmd_pagesize, pmd_pagesize)) { ksft_print_msg("No large pagecache folio generated, please provide a filesystem supporting large folio\n"); diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index c20298ae98ea..b55d1809debc 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -23,7 +23,7 @@ * anything with it in order to trigger a read page fault. We therefore must use * volatile to stop the compiler from optimising this away. */ -#define FORCE_READ(x) (*(volatile typeof(x) *)x) +#define FORCE_READ(x) (*(const volatile typeof(x) *)&(x)) extern unsigned int __page_size; extern unsigned int __page_shift; From 7a19afee6fb39df63ddea7ce78976d8c521178c6 Mon Sep 17 00:00:00 2001 From: Yeoreum Yun Date: Fri, 1 Aug 2025 13:02:36 +0100 Subject: [PATCH 06/17] kunit: kasan_test: disable fortify string checker on kasan_strings() test Similar to commit 09c6304e38e4 ("kasan: test: fix compatibility with FORTIFY_SOURCE") the kernel is panicing in kasan_string(). This is due to the `src` and `ptr` not being hidden from the optimizer which would disable the runtime fortify string checker. Call trace: __fortify_panic+0x10/0x20 (P) kasan_strings+0x980/0x9b0 kunit_try_run_case+0x68/0x190 kunit_generic_run_threadfn_adapter+0x34/0x68 kthread+0x1c4/0x228 ret_from_fork+0x10/0x20 Code: d503233f a9bf7bfd 910003fd 9424b243 (d4210000) ---[ end trace 0000000000000000 ]--- note: kunit_try_catch[128] exited with irqs disabled note: kunit_try_catch[128] exited with preempt_count 1 # kasan_strings: try faulted: last ** replaying previous printk message ** # kasan_strings: try faulted: last line seen mm/kasan/kasan_test_c.c:1600 # kasan_strings: internal error occurred preventing test case from running: -4 Link: https://lkml.kernel.org/r/20250801120236.2962642-1-yeoreum.yun@arm.com Fixes: 73228c7ecc5e ("KASAN: port KASAN Tests to KUnit") Signed-off-by: Yeoreum Yun Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitriy Vyukov Cc: Vincenzo Frascino Cc: Signed-off-by: Andrew Morton --- mm/kasan/kasan_test_c.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index e0968acc03aa..f4b17984b627 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -1578,9 +1578,11 @@ static void kasan_strings(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(ptr); src = kmalloc(KASAN_GRANULE_SIZE, GFP_KERNEL | __GFP_ZERO); strscpy(src, "f0cacc1a0000000", KASAN_GRANULE_SIZE); + OPTIMIZER_HIDE_VAR(src); /* * Make sure that strscpy() does not trigger KASAN if it overreads into From 08c7c253e032863199da4f089bd0ccab5d1a4876 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Mon, 18 Aug 2025 18:39:12 +0200 Subject: [PATCH 07/17] mm/kasan: fix vmalloc shadow memory (de-)population races While working on the lazy MMU mode enablement for s390 I hit pretty curious issues in the kasan code. The first is related to a custom kasan-based sanitizer aimed at catching invalid accesses to PTEs and is inspired by [1] conversation. The kasan complains on valid PTE accesses, while the shadow memory is reported as unpoisoned: [ 102.783993] ================================================================== [ 102.784008] BUG: KASAN: out-of-bounds in set_pte_range+0x36c/0x390 [ 102.784016] Read of size 8 at addr 0000780084cf9608 by task vmalloc_test/0/5542 [ 102.784019] [ 102.784040] CPU: 1 UID: 0 PID: 5542 Comm: vmalloc_test/0 Kdump: loaded Tainted: G OE 6.16.0-gcc-ipte-kasan-11657-gb2d930c4950e #340 PREEMPT [ 102.784047] Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE [ 102.784049] Hardware name: IBM 8561 T01 703 (LPAR) [ 102.784052] Call Trace: [ 102.784054] [<00007fffe0147ac0>] dump_stack_lvl+0xe8/0x140 [ 102.784059] [<00007fffe0112484>] print_address_description.constprop.0+0x34/0x2d0 [ 102.784066] [<00007fffe011282c>] print_report+0x10c/0x1f8 [ 102.784071] [<00007fffe090785a>] kasan_report+0xfa/0x220 [ 102.784078] [<00007fffe01d3dec>] set_pte_range+0x36c/0x390 [ 102.784083] [<00007fffe01d41c2>] leave_ipte_batch+0x3b2/0xb10 [ 102.784088] [<00007fffe07d3650>] apply_to_pte_range+0x2f0/0x4e0 [ 102.784094] [<00007fffe07e62e4>] apply_to_pmd_range+0x194/0x3e0 [ 102.784099] [<00007fffe07e820e>] __apply_to_page_range+0x2fe/0x7a0 [ 102.784104] [<00007fffe07e86d8>] apply_to_page_range+0x28/0x40 [ 102.784109] [<00007fffe090a3ec>] __kasan_populate_vmalloc+0xec/0x310 [ 102.784114] [<00007fffe090aa36>] kasan_populate_vmalloc+0x96/0x130 [ 102.784118] [<00007fffe0833a04>] alloc_vmap_area+0x3d4/0xf30 [ 102.784123] [<00007fffe083a8ba>] __get_vm_area_node+0x1aa/0x4c0 [ 102.784127] [<00007fffe083c4f6>] __vmalloc_node_range_noprof+0x126/0x4e0 [ 102.784131] [<00007fffe083c980>] __vmalloc_node_noprof+0xd0/0x110 [ 102.784135] [<00007fffe083ca32>] vmalloc_noprof+0x32/0x40 [ 102.784139] [<00007fff608aa336>] fix_size_alloc_test+0x66/0x150 [test_vmalloc] [ 102.784147] [<00007fff608aa710>] test_func+0x2f0/0x430 [test_vmalloc] [ 102.784153] [<00007fffe02841f8>] kthread+0x3f8/0x7a0 [ 102.784159] [<00007fffe014d8b4>] __ret_from_fork+0xd4/0x7d0 [ 102.784164] [<00007fffe299c00a>] ret_from_fork+0xa/0x30 [ 102.784173] no locks held by vmalloc_test/0/5542. [ 102.784176] [ 102.784178] The buggy address belongs to the physical page: [ 102.784186] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x84cf9 [ 102.784198] flags: 0x3ffff00000000000(node=0|zone=1|lastcpupid=0x1ffff) [ 102.784212] page_type: f2(table) [ 102.784225] raw: 3ffff00000000000 0000000000000000 0000000000000122 0000000000000000 [ 102.784234] raw: 0000000000000000 0000000000000000 f200000000000001 0000000000000000 [ 102.784248] page dumped because: kasan: bad access detected [ 102.784250] [ 102.784252] Memory state around the buggy address: [ 102.784260] 0000780084cf9500: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 102.784274] 0000780084cf9580: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 102.784277] >0000780084cf9600: fd 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 102.784290] ^ [ 102.784293] 0000780084cf9680: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 102.784303] 0000780084cf9700: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 102.784306] ================================================================== The second issue hits when the custom sanitizer above is not implemented, but the kasan itself is still active: [ 1554.438028] Unable to handle kernel pointer dereference in virtual kernel address space [ 1554.438065] Failing address: 001c0ff0066f0000 TEID: 001c0ff0066f0403 [ 1554.438076] Fault in home space mode while using kernel ASCE. [ 1554.438103] AS:00000000059d400b R2:0000000ffec5c00b R3:00000000c6c9c007 S:0000000314470001 P:00000000d0ab413d [ 1554.438158] Oops: 0011 ilc:2 [#1]SMP [ 1554.438175] Modules linked in: test_vmalloc(E+) nft_fib_inet(E) nft_fib_ipv4(E) nft_fib_ipv6(E) nft_fib(E) nft_reject_inet(E) nf_reject_ipv4(E) nf_reject_ipv6(E) nft_reject(E) nft_ct(E) nft_chain_nat(E) nf_nat(E) nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E) nf_tables(E) sunrpc(E) pkey_pckmo(E) uvdevice(E) s390_trng(E) rng_core(E) eadm_sch(E) vfio_ccw(E) mdev(E) vfio_iommu_type1(E) vfio(E) sch_fq_codel(E) drm(E) loop(E) i2c_core(E) drm_panel_orientation_quirks(E) nfnetlink(E) ctcm(E) fsm(E) zfcp(E) scsi_transport_fc(E) diag288_wdt(E) watchdog(E) ghash_s390(E) prng(E) aes_s390(E) des_s390(E) libdes(E) sha3_512_s390(E) sha3_256_s390(E) sha512_s390(E) sha1_s390(E) sha_common(E) pkey(E) autofs4(E) [ 1554.438319] Unloaded tainted modules: pkey_uv(E):1 hmac_s390(E):2 [ 1554.438354] CPU: 1 UID: 0 PID: 1715 Comm: vmalloc_test/0 Kdump: loaded Tainted: G E 6.16.0-gcc-ipte-kasan-11657-gb2d930c4950e #350 PREEMPT [ 1554.438368] Tainted: [E]=UNSIGNED_MODULE [ 1554.438374] Hardware name: IBM 8561 T01 703 (LPAR) [ 1554.438381] Krnl PSW : 0704e00180000000 00007fffe1d3d6ae (memset+0x5e/0x98) [ 1554.438396] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:2 PM:0 RI:0 EA:3 [ 1554.438409] Krnl GPRS: 0000000000000001 001c0ff0066f0000 001c0ff0066f0000 00000000000000f8 [ 1554.438418] 00000000000009fe 0000000000000009 0000000000000000 0000000000000002 [ 1554.438426] 0000000000005000 000078031ae655c8 00000feffdcf9f59 0000780258672a20 [ 1554.438433] 0000780243153500 00007f8033780000 00007fffe083a510 00007f7fee7cfa00 [ 1554.438452] Krnl Code: 00007fffe1d3d6a0: eb540008000c srlg %r5,%r4,8 00007fffe1d3d6a6: b9020055 ltgr %r5,%r5 #00007fffe1d3d6aa: a784000b brc 8,00007fffe1d3d6c0 >00007fffe1d3d6ae: 42301000 stc %r3,0(%r1) 00007fffe1d3d6b2: d2fe10011000 mvc 1(255,%r1),0(%r1) 00007fffe1d3d6b8: 41101100 la %r1,256(%r1) 00007fffe1d3d6bc: a757fff9 brctg %r5,00007fffe1d3d6ae 00007fffe1d3d6c0: 42301000 stc %r3,0(%r1) [ 1554.438539] Call Trace: [ 1554.438545] [<00007fffe1d3d6ae>] memset+0x5e/0x98 [ 1554.438552] ([<00007fffe083a510>] remove_vm_area+0x220/0x400) [ 1554.438562] [<00007fffe083a9d6>] vfree.part.0+0x26/0x810 [ 1554.438569] [<00007fff6073bd50>] fix_align_alloc_test+0x50/0x90 [test_vmalloc] [ 1554.438583] [<00007fff6073c73a>] test_func+0x46a/0x6c0 [test_vmalloc] [ 1554.438593] [<00007fffe0283ac8>] kthread+0x3f8/0x7a0 [ 1554.438603] [<00007fffe014d8b4>] __ret_from_fork+0xd4/0x7d0 [ 1554.438613] [<00007fffe299ac0a>] ret_from_fork+0xa/0x30 [ 1554.438622] INFO: lockdep is turned off. [ 1554.438627] Last Breaking-Event-Address: [ 1554.438632] [<00007fffe1d3d65c>] memset+0xc/0x98 [ 1554.438644] Kernel panic - not syncing: Fatal exception: panic_on_oops This series fixes the above issues and is a pre-requisite for the s390 lazy MMU mode implementation. test_vmalloc was used to stress-test the fixes. This patch (of 2): When vmalloc shadow memory is established the modification of the corresponding page tables is not protected by any locks. Instead, the locking is done per-PTE. This scheme however has defects. kasan_populate_vmalloc_pte() - while ptep_get() read is atomic the sequence pte_none(ptep_get()) is not. Doing that outside of the lock might lead to a concurrent PTE update and what could be seen as a shadow memory corruption as result. kasan_depopulate_vmalloc_pte() - by the time a page whose address was extracted from ptep_get() read and cached in a local variable outside of the lock is attempted to get free, could actually be freed already. To avoid these put ptep_get() itself and the code that manipulates the result of the read under lock. In addition, move freeing of the page out of the atomic context. Link: https://lkml.kernel.org/r/cover.1755528662.git.agordeev@linux.ibm.com Link: https://lkml.kernel.org/r/adb258634194593db294c0d1fb35646e894d6ead.1755528662.git.agordeev@linux.ibm.com Link: https://lore.kernel.org/linux-mm/5b0609c9-95ee-4e48-bb6d-98f57c5d2c31@arm.com/ [1] Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shadow memory") Signed-off-by: Alexander Gordeev Cc: Andrey Ryabinin Cc: Daniel Axtens Cc: Marc Rutland Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/kasan/shadow.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index d2c70cd2afb1..4d846d146d02 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -305,9 +305,6 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, pte_t pte; int index; - if (likely(!pte_none(ptep_get(ptep)))) - return 0; - index = PFN_DOWN(addr - data->start); page = data->pages[index]; __memset(page_to_virt(page), KASAN_VMALLOC_INVALID, PAGE_SIZE); @@ -461,18 +458,19 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, void *unused) { - unsigned long page; - - page = (unsigned long)__va(pte_pfn(ptep_get(ptep)) << PAGE_SHIFT); + pte_t pte; + int none; spin_lock(&init_mm.page_table_lock); - - if (likely(!pte_none(ptep_get(ptep)))) { + pte = ptep_get(ptep); + none = pte_none(pte); + if (likely(!none)) pte_clear(&init_mm, addr, ptep); - free_page(page); - } spin_unlock(&init_mm.page_table_lock); + if (likely(!none)) + __free_page(pfn_to_page(pte_pfn(pte))); + return 0; } From c519c3c0a1133c408e83a383aa4dd30010aa5d71 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Mon, 18 Aug 2025 18:39:13 +0200 Subject: [PATCH 08/17] mm/kasan: avoid lazy MMU mode hazards Functions __kasan_populate_vmalloc() and __kasan_depopulate_vmalloc() use apply_to_pte_range(), which enters lazy MMU mode. In that mode updating PTEs may not be observed until the mode is left. That may lead to a situation in which otherwise correct reads and writes to a PTE using ptep_get(), set_pte(), pte_clear() and other access primitives bring wrong results when the vmalloc shadow memory is being (de-)populated. To avoid these hazards leave the lazy MMU mode before and re-enter it after each PTE manipulation. Link: https://lkml.kernel.org/r/0d2efb7ddddbff6b288fbffeeb10166e90771718.1755528662.git.agordeev@linux.ibm.com Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shadow memory") Signed-off-by: Alexander Gordeev Cc: Andrey Ryabinin Cc: Daniel Axtens Cc: Marc Rutland Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/kasan/shadow.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 4d846d146d02..e2ceebf737ef 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -305,6 +305,8 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, pte_t pte; int index; + arch_leave_lazy_mmu_mode(); + index = PFN_DOWN(addr - data->start); page = data->pages[index]; __memset(page_to_virt(page), KASAN_VMALLOC_INVALID, PAGE_SIZE); @@ -317,6 +319,8 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, } spin_unlock(&init_mm.page_table_lock); + arch_enter_lazy_mmu_mode(); + return 0; } @@ -461,6 +465,8 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, pte_t pte; int none; + arch_leave_lazy_mmu_mode(); + spin_lock(&init_mm.page_table_lock); pte = ptep_get(ptep); none = pte_none(pte); @@ -471,6 +477,8 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, if (likely(!none)) __free_page(pfn_to_page(pte_pfn(pte))); + arch_enter_lazy_mmu_mode(); + return 0; } From 51337a9a3a404fde0f5337662ffc7699793dfeb5 Mon Sep 17 00:00:00 2001 From: Ada Couprie Diaz Date: Thu, 21 Aug 2025 13:07:35 +0100 Subject: [PATCH 09/17] kasan: fix GCC mem-intrinsic prefix with sw tags GCC doesn't support "hwasan-kernel-mem-intrinsic-prefix", only "asan-kernel-mem-intrinsic-prefix"[0], while LLVM supports both. This is already taken into account when checking "CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX", but not in the KASAN Makefile adding those parameters when "CONFIG_KASAN_SW_TAGS" is enabled. Replace the version check with "CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX", which already validates that mem-intrinsic prefix parameter can be used, and choose the correct name depending on compiler. GCC 13 and above trigger "CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX" which prevents `mem{cpy,move,set}()` being redefined in "mm/kasan/shadow.c" since commit 36be5cba99f6 ("kasan: treat meminstrinsic as builtins in uninstrumented files"), as we expect the compiler to prefix those calls with `__(hw)asan_` instead. But as the option passed to GCC has been incorrect, the compiler has not been emitting those prefixes, effectively never calling the instrumented versions of `mem{cpy,move,set}()` with "CONFIG_KASAN_SW_TAGS" enabled. If "CONFIG_FORTIFY_SOURCES" is enabled, this issue would be mitigated as it redefines `mem{cpy,move,set}()` and properly aliases the `__underlying_mem*()` that will be called to the instrumented versions. Link: https://lkml.kernel.org/r/20250821120735.156244-1-ada.coupriediaz@arm.com Link: https://gcc.gnu.org/onlinedocs/gcc-13.4.0/gcc/Optimize-Options.html [0] Signed-off-by: Ada Couprie Diaz Fixes: 36be5cba99f6 ("kasan: treat meminstrinsic as builtins in uninstrumented files") Reviewed-by: Yeoreum Yun Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitriy Vyukov Cc: Marco Elver Cc: Marc Rutland Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Vincenzo Frascino Cc: Signed-off-by: Andrew Morton --- scripts/Makefile.kasan | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan index 693dbbebebba..0ba2aac3b8dc 100644 --- a/scripts/Makefile.kasan +++ b/scripts/Makefile.kasan @@ -86,10 +86,14 @@ kasan_params += hwasan-instrument-stack=$(stack_enable) \ hwasan-use-short-granules=0 \ hwasan-inline-all-checks=0 -# Instrument memcpy/memset/memmove calls by using instrumented __hwasan_mem*(). -ifeq ($(call clang-min-version, 150000)$(call gcc-min-version, 130000),y) - kasan_params += hwasan-kernel-mem-intrinsic-prefix=1 -endif +# Instrument memcpy/memset/memmove calls by using instrumented __(hw)asan_mem*(). +ifdef CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX + ifdef CONFIG_CC_IS_GCC + kasan_params += asan-kernel-mem-intrinsic-prefix=1 + else + kasan_params += hwasan-kernel-mem-intrinsic-prefix=1 + endif +endif # CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX endif # CONFIG_KASAN_SW_TAGS From 6310c149e5dede74bb47110e0d7a38c78772c152 Mon Sep 17 00:00:00 2001 From: Brian Mak Date: Tue, 5 Aug 2025 14:15:26 -0700 Subject: [PATCH 10/17] kexec: add KEXEC_FILE_NO_CMA as a legal flag Commit 07d24902977e ("kexec: enable CMA based contiguous allocation") introduces logic to use CMA-based allocation in kexec by default. As part of the changes, it introduces a kexec_file_load flag to disable the use of CMA allocations from userspace. However, this flag is broken since it is missing from the list of legal flags for kexec_file_load. kexec_file_load returns EINVAL when attempting to use the flag. Fix this by adding the KEXEC_FILE_NO_CMA flag to the list of legal flags for kexec_file_load. Without this fix, kexec_file_load syscall will failed and return '-EINVAL' when KEXEC_FILE_NO_CMA is specified. Link: https://lkml.kernel.org/r/20250805211527.122367-2-makb@juniper.net Fixes: 07d24902977e ("kexec: enable CMA based contiguous allocation") Signed-off-by: Brian Mak Acked-by: Baoquan He Cc: Alexander Graf Cc: Borislav Betkov Cc: Dave Young Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Rob Herring Cc: Saravana Kannan Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- include/linux/kexec.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 1b10a5d84b68..39fe3e6cd282 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -460,7 +460,8 @@ bool kexec_load_permitted(int kexec_image_type); /* List of defined/legal kexec file flags */ #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ - KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG) + KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG | \ + KEXEC_FILE_NO_CMA) /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; From 9f68eabab9d9aaa764a8d234c4170119e6518102 Mon Sep 17 00:00:00 2001 From: Quanmin Yan Date: Thu, 21 Aug 2025 20:55:55 +0800 Subject: [PATCH 11/17] mm/damon/core: prevent unnecessary overflow in damos_set_effective_quota() On 32-bit systems, the throughput calculation in damos_set_effective_quota() is prone to unnecessary multiplication overflow. Using mult_frac() to fix it. Andrew Paniakin also recently found and privately reported this issue, on 64 bit systems. This can also happen on 64-bit systems, once the charged size exceeds ~17 TiB. On systems running for long time in production, this issue can actually happen. More specifically, when a DAMOS scheme having the time quota run for longtime, throughput calculation can overflow and set esz too small. As a result, speed of the scheme get unexpectedly slow. Link: https://lkml.kernel.org/r/20250821125555.3020951-1-yanquanmin1@huawei.com Fixes: 1cd243030059 ("mm/damon/schemes: implement time quota") Signed-off-by: Quanmin Yan Reported-by: Andrew Paniakin Reviewed-by: SeongJae Park Cc: Kefeng Wang Cc: ze zuo Cc: [5.16+] Signed-off-by: Andrew Morton --- mm/damon/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 70eff5cbe6ee..106ee8b0f2d5 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2073,8 +2073,8 @@ static void damos_set_effective_quota(struct damos_quota *quota) if (quota->ms) { if (quota->total_charged_ns) - throughput = quota->total_charged_sz * 1000000 / - quota->total_charged_ns; + throughput = mult_frac(quota->total_charged_sz, 1000000, + quota->total_charged_ns); else throughput = PAGE_SIZE * 1024; esz = min(throughput * quota->ms, esz); From c3576889d87b603cb66b417e08844a53c1077a37 Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Thu, 7 Aug 2025 20:35:45 +0200 Subject: [PATCH 12/17] mm: fix accounting of memmap pages For !CONFIG_SPARSEMEM_VMEMMAP, memmap page accounting is currently done upfront in sparse_buffer_init(). However, sparse_buffer_alloc() may return NULL in failure scenario. Also, memmap pages may be allocated either from the memblock allocator during early boot or from the buddy allocator. When removed via arch_remove_memory(), accounting of memmap pages must reflect the original allocation source. To ensure correctness: * Account memmap pages after successful allocation in sparse_init_nid() and section_activate(). * Account memmap pages in section_deactivate() based on allocation source. Link: https://lkml.kernel.org/r/20250807183545.1424509-1-sumanthk@linux.ibm.com Fixes: 15995a352474 ("mm: report per-page metadata information") Signed-off-by: Sumanth Korikkar Suggested-by: David Hildenbrand Reviewed-by: Wei Yang Cc: Alexander Gordeev Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Signed-off-by: Andrew Morton --- mm/sparse-vmemmap.c | 5 ----- mm/sparse.c | 15 +++++++++------ 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index fd2ab5118e13..41aa0493eb03 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -578,11 +578,6 @@ struct page * __meminit __populate_section_memmap(unsigned long pfn, if (r < 0) return NULL; - if (system_state == SYSTEM_BOOTING) - memmap_boot_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE)); - else - memmap_pages_add(DIV_ROUND_UP(end - start, PAGE_SIZE)); - return pfn_to_page(pfn); } diff --git a/mm/sparse.c b/mm/sparse.c index 3c012cf83cc2..e6075b622407 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -454,9 +454,6 @@ static void __init sparse_buffer_init(unsigned long size, int nid) */ sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true); sparsemap_buf_end = sparsemap_buf + size; -#ifndef CONFIG_SPARSEMEM_VMEMMAP - memmap_boot_pages_add(DIV_ROUND_UP(size, PAGE_SIZE)); -#endif } static void __init sparse_buffer_fini(void) @@ -567,6 +564,8 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, sparse_buffer_fini(); goto failed; } + memmap_boot_pages_add(DIV_ROUND_UP(PAGES_PER_SECTION * sizeof(struct page), + PAGE_SIZE)); sparse_init_early_section(nid, map, pnum, 0); } } @@ -680,7 +679,6 @@ static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, unsigned long start = (unsigned long) pfn_to_page(pfn); unsigned long end = start + nr_pages * sizeof(struct page); - memmap_pages_add(-1L * (DIV_ROUND_UP(end - start, PAGE_SIZE))); vmemmap_free(start, end, altmap); } static void free_map_bootmem(struct page *memmap) @@ -856,10 +854,14 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, * The memmap of early sections is always fully populated. See * section_activate() and pfn_valid() . */ - if (!section_is_early) + if (!section_is_early) { + memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE))); depopulate_section_memmap(pfn, nr_pages, altmap); - else if (memmap) + } else if (memmap) { + memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), + PAGE_SIZE))); free_map_bootmem(memmap); + } if (empty) ms->section_mem_map = (unsigned long)NULL; @@ -904,6 +906,7 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn, section_deactivate(pfn, nr_pages, altmap); return ERR_PTR(-ENOMEM); } + memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)); return memmap; } From 2ce3d282bd5050fca8577defeff08ada0d55d062 Mon Sep 17 00:00:00 2001 From: wangzijie Date: Mon, 18 Aug 2025 20:31:02 +0800 Subject: [PATCH 13/17] proc: fix missing pde_set_flags() for net proc files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To avoid potential UAF issues during module removal races, we use pde_set_flags() to save proc_ops flags in PDE itself before proc_register(), and then use pde_has_proc_*() helpers instead of directly dereferencing pde->proc_ops->*. However, the pde_set_flags() call was missing when creating net related proc files. This omission caused incorrect behavior which FMODE_LSEEK was being cleared inappropriately in proc_reg_open() for net proc files. Lars reported it in this link[1]. Fix this by ensuring pde_set_flags() is called when register proc entry, and add NULL check for proc_ops in pde_set_flags(). [wangzijie1@honor.com: stash pde->proc_ops in a local const variable, per Christian] Link: https://lkml.kernel.org/r/20250821105806.1453833-1-wangzijie1@honor.com Link: https://lkml.kernel.org/r/20250818123102.959595-1-wangzijie1@honor.com Link: https://lore.kernel.org/all/20250815195616.64497967@chagall.paradoxon.rec/ [1] Fixes: ff7ec8dc1b64 ("proc: use the same treatment to check proc_lseek as ones for proc_read_iter et.al") Signed-off-by: wangzijie Reported-by: Lars Wendler Tested-by: Stefano Brivio Tested-by: Petr Vaněk Tested by: Lars Wendler Cc: Alexei Starovoitov Cc: Alexey Dobriyan Cc: Al Viro Cc: "Edgecombe, Rick P" Cc: Greg Kroah-Hartman Cc: Jiri Slaby Cc: Kirill A. Shutemov Cc: wangzijie Cc: Signed-off-by: Andrew Morton --- fs/proc/generic.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 76e800e38c8f..bd0c099cfdd2 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -367,6 +367,25 @@ static const struct inode_operations proc_dir_inode_operations = { .setattr = proc_notify_change, }; +static void pde_set_flags(struct proc_dir_entry *pde) +{ + const struct proc_ops *proc_ops = pde->proc_ops; + + if (!proc_ops) + return; + + if (proc_ops->proc_flags & PROC_ENTRY_PERMANENT) + pde->flags |= PROC_ENTRY_PERMANENT; + if (proc_ops->proc_read_iter) + pde->flags |= PROC_ENTRY_proc_read_iter; +#ifdef CONFIG_COMPAT + if (proc_ops->proc_compat_ioctl) + pde->flags |= PROC_ENTRY_proc_compat_ioctl; +#endif + if (proc_ops->proc_lseek) + pde->flags |= PROC_ENTRY_proc_lseek; +} + /* returns the registered entry, or frees dp and returns NULL on failure */ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, struct proc_dir_entry *dp) @@ -374,6 +393,8 @@ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, if (proc_alloc_inum(&dp->low_ino)) goto out_free_entry; + pde_set_flags(dp); + write_lock(&proc_subdir_lock); dp->parent = dir; if (pde_subdir_insert(dir, dp) == false) { @@ -561,20 +582,6 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, return p; } -static void pde_set_flags(struct proc_dir_entry *pde) -{ - if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT) - pde->flags |= PROC_ENTRY_PERMANENT; - if (pde->proc_ops->proc_read_iter) - pde->flags |= PROC_ENTRY_proc_read_iter; -#ifdef CONFIG_COMPAT - if (pde->proc_ops->proc_compat_ioctl) - pde->flags |= PROC_ENTRY_proc_compat_ioctl; -#endif - if (pde->proc_ops->proc_lseek) - pde->flags |= PROC_ENTRY_proc_lseek; -} - struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct proc_ops *proc_ops, void *data) @@ -585,7 +592,6 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, if (!p) return NULL; p->proc_ops = proc_ops; - pde_set_flags(p); return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_data); @@ -636,7 +642,6 @@ struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode, p->proc_ops = &proc_seq_ops; p->seq_ops = ops; p->state_size = state_size; - pde_set_flags(p); return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_seq_private); @@ -667,7 +672,6 @@ struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode, return NULL; p->proc_ops = &proc_single_ops; p->single_show = show; - pde_set_flags(p); return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_single_data); From 7cc183f2e67d19b03ee5c13a6664b8c6cc37ff9d Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Mon, 18 Aug 2025 11:02:04 +0900 Subject: [PATCH 14/17] mm: move page table sync declarations to linux/pgtable.h During our internal testing, we started observing intermittent boot failures when the machine uses 4-level paging and has a large amount of persistent memory: BUG: unable to handle page fault for address: ffffe70000000034 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: 0002 [#1] SMP NOPTI RIP: 0010:__init_single_page+0x9/0x6d Call Trace: __init_zone_device_page+0x17/0x5d memmap_init_zone_device+0x154/0x1bb pagemap_range+0x2e0/0x40f memremap_pages+0x10b/0x2f0 devm_memremap_pages+0x1e/0x60 dev_dax_probe+0xce/0x2ec [device_dax] dax_bus_probe+0x6d/0xc9 [... snip ...] It turns out that the kernel panics while initializing vmemmap (struct page array) when the vmemmap region spans two PGD entries, because the new PGD entry is only installed in init_mm.pgd, but not in the page tables of other tasks. And looking at __populate_section_memmap(): if (vmemmap_can_optimize(altmap, pgmap)) // does not sync top level page tables r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap); else // sync top level page tables in x86 r = vmemmap_populate(start, end, nid, altmap); In the normal path, vmemmap_populate() in arch/x86/mm/init_64.c synchronizes the top level page table (See commit 9b861528a801 ("x86-64, mem: Update all PGDs for direct mapping and vmemmap mapping changes")) so that all tasks in the system can see the new vmemmap area. However, when vmemmap_can_optimize() returns true, the optimized path skips synchronization of top-level page tables. This is because vmemmap_populate_compound_pages() is implemented in core MM code, which does not handle synchronization of the top-level page tables. Instead, the core MM has historically relied on each architecture to perform this synchronization manually. We're not the first party to encounter a crash caused by not-sync'd top level page tables: earlier this year, Gwan-gyeong Mun attempted to address the issue [1] [2] after hitting a kernel panic when x86 code accessed the vmemmap area before the corresponding top-level entries were synced. At that time, the issue was believed to be triggered only when struct page was enlarged for debugging purposes, and the patch did not get further updates. It turns out that current approach of relying on each arch to handle the page table sync manually is fragile because 1) it's easy to forget to sync the top level page table, and 2) it's also easy to overlook that the kernel should not access the vmemmap and direct mapping areas before the sync. # The solution: Make page table sync more code robust and harder to miss To address this, Dave Hansen suggested [3] [4] introducing {pgd,p4d}_populate_kernel() for updating kernel portion of the page tables and allow each architecture to explicitly perform synchronization when installing top-level entries. With this approach, we no longer need to worry about missing the sync step, reducing the risk of future regressions. The new interface reuses existing ARCH_PAGE_TABLE_SYNC_MASK, PGTBL_P*D_MODIFIED and arch_sync_kernel_mappings() facility used by vmalloc and ioremap to synchronize page tables. pgd_populate_kernel() looks like this: static inline void pgd_populate_kernel(unsigned long addr, pgd_t *pgd, p4d_t *p4d) { pgd_populate(&init_mm, pgd, p4d); if (ARCH_PAGE_TABLE_SYNC_MASK & PGTBL_PGD_MODIFIED) arch_sync_kernel_mappings(addr, addr); } It is worth noting that vmalloc() and apply_to_range() carefully synchronizes page tables by calling p*d_alloc_track() and arch_sync_kernel_mappings(), and thus they are not affected by this patch series. This series was hugely inspired by Dave Hansen's suggestion and hence added Suggested-by: Dave Hansen. Cc stable because lack of this series opens the door to intermittent boot failures. This patch (of 3): Move ARCH_PAGE_TABLE_SYNC_MASK and arch_sync_kernel_mappings() to linux/pgtable.h so that they can be used outside of vmalloc and ioremap. Link: https://lkml.kernel.org/r/20250818020206.4517-1-harry.yoo@oracle.com Link: https://lkml.kernel.org/r/20250818020206.4517-2-harry.yoo@oracle.com Link: https://lore.kernel.org/linux-mm/20250220064105.808339-1-gwan-gyeong.mun@intel.com [1] Link: https://lore.kernel.org/linux-mm/20250311114420.240341-1-gwan-gyeong.mun@intel.com [2] Link: https://lore.kernel.org/linux-mm/d1da214c-53d3-45ac-a8b6-51821c5416e4@intel.com [3] Link: https://lore.kernel.org/linux-mm/4d800744-7b88-41aa-9979-b245e8bf794b@intel.com [4] Fixes: 8d400913c231 ("x86/vmemmap: handle unpopulated sub-pmd ranges") Signed-off-by: Harry Yoo Acked-by: Kiryl Shutsemau Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: "Uladzislau Rezki (Sony)" Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Cc: Alexander Potapenko Cc: Alistair Popple Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V" Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: bibo mao Cc: Borislav Betkov Cc: Christoph Lameter (Ampere) Cc: Dennis Zhou Cc: Dev Jain Cc: Dmitriy Vyukov Cc: Gwan-gyeong Mun Cc: Ingo Molnar Cc: Jane Chu Cc: Joao Martins Cc: Joerg Roedel Cc: John Hubbard Cc: Kevin Brodsky Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vincenzo Frascino Cc: Vlastimil Babka Cc: Dave Hansen Cc: Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 16 ++++++++++++++++ include/linux/vmalloc.h | 16 ---------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 4c035637eeb7..ba699df6ef69 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1467,6 +1467,22 @@ static inline void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned } #endif +/* + * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values + * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings() + * needs to be called. + */ +#ifndef ARCH_PAGE_TABLE_SYNC_MASK +#define ARCH_PAGE_TABLE_SYNC_MASK 0 +#endif + +/* + * There is no default implementation for arch_sync_kernel_mappings(). It is + * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK + * is 0. + */ +void arch_sync_kernel_mappings(unsigned long start, unsigned long end); + #endif /* CONFIG_MMU */ /* diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index fdc9aeb74a44..2759dac6be44 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -219,22 +219,6 @@ extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift); -/* - * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values - * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings() - * needs to be called. - */ -#ifndef ARCH_PAGE_TABLE_SYNC_MASK -#define ARCH_PAGE_TABLE_SYNC_MASK 0 -#endif - -/* - * There is no default implementation for arch_sync_kernel_mappings(). It is - * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK - * is 0. - */ -void arch_sync_kernel_mappings(unsigned long start, unsigned long end); - /* * Lowlevel-APIs (not for driver use!) */ From f2d2f9598ebb0158a3fe17cda0106d7752e654a2 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Mon, 18 Aug 2025 11:02:05 +0900 Subject: [PATCH 15/17] mm: introduce and use {pgd,p4d}_populate_kernel() Introduce and use {pgd,p4d}_populate_kernel() in core MM code when populating PGD and P4D entries for the kernel address space. These helpers ensure proper synchronization of page tables when updating the kernel portion of top-level page tables. Until now, the kernel has relied on each architecture to handle synchronization of top-level page tables in an ad-hoc manner. For example, see commit 9b861528a801 ("x86-64, mem: Update all PGDs for direct mapping and vmemmap mapping changes"). However, this approach has proven fragile for following reasons: 1) It is easy to forget to perform the necessary page table synchronization when introducing new changes. For instance, commit 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for compound devmaps") overlooked the need to synchronize page tables for the vmemmap area. 2) It is also easy to overlook that the vmemmap and direct mapping areas must not be accessed before explicit page table synchronization. For example, commit 8d400913c231 ("x86/vmemmap: handle unpopulated sub-pmd ranges")) caused crashes by accessing the vmemmap area before calling sync_global_pgds(). To address this, as suggested by Dave Hansen, introduce _kernel() variants of the page table population helpers, which invoke architecture-specific hooks to properly synchronize page tables. These are introduced in a new header file, include/linux/pgalloc.h, so they can be called from common code. They reuse existing infrastructure for vmalloc and ioremap. Synchronization requirements are determined by ARCH_PAGE_TABLE_SYNC_MASK, and the actual synchronization is performed by arch_sync_kernel_mappings(). This change currently targets only x86_64, so only PGD and P4D level helpers are introduced. Currently, these helpers are no-ops since no architecture sets PGTBL_{PGD,P4D}_MODIFIED in ARCH_PAGE_TABLE_SYNC_MASK. In theory, PUD and PMD level helpers can be added later if needed by other architectures. For now, 32-bit architectures (x86-32 and arm) only handle PGTBL_PMD_MODIFIED, so p*d_populate_kernel() will never affect them unless we introduce a PMD level helper. [harry.yoo@oracle.com: fix KASAN build error due to p*d_populate_kernel()] Link: https://lkml.kernel.org/r/20250822020727.202749-1-harry.yoo@oracle.com Link: https://lkml.kernel.org/r/20250818020206.4517-3-harry.yoo@oracle.com Fixes: 8d400913c231 ("x86/vmemmap: handle unpopulated sub-pmd ranges") Signed-off-by: Harry Yoo Suggested-by: Dave Hansen Acked-by: Kiryl Shutsemau Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Cc: Alexander Potapenko Cc: Alistair Popple Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V" Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: bibo mao Cc: Borislav Betkov Cc: Christoph Lameter (Ampere) Cc: Dennis Zhou Cc: Dev Jain Cc: Dmitriy Vyukov Cc: Gwan-gyeong Mun Cc: Ingo Molnar Cc: Jane Chu Cc: Joao Martins Cc: Joerg Roedel Cc: John Hubbard Cc: Kevin Brodsky Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Huth Cc: "Uladzislau Rezki (Sony)" Cc: Vincenzo Frascino Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/pgalloc.h | 29 +++++++++++++++++++++++++++++ include/linux/pgtable.h | 13 +++++++------ mm/kasan/init.c | 12 ++++++------ mm/percpu.c | 6 +++--- mm/sparse-vmemmap.c | 6 +++--- 5 files changed, 48 insertions(+), 18 deletions(-) create mode 100644 include/linux/pgalloc.h diff --git a/include/linux/pgalloc.h b/include/linux/pgalloc.h new file mode 100644 index 000000000000..9174fa59bbc5 --- /dev/null +++ b/include/linux/pgalloc.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PGALLOC_H +#define _LINUX_PGALLOC_H + +#include +#include + +/* + * {pgd,p4d}_populate_kernel() are defined as macros to allow + * compile-time optimization based on the configured page table levels. + * Without this, linking may fail because callers (e.g., KASAN) may rely + * on calls to these functions being optimized away when passing symbols + * that exist only for certain page table levels. + */ +#define pgd_populate_kernel(addr, pgd, p4d) \ + do { \ + pgd_populate(&init_mm, pgd, p4d); \ + if (ARCH_PAGE_TABLE_SYNC_MASK & PGTBL_PGD_MODIFIED) \ + arch_sync_kernel_mappings(addr, addr); \ + } while (0) + +#define p4d_populate_kernel(addr, p4d, pud) \ + do { \ + p4d_populate(&init_mm, p4d, pud); \ + if (ARCH_PAGE_TABLE_SYNC_MASK & PGTBL_P4D_MODIFIED) \ + arch_sync_kernel_mappings(addr, addr); \ + } while (0) + +#endif /* _LINUX_PGALLOC_H */ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index ba699df6ef69..2b80fd456c8b 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1469,8 +1469,8 @@ static inline void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned /* * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values - * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings() - * needs to be called. + * and let generic vmalloc, ioremap and page table update code know when + * arch_sync_kernel_mappings() needs to be called. */ #ifndef ARCH_PAGE_TABLE_SYNC_MASK #define ARCH_PAGE_TABLE_SYNC_MASK 0 @@ -1954,10 +1954,11 @@ static inline bool arch_has_pfn_modify_check(void) /* * Page Table Modification bits for pgtbl_mod_mask. * - * These are used by the p?d_alloc_track*() set of functions an in the generic - * vmalloc/ioremap code to track at which page-table levels entries have been - * modified. Based on that the code can better decide when vmalloc and ioremap - * mapping changes need to be synchronized to other page-tables in the system. + * These are used by the p?d_alloc_track*() and p*d_populate_kernel() + * functions in the generic vmalloc, ioremap and page table update code + * to track at which page-table levels entries have been modified. + * Based on that the code can better decide when page table changes need + * to be synchronized to other page-tables in the system. */ #define __PGTBL_PGD_MODIFIED 0 #define __PGTBL_P4D_MODIFIED 1 diff --git a/mm/kasan/init.c b/mm/kasan/init.c index ced6b29fcf76..8fce3370c84e 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -13,9 +13,9 @@ #include #include #include +#include #include -#include #include "kasan.h" @@ -191,7 +191,7 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr, pud_t *pud; pmd_t *pmd; - p4d_populate(&init_mm, p4d, + p4d_populate_kernel(addr, p4d, lm_alias(kasan_early_shadow_pud)); pud = pud_offset(p4d, addr); pud_populate(&init_mm, pud, @@ -212,7 +212,7 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr, } else { p = early_alloc(PAGE_SIZE, NUMA_NO_NODE); pud_init(p); - p4d_populate(&init_mm, p4d, p); + p4d_populate_kernel(addr, p4d, p); } } zero_pud_populate(p4d, addr, next); @@ -251,10 +251,10 @@ int __ref kasan_populate_early_shadow(const void *shadow_start, * puds,pmds, so pgd_populate(), pud_populate() * is noops. */ - pgd_populate(&init_mm, pgd, + pgd_populate_kernel(addr, pgd, lm_alias(kasan_early_shadow_p4d)); p4d = p4d_offset(pgd, addr); - p4d_populate(&init_mm, p4d, + p4d_populate_kernel(addr, p4d, lm_alias(kasan_early_shadow_pud)); pud = pud_offset(p4d, addr); pud_populate(&init_mm, pud, @@ -273,7 +273,7 @@ int __ref kasan_populate_early_shadow(const void *shadow_start, if (!p) return -ENOMEM; } else { - pgd_populate(&init_mm, pgd, + pgd_populate_kernel(addr, pgd, early_alloc(PAGE_SIZE, NUMA_NO_NODE)); } } diff --git a/mm/percpu.c b/mm/percpu.c index d9cbaee92b60..a56f35dcc417 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -3108,7 +3108,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, #endif /* BUILD_EMBED_FIRST_CHUNK */ #ifdef BUILD_PAGE_FIRST_CHUNK -#include +#include #ifndef P4D_TABLE_SIZE #define P4D_TABLE_SIZE PAGE_SIZE @@ -3134,13 +3134,13 @@ void __init __weak pcpu_populate_pte(unsigned long addr) if (pgd_none(*pgd)) { p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE); - pgd_populate(&init_mm, pgd, p4d); + pgd_populate_kernel(addr, pgd, p4d); } p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) { pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE); - p4d_populate(&init_mm, p4d, pud); + p4d_populate_kernel(addr, p4d, pud); } pud = pud_offset(p4d, addr); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 41aa0493eb03..dbd8daccade2 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -27,9 +27,9 @@ #include #include #include +#include #include -#include #include #include "hugetlb_vmemmap.h" @@ -229,7 +229,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) if (!p) return NULL; pud_init(p); - p4d_populate(&init_mm, p4d, p); + p4d_populate_kernel(addr, p4d, p); } return p4d; } @@ -241,7 +241,7 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; - pgd_populate(&init_mm, pgd, p); + pgd_populate_kernel(addr, pgd, p); } return pgd; } From 6659d027998083fbb6d42a165b0c90dc2e8ba989 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Mon, 18 Aug 2025 11:02:06 +0900 Subject: [PATCH 16/17] x86/mm/64: define ARCH_PAGE_TABLE_SYNC_MASK and arch_sync_kernel_mappings() Define ARCH_PAGE_TABLE_SYNC_MASK and arch_sync_kernel_mappings() to ensure page tables are properly synchronized when calling p*d_populate_kernel(). For 5-level paging, synchronization is performed via pgd_populate_kernel(). In 4-level paging, pgd_populate() is a no-op, so synchronization is instead performed at the P4D level via p4d_populate_kernel(). This fixes intermittent boot failures on systems using 4-level paging and a large amount of persistent memory: BUG: unable to handle page fault for address: ffffe70000000034 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: 0002 [#1] SMP NOPTI RIP: 0010:__init_single_page+0x9/0x6d Call Trace: __init_zone_device_page+0x17/0x5d memmap_init_zone_device+0x154/0x1bb pagemap_range+0x2e0/0x40f memremap_pages+0x10b/0x2f0 devm_memremap_pages+0x1e/0x60 dev_dax_probe+0xce/0x2ec [device_dax] dax_bus_probe+0x6d/0xc9 [... snip ...] It also fixes a crash in vmemmap_set_pmd() caused by accessing vmemmap before sync_global_pgds() [1]: BUG: unable to handle page fault for address: ffffeb3ff1200000 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: Oops: 0002 [#1] PREEMPT SMP NOPTI Tainted: [W]=WARN RIP: 0010:vmemmap_set_pmd+0xff/0x230 vmemmap_populate_hugepages+0x176/0x180 vmemmap_populate+0x34/0x80 __populate_section_memmap+0x41/0x90 sparse_add_section+0x121/0x3e0 __add_pages+0xba/0x150 add_pages+0x1d/0x70 memremap_pages+0x3dc/0x810 devm_memremap_pages+0x1c/0x60 xe_devm_add+0x8b/0x100 [xe] xe_tile_init_noalloc+0x6a/0x70 [xe] xe_device_probe+0x48c/0x740 [xe] [... snip ...] Link: https://lkml.kernel.org/r/20250818020206.4517-4-harry.yoo@oracle.com Fixes: 8d400913c231 ("x86/vmemmap: handle unpopulated sub-pmd ranges") Signed-off-by: Harry Yoo Closes: https://lore.kernel.org/linux-mm/20250311114420.240341-1-gwan-gyeong.mun@intel.com [1] Suggested-by: Dave Hansen Acked-by: Kiryl Shutsemau Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Cc: Alexander Potapenko Cc: Alistair Popple Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: "Aneesh Kumar K.V" Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: bibo mao Cc: Borislav Betkov Cc: Christoph Lameter (Ampere) Cc: Dennis Zhou Cc: Dev Jain Cc: Dmitriy Vyukov Cc: Ingo Molnar Cc: Jane Chu Cc: Joao Martins Cc: Joerg Roedel Cc: John Hubbard Cc: Kevin Brodsky Cc: Liam Howlett Cc: Michal Hocko Cc: Oscar Salvador Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Huth Cc: "Uladzislau Rezki (Sony)" Cc: Vincenzo Frascino Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable_64_types.h | 3 +++ arch/x86/mm/init_64.c | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 4604f924d8b8..7eb61ef6a185 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -36,6 +36,9 @@ static inline bool pgtable_l5_enabled(void) #define pgtable_l5_enabled() cpu_feature_enabled(X86_FEATURE_LA57) #endif /* USE_EARLY_PGTABLE_L5 */ +#define ARCH_PAGE_TABLE_SYNC_MASK \ + (pgtable_l5_enabled() ? PGTBL_PGD_MODIFIED : PGTBL_P4D_MODIFIED) + extern unsigned int pgdir_shift; extern unsigned int ptrs_per_p4d; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 76e33bd7c556..b9426fce5f3e 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -223,6 +223,24 @@ static void sync_global_pgds(unsigned long start, unsigned long end) sync_global_pgds_l4(start, end); } +/* + * Make kernel mappings visible in all page tables in the system. + * This is necessary except when the init task populates kernel mappings + * during the boot process. In that case, all processes originating from + * the init task copies the kernel mappings, so there is no issue. + * Otherwise, missing synchronization could lead to kernel crashes due + * to missing page table entries for certain kernel mappings. + * + * Synchronization is performed at the top level, which is the PGD in + * 5-level paging systems. But in 4-level paging systems, however, + * pgd_populate() is a no-op, so synchronization is done at the P4D level. + * sync_global_pgds() handles this difference between paging levels. + */ +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) +{ + sync_global_pgds(start, end); +} + /* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. From c873ccbb2f8db46ad9b4a989ea924b6d8f19abf1 Mon Sep 17 00:00:00 2001 From: Gu Bowen Date: Fri, 22 Aug 2025 15:35:41 +0800 Subject: [PATCH 17/17] mm: fix possible deadlock in kmemleak There are some AA deadlock issues in kmemleak, similar to the situation reported by Breno [1]. The deadlock path is as follows: mem_pool_alloc() -> raw_spin_lock_irqsave(&kmemleak_lock, flags); -> pr_warn() -> netconsole subsystem -> netpoll -> __alloc_skb -> __create_object -> raw_spin_lock_irqsave(&kmemleak_lock, flags); To solve this problem, switch to printk_safe mode before printing warning message, this will redirect all printk()-s to a special per-CPU buffer, which will be flushed later from a safe context (irq work), and this deadlock problem can be avoided. The proper API to use should be printk_deferred_enter()/printk_deferred_exit() [2]. Another way is to place the warn print after kmemleak is released. Link: https://lkml.kernel.org/r/20250822073541.1886469-1-gubowen5@huawei.com Link: https://lore.kernel.org/all/20250731-kmemleak_lock-v1-1-728fd470198f@debian.org/#t [1] Link: https://lore.kernel.org/all/5ca375cd-4a20-4807-b897-68b289626550@redhat.com/ [2] Signed-off-by: Gu Bowen Reviewed-by: Waiman Long Reviewed-by: Catalin Marinas Reviewed-by: Breno Leitao Cc: Greg Kroah-Hartman Cc: John Ogness Cc: Lu Jialin Cc: Petr Mladek Cc: Signed-off-by: Andrew Morton --- mm/kmemleak.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 84265983f239..1ac56ceb29b6 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -437,9 +437,15 @@ static struct kmemleak_object *__lookup_object(unsigned long ptr, int alias, else if (untagged_objp == untagged_ptr || alias) return object; else { + /* + * Printk deferring due to the kmemleak_lock held. + * This is done to avoid deadlock. + */ + printk_deferred_enter(); kmemleak_warn("Found object by alias at 0x%08lx\n", ptr); dump_object_info(object); + printk_deferred_exit(); break; } } @@ -736,6 +742,11 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr, else if (untagged_objp + parent->size <= untagged_ptr) link = &parent->rb_node.rb_right; else { + /* + * Printk deferring due to the kmemleak_lock held. + * This is done to avoid deadlock. + */ + printk_deferred_enter(); kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n", ptr); /* @@ -743,6 +754,7 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr, * be freed while the kmemleak_lock is held. */ dump_object_info(parent); + printk_deferred_exit(); return -EEXIST; } } @@ -856,13 +868,8 @@ static void delete_object_part(unsigned long ptr, size_t size, raw_spin_lock_irqsave(&kmemleak_lock, flags); object = __find_and_remove_object(ptr, 1, objflags); - if (!object) { -#ifdef DEBUG - kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n", - ptr, size); -#endif + if (!object) goto unlock; - } /* * Create one or two objects that may result from the memory block @@ -882,8 +889,14 @@ static void delete_object_part(unsigned long ptr, size_t size, unlock: raw_spin_unlock_irqrestore(&kmemleak_lock, flags); - if (object) + if (object) { __delete_object(object); + } else { +#ifdef DEBUG + kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n", + ptr, size); +#endif + } out: if (object_l)