arm64: Enable vmalloc-huge with ptdump

Our goal is to move towards enabling vmalloc-huge by default on arm64 so
as to reduce TLB pressure. Therefore, we need a way to analyze the portion
of block mappings in vmalloc space we can get on a production system; this
can be done through ptdump, but currently we disable vmalloc-huge if
CONFIG_PTDUMP_DEBUGFS is on. The reason is that lazy freeing of kernel
pagetables via vmap_try_huge_pxd() may race with ptdump, so ptdump
may dereference a bogus address.

To solve this, we need to synchronize ptdump_walk() and ptdump_check_wx()
with pud_free_pmd_page() and pmd_free_pte_page().

Since this race is very unlikely to happen in practice, we do not want to
penalize the vmalloc pagetable tearing path by taking the init_mm
mmap_lock. Therefore, we use static keys. ptdump_walk() and
ptdump_check_wx() are the pagetable walkers; they will enable the static
key - upon observing that, the vmalloc pagetable tearing path will get
patched in with an mmap_read_lock/unlock sequence. A combination of the
patched-in mmap_read_lock/unlock, the acquire semantics of
static_branch_inc(), and the barriers in __flush_tlb_kernel_pgtable()
ensures that ptdump will never get a hold on the address of a freed PMD
or PTE table.

We can verify the correctness of the algorithm via the following litmus
test (thanks to James Houghton and Will Deacon):

AArch64 ptdump
Variant=Ifetch
{
uint64_t pud=0xa110c;
uint64_t pmd;

0:X0=label:"P1:L0"; 0:X1=instr:"NOP"; 0:X2=lock; 0:X3=pud; 0:X4=pmd;
                    1:X1=0xdead;      1:X2=lock; 1:X3=pud; 1:X4=pmd;
}
 P0				| P1				;
 (* static_key_enable *)	| (* pud_free_pmd_page *)	;
 STR	W1, [X0]		| LDR	X9, [X3]		;
 DC	CVAU,X0			| STR	XZR, [X3]		;
 DSB	ISH			| DSB	ISH			;
 IC	IVAU,X0			| ISB				;
 DSB	ISH			|				;
 ISB				| (* static key *)		;
				| L0:				;
 (* mmap_lock *)		| B	out1			;
 Lwlock:			|				;
 MOV	W7, #1			| (* mmap_lock *)		;
 SWPA	W7, W8, [X2]		| Lrlock:			;
				| MOV	W7, #1			;
				| SWPA	W7, W8, [X2]		;
 (* walk pgtable *)		|				;
 LDR	X9, [X3]		| (* mmap_unlock *)		;
 CBZ	X9, out0		| STLR	WZR, [X2]		;
 EOR	X10, X9, X9		|				;
 LDR	X11, [X4, X10]		| out1:				;
				| EOR	X10, X9, X9		;
 out0:				| STR	X1, [X4, X10]		;

exists (0:X8=0 /\ 1:X8=0 /\	(* Lock acquisitions succeed *)
	0:X9=0xa110c /\		(* P0 sees the valid PUD ...*)
	0:X11=0xdead)		(* ... but the freed PMD *)

For an approximate written proof of why this algorithm works, please read
the code comment in [1], which is now removed for the sake of simplicity.

mm-selftests pass. No issues were observed while parallelly running
test_vmalloc.sh (which stresses the vmalloc subsystem),
and cat /sys/kernel/debug/{kernel_page_tables, check_wx_pages} in a loop.

Link: https://lore.kernel.org/all/20250723161827.15802-1-dev.jain@arm.com/ [1]
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>
This commit is contained in:
Dev Jain
2025-09-22 12:11:26 +05:30
committed by Will Deacon
parent 3df6979d22
commit fa93b45fd3
4 changed files with 52 additions and 13 deletions

View File

@@ -7,6 +7,8 @@
#include <linux/ptdump.h>
DECLARE_STATIC_KEY_FALSE(arm64_ptdump_lock_key);
#ifdef CONFIG_PTDUMP
#include <linux/mm_types.h>

View File

@@ -9,18 +9,13 @@
#define arch_vmap_pud_supported arch_vmap_pud_supported
static inline bool arch_vmap_pud_supported(pgprot_t prot)
{
/*
* SW table walks can't handle removal of intermediate entries.
*/
return pud_sect_supported() &&
!IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
return pud_sect_supported();
}
#define arch_vmap_pmd_supported arch_vmap_pmd_supported
static inline bool arch_vmap_pmd_supported(pgprot_t prot)
{
/* See arch_vmap_pud_supported() */
return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
return true;
}
#define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size

View File

@@ -56,6 +56,8 @@ enum pgtable_type {
TABLE_P4D,
};
DEFINE_STATIC_KEY_FALSE(arm64_ptdump_lock_key);
u64 kimage_voffset __ro_after_init;
EXPORT_SYMBOL(kimage_voffset);
@@ -1665,7 +1667,8 @@ int pmd_clear_huge(pmd_t *pmdp)
return 1;
}
int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
static int __pmd_free_pte_page(pmd_t *pmdp, unsigned long addr,
bool acquire_mmap_lock)
{
pte_t *table;
pmd_t pmd;
@@ -1677,13 +1680,25 @@ int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
return 1;
}
/* See comment in pud_free_pmd_page for static key logic */
table = pte_offset_kernel(pmdp, addr);
pmd_clear(pmdp);
__flush_tlb_kernel_pgtable(addr);
if (static_branch_unlikely(&arm64_ptdump_lock_key) && acquire_mmap_lock) {
mmap_read_lock(&init_mm);
mmap_read_unlock(&init_mm);
}
pte_free_kernel(NULL, table);
return 1;
}
int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
{
/* If ptdump is walking the pagetables, acquire init_mm.mmap_lock */
return __pmd_free_pte_page(pmdp, addr, /* acquire_mmap_lock = */ true);
}
int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
{
pmd_t *table;
@@ -1699,16 +1714,36 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
}
table = pmd_offset(pudp, addr);
/*
* Our objective is to prevent ptdump from reading a PMD table which has
* been freed. In this race, if pud_free_pmd_page observes the key on
* (which got flipped by ptdump) then the mmap lock sequence here will,
* as a result of the mmap write lock/unlock sequence in ptdump, give
* us the correct synchronization. If not, this means that ptdump has
* yet not started walking the pagetables - the sequence of barriers
* issued by __flush_tlb_kernel_pgtable() guarantees that ptdump will
* observe an empty PUD.
*/
pud_clear(pudp);
__flush_tlb_kernel_pgtable(addr);
if (static_branch_unlikely(&arm64_ptdump_lock_key)) {
mmap_read_lock(&init_mm);
mmap_read_unlock(&init_mm);
}
pmdp = table;
next = addr;
end = addr + PUD_SIZE;
do {
if (pmd_present(pmdp_get(pmdp)))
pmd_free_pte_page(pmdp, next);
/*
* PMD has been isolated, so ptdump won't see it. No
* need to acquire init_mm.mmap_lock.
*/
__pmd_free_pte_page(pmdp, next, /* acquire_mmap_lock = */ false);
} while (pmdp++, next += PMD_SIZE, next != end);
pud_clear(pudp);
__flush_tlb_kernel_pgtable(addr);
pmd_free(NULL, table);
return 1;
}

View File

@@ -283,6 +283,13 @@ void note_page_flush(struct ptdump_state *pt_st)
note_page(pt_st, 0, -1, pte_val(pte_zero));
}
static void arm64_ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm)
{
static_branch_inc(&arm64_ptdump_lock_key);
ptdump_walk_pgd(st, mm, NULL);
static_branch_dec(&arm64_ptdump_lock_key);
}
void ptdump_walk(struct seq_file *s, struct ptdump_info *info)
{
unsigned long end = ~0UL;
@@ -311,7 +318,7 @@ void ptdump_walk(struct seq_file *s, struct ptdump_info *info)
}
};
ptdump_walk_pgd(&st.ptdump, info->mm, NULL);
arm64_ptdump_walk_pgd(&st.ptdump, info->mm);
}
static void __init ptdump_initialize(void)
@@ -353,7 +360,7 @@ bool ptdump_check_wx(void)
}
};
ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
arm64_ptdump_walk_pgd(&st.ptdump, &init_mm);
if (st.wx_pages || st.uxn_pages) {
pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n",