mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-12-27 13:30:45 -05:00
Our goal is to move towards enabling vmalloc-huge by default on arm64 so
as to reduce TLB pressure. Therefore, we need a way to analyze the portion
of block mappings in vmalloc space we can get on a production system; this
can be done through ptdump, but currently we disable vmalloc-huge if
CONFIG_PTDUMP_DEBUGFS is on. The reason is that lazy freeing of kernel
pagetables via vmap_try_huge_pxd() may race with ptdump, so ptdump
may dereference a bogus address.
To solve this, we need to synchronize ptdump_walk() and ptdump_check_wx()
with pud_free_pmd_page() and pmd_free_pte_page().
Since this race is very unlikely to happen in practice, we do not want to
penalize the vmalloc pagetable tearing path by taking the init_mm
mmap_lock. Therefore, we use static keys. ptdump_walk() and
ptdump_check_wx() are the pagetable walkers; they will enable the static
key - upon observing that, the vmalloc pagetable tearing path will get
patched in with an mmap_read_lock/unlock sequence. A combination of the
patched-in mmap_read_lock/unlock, the acquire semantics of
static_branch_inc(), and the barriers in __flush_tlb_kernel_pgtable()
ensures that ptdump will never get a hold on the address of a freed PMD
or PTE table.
We can verify the correctness of the algorithm via the following litmus
test (thanks to James Houghton and Will Deacon):
AArch64 ptdump
Variant=Ifetch
{
uint64_t pud=0xa110c;
uint64_t pmd;
0:X0=label:"P1:L0"; 0:X1=instr:"NOP"; 0:X2=lock; 0:X3=pud; 0:X4=pmd;
1:X1=0xdead; 1:X2=lock; 1:X3=pud; 1:X4=pmd;
}
P0 | P1 ;
(* static_key_enable *) | (* pud_free_pmd_page *) ;
STR W1, [X0] | LDR X9, [X3] ;
DC CVAU,X0 | STR XZR, [X3] ;
DSB ISH | DSB ISH ;
IC IVAU,X0 | ISB ;
DSB ISH | ;
ISB | (* static key *) ;
| L0: ;
(* mmap_lock *) | B out1 ;
Lwlock: | ;
MOV W7, #1 | (* mmap_lock *) ;
SWPA W7, W8, [X2] | Lrlock: ;
| MOV W7, #1 ;
| SWPA W7, W8, [X2] ;
(* walk pgtable *) | ;
LDR X9, [X3] | (* mmap_unlock *) ;
CBZ X9, out0 | STLR WZR, [X2] ;
EOR X10, X9, X9 | ;
LDR X11, [X4, X10] | out1: ;
| EOR X10, X9, X9 ;
out0: | STR X1, [X4, X10] ;
exists (0:X8=0 /\ 1:X8=0 /\ (* Lock acquisitions succeed *)
0:X9=0xa110c /\ (* P0 sees the valid PUD ...*)
0:X11=0xdead) (* ... but the freed PMD *)
For an approximate written proof of why this algorithm works, please read
the code comment in [1], which is now removed for the sake of simplicity.
mm-selftests pass. No issues were observed while parallelly running
test_vmalloc.sh (which stresses the vmalloc subsystem),
and cat /sys/kernel/debug/{kernel_page_tables, check_wx_pages} in a loop.
Link: https://lore.kernel.org/all/20250723161827.15802-1-dev.jain@arm.com/ [1]
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>
90 lines
2.8 KiB
C
90 lines
2.8 KiB
C
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
/*
|
|
* Copyright (C) 2014 ARM Ltd.
|
|
*/
|
|
#ifndef __ASM_PTDUMP_H
|
|
#define __ASM_PTDUMP_H
|
|
|
|
#include <linux/ptdump.h>
|
|
|
|
DECLARE_STATIC_KEY_FALSE(arm64_ptdump_lock_key);
|
|
|
|
#ifdef CONFIG_PTDUMP
|
|
|
|
#include <linux/mm_types.h>
|
|
#include <linux/seq_file.h>
|
|
|
|
struct addr_marker {
|
|
unsigned long start_address;
|
|
char *name;
|
|
};
|
|
|
|
struct ptdump_info {
|
|
struct mm_struct *mm;
|
|
const struct addr_marker *markers;
|
|
unsigned long base_addr;
|
|
};
|
|
|
|
struct ptdump_prot_bits {
|
|
ptdesc_t mask;
|
|
ptdesc_t val;
|
|
const char *set;
|
|
const char *clear;
|
|
};
|
|
|
|
struct ptdump_pg_level {
|
|
const struct ptdump_prot_bits *bits;
|
|
char name[4];
|
|
int num;
|
|
ptdesc_t mask;
|
|
};
|
|
|
|
/*
|
|
* The page dumper groups page table entries of the same type into a single
|
|
* description. It uses pg_state to track the range information while
|
|
* iterating over the pte entries. When the continuity is broken it then
|
|
* dumps out a description of the range.
|
|
*/
|
|
struct ptdump_pg_state {
|
|
struct ptdump_state ptdump;
|
|
struct ptdump_pg_level *pg_level;
|
|
struct seq_file *seq;
|
|
const struct addr_marker *marker;
|
|
const struct mm_struct *mm;
|
|
unsigned long start_address;
|
|
int level;
|
|
ptdesc_t current_prot;
|
|
bool check_wx;
|
|
unsigned long wx_pages;
|
|
unsigned long uxn_pages;
|
|
};
|
|
|
|
void ptdump_walk(struct seq_file *s, struct ptdump_info *info);
|
|
void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
|
|
pteval_t val);
|
|
void note_page_pte(struct ptdump_state *st, unsigned long addr, pte_t pte);
|
|
void note_page_pmd(struct ptdump_state *st, unsigned long addr, pmd_t pmd);
|
|
void note_page_pud(struct ptdump_state *st, unsigned long addr, pud_t pud);
|
|
void note_page_p4d(struct ptdump_state *st, unsigned long addr, p4d_t p4d);
|
|
void note_page_pgd(struct ptdump_state *st, unsigned long addr, pgd_t pgd);
|
|
void note_page_flush(struct ptdump_state *st);
|
|
#ifdef CONFIG_PTDUMP_DEBUGFS
|
|
#define EFI_RUNTIME_MAP_END DEFAULT_MAP_WINDOW_64
|
|
void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name);
|
|
#else
|
|
static inline void ptdump_debugfs_register(struct ptdump_info *info,
|
|
const char *name) { }
|
|
#endif /* CONFIG_PTDUMP_DEBUGFS */
|
|
#else
|
|
static inline void note_page(struct ptdump_state *pt_st, unsigned long addr,
|
|
int level, pteval_t val) { }
|
|
static inline void note_page_pte(struct ptdump_state *st, unsigned long addr, pte_t pte) { }
|
|
static inline void note_page_pmd(struct ptdump_state *st, unsigned long addr, pmd_t pmd) { }
|
|
static inline void note_page_pud(struct ptdump_state *st, unsigned long addr, pud_t pud) { }
|
|
static inline void note_page_p4d(struct ptdump_state *st, unsigned long addr, p4d_t p4d) { }
|
|
static inline void note_page_pgd(struct ptdump_state *st, unsigned long addr, pgd_t pgd) { }
|
|
static inline void note_page_flush(struct ptdump_state *st) { }
|
|
#endif /* CONFIG_PTDUMP */
|
|
|
|
#endif /* __ASM_PTDUMP_H */
|