mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-01-05 15:21:53 -05:00
Our virtual KASLR displacement is a randomly chosen multiple of
2 MiB plus an offset that is equal to the physical placement modulo 2
MiB. This arrangement ensures that we can always use 2 MiB block
mappings (or contiguous PTE mappings for 16k or 64k pages) to map the
kernel.
This means that a KASLR offset of less than 2 MiB is simply the product
of this physical displacement, and no randomization has actually taken
place. Currently, we use 'kaslr_offset() > 0' to decide whether or not
randomization has occurred, and so we misidentify this case.
If the kernel image placement is not randomized, modules are allocated
from a dedicated region below the kernel mapping, which is only used for
modules and not for other vmalloc() or vmap() calls.
When randomization is enabled, the kernel image is vmap()'ed randomly
inside the vmalloc region, and modules are allocated in the vicinity of
this mapping to ensure that relative references are always in range.
However, unlike the dedicated module region below the vmalloc region,
this region is not reserved exclusively for modules, and so ordinary
vmalloc() calls may end up overlapping with it. This should rarely
happen, given that vmalloc allocates bottom up, although it cannot be
ruled out entirely.
The misidentified case results in a placement of the kernel image within
2 MiB of its default address. However, the logic that randomizes the
module region is still invoked, and this could result in the module
region overlapping with the start of the vmalloc region, instead of
using the dedicated region below it. If this happens, a single large
vmalloc() or vmap() call will use up the entire region, and leave no
space for loading modules after that.
Since commit 82046702e2 ("efi/libstub/arm64: Replace 'preferred'
offset with alignment check"), this is much more likely to occur on
systems that boot via EFI but lack an implementation of the EFI RNG
protocol, as in that case, the EFI stub will decide to leave the image
where it found it, and the EFI firmware uses 64k alignment only.
Fix this, by correctly identifying the case where the virtual
displacement is a result of the physical displacement only.
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20230223204101.1500373-1-ardb@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
407 lines
13 KiB
C
407 lines
13 KiB
C
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
/*
|
|
* Based on arch/arm/include/asm/memory.h
|
|
*
|
|
* Copyright (C) 2000-2002 Russell King
|
|
* Copyright (C) 2012 ARM Ltd.
|
|
*
|
|
* Note: this file should not be included by non-asm/.h files
|
|
*/
|
|
#ifndef __ASM_MEMORY_H
|
|
#define __ASM_MEMORY_H
|
|
|
|
#include <linux/const.h>
|
|
#include <linux/sizes.h>
|
|
#include <asm/page-def.h>
|
|
|
|
/*
|
|
* Size of the PCI I/O space. This must remain a power of two so that
|
|
* IO_SPACE_LIMIT acts as a mask for the low bits of I/O addresses.
|
|
*/
|
|
#define PCI_IO_SIZE SZ_16M
|
|
|
|
/*
|
|
* VMEMMAP_SIZE - allows the whole linear region to be covered by
|
|
* a struct page array
|
|
*
|
|
* If we are configured with a 52-bit kernel VA then our VMEMMAP_SIZE
|
|
* needs to cover the memory region from the beginning of the 52-bit
|
|
* PAGE_OFFSET all the way to PAGE_END for 48-bit. This allows us to
|
|
* keep a constant PAGE_OFFSET and "fallback" to using the higher end
|
|
* of the VMEMMAP where 52-bit support is not available in hardware.
|
|
*/
|
|
#define VMEMMAP_SHIFT (PAGE_SHIFT - STRUCT_PAGE_MAX_SHIFT)
|
|
#define VMEMMAP_SIZE ((_PAGE_END(VA_BITS_MIN) - PAGE_OFFSET) >> VMEMMAP_SHIFT)
|
|
|
|
/*
|
|
* PAGE_OFFSET - the virtual address of the start of the linear map, at the
|
|
* start of the TTBR1 address space.
|
|
* PAGE_END - the end of the linear map, where all other kernel mappings begin.
|
|
* KIMAGE_VADDR - the virtual address of the start of the kernel image.
|
|
* VA_BITS - the maximum number of bits for virtual addresses.
|
|
*/
|
|
#define VA_BITS (CONFIG_ARM64_VA_BITS)
|
|
#define _PAGE_OFFSET(va) (-(UL(1) << (va)))
|
|
#define PAGE_OFFSET (_PAGE_OFFSET(VA_BITS))
|
|
#define KIMAGE_VADDR (MODULES_END)
|
|
#define MODULES_END (MODULES_VADDR + MODULES_VSIZE)
|
|
#define MODULES_VADDR (_PAGE_END(VA_BITS_MIN))
|
|
#define MODULES_VSIZE (SZ_128M)
|
|
#define VMEMMAP_START (-(UL(1) << (VA_BITS - VMEMMAP_SHIFT)))
|
|
#define VMEMMAP_END (VMEMMAP_START + VMEMMAP_SIZE)
|
|
#define PCI_IO_END (VMEMMAP_START - SZ_8M)
|
|
#define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE)
|
|
#define FIXADDR_TOP (VMEMMAP_START - SZ_32M)
|
|
|
|
#if VA_BITS > 48
|
|
#define VA_BITS_MIN (48)
|
|
#else
|
|
#define VA_BITS_MIN (VA_BITS)
|
|
#endif
|
|
|
|
#define _PAGE_END(va) (-(UL(1) << ((va) - 1)))
|
|
|
|
#define KERNEL_START _text
|
|
#define KERNEL_END _end
|
|
|
|
/*
|
|
* Generic and tag-based KASAN require 1/8th and 1/16th of the kernel virtual
|
|
* address space for the shadow region respectively. They can bloat the stack
|
|
* significantly, so double the (minimum) stack size when they are in use.
|
|
*/
|
|
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
|
|
#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
|
|
#define KASAN_SHADOW_END ((UL(1) << (64 - KASAN_SHADOW_SCALE_SHIFT)) \
|
|
+ KASAN_SHADOW_OFFSET)
|
|
#define PAGE_END (KASAN_SHADOW_END - (1UL << (vabits_actual - KASAN_SHADOW_SCALE_SHIFT)))
|
|
#define KASAN_THREAD_SHIFT 1
|
|
#else
|
|
#define KASAN_THREAD_SHIFT 0
|
|
#define PAGE_END (_PAGE_END(VA_BITS_MIN))
|
|
#endif /* CONFIG_KASAN */
|
|
|
|
#define MIN_THREAD_SHIFT (14 + KASAN_THREAD_SHIFT)
|
|
|
|
/*
|
|
* VMAP'd stacks are allocated at page granularity, so we must ensure that such
|
|
* stacks are a multiple of page size.
|
|
*/
|
|
#if defined(CONFIG_VMAP_STACK) && (MIN_THREAD_SHIFT < PAGE_SHIFT)
|
|
#define THREAD_SHIFT PAGE_SHIFT
|
|
#else
|
|
#define THREAD_SHIFT MIN_THREAD_SHIFT
|
|
#endif
|
|
|
|
#if THREAD_SHIFT >= PAGE_SHIFT
|
|
#define THREAD_SIZE_ORDER (THREAD_SHIFT - PAGE_SHIFT)
|
|
#endif
|
|
|
|
#define THREAD_SIZE (UL(1) << THREAD_SHIFT)
|
|
|
|
/*
|
|
* By aligning VMAP'd stacks to 2 * THREAD_SIZE, we can detect overflow by
|
|
* checking sp & (1 << THREAD_SHIFT), which we can do cheaply in the entry
|
|
* assembly.
|
|
*/
|
|
#ifdef CONFIG_VMAP_STACK
|
|
#define THREAD_ALIGN (2 * THREAD_SIZE)
|
|
#else
|
|
#define THREAD_ALIGN THREAD_SIZE
|
|
#endif
|
|
|
|
#define IRQ_STACK_SIZE THREAD_SIZE
|
|
|
|
#define OVERFLOW_STACK_SIZE SZ_4K
|
|
|
|
/*
|
|
* With the minimum frame size of [x29, x30], exactly half the combined
|
|
* sizes of the hyp and overflow stacks is the maximum size needed to
|
|
* save the unwinded stacktrace; plus an additional entry to delimit the
|
|
* end.
|
|
*/
|
|
#define NVHE_STACKTRACE_SIZE ((OVERFLOW_STACK_SIZE + PAGE_SIZE) / 2 + sizeof(long))
|
|
|
|
/*
|
|
* Alignment of kernel segments (e.g. .text, .data).
|
|
*
|
|
* 4 KB granule: 16 level 3 entries, with contiguous bit
|
|
* 16 KB granule: 4 level 3 entries, without contiguous bit
|
|
* 64 KB granule: 1 level 3 entry
|
|
*/
|
|
#define SEGMENT_ALIGN SZ_64K
|
|
|
|
/*
|
|
* Memory types available.
|
|
*
|
|
* IMPORTANT: MT_NORMAL must be index 0 since vm_get_page_prot() may 'or' in
|
|
* the MT_NORMAL_TAGGED memory type for PROT_MTE mappings. Note
|
|
* that protection_map[] only contains MT_NORMAL attributes.
|
|
*/
|
|
#define MT_NORMAL 0
|
|
#define MT_NORMAL_TAGGED 1
|
|
#define MT_NORMAL_NC 2
|
|
#define MT_DEVICE_nGnRnE 3
|
|
#define MT_DEVICE_nGnRE 4
|
|
|
|
/*
|
|
* Memory types for Stage-2 translation
|
|
*/
|
|
#define MT_S2_NORMAL 0xf
|
|
#define MT_S2_DEVICE_nGnRE 0x1
|
|
|
|
/*
|
|
* Memory types for Stage-2 translation when ID_AA64MMFR2_EL1.FWB is 0001
|
|
* Stage-2 enforces Normal-WB and Device-nGnRE
|
|
*/
|
|
#define MT_S2_FWB_NORMAL 6
|
|
#define MT_S2_FWB_DEVICE_nGnRE 1
|
|
|
|
#ifdef CONFIG_ARM64_4K_PAGES
|
|
#define IOREMAP_MAX_ORDER (PUD_SHIFT)
|
|
#else
|
|
#define IOREMAP_MAX_ORDER (PMD_SHIFT)
|
|
#endif
|
|
|
|
/*
|
|
* Open-coded (swapper_pg_dir - reserved_pg_dir) as this cannot be calculated
|
|
* until link time.
|
|
*/
|
|
#define RESERVED_SWAPPER_OFFSET (PAGE_SIZE)
|
|
|
|
/*
|
|
* Open-coded (swapper_pg_dir - tramp_pg_dir) as this cannot be calculated
|
|
* until link time.
|
|
*/
|
|
#define TRAMP_SWAPPER_OFFSET (2 * PAGE_SIZE)
|
|
|
|
#ifndef __ASSEMBLY__
|
|
|
|
#include <linux/bitops.h>
|
|
#include <linux/compiler.h>
|
|
#include <linux/mmdebug.h>
|
|
#include <linux/types.h>
|
|
#include <asm/boot.h>
|
|
#include <asm/bug.h>
|
|
|
|
#if VA_BITS > 48
|
|
extern u64 vabits_actual;
|
|
#else
|
|
#define vabits_actual ((u64)VA_BITS)
|
|
#endif
|
|
|
|
extern s64 memstart_addr;
|
|
/* PHYS_OFFSET - the physical address of the start of memory. */
|
|
#define PHYS_OFFSET ({ VM_BUG_ON(memstart_addr & 1); memstart_addr; })
|
|
|
|
/* the virtual base of the kernel image */
|
|
extern u64 kimage_vaddr;
|
|
|
|
/* the offset between the kernel virtual and physical mappings */
|
|
extern u64 kimage_voffset;
|
|
|
|
static inline unsigned long kaslr_offset(void)
|
|
{
|
|
return kimage_vaddr - KIMAGE_VADDR;
|
|
}
|
|
|
|
static inline bool kaslr_enabled(void)
|
|
{
|
|
/*
|
|
* The KASLR offset modulo MIN_KIMG_ALIGN is taken from the physical
|
|
* placement of the image rather than from the seed, so a displacement
|
|
* of less than MIN_KIMG_ALIGN means that no seed was provided.
|
|
*/
|
|
return kaslr_offset() >= MIN_KIMG_ALIGN;
|
|
}
|
|
|
|
/*
|
|
* Allow all memory at the discovery stage. We will clip it later.
|
|
*/
|
|
#define MIN_MEMBLOCK_ADDR 0
|
|
#define MAX_MEMBLOCK_ADDR U64_MAX
|
|
|
|
/*
|
|
* PFNs are used to describe any physical page; this means
|
|
* PFN 0 == physical address 0.
|
|
*
|
|
* This is the PFN of the first RAM page in the kernel
|
|
* direct-mapped view. We assume this is the first page
|
|
* of RAM in the mem_map as well.
|
|
*/
|
|
#define PHYS_PFN_OFFSET (PHYS_OFFSET >> PAGE_SHIFT)
|
|
|
|
/*
|
|
* When dealing with data aborts, watchpoints, or instruction traps we may end
|
|
* up with a tagged userland pointer. Clear the tag to get a sane pointer to
|
|
* pass on to access_ok(), for instance.
|
|
*/
|
|
#define __untagged_addr(addr) \
|
|
((__force __typeof__(addr))sign_extend64((__force u64)(addr), 55))
|
|
|
|
#define untagged_addr(addr) ({ \
|
|
u64 __addr = (__force u64)(addr); \
|
|
__addr &= __untagged_addr(__addr); \
|
|
(__force __typeof__(addr))__addr; \
|
|
})
|
|
|
|
#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
|
|
#define __tag_shifted(tag) ((u64)(tag) << 56)
|
|
#define __tag_reset(addr) __untagged_addr(addr)
|
|
#define __tag_get(addr) (__u8)((u64)(addr) >> 56)
|
|
#else
|
|
#define __tag_shifted(tag) 0UL
|
|
#define __tag_reset(addr) (addr)
|
|
#define __tag_get(addr) 0
|
|
#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
|
|
|
|
static inline const void *__tag_set(const void *addr, u8 tag)
|
|
{
|
|
u64 __addr = (u64)addr & ~__tag_shifted(0xff);
|
|
return (const void *)(__addr | __tag_shifted(tag));
|
|
}
|
|
|
|
#ifdef CONFIG_KASAN_HW_TAGS
|
|
#define arch_enable_tagging_sync() mte_enable_kernel_sync()
|
|
#define arch_enable_tagging_async() mte_enable_kernel_async()
|
|
#define arch_enable_tagging_asymm() mte_enable_kernel_asymm()
|
|
#define arch_force_async_tag_fault() mte_check_tfsr_exit()
|
|
#define arch_get_random_tag() mte_get_random_tag()
|
|
#define arch_get_mem_tag(addr) mte_get_mem_tag(addr)
|
|
#define arch_set_mem_tag_range(addr, size, tag, init) \
|
|
mte_set_mem_tag_range((addr), (size), (tag), (init))
|
|
#endif /* CONFIG_KASAN_HW_TAGS */
|
|
|
|
/*
|
|
* Physical vs virtual RAM address space conversion. These are
|
|
* private definitions which should NOT be used outside memory.h
|
|
* files. Use virt_to_phys/phys_to_virt/__pa/__va instead.
|
|
*/
|
|
|
|
|
|
/*
|
|
* Check whether an arbitrary address is within the linear map, which
|
|
* lives in the [PAGE_OFFSET, PAGE_END) interval at the bottom of the
|
|
* kernel's TTBR1 address range.
|
|
*/
|
|
#define __is_lm_address(addr) (((u64)(addr) - PAGE_OFFSET) < (PAGE_END - PAGE_OFFSET))
|
|
|
|
#define __lm_to_phys(addr) (((addr) - PAGE_OFFSET) + PHYS_OFFSET)
|
|
#define __kimg_to_phys(addr) ((addr) - kimage_voffset)
|
|
|
|
#define __virt_to_phys_nodebug(x) ({ \
|
|
phys_addr_t __x = (phys_addr_t)(__tag_reset(x)); \
|
|
__is_lm_address(__x) ? __lm_to_phys(__x) : __kimg_to_phys(__x); \
|
|
})
|
|
|
|
#define __pa_symbol_nodebug(x) __kimg_to_phys((phys_addr_t)(x))
|
|
|
|
#ifdef CONFIG_DEBUG_VIRTUAL
|
|
extern phys_addr_t __virt_to_phys(unsigned long x);
|
|
extern phys_addr_t __phys_addr_symbol(unsigned long x);
|
|
#else
|
|
#define __virt_to_phys(x) __virt_to_phys_nodebug(x)
|
|
#define __phys_addr_symbol(x) __pa_symbol_nodebug(x)
|
|
#endif /* CONFIG_DEBUG_VIRTUAL */
|
|
|
|
#define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET) | PAGE_OFFSET)
|
|
#define __phys_to_kimg(x) ((unsigned long)((x) + kimage_voffset))
|
|
|
|
/*
|
|
* Convert a page to/from a physical address
|
|
*/
|
|
#define page_to_phys(page) (__pfn_to_phys(page_to_pfn(page)))
|
|
#define phys_to_page(phys) (pfn_to_page(__phys_to_pfn(phys)))
|
|
|
|
/*
|
|
* Note: Drivers should NOT use these. They are the wrong
|
|
* translation for translating DMA addresses. Use the driver
|
|
* DMA support - see dma-mapping.h.
|
|
*/
|
|
#define virt_to_phys virt_to_phys
|
|
static inline phys_addr_t virt_to_phys(const volatile void *x)
|
|
{
|
|
return __virt_to_phys((unsigned long)(x));
|
|
}
|
|
|
|
#define phys_to_virt phys_to_virt
|
|
static inline void *phys_to_virt(phys_addr_t x)
|
|
{
|
|
return (void *)(__phys_to_virt(x));
|
|
}
|
|
|
|
/*
|
|
* Drivers should NOT use these either.
|
|
*/
|
|
#define __pa(x) __virt_to_phys((unsigned long)(x))
|
|
#define __pa_symbol(x) __phys_addr_symbol(RELOC_HIDE((unsigned long)(x), 0))
|
|
#define __pa_nodebug(x) __virt_to_phys_nodebug((unsigned long)(x))
|
|
#define __va(x) ((void *)__phys_to_virt((phys_addr_t)(x)))
|
|
#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
|
|
#define virt_to_pfn(x) __phys_to_pfn(__virt_to_phys((unsigned long)(x)))
|
|
#define sym_to_pfn(x) __phys_to_pfn(__pa_symbol(x))
|
|
|
|
/*
|
|
* virt_to_page(x) convert a _valid_ virtual address to struct page *
|
|
* virt_addr_valid(x) indicates whether a virtual address is valid
|
|
*/
|
|
#define ARCH_PFN_OFFSET ((unsigned long)PHYS_PFN_OFFSET)
|
|
|
|
#if defined(CONFIG_DEBUG_VIRTUAL)
|
|
#define page_to_virt(x) ({ \
|
|
__typeof__(x) __page = x; \
|
|
void *__addr = __va(page_to_phys(__page)); \
|
|
(void *)__tag_set((const void *)__addr, page_kasan_tag(__page));\
|
|
})
|
|
#define virt_to_page(x) pfn_to_page(virt_to_pfn(x))
|
|
#else
|
|
#define page_to_virt(x) ({ \
|
|
__typeof__(x) __page = x; \
|
|
u64 __idx = ((u64)__page - VMEMMAP_START) / sizeof(struct page);\
|
|
u64 __addr = PAGE_OFFSET + (__idx * PAGE_SIZE); \
|
|
(void *)__tag_set((const void *)__addr, page_kasan_tag(__page));\
|
|
})
|
|
|
|
#define virt_to_page(x) ({ \
|
|
u64 __idx = (__tag_reset((u64)x) - PAGE_OFFSET) / PAGE_SIZE; \
|
|
u64 __addr = VMEMMAP_START + (__idx * sizeof(struct page)); \
|
|
(struct page *)__addr; \
|
|
})
|
|
#endif /* CONFIG_DEBUG_VIRTUAL */
|
|
|
|
#define virt_addr_valid(addr) ({ \
|
|
__typeof__(addr) __addr = __tag_reset(addr); \
|
|
__is_lm_address(__addr) && pfn_is_map_memory(virt_to_pfn(__addr)); \
|
|
})
|
|
|
|
void dump_mem_limit(void);
|
|
|
|
static inline bool defer_reserve_crashkernel(void)
|
|
{
|
|
return IS_ENABLED(CONFIG_ZONE_DMA) || IS_ENABLED(CONFIG_ZONE_DMA32);
|
|
}
|
|
#endif /* !ASSEMBLY */
|
|
|
|
/*
|
|
* Given that the GIC architecture permits ITS implementations that can only be
|
|
* configured with a LPI table address once, GICv3 systems with many CPUs may
|
|
* end up reserving a lot of different regions after a kexec for their LPI
|
|
* tables (one per CPU), as we are forced to reuse the same memory after kexec
|
|
* (and thus reserve it persistently with EFI beforehand)
|
|
*/
|
|
#if defined(CONFIG_EFI) && defined(CONFIG_ARM_GIC_V3_ITS)
|
|
# define INIT_MEMBLOCK_RESERVED_REGIONS (INIT_MEMBLOCK_REGIONS + NR_CPUS + 1)
|
|
#endif
|
|
|
|
/*
|
|
* memory regions which marked with flag MEMBLOCK_NOMAP(for example, the memory
|
|
* of the EFI_UNUSABLE_MEMORY type) may divide a continuous memory block into
|
|
* multiple parts. As a result, the number of memory regions is large.
|
|
*/
|
|
#ifdef CONFIG_EFI
|
|
#define INIT_MEMBLOCK_MEMORY_REGIONS (INIT_MEMBLOCK_REGIONS * 8)
|
|
#endif
|
|
|
|
#include <asm-generic/memory_model.h>
|
|
|
|
#endif /* __ASM_MEMORY_H */
|