mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-05-08 06:53:30 -04:00
x86/boot: Move early kernel mapping code into startup/
The startup code that constructs the kernel virtual mapping runs from the 1:1 mapping of memory itself, and therefore, cannot use absolute symbol references. Before making changes in subsequent patches, move this code into a separate source file under arch/x86/boot/startup/ where all such code will be kept from now on. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Ingo Molnar <mingo@kernel.org> Cc: Dionna Amalie Glaze <dionnaglaze@google.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Kees Cook <keescook@chromium.org> Cc: Kevin Loughlin <kevinloughlin@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Tom Lendacky <thomas.lendacky@amd.com> Cc: linux-efi@vger.kernel.org Link: https://lore.kernel.org/r/20250410134117.3713574-16-ardb+git@google.com
This commit is contained in:
committed by
Ingo Molnar
parent
4cecebf200
commit
dbe0ad775c
@@ -15,7 +15,7 @@ KMSAN_SANITIZE := n
|
||||
UBSAN_SANITIZE := n
|
||||
KCOV_INSTRUMENT := n
|
||||
|
||||
obj-$(CONFIG_X86_64) += gdt_idt.o
|
||||
obj-$(CONFIG_X86_64) += gdt_idt.o map_kernel.o
|
||||
|
||||
lib-$(CONFIG_X86_64) += la57toggle.o
|
||||
lib-$(CONFIG_EFI_MIXED) += efi-mixed.o
|
||||
|
||||
224
arch/x86/boot/startup/map_kernel.c
Normal file
224
arch/x86/boot/startup/map_kernel.c
Normal file
@@ -0,0 +1,224 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include <linux/init.h>
|
||||
#include <linux/linkage.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/pgtable.h>
|
||||
|
||||
#include <asm/init.h>
|
||||
#include <asm/sections.h>
|
||||
#include <asm/setup.h>
|
||||
#include <asm/sev.h>
|
||||
|
||||
extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
|
||||
extern unsigned int next_early_pgt;
|
||||
|
||||
static inline bool check_la57_support(void)
|
||||
{
|
||||
if (!IS_ENABLED(CONFIG_X86_5LEVEL))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* 5-level paging is detected and enabled at kernel decompression
|
||||
* stage. Only check if it has been enabled there.
|
||||
*/
|
||||
if (!(native_read_cr4() & X86_CR4_LA57))
|
||||
return false;
|
||||
|
||||
RIP_REL_REF(__pgtable_l5_enabled) = 1;
|
||||
RIP_REL_REF(pgdir_shift) = 48;
|
||||
RIP_REL_REF(ptrs_per_p4d) = 512;
|
||||
RIP_REL_REF(page_offset_base) = __PAGE_OFFSET_BASE_L5;
|
||||
RIP_REL_REF(vmalloc_base) = __VMALLOC_BASE_L5;
|
||||
RIP_REL_REF(vmemmap_base) = __VMEMMAP_BASE_L5;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static unsigned long __head sme_postprocess_startup(struct boot_params *bp,
|
||||
pmdval_t *pmd,
|
||||
unsigned long p2v_offset)
|
||||
{
|
||||
unsigned long paddr, paddr_end;
|
||||
int i;
|
||||
|
||||
/* Encrypt the kernel and related (if SME is active) */
|
||||
sme_encrypt_kernel(bp);
|
||||
|
||||
/*
|
||||
* Clear the memory encryption mask from the .bss..decrypted section.
|
||||
* The bss section will be memset to zero later in the initialization so
|
||||
* there is no need to zero it after changing the memory encryption
|
||||
* attribute.
|
||||
*/
|
||||
if (sme_get_me_mask()) {
|
||||
paddr = (unsigned long)rip_rel_ptr(__start_bss_decrypted);
|
||||
paddr_end = (unsigned long)rip_rel_ptr(__end_bss_decrypted);
|
||||
|
||||
for (; paddr < paddr_end; paddr += PMD_SIZE) {
|
||||
/*
|
||||
* On SNP, transition the page to shared in the RMP table so that
|
||||
* it is consistent with the page table attribute change.
|
||||
*
|
||||
* __start_bss_decrypted has a virtual address in the high range
|
||||
* mapping (kernel .text). PVALIDATE, by way of
|
||||
* early_snp_set_memory_shared(), requires a valid virtual
|
||||
* address but the kernel is currently running off of the identity
|
||||
* mapping so use the PA to get a *currently* valid virtual address.
|
||||
*/
|
||||
early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD);
|
||||
|
||||
i = pmd_index(paddr - p2v_offset);
|
||||
pmd[i] -= sme_get_me_mask();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the SME encryption mask (if SME is active) to be used as a
|
||||
* modifier for the initial pgdir entry programmed into CR3.
|
||||
*/
|
||||
return sme_get_me_mask();
|
||||
}
|
||||
|
||||
/* Code in __startup_64() can be relocated during execution, but the compiler
|
||||
* doesn't have to generate PC-relative relocations when accessing globals from
|
||||
* that function. Clang actually does not generate them, which leads to
|
||||
* boot-time crashes. To work around this problem, every global pointer must
|
||||
* be accessed using RIP_REL_REF(). Kernel virtual addresses can be determined
|
||||
* by subtracting p2v_offset from the RIP-relative address.
|
||||
*/
|
||||
unsigned long __head __startup_64(unsigned long p2v_offset,
|
||||
struct boot_params *bp)
|
||||
{
|
||||
pmd_t (*early_pgts)[PTRS_PER_PMD] = rip_rel_ptr(early_dynamic_pgts);
|
||||
unsigned long physaddr = (unsigned long)rip_rel_ptr(_text);
|
||||
unsigned long va_text, va_end;
|
||||
unsigned long pgtable_flags;
|
||||
unsigned long load_delta;
|
||||
pgdval_t *pgd;
|
||||
p4dval_t *p4d;
|
||||
pudval_t *pud;
|
||||
pmdval_t *pmd, pmd_entry;
|
||||
bool la57;
|
||||
int i;
|
||||
|
||||
la57 = check_la57_support();
|
||||
|
||||
/* Is the address too large? */
|
||||
if (physaddr >> MAX_PHYSMEM_BITS)
|
||||
for (;;);
|
||||
|
||||
/*
|
||||
* Compute the delta between the address I am compiled to run at
|
||||
* and the address I am actually running at.
|
||||
*/
|
||||
load_delta = __START_KERNEL_map + p2v_offset;
|
||||
RIP_REL_REF(phys_base) = load_delta;
|
||||
|
||||
/* Is the address not 2M aligned? */
|
||||
if (load_delta & ~PMD_MASK)
|
||||
for (;;);
|
||||
|
||||
va_text = physaddr - p2v_offset;
|
||||
va_end = (unsigned long)rip_rel_ptr(_end) - p2v_offset;
|
||||
|
||||
/* Include the SME encryption mask in the fixup value */
|
||||
load_delta += sme_get_me_mask();
|
||||
|
||||
/* Fixup the physical addresses in the page table */
|
||||
|
||||
pgd = rip_rel_ptr(early_top_pgt);
|
||||
pgd[pgd_index(__START_KERNEL_map)] += load_delta;
|
||||
|
||||
if (IS_ENABLED(CONFIG_X86_5LEVEL) && la57) {
|
||||
p4d = (p4dval_t *)rip_rel_ptr(level4_kernel_pgt);
|
||||
p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
|
||||
|
||||
pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE;
|
||||
}
|
||||
|
||||
RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 2].pud += load_delta;
|
||||
RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 1].pud += load_delta;
|
||||
|
||||
for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
|
||||
RIP_REL_REF(level2_fixmap_pgt)[i].pmd += load_delta;
|
||||
|
||||
/*
|
||||
* Set up the identity mapping for the switchover. These
|
||||
* entries should *NOT* have the global bit set! This also
|
||||
* creates a bunch of nonsense entries but that is fine --
|
||||
* it avoids problems around wraparound.
|
||||
*/
|
||||
|
||||
pud = &early_pgts[0]->pmd;
|
||||
pmd = &early_pgts[1]->pmd;
|
||||
RIP_REL_REF(next_early_pgt) = 2;
|
||||
|
||||
pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
|
||||
|
||||
if (la57) {
|
||||
p4d = &early_pgts[RIP_REL_REF(next_early_pgt)++]->pmd;
|
||||
|
||||
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
|
||||
pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
|
||||
pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
|
||||
|
||||
i = physaddr >> P4D_SHIFT;
|
||||
p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
|
||||
p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
|
||||
} else {
|
||||
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
|
||||
pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
|
||||
pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
|
||||
}
|
||||
|
||||
i = physaddr >> PUD_SHIFT;
|
||||
pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
|
||||
pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
|
||||
|
||||
pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
|
||||
/* Filter out unsupported __PAGE_KERNEL_* bits: */
|
||||
pmd_entry &= RIP_REL_REF(__supported_pte_mask);
|
||||
pmd_entry += sme_get_me_mask();
|
||||
pmd_entry += physaddr;
|
||||
|
||||
for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) {
|
||||
int idx = i + (physaddr >> PMD_SHIFT);
|
||||
|
||||
pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fixup the kernel text+data virtual addresses. Note that
|
||||
* we might write invalid pmds, when the kernel is relocated
|
||||
* cleanup_highmap() fixes this up along with the mappings
|
||||
* beyond _end.
|
||||
*
|
||||
* Only the region occupied by the kernel image has so far
|
||||
* been checked against the table of usable memory regions
|
||||
* provided by the firmware, so invalidate pages outside that
|
||||
* region. A page table entry that maps to a reserved area of
|
||||
* memory would allow processor speculation into that area,
|
||||
* and on some hardware (particularly the UV platform) even
|
||||
* speculative access to some reserved areas is caught as an
|
||||
* error, causing the BIOS to halt the system.
|
||||
*/
|
||||
|
||||
pmd = rip_rel_ptr(level2_kernel_pgt);
|
||||
|
||||
/* invalidate pages before the kernel image */
|
||||
for (i = 0; i < pmd_index(va_text); i++)
|
||||
pmd[i] &= ~_PAGE_PRESENT;
|
||||
|
||||
/* fixup pages that are part of the kernel image */
|
||||
for (; i <= pmd_index(va_end); i++)
|
||||
if (pmd[i] & _PAGE_PRESENT)
|
||||
pmd[i] += load_delta;
|
||||
|
||||
/* invalidate pages after the kernel image */
|
||||
for (; i < PTRS_PER_PMD; i++)
|
||||
pmd[i] &= ~_PAGE_PRESENT;
|
||||
|
||||
return sme_postprocess_startup(bp, pmd, p2v_offset);
|
||||
}
|
||||
@@ -47,7 +47,7 @@
|
||||
* Manage page tables very early on.
|
||||
*/
|
||||
extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
|
||||
static unsigned int __initdata next_early_pgt;
|
||||
unsigned int __initdata next_early_pgt;
|
||||
pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
|
||||
|
||||
#ifdef CONFIG_X86_5LEVEL
|
||||
@@ -67,215 +67,6 @@ unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4;
|
||||
EXPORT_SYMBOL(vmemmap_base);
|
||||
#endif
|
||||
|
||||
static inline bool check_la57_support(void)
|
||||
{
|
||||
if (!IS_ENABLED(CONFIG_X86_5LEVEL))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* 5-level paging is detected and enabled at kernel decompression
|
||||
* stage. Only check if it has been enabled there.
|
||||
*/
|
||||
if (!(native_read_cr4() & X86_CR4_LA57))
|
||||
return false;
|
||||
|
||||
RIP_REL_REF(__pgtable_l5_enabled) = 1;
|
||||
RIP_REL_REF(pgdir_shift) = 48;
|
||||
RIP_REL_REF(ptrs_per_p4d) = 512;
|
||||
RIP_REL_REF(page_offset_base) = __PAGE_OFFSET_BASE_L5;
|
||||
RIP_REL_REF(vmalloc_base) = __VMALLOC_BASE_L5;
|
||||
RIP_REL_REF(vmemmap_base) = __VMEMMAP_BASE_L5;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static unsigned long __head sme_postprocess_startup(struct boot_params *bp,
|
||||
pmdval_t *pmd,
|
||||
unsigned long p2v_offset)
|
||||
{
|
||||
unsigned long paddr, paddr_end;
|
||||
int i;
|
||||
|
||||
/* Encrypt the kernel and related (if SME is active) */
|
||||
sme_encrypt_kernel(bp);
|
||||
|
||||
/*
|
||||
* Clear the memory encryption mask from the .bss..decrypted section.
|
||||
* The bss section will be memset to zero later in the initialization so
|
||||
* there is no need to zero it after changing the memory encryption
|
||||
* attribute.
|
||||
*/
|
||||
if (sme_get_me_mask()) {
|
||||
paddr = (unsigned long)rip_rel_ptr(__start_bss_decrypted);
|
||||
paddr_end = (unsigned long)rip_rel_ptr(__end_bss_decrypted);
|
||||
|
||||
for (; paddr < paddr_end; paddr += PMD_SIZE) {
|
||||
/*
|
||||
* On SNP, transition the page to shared in the RMP table so that
|
||||
* it is consistent with the page table attribute change.
|
||||
*
|
||||
* __start_bss_decrypted has a virtual address in the high range
|
||||
* mapping (kernel .text). PVALIDATE, by way of
|
||||
* early_snp_set_memory_shared(), requires a valid virtual
|
||||
* address but the kernel is currently running off of the identity
|
||||
* mapping so use the PA to get a *currently* valid virtual address.
|
||||
*/
|
||||
early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD);
|
||||
|
||||
i = pmd_index(paddr - p2v_offset);
|
||||
pmd[i] -= sme_get_me_mask();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the SME encryption mask (if SME is active) to be used as a
|
||||
* modifier for the initial pgdir entry programmed into CR3.
|
||||
*/
|
||||
return sme_get_me_mask();
|
||||
}
|
||||
|
||||
/* Code in __startup_64() can be relocated during execution, but the compiler
|
||||
* doesn't have to generate PC-relative relocations when accessing globals from
|
||||
* that function. Clang actually does not generate them, which leads to
|
||||
* boot-time crashes. To work around this problem, every global pointer must
|
||||
* be accessed using RIP_REL_REF(). Kernel virtual addresses can be determined
|
||||
* by subtracting p2v_offset from the RIP-relative address.
|
||||
*/
|
||||
unsigned long __head __startup_64(unsigned long p2v_offset,
|
||||
struct boot_params *bp)
|
||||
{
|
||||
pmd_t (*early_pgts)[PTRS_PER_PMD] = rip_rel_ptr(early_dynamic_pgts);
|
||||
unsigned long physaddr = (unsigned long)rip_rel_ptr(_text);
|
||||
unsigned long va_text, va_end;
|
||||
unsigned long pgtable_flags;
|
||||
unsigned long load_delta;
|
||||
pgdval_t *pgd;
|
||||
p4dval_t *p4d;
|
||||
pudval_t *pud;
|
||||
pmdval_t *pmd, pmd_entry;
|
||||
bool la57;
|
||||
int i;
|
||||
|
||||
la57 = check_la57_support();
|
||||
|
||||
/* Is the address too large? */
|
||||
if (physaddr >> MAX_PHYSMEM_BITS)
|
||||
for (;;);
|
||||
|
||||
/*
|
||||
* Compute the delta between the address I am compiled to run at
|
||||
* and the address I am actually running at.
|
||||
*/
|
||||
load_delta = __START_KERNEL_map + p2v_offset;
|
||||
RIP_REL_REF(phys_base) = load_delta;
|
||||
|
||||
/* Is the address not 2M aligned? */
|
||||
if (load_delta & ~PMD_MASK)
|
||||
for (;;);
|
||||
|
||||
va_text = physaddr - p2v_offset;
|
||||
va_end = (unsigned long)rip_rel_ptr(_end) - p2v_offset;
|
||||
|
||||
/* Include the SME encryption mask in the fixup value */
|
||||
load_delta += sme_get_me_mask();
|
||||
|
||||
/* Fixup the physical addresses in the page table */
|
||||
|
||||
pgd = rip_rel_ptr(early_top_pgt);
|
||||
pgd[pgd_index(__START_KERNEL_map)] += load_delta;
|
||||
|
||||
if (IS_ENABLED(CONFIG_X86_5LEVEL) && la57) {
|
||||
p4d = (p4dval_t *)rip_rel_ptr(level4_kernel_pgt);
|
||||
p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
|
||||
|
||||
pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE;
|
||||
}
|
||||
|
||||
RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 2].pud += load_delta;
|
||||
RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 1].pud += load_delta;
|
||||
|
||||
for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
|
||||
RIP_REL_REF(level2_fixmap_pgt)[i].pmd += load_delta;
|
||||
|
||||
/*
|
||||
* Set up the identity mapping for the switchover. These
|
||||
* entries should *NOT* have the global bit set! This also
|
||||
* creates a bunch of nonsense entries but that is fine --
|
||||
* it avoids problems around wraparound.
|
||||
*/
|
||||
|
||||
pud = &early_pgts[0]->pmd;
|
||||
pmd = &early_pgts[1]->pmd;
|
||||
RIP_REL_REF(next_early_pgt) = 2;
|
||||
|
||||
pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
|
||||
|
||||
if (la57) {
|
||||
p4d = &early_pgts[RIP_REL_REF(next_early_pgt)++]->pmd;
|
||||
|
||||
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
|
||||
pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
|
||||
pgd[i + 1] = (pgdval_t)p4d + pgtable_flags;
|
||||
|
||||
i = physaddr >> P4D_SHIFT;
|
||||
p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
|
||||
p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags;
|
||||
} else {
|
||||
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
|
||||
pgd[i + 0] = (pgdval_t)pud + pgtable_flags;
|
||||
pgd[i + 1] = (pgdval_t)pud + pgtable_flags;
|
||||
}
|
||||
|
||||
i = physaddr >> PUD_SHIFT;
|
||||
pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
|
||||
pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags;
|
||||
|
||||
pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
|
||||
/* Filter out unsupported __PAGE_KERNEL_* bits: */
|
||||
pmd_entry &= RIP_REL_REF(__supported_pte_mask);
|
||||
pmd_entry += sme_get_me_mask();
|
||||
pmd_entry += physaddr;
|
||||
|
||||
for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) {
|
||||
int idx = i + (physaddr >> PMD_SHIFT);
|
||||
|
||||
pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fixup the kernel text+data virtual addresses. Note that
|
||||
* we might write invalid pmds, when the kernel is relocated
|
||||
* cleanup_highmap() fixes this up along with the mappings
|
||||
* beyond _end.
|
||||
*
|
||||
* Only the region occupied by the kernel image has so far
|
||||
* been checked against the table of usable memory regions
|
||||
* provided by the firmware, so invalidate pages outside that
|
||||
* region. A page table entry that maps to a reserved area of
|
||||
* memory would allow processor speculation into that area,
|
||||
* and on some hardware (particularly the UV platform) even
|
||||
* speculative access to some reserved areas is caught as an
|
||||
* error, causing the BIOS to halt the system.
|
||||
*/
|
||||
|
||||
pmd = rip_rel_ptr(level2_kernel_pgt);
|
||||
|
||||
/* invalidate pages before the kernel image */
|
||||
for (i = 0; i < pmd_index(va_text); i++)
|
||||
pmd[i] &= ~_PAGE_PRESENT;
|
||||
|
||||
/* fixup pages that are part of the kernel image */
|
||||
for (; i <= pmd_index(va_end); i++)
|
||||
if (pmd[i] & _PAGE_PRESENT)
|
||||
pmd[i] += load_delta;
|
||||
|
||||
/* invalidate pages after the kernel image */
|
||||
for (; i < PTRS_PER_PMD; i++)
|
||||
pmd[i] &= ~_PAGE_PRESENT;
|
||||
|
||||
return sme_postprocess_startup(bp, pmd, p2v_offset);
|
||||
}
|
||||
|
||||
/* Wipe all early page tables except for the kernel symbol map */
|
||||
static void __init reset_early_page_tables(void)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user