diff --git a/Documentation/arch/x86/tdx.rst b/Documentation/arch/x86/tdx.rst index 719043cd8b46..61670e7df2f7 100644 --- a/Documentation/arch/x86/tdx.rst +++ b/Documentation/arch/x86/tdx.rst @@ -142,13 +142,6 @@ but depends on the BIOS to behave correctly. Note TDX works with CPU logical online/offline, thus the kernel still allows to offline logical CPU and online it again. -Kexec() -~~~~~~~ - -TDX host support currently lacks the ability to handle kexec. For -simplicity only one of them can be enabled in the Kconfig. This will be -fixed in the future. - Erratum ~~~~~~~ @@ -171,6 +164,13 @@ If the platform has such erratum, the kernel prints additional message in machine check handler to tell user the machine check may be caused by kernel bug on TDX private memory. +Kexec +~~~~~~~ + +Currently kexec doesn't work on the TDX platforms with the aforementioned +erratum. It fails when loading the kexec kernel image. Otherwise it +works normally. + Interaction vs S3 and deeper states ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/MAINTAINERS b/MAINTAINERS index 9b3db4f797c6..3b192f9a4785 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -27723,17 +27723,14 @@ F: arch/x86/kernel/unwind_*.c X86 TRUST DOMAIN EXTENSIONS (TDX) M: Kirill A. Shutemov R: Dave Hansen +R: Rick Edgecombe L: x86@kernel.org L: linux-coco@lists.linux.dev +L: kvm@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/tdx -F: Documentation/ABI/testing/sysfs-devices-virtual-misc-tdx_guest -F: arch/x86/boot/compressed/tdx* -F: arch/x86/coco/tdx/ -F: arch/x86/include/asm/shared/tdx.h -F: arch/x86/include/asm/tdx.h -F: arch/x86/virt/vmx/tdx/ -F: drivers/virt/coco/tdx-guest +N: tdx +K: \b(tdx) X86 VDSO M: Andy Lutomirski diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e3cccf4256ca..9d034a987c6e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1902,7 +1902,6 @@ config INTEL_TDX_HOST depends on X86_X2APIC select ARCH_KEEP_MEMBLOCK depends on CONTIG_ALLOC - depends on !KEXEC_CORE depends on X86_MCE help Intel Trust Domain Extensions (TDX) protects guest VMs from malicious diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index f2ad77929d6e..5cfb27f26583 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -13,6 +13,15 @@ # define KEXEC_DEBUG_EXC_HANDLER_SIZE 6 /* PUSHI, PUSHI, 2-byte JMP */ #endif +#ifdef CONFIG_X86_64 + +#include + +#define RELOC_KERNEL_PRESERVE_CONTEXT BIT(0) +#define RELOC_KERNEL_CACHE_INCOHERENT BIT(1) + +#endif + # define KEXEC_CONTROL_PAGE_SIZE 4096 # define KEXEC_CONTROL_CODE_MAX_SIZE 2048 @@ -121,8 +130,7 @@ typedef unsigned long relocate_kernel_fn(unsigned long indirection_page, unsigned long pa_control_page, unsigned long start_address, - unsigned int preserve_context, - unsigned int host_mem_enc_active); + unsigned int flags); #endif extern relocate_kernel_fn relocate_kernel; #define ARCH_HAS_KIMAGE_ARCH diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index bde58f6510ac..a24c7805acdb 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -731,6 +731,8 @@ void __noreturn stop_this_cpu(void *dummy); void microcode_check(struct cpuinfo_x86 *prev_info); void store_cpu_caps(struct cpuinfo_x86 *info); +DECLARE_PER_CPU(bool, cache_state_incoherent); + enum l1tf_mitigations { L1TF_MITIGATION_OFF, L1TF_MITIGATION_AUTO, diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 7ddef3a69866..6b338d7f01b7 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -102,10 +102,31 @@ u64 __seamcall_ret(u64 fn, struct tdx_module_args *args); u64 __seamcall_saved_ret(u64 fn, struct tdx_module_args *args); void tdx_init(void); +#include #include +#include typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args); +static __always_inline u64 __seamcall_dirty_cache(sc_func_t func, u64 fn, + struct tdx_module_args *args) +{ + lockdep_assert_preemption_disabled(); + + /* + * SEAMCALLs are made to the TDX module and can generate dirty + * cachelines of TDX private memory. Mark cache state incoherent + * so that the cache can be flushed during kexec. + * + * This needs to be done before actually making the SEAMCALL, + * because kexec-ing CPU could send NMI to stop remote CPUs, + * in which case even disabling IRQ won't help here. + */ + this_cpu_write(cache_state_incoherent, true); + + return func(fn, args); +} + static __always_inline u64 sc_retry(sc_func_t func, u64 fn, struct tdx_module_args *args) { @@ -113,7 +134,9 @@ static __always_inline u64 sc_retry(sc_func_t func, u64 fn, u64 ret; do { - ret = func(fn, args); + preempt_disable(); + ret = __seamcall_dirty_cache(func, fn, args); + preempt_enable(); } while (ret == TDX_RND_NO_ENTROPY && --retry); return ret; @@ -131,6 +154,8 @@ int tdx_guest_keyid_alloc(void); u32 tdx_get_nr_guest_keyids(void); void tdx_guest_keyid_free(unsigned int keyid); +void tdx_quirk_reset_page(struct page *page); + struct tdx_td { /* TD root structure: */ struct page *tdr_page; @@ -146,6 +171,8 @@ struct tdx_td { struct tdx_vp { /* TDVP root page */ struct page *tdvpr_page; + /* precalculated page_to_phys(tdvpr_page) for use in noinstr code */ + phys_addr_t tdvpr_pa; /* TD vCPU control structure: */ struct page **tdcx_pages; @@ -203,5 +230,11 @@ static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; } #endif /* CONFIG_INTEL_TDX_HOST */ +#ifdef CONFIG_KEXEC_CORE +void tdx_cpu_flush_cache_for_kexec(void); +#else +static inline void tdx_cpu_flush_cache_for_kexec(void) { } +#endif + #endif /* !__ASSEMBLER__ */ #endif /* _ASM_X86_TDX_H */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a6f88ca1a6b4..5398db4dedb4 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -545,6 +545,23 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) { u64 msr; + /* + * Mark using WBINVD is needed during kexec on processors that + * support SME. This provides support for performing a successful + * kexec when going from SME inactive to SME active (or vice-versa). + * + * The cache must be cleared so that if there are entries with the + * same physical address, both with and without the encryption bit, + * they don't race each other when flushed and potentially end up + * with the wrong entry being committed to memory. + * + * Test the CPUID bit directly because with mem_encrypt=off the + * BSP will clear the X86_FEATURE_SME bit and the APs will not + * see it set after that. + */ + if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0))) + __this_cpu_write(cache_state_incoherent, true); + /* * BIOS support is required for SME and SEV. * For SME: If BIOS has enabled SME then adjust x86_phys_bits by diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 697fb99406e6..15088d14904f 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -29,6 +29,7 @@ #include #include #include +#include #ifdef CONFIG_ACPI /* @@ -346,6 +347,22 @@ int machine_kexec_prepare(struct kimage *image) unsigned long reloc_end = (unsigned long)__relocate_kernel_end; int result; + /* + * Some early TDX-capable platforms have an erratum. A kernel + * partial write (a write transaction of less than cacheline + * lands at memory controller) to TDX private memory poisons that + * memory, and a subsequent read triggers a machine check. + * + * On those platforms the old kernel must reset TDX private + * memory before jumping to the new kernel otherwise the new + * kernel may see unexpected machine check. For simplicity + * just fail kexec/kdump on those platforms. + */ + if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE)) { + pr_info_once("Not allowed on platform with tdx_pw_mce bug\n"); + return -EOPNOTSUPP; + } + /* Setup the identity mapped 64bit page table */ result = init_pgtable(image, __pa(control_page)); if (result) @@ -384,16 +401,10 @@ void __nocfi machine_kexec(struct kimage *image) { unsigned long reloc_start = (unsigned long)__relocate_kernel_start; relocate_kernel_fn *relocate_kernel_ptr; - unsigned int host_mem_enc_active; + unsigned int relocate_kernel_flags; int save_ftrace_enabled; void *control_page; - /* - * This must be done before load_segments() since if call depth tracking - * is used then GS must be valid to make any function calls. - */ - host_mem_enc_active = cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT); - #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) save_processor_state(); @@ -427,6 +438,17 @@ void __nocfi machine_kexec(struct kimage *image) */ relocate_kernel_ptr = control_page + (unsigned long)relocate_kernel - reloc_start; + relocate_kernel_flags = 0; + if (image->preserve_context) + relocate_kernel_flags |= RELOC_KERNEL_PRESERVE_CONTEXT; + + /* + * This must be done before load_segments() since it resets + * GS to 0 and percpu data needs the correct GS to work. + */ + if (this_cpu_read(cache_state_incoherent)) + relocate_kernel_flags |= RELOC_KERNEL_CACHE_INCOHERENT; + /* * The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is @@ -436,6 +458,11 @@ void __nocfi machine_kexec(struct kimage *image) * * Take advantage of this here by force loading the segments, * before the GDT is zapped with an invalid value. + * + * load_segments() resets GS to 0. Don't make any function call + * after here since call depth tracking uses percpu variables to + * operate (relocate_kernel() is explicitly ignored by call depth + * tracking). */ load_segments(); @@ -443,8 +470,7 @@ void __nocfi machine_kexec(struct kimage *image) image->start = relocate_kernel_ptr((unsigned long)image->head, virt_to_phys(control_page), image->start, - image->preserve_context, - host_mem_enc_active); + relocate_kernel_flags); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e3a3987b0c4f..4c718f8adc59 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -88,6 +88,16 @@ EXPORT_PER_CPU_SYMBOL(cpu_tss_rw); DEFINE_PER_CPU(bool, __tss_limit_invalid); EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); +/* + * The cache may be in an incoherent state and needs flushing during kexec. + * E.g., on SME/TDX platforms, dirty cacheline aliases with and without + * encryption bit(s) can coexist and the cache needs to be flushed before + * booting to the new kernel to avoid the silent memory corruption due to + * dirty cachelines with different encryption property being written back + * to the memory. + */ +DEFINE_PER_CPU(bool, cache_state_incoherent); + /* * this gets called so that we can store lazy state into memory and copy the * current task into the new thread. @@ -827,19 +837,7 @@ void __noreturn stop_this_cpu(void *dummy) disable_local_APIC(); mcheck_cpu_clear(c); - /* - * Use wbinvd on processors that support SME. This provides support - * for performing a successful kexec when going from SME inactive - * to SME active (or vice-versa). The cache must be cleared so that - * if there are entries with the same physical address, both with and - * without the encryption bit, they don't race each other when flushed - * and potentially end up with the wrong entry being committed to - * memory. - * - * Test the CPUID bit directly because the machine might've cleared - * X86_FEATURE_SME due to cmdline options. - */ - if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0))) + if (this_cpu_read(cache_state_incoherent)) wbinvd(); /* diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index ea604f4d0b52..11e20bb13aca 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -66,8 +66,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel) * %rdi indirection_page * %rsi pa_control_page * %rdx start address - * %rcx preserve_context - * %r8 host_mem_enc_active + * %rcx flags: RELOC_KERNEL_* */ /* Save the CPU context, used for jumping back */ @@ -111,7 +110,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel) /* save indirection list for jumping back */ movq %rdi, pa_backup_pages_map(%rip) - /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */ + /* Save the flags to %r11 as swap_pages clobbers %rcx. */ movq %rcx, %r11 /* setup a new stack at the end of the physical control page */ @@ -129,9 +128,8 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) /* * %rdi indirection page * %rdx start address - * %r8 host_mem_enc_active * %r9 page table page - * %r11 preserve_context + * %r11 flags: RELOC_KERNEL_* * %r13 original CR4 when relocate_kernel() was invoked */ @@ -200,14 +198,21 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) movq %r9, %cr3 /* + * If the memory cache is in incoherent state, e.g., due to + * memory encryption, do WBINVD to flush cache. + * * If SME is active, there could be old encrypted cache line * entries that will conflict with the now unencrypted memory * used by kexec. Flush the caches before copying the kernel. + * + * Note SME sets this flag to true when the platform supports + * SME, so the WBINVD is performed even SME is not activated + * by the kernel. But this has no harm. */ - testq %r8, %r8 - jz .Lsme_off + testb $RELOC_KERNEL_CACHE_INCOHERENT, %r11b + jz .Lnowbinvd wbinvd -.Lsme_off: +.Lnowbinvd: call swap_pages @@ -220,7 +225,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) movq %cr3, %rax movq %rax, %cr3 - testq %r11, %r11 /* preserve_context */ + testb $RELOC_KERNEL_PRESERVE_CONTEXT, %r11b jnz .Lrelocate /* @@ -273,7 +278,13 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) ANNOTATE_NOENDBR andq $PAGE_MASK, %r8 lea PAGE_SIZE(%r8), %rsp - movl $1, %r11d /* Ensure preserve_context flag is set */ + /* + * Ensure RELOC_KERNEL_PRESERVE_CONTEXT flag is set so that + * swap_pages() can swap pages correctly. Note all other + * RELOC_KERNEL_* flags passed to relocate_kernel() are not + * restored. + */ + movl $RELOC_KERNEL_PRESERVE_CONTEXT, %r11d call swap_pages movq kexec_va_control_page(%rip), %rax 0: addq $virtual_mapped - 0b, %rax @@ -321,7 +332,7 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) UNWIND_HINT_END_OF_STACK /* * %rdi indirection page - * %r11 preserve_context + * %r11 flags: RELOC_KERNEL_* */ movq %rdi, %rcx /* Put the indirection_page in %rcx */ xorl %edi, %edi @@ -357,7 +368,8 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) movq %rdi, %rdx /* Save destination page to %rdx */ movq %rsi, %rax /* Save source page to %rax */ - testq %r11, %r11 /* Only actually swap for ::preserve_context */ + /* Only actually swap for ::preserve_context */ + testb $RELOC_KERNEL_PRESERVE_CONTEXT, %r11b jz .Lnoswap /* copy source page to swap page */ diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index ca9c8ec7dd01..00f8bfd2330d 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -281,25 +281,6 @@ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu) vcpu->cpu = -1; } -static void tdx_clear_page(struct page *page) -{ - const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0)); - void *dest = page_to_virt(page); - unsigned long i; - - /* - * The page could have been poisoned. MOVDIR64B also clears - * the poison bit so the kernel can safely use the page again. - */ - for (i = 0; i < PAGE_SIZE; i += 64) - movdir64b(dest + i, zero_page); - /* - * MOVDIR64B store uses WC buffer. Prevent following memory reads - * from seeing potentially poisoned cache. - */ - __mb(); -} - static void tdx_no_vcpus_enter_start(struct kvm *kvm) { struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm); @@ -345,7 +326,7 @@ static int tdx_reclaim_page(struct page *page) r = __tdx_reclaim_page(page); if (!r) - tdx_clear_page(page); + tdx_quirk_reset_page(page); return r; } @@ -442,6 +423,16 @@ void tdx_disable_virtualization_cpu(void) tdx_flush_vp(&arg); } local_irq_restore(flags); + + /* + * Flush cache now if kexec is possible: this is necessary to avoid + * having dirty private memory cachelines when the new kernel boots, + * but WBINVD is a relatively expensive operation and doing it during + * kexec can exacerbate races in native_stop_other_cpus(). Do it + * now, since this is a safe moment and there is going to be no more + * TDX activity on this CPU from this point on. + */ + tdx_cpu_flush_cache_for_kexec(); } #define TDX_SEAMCALL_RETRIES 10000 @@ -593,7 +584,7 @@ static void tdx_reclaim_td_control_pages(struct kvm *kvm) pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); return; } - tdx_clear_page(kvm_tdx->td.tdr_page); + tdx_quirk_reset_page(kvm_tdx->td.tdr_page); __free_page(kvm_tdx->td.tdr_page); kvm_tdx->td.tdr_page = NULL; @@ -861,6 +852,7 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu) if (tdx->vp.tdvpr_page) { tdx_reclaim_control_page(tdx->vp.tdvpr_page); tdx->vp.tdvpr_page = 0; + tdx->vp.tdvpr_pa = 0; } tdx->state = VCPU_TD_STATE_UNINITIALIZED; @@ -1714,7 +1706,7 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err); return -EIO; } - tdx_clear_page(page); + tdx_quirk_reset_page(page); tdx_unpin(kvm, page); return 0; } @@ -2940,6 +2932,13 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) return -ENOMEM; tdx->vp.tdvpr_page = page; + /* + * page_to_phys() does not work in 'noinstr' code, like guest + * entry via tdh_vp_enter(). Precalculate and store it instead + * of doing it at runtime later. + */ + tdx->vp.tdvpr_pa = page_to_phys(tdx->vp.tdvpr_page); + tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages), GFP_KERNEL); if (!tdx->vp.tdcx_pages) { @@ -3002,6 +3001,7 @@ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx) if (tdx->vp.tdvpr_page) __free_page(tdx->vp.tdvpr_page); tdx->vp.tdvpr_page = 0; + tdx->vp.tdvpr_pa = 0; return ret; } diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index c7a9a087ccaf..eac403248462 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -633,15 +633,19 @@ static int tdmrs_set_up_pamt_all(struct tdmr_info_list *tdmr_list, } /* - * Convert TDX private pages back to normal by using MOVDIR64B to - * clear these pages. Note this function doesn't flush cache of - * these TDX private pages. The caller should make sure of that. + * Convert TDX private pages back to normal by using MOVDIR64B to clear these + * pages. Typically, any write to the page will convert it from TDX private back + * to normal kernel memory. Systems with the X86_BUG_TDX_PW_MCE erratum need to + * do the conversion explicitly via MOVDIR64B. */ -static void reset_tdx_pages(unsigned long base, unsigned long size) +static void tdx_quirk_reset_paddr(unsigned long base, unsigned long size) { const void *zero_page = (const void *)page_address(ZERO_PAGE(0)); unsigned long phys, end; + if (!boot_cpu_has_bug(X86_BUG_TDX_PW_MCE)) + return; + end = base + size; for (phys = base; phys < end; phys += 64) movdir64b(__va(phys), zero_page); @@ -654,17 +658,23 @@ static void reset_tdx_pages(unsigned long base, unsigned long size) mb(); } -static void tdmr_reset_pamt(struct tdmr_info *tdmr) +void tdx_quirk_reset_page(struct page *page) { - tdmr_do_pamt_func(tdmr, reset_tdx_pages); + tdx_quirk_reset_paddr(page_to_phys(page), PAGE_SIZE); +} +EXPORT_SYMBOL_GPL(tdx_quirk_reset_page); + +static void tdmr_quirk_reset_pamt(struct tdmr_info *tdmr) +{ + tdmr_do_pamt_func(tdmr, tdx_quirk_reset_paddr); } -static void tdmrs_reset_pamt_all(struct tdmr_info_list *tdmr_list) +static void tdmrs_quirk_reset_pamt_all(struct tdmr_info_list *tdmr_list) { int i; for (i = 0; i < tdmr_list->nr_consumed_tdmrs; i++) - tdmr_reset_pamt(tdmr_entry(tdmr_list, i)); + tdmr_quirk_reset_pamt(tdmr_entry(tdmr_list, i)); } static unsigned long tdmrs_count_pamt_kb(struct tdmr_info_list *tdmr_list) @@ -1136,15 +1146,7 @@ static int init_tdx_module(void) * to the kernel. */ wbinvd_on_all_cpus(); - /* - * According to the TDX hardware spec, if the platform - * doesn't have the "partial write machine check" - * erratum, any kernel read/write will never cause #MC - * in kernel space, thus it's OK to not convert PAMTs - * back to normal. But do the conversion anyway here - * as suggested by the TDX spec. - */ - tdmrs_reset_pamt_all(&tdx_tdmr_list); + tdmrs_quirk_reset_pamt_all(&tdx_tdmr_list); err_free_pamts: tdmrs_free_pamt_all(&tdx_tdmr_list); err_free_tdmrs: @@ -1266,7 +1268,7 @@ static bool paddr_is_tdx_private(unsigned long phys) return false; /* Get page type from the TDX module */ - sret = __seamcall_ret(TDH_PHYMEM_PAGE_RDMD, &args); + sret = __seamcall_dirty_cache(__seamcall_ret, TDH_PHYMEM_PAGE_RDMD, &args); /* * The SEAMCALL will not return success unless there is a @@ -1502,11 +1504,6 @@ static inline u64 tdx_tdr_pa(struct tdx_td *td) return page_to_phys(td->tdr_page); } -static inline u64 tdx_tdvpr_pa(struct tdx_vp *td) -{ - return page_to_phys(td->tdvpr_page); -} - /* * The TDX module exposes a CLFLUSH_BEFORE_ALLOC bit to specify whether * a CLFLUSH of pages is required before handing them to the TDX module. @@ -1518,11 +1515,11 @@ static void tdx_clflush_page(struct page *page) clflush_cache_range(page_to_virt(page), PAGE_SIZE); } -noinstr __flatten u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args) +noinstr u64 tdh_vp_enter(struct tdx_vp *td, struct tdx_module_args *args) { - args->rcx = tdx_tdvpr_pa(td); + args->rcx = td->tdvpr_pa; - return __seamcall_saved_ret(TDH_VP_ENTER, args); + return __seamcall_dirty_cache(__seamcall_saved_ret, TDH_VP_ENTER, args); } EXPORT_SYMBOL_GPL(tdh_vp_enter); @@ -1581,7 +1578,7 @@ u64 tdh_vp_addcx(struct tdx_vp *vp, struct page *tdcx_page) { struct tdx_module_args args = { .rcx = page_to_phys(tdcx_page), - .rdx = tdx_tdvpr_pa(vp), + .rdx = vp->tdvpr_pa, }; tdx_clflush_page(tdcx_page); @@ -1650,7 +1647,7 @@ EXPORT_SYMBOL_GPL(tdh_mng_create); u64 tdh_vp_create(struct tdx_td *td, struct tdx_vp *vp) { struct tdx_module_args args = { - .rcx = tdx_tdvpr_pa(vp), + .rcx = vp->tdvpr_pa, .rdx = tdx_tdr_pa(td), }; @@ -1706,7 +1703,7 @@ EXPORT_SYMBOL_GPL(tdh_mr_finalize); u64 tdh_vp_flush(struct tdx_vp *vp) { struct tdx_module_args args = { - .rcx = tdx_tdvpr_pa(vp), + .rcx = vp->tdvpr_pa, }; return seamcall(TDH_VP_FLUSH, &args); @@ -1752,7 +1749,7 @@ EXPORT_SYMBOL_GPL(tdh_mng_init); u64 tdh_vp_rd(struct tdx_vp *vp, u64 field, u64 *data) { struct tdx_module_args args = { - .rcx = tdx_tdvpr_pa(vp), + .rcx = vp->tdvpr_pa, .rdx = field, }; u64 ret; @@ -1769,7 +1766,7 @@ EXPORT_SYMBOL_GPL(tdh_vp_rd); u64 tdh_vp_wr(struct tdx_vp *vp, u64 field, u64 data, u64 mask) { struct tdx_module_args args = { - .rcx = tdx_tdvpr_pa(vp), + .rcx = vp->tdvpr_pa, .rdx = field, .r8 = data, .r9 = mask, @@ -1782,7 +1779,7 @@ EXPORT_SYMBOL_GPL(tdh_vp_wr); u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid) { struct tdx_module_args args = { - .rcx = tdx_tdvpr_pa(vp), + .rcx = vp->tdvpr_pa, .rdx = initial_rcx, .r8 = x2apicid, }; @@ -1870,3 +1867,22 @@ u64 tdh_phymem_page_wbinvd_hkid(u64 hkid, struct page *page) return seamcall(TDH_PHYMEM_PAGE_WBINVD, &args); } EXPORT_SYMBOL_GPL(tdh_phymem_page_wbinvd_hkid); + +#ifdef CONFIG_KEXEC_CORE +void tdx_cpu_flush_cache_for_kexec(void) +{ + lockdep_assert_preemption_disabled(); + + if (!this_cpu_read(cache_state_incoherent)) + return; + + /* + * Private memory cachelines need to be clean at the time of + * kexec. Write them back now, as the caller promises that + * there should be no more SEAMCALLs on this CPU. + */ + wbinvd(); + this_cpu_write(cache_state_incoherent, false); +} +EXPORT_SYMBOL_GPL(tdx_cpu_flush_cache_for_kexec); +#endif