From fa1d117162aa820a8f73a31ccab85bde6c84725b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 16 Aug 2025 11:06:32 +0100 Subject: [PATCH 1/8] x86/cpu: Detect FreeBSD Bhyve hypervisor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Detect the Bhyve hypervisor and enable 15-bit MSI support if available. Detecting Bhyve used to be a purely cosmetic issue of the kernel printing 'Hypervisor detected: Bhyve' at boot time. But FreeBSD 15.0 will support¹ the 15-bit MSI enlightenment to support more than 255 vCPUs (http://david.woodhou.se/ExtDestId.pdf) which means there's now actually some functional reason to do so. ¹ https://github.com/freebsd/freebsd-src/commit/313a68ea20b4 [ bp: Massage, move tail comment ontop. ] Signed-off-by: David Woodhouse Signed-off-by: Borislav Petkov (AMD) Acked-by: Ahmed S. Darwish Link: https://lore.kernel.org/03802f6f7f5b5cf8c5e8adfe123c397ca8e21093.camel@infradead.org --- arch/x86/Kconfig | 9 +++++ arch/x86/include/asm/hypervisor.h | 2 + arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/bhyve.c | 66 +++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/hypervisor.c | 3 ++ 5 files changed, 81 insertions(+) create mode 100644 arch/x86/kernel/cpu/bhyve.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 52c8910ba2ef..2be5a50f00c2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -879,6 +879,15 @@ config ACRN_GUEST IOT with small footprint and real-time features. More details can be found in https://projectacrn.org/. +config BHYVE_GUEST + bool "Bhyve (BSD Hypervisor) Guest support" + depends on X86_64 + help + This option allows to run Linux to recognise when it is running as a + guest in the Bhyve hypervisor, and to support more than 255 vCPUs when + when doing so. More details about Bhyve can be found at https://bhyve.org + and https://wiki.freebsd.org/bhyve/. + config INTEL_TDX_GUEST bool "Intel TDX (Trust Domain Extensions) - Guest Support" depends on X86_64 && CPU_SUP_INTEL diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index e41cbf2ec41d..9ad86a7d13f6 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -30,6 +30,7 @@ enum x86_hypervisor_type { X86_HYPER_KVM, X86_HYPER_JAILHOUSE, X86_HYPER_ACRN, + X86_HYPER_BHYVE, }; #ifdef CONFIG_HYPERVISOR_GUEST @@ -64,6 +65,7 @@ extern const struct hypervisor_x86 x86_hyper_xen_pv; extern const struct hypervisor_x86 x86_hyper_kvm; extern const struct hypervisor_x86 x86_hyper_jailhouse; extern const struct hypervisor_x86 x86_hyper_acrn; +extern const struct hypervisor_x86 x86_hyper_bhyve; extern struct hypervisor_x86 x86_hyper_xen_hvm; extern bool nopv; diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 1e26179ff18c..2f8a58ef690e 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -58,6 +58,7 @@ obj-$(CONFIG_X86_SGX) += sgx/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o +obj-$(CONFIG_BHYVE_GUEST) += bhyve.o obj-$(CONFIG_ACRN_GUEST) += acrn.o obj-$(CONFIG_DEBUG_FS) += debugfs.o diff --git a/arch/x86/kernel/cpu/bhyve.c b/arch/x86/kernel/cpu/bhyve.c new file mode 100644 index 000000000000..f1a8ca3dd1ed --- /dev/null +++ b/arch/x86/kernel/cpu/bhyve.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FreeBSD Bhyve guest enlightenments + * + * Copyright © 2025 Amazon.com, Inc. or its affiliates. + * + * Author: David Woodhouse + */ + +#include +#include +#include +#include + +static uint32_t bhyve_cpuid_base; +static uint32_t bhyve_cpuid_max; + +#define BHYVE_SIGNATURE "bhyve bhyve " + +#define CPUID_BHYVE_FEATURES 0x40000001 + +/* Features advertised in CPUID_BHYVE_FEATURES %eax */ + +/* MSI Extended Dest ID */ +#define CPUID_BHYVE_FEAT_EXT_DEST_ID (1UL << 0) + +static uint32_t __init bhyve_detect(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + return 0; + + bhyve_cpuid_base = cpuid_base_hypervisor(BHYVE_SIGNATURE, 0); + if (!bhyve_cpuid_base) + return 0; + + bhyve_cpuid_max = cpuid_eax(bhyve_cpuid_base); + return bhyve_cpuid_max; +} + +static uint32_t bhyve_features(void) +{ + unsigned int cpuid_leaf = bhyve_cpuid_base | CPUID_BHYVE_FEATURES; + + if (bhyve_cpuid_max < cpuid_leaf) + return 0; + + return cpuid_eax(cpuid_leaf); +} + +static bool __init bhyve_ext_dest_id(void) +{ + return !!(bhyve_features() & CPUID_BHYVE_FEAT_EXT_DEST_ID); +} + +static bool __init bhyve_x2apic_available(void) +{ + return true; +} + +const struct hypervisor_x86 x86_hyper_bhyve __refconst = { + .name = "Bhyve", + .detect = bhyve_detect, + .init.init_platform = x86_init_noop, + .init.x2apic_available = bhyve_x2apic_available, + .init.msi_ext_dest_id = bhyve_ext_dest_id, +}; diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 553bfbfc3a1b..f3e9219845e8 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -45,6 +45,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = #ifdef CONFIG_ACRN_GUEST &x86_hyper_acrn, #endif +#ifdef CONFIG_BHYVE_GUEST + &x86_hyper_bhyve, +#endif }; enum x86_hypervisor_type x86_hyper_type; From 70d1d98934e723b9e463283a542b88f4f009ae82 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 11 Aug 2025 14:33:45 -0700 Subject: [PATCH 2/8] x86/cpu: Rename and move CPU model entry for Diamond Rapids This model was added as INTEL_PANTHERCOVE_X (based on the name of the core) with a comment that the platform name is Diamond Rapids. It was also placed at the end of the file in a new section for family 19 processors. This is different from previous naming as Andrew Cooper noted. PeterZ agreed and posted a patch[1] to fix the name and move it in sequence with other Xeon servers. But without a commit description or sign-off the patch wasn't ever applied. Patch updated to cover one additional use of the #define by turbostat and to change the "Family 6" comment to also list 18 and 19 since new models in these families are mixed in with family 6. Originally-by: Peter Zijlstra Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Ingo Molnar Reviewed-by: Sohil Mehta Link: https://lore.kernel.org/all/20250214130205.GK14028@noisy.programming.kicks-ass.net/ # [1] --- arch/x86/include/asm/intel-family.h | 7 +++---- .../platform/x86/intel/speed_select_if/isst_if_common.c | 2 +- drivers/platform/x86/intel/tpmi_power_domains.c | 2 +- tools/power/x86/turbostat/turbostat.c | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index e345dbdf933e..f32a0eca2ae5 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -51,7 +51,7 @@ #define INTEL_PENTIUM_MMX IFM(5, 0x04) /* P55C */ #define INTEL_QUARK_X1000 IFM(5, 0x09) /* Quark X1000 SoC */ -/* Family 6 */ +/* Family 6, 18, 19 */ #define INTEL_PENTIUM_PRO IFM(6, 0x01) #define INTEL_PENTIUM_II_KLAMATH IFM(6, 0x03) #define INTEL_PENTIUM_III_DESCHUTES IFM(6, 0x05) @@ -126,6 +126,8 @@ #define INTEL_GRANITERAPIDS_X IFM(6, 0xAD) /* Redwood Cove */ #define INTEL_GRANITERAPIDS_D IFM(6, 0xAE) +#define INTEL_DIAMONDRAPIDS_X IFM(19, 0x01) /* Panther Cove */ + #define INTEL_BARTLETTLAKE IFM(6, 0xD7) /* Raptor Cove */ /* "Hybrid" Processors (P-Core/E-Core) */ @@ -203,9 +205,6 @@ #define INTEL_P4_PRESCOTT_2M IFM(15, 0x04) #define INTEL_P4_CEDARMILL IFM(15, 0x06) /* Also Xeon Dempsey */ -/* Family 19 */ -#define INTEL_PANTHERCOVE_X IFM(19, 0x01) /* Diamond Rapids */ - /* * Intel CPU core types * diff --git a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c index 71e104a068e9..7449873c3d40 100644 --- a/drivers/platform/x86/intel/speed_select_if/isst_if_common.c +++ b/drivers/platform/x86/intel/speed_select_if/isst_if_common.c @@ -790,7 +790,7 @@ static const struct x86_cpu_id isst_cpu_ids[] = { X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, SST_HPM_SUPPORTED), X86_MATCH_VFM(INTEL_ICELAKE_D, 0), X86_MATCH_VFM(INTEL_ICELAKE_X, 0), - X86_MATCH_VFM(INTEL_PANTHERCOVE_X, SST_HPM_SUPPORTED), + X86_MATCH_VFM(INTEL_DIAMONDRAPIDS_X, SST_HPM_SUPPORTED), X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0), X86_MATCH_VFM(INTEL_SKYLAKE_X, SST_MBOX_SUPPORTED), {} diff --git a/drivers/platform/x86/intel/tpmi_power_domains.c b/drivers/platform/x86/intel/tpmi_power_domains.c index 8641353b2e06..7d93119a4c30 100644 --- a/drivers/platform/x86/intel/tpmi_power_domains.c +++ b/drivers/platform/x86/intel/tpmi_power_domains.c @@ -85,7 +85,7 @@ static const struct x86_cpu_id tpmi_cpu_ids[] = { X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, NULL), X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, NULL), X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, NULL), - X86_MATCH_VFM(INTEL_PANTHERCOVE_X, NULL), + X86_MATCH_VFM(INTEL_DIAMONDRAPIDS_X, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, tpmi_cpu_ids); diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 72a280e7a9d5..47eb2d4d13a5 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1195,7 +1195,7 @@ static const struct platform_data turbostat_pdata[] = { { INTEL_EMERALDRAPIDS_X, &spr_features }, { INTEL_GRANITERAPIDS_X, &spr_features }, { INTEL_GRANITERAPIDS_D, &spr_features }, - { INTEL_PANTHERCOVE_X, &dmr_features }, + { INTEL_DIAMONDRAPIDS_X, &dmr_features }, { INTEL_LAKEFIELD, &cnl_features }, { INTEL_ALDERLAKE, &adl_features }, { INTEL_ALDERLAKE_L, &adl_features }, From af507c6951180ae5e5edb71f56a227ffdcc54f78 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Thu, 21 Aug 2025 05:19:09 +0000 Subject: [PATCH 3/8] x86/cpu/cacheinfo: Simplify cacheinfo_amd_init_llc_id() using _cpuid4_info struct _cpuid4_info has the same layout as the CPUID leaf 0x8000001d. Use the encoded definition and amd_fill_cpuid4_info(), get_cache_id() helpers instead of open coding masks and shifts to calculate the llc_id. cacheinfo_amd_init_llc_id() is only called on AMD systems that support X86_FEATURE_TOPOEXT and amd_fill_cpuid4_info() uses the information from CPUID leaf 0x8000001d on all these systems which is consistent with the current open coded implementation. While at it, avoid reading cpu_data() every time get_cache_id() is called and instead pass the APIC ID necessary to return the _cpuid4_info.id from get_cache_id(). No functional changes intended. [ bp: do what Ahmed suggests: merge into one patch, make id4 ptr const. ] Signed-off-by: K Prateek Nayak Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Ingo Molnar Acked-by: Ahmed S. Darwish Link: https://lore.kernel.org/20250821051910.7351-2-kprateek.nayak@amd.com --- arch/x86/kernel/cpu/cacheinfo.c | 48 +++++++++++++++------------------ 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index adfa7e8bb865..51a95b07831f 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -289,6 +289,22 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) return i; } +/* + * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input + * ECX as cache index. Then right shift apicid by the number's order to get + * cache id for this cache node. + */ +static unsigned int get_cache_id(u32 apicid, const struct _cpuid4_info *id4) +{ + unsigned long num_threads_sharing; + int index_msb; + + num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + + return apicid >> index_msb; +} + /* * AMD/Hygon CPUs may have multiple LLCs if L3 caches exist. */ @@ -312,18 +328,11 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id) * Newer families: LLC ID is calculated from the number * of threads sharing the L3 cache. */ - u32 eax, ebx, ecx, edx, num_sharing_cache = 0; u32 llc_index = find_num_cache_leaves(c) - 1; + struct _cpuid4_info id4 = {}; - cpuid_count(0x8000001d, llc_index, &eax, &ebx, &ecx, &edx); - if (eax) - num_sharing_cache = ((eax >> 14) & 0xfff) + 1; - - if (num_sharing_cache) { - int index_msb = get_count_order(num_sharing_cache); - - c->topo.llc_id = c->topo.apicid >> index_msb; - } + if (!amd_fill_cpuid4_info(llc_index, &id4)) + c->topo.llc_id = get_cache_id(c->topo.apicid, &id4); } } @@ -598,27 +607,12 @@ int init_cache_level(unsigned int cpu) return 0; } -/* - * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input - * ECX as cache index. Then right shift apicid by the number's order to get - * cache id for this cache node. - */ -static void get_cache_id(int cpu, struct _cpuid4_info *id4) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu); - unsigned long num_threads_sharing; - int index_msb; - - num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); - id4->id = c->topo.apicid >> index_msb; -} - int populate_cache_leaves(unsigned int cpu) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); struct cacheinfo *ci = this_cpu_ci->info_list; u8 cpu_vendor = boot_cpu_data.x86_vendor; + u32 apicid = cpu_data(cpu).topo.apicid; struct amd_northbridge *nb = NULL; struct _cpuid4_info id4 = {}; int idx, ret; @@ -628,7 +622,7 @@ int populate_cache_leaves(unsigned int cpu) if (ret) return ret; - get_cache_id(cpu, &id4); + id4.id = get_cache_id(apicid, &id4); if (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON) nb = amd_init_l3_cache(idx); From d691c5f87f344a448b1a522284fa314e2bb403e2 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Mon, 1 Sep 2025 17:04:16 +0000 Subject: [PATCH 4/8] x86/cpu/topology: Check for X86_FEATURE_XTOPOLOGY instead of passing has_xtopology cpu_parse_topology_ext() sets X86_FEATURE_XTOPOLOGY before returning true if any of the XTOPOLOGY leaf (0x80000026 / 0xb) could be parsed successfully. Instead of storing and passing around this return value using "has_xtopology" in parse_topology_amd(), check for X86_FEATURE_XTOPOLOGY directly in parse_8000_001e() to simplify the flow. No functional changes intended. Signed-off-by: K Prateek Nayak Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/20250901170418.4314-1-kprateek.nayak@amd.com --- arch/x86/kernel/cpu/topology_amd.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index c79ebbb639cb..7ebd4a15c561 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -59,7 +59,7 @@ static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id) tscan->amd_node_id = node_id; } -static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext) +static bool parse_8000_001e(struct topo_scan *tscan) { struct { // eax @@ -85,7 +85,7 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext) * If leaf 0xb/0x26 is available, then the APIC ID and the domain * shifts are set already. */ - if (!has_topoext) { + if (!cpu_feature_enabled(X86_FEATURE_XTOPOLOGY)) { tscan->c->topo.initial_apicid = leaf.ext_apic_id; /* @@ -175,30 +175,27 @@ static void topoext_fixup(struct topo_scan *tscan) static void parse_topology_amd(struct topo_scan *tscan) { + if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) + tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); + /* * Try to get SMT, CORE, TILE, and DIE shifts from extended * CPUID leaf 0x8000_0026 on supported processors first. If * extended CPUID leaf 0x8000_0026 is not supported, try to * get SMT and CORE shift from leaf 0xb. If either leaf is * available, cpu_parse_topology_ext() will return true. - */ - bool has_xtopology = cpu_parse_topology_ext(tscan); - - if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) - tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); - - /* + * * If XTOPOLOGY leaves (0x26/0xb) are not available, try to * get the CORE shift from leaf 0x8000_0008 first. */ - if (!has_xtopology && !parse_8000_0008(tscan)) + if (!cpu_parse_topology_ext(tscan) && !parse_8000_0008(tscan)) return; /* * Prefer leaf 0x8000001e if available to get the SMT shift and * the initial APIC ID if XTOPOLOGY leaves are not available. */ - if (parse_8000_001e(tscan, has_xtopology)) + if (parse_8000_001e(tscan)) return; /* Try the NODEID MSR */ From bc6397cf0bc4f2b7a47cc6ac44086daf67c3c71c Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Mon, 1 Sep 2025 17:04:17 +0000 Subject: [PATCH 5/8] x86/cpu/topology: Define AMD64_CPUID_EXT_FEAT MSR Add defines for the 0xc001_1005 MSR (Core::X86::Msr::CPUID_ExtFeatures) used to toggle the extended CPUID features, instead of using literal numbers. Also define and use the bits necessary for an old TOPOEXT fixup on AMD Family 0x15 processors. No functional changes intended. [ bp: Massage, rename MSR to adhere to the documentation name. ] Signed-off-by: K Prateek Nayak Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/20250901170418.4314-1-kprateek.nayak@amd.com --- arch/x86/include/asm/msr-index.h | 5 +++++ arch/x86/kernel/cpu/topology_amd.c | 7 ++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b65c3ba5fa14..a734b5602149 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -631,6 +631,11 @@ #define MSR_AMD_PPIN 0xc00102f1 #define MSR_AMD64_CPUID_FN_7 0xc0011002 #define MSR_AMD64_CPUID_FN_1 0xc0011004 + +#define MSR_AMD64_CPUID_EXT_FEAT 0xc0011005 +#define MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT 54 +#define MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT BIT_ULL(MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT) + #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 #define MSR_AMD64_TW_CFG 0xc0011023 diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 7ebd4a15c561..6ac097e13106 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -163,11 +163,12 @@ static void topoext_fixup(struct topo_scan *tscan) c->x86 != 0x15 || c->x86_model < 0x10 || c->x86_model > 0x6f) return; - if (msr_set_bit(0xc0011005, 54) <= 0) + if (msr_set_bit(MSR_AMD64_CPUID_EXT_FEAT, + MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT_BIT) <= 0) return; - rdmsrq(0xc0011005, msrval); - if (msrval & BIT_64(54)) { + rdmsrq(MSR_AMD64_CPUID_EXT_FEAT, msrval); + if (msrval & MSR_AMD64_CPUID_EXT_FEAT_TOPOEXT) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); } From d98b40c07552a3bdc72bfc5879748707ba7598ae Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Mon, 1 Sep 2025 17:04:18 +0000 Subject: [PATCH 6/8] Documentation/x86/topology: Detail CPUID leaves used for topology enumeration Add a new section describing the different CPUID leaves and fields used to parse topology on x86 systems. [ bp: Cleanups and simplifications ontop. ] Suggested-by: Borislav Petkov Signed-off-by: K Prateek Nayak Signed-off-by: Borislav Petkov (AMD) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/20250901170418.4314-1-kprateek.nayak@amd.com --- Documentation/arch/x86/topology.rst | 191 ++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) diff --git a/Documentation/arch/x86/topology.rst b/Documentation/arch/x86/topology.rst index c12837e61bda..86bec8ac2c4d 100644 --- a/Documentation/arch/x86/topology.rst +++ b/Documentation/arch/x86/topology.rst @@ -141,6 +141,197 @@ Thread-related topology information in the kernel: +System topology enumeration +=========================== + +The topology on x86 systems can be discovered using a combination of vendor +specific CPUID leaves which enumerate the processor topology and the cache +hierarchy. + +The CPUID leaves in their preferred order of parsing for each x86 vendor is as +follows: + +1) AMD + + 1) CPUID leaf 0x80000026 [Extended CPU Topology] (Core::X86::Cpuid::ExCpuTopology) + + The extended CPUID leaf 0x80000026 is the extension of the CPUID leaf 0xB + and provides the topology information of Core, Complex, CCD (Die), and + Socket in each level. + + Support for the leaf is discovered by checking if the maximum extended + CPUID level is >= 0x80000026 and then checking if `LogProcAtThisLevel` + in `EBX[15:0]` at a particular level (starting from 0) is non-zero. + + The `LevelType` in `ECX[15:8]` at the level provides the topology domain + the level describes - Core, Complex, CCD(Die), or the Socket. + + The kernel uses the `CoreMaskWidth` from `EAX[4:0]` to discover the + number of bits that need to be right-shifted from `ExtendedLocalApicId` + in `EDX[31:0]` in order to get a unique Topology ID for the topology + level. CPUs with the same Topology ID share the resources at that level. + + CPUID leaf 0x80000026 also provides more information regarding the power + and efficiency rankings, and about the core type on AMD processors with + heterogeneous characteristics. + + If CPUID leaf 0x80000026 is supported, further parsing is not required. + + 2) CPUID leaf 0x0000000B [Extended Topology Enumeration] (Core::X86::Cpuid::ExtTopEnum) + + The extended CPUID leaf 0x0000000B is the predecessor on the extended + CPUID leaf 0x80000026 and only describes the core, and the socket domains + of the processor topology. + + The support for the leaf is discovered by checking if the maximum supported + CPUID level is >= 0xB and then if `EBX[31:0]` at a particular level + (starting from 0) is non-zero. + + The `LevelType` in `ECX[15:8]` at the level provides the topology domain + that the level describes - Thread, or Processor (Socket). + + The kernel uses the `CoreMaskWidth` from `EAX[4:0]` to discover the + number of bits that need to be right-shifted from the `ExtendedLocalApicId` + in `EDX[31:0]` to get a unique Topology ID for that topology level. CPUs + sharing the Topology ID share the resources at that level. + + If CPUID leaf 0xB is supported, further parsing is not required. + + + 3) CPUID leaf 0x80000008 ECX [Size Identifiers] (Core::X86::Cpuid::SizeId) + + If neither the CPUID leaf 0x80000026 nor 0xB is supported, the number of + CPUs on the package is detected using the Size Identifier leaf + 0x80000008 ECX. + + The support for the leaf is discovered by checking if the supported + extended CPUID level is >= 0x80000008. + + The shifts from the APIC ID for the Socket ID is calculated from the + `ApicIdSize` field in `ECX[15:12]` if it is non-zero. + + If `ApicIdSize` is reported to be zero, the shift is calculated as the + order of the `number of threads` calculated from `NC` field in + `ECX[7:0]` which describes the `number of threads - 1` on the package. + + Unless Extended APIC ID is supported, the APIC ID used to find the + Socket ID is from the `LocalApicId` field of CPUID leaf 0x00000001 + `EBX[31:24]`. + + The topology parsing continues to detect if Extended APIC ID is + supported or not. + + + 4) CPUID leaf 0x8000001E [Extended APIC ID, Core Identifiers, Node Identifiers] + (Core::X86::Cpuid::{ExtApicId,CoreId,NodeId}) + + The support for Extended APIC ID can be detected by checking for the + presence of `TopologyExtensions` in `ECX[22]` of CPUID leaf 0x80000001 + [Feature Identifiers] (Core::X86::Cpuid::FeatureExtIdEcx). + + If Topology Extensions is supported, the APIC ID from `ExtendedApicId` + from CPUID leaf 0x8000001E `EAX[31:0]` should be preferred over that from + `LocalApicId` field of CPUID leaf 0x00000001 `EBX[31:24]` for topology + enumeration. + + On processors of Family 0x17 and above that do not support CPUID leaf + 0x80000026 or CPUID leaf 0xB, the shifts from the APIC ID for the Core + ID is calculated using the order of `number of threads per core` + calculated using the `ThreadsPerCore` field in `EBX[15:8]` which + describes `number of threads per core - 1`. + + On Processors of Family 0x15, the Core ID from `EBX[7:0]` is used as the + `cu_id` (Compute Unit ID) to detect CPUs that share the compute units. + + + All AMD processors that support the `TopologyExtensions` feature store the + `NodeId` from the `ECX[7:0]` of CPUID leaf 0x8000001E + (Core::X86::Cpuid::NodeId) as the per-CPU `node_id`. On older processors, + the `node_id` was discovered using MSR_FAM10H_NODE_ID MSR (MSR + 0x0xc001_100c). The presence of the NODE_ID MSR was detected by checking + `ECX[19]` of CPUID leaf 0x80000001 [Feature Identifiers] + (Core::X86::Cpuid::FeatureExtIdEcx). + + +2) Intel + + On Intel platforms, the CPUID leaves that enumerate the processor + topology are as follows: + + 1) CPUID leaf 0x1F (V2 Extended Topology Enumeration Leaf) + + The CPUID leaf 0x1F is the extension of the CPUID leaf 0xB and provides + the topology information of Core, Module, Tile, Die, DieGrp, and Socket + in each level. + + The support for the leaf is discovered by checking if the supported + CPUID level is >= 0x1F and then `EBX[31:0]` at a particular level + (starting from 0) is non-zero. + + The `Domain Type` in `ECX[15:8]` of the sub-leaf provides the topology + domain that the level describes - Core, Module, Tile, Die, DieGrp, and + Socket. + + The kernel uses the value from `EAX[4:0]` to discover the number of + bits that need to be right shifted from the `x2APIC ID` in `EDX[31:0]` + to get a unique Topology ID for the topology level. CPUs with the same + Topology ID share the resources at that level. + + If CPUID leaf 0x1F is supported, further parsing is not required. + + + 2) CPUID leaf 0x0000000B (Extended Topology Enumeration Leaf) + + The extended CPUID leaf 0x0000000B is the predecessor of the V2 Extended + Topology Enumeration Leaf 0x1F and only describes the core, and the + socket domains of the processor topology. + + The support for the leaf is iscovered by checking if the supported CPUID + level is >= 0xB and then checking if `EBX[31:0]` at a particular level + (starting from 0) is non-zero. + + CPUID leaf 0x0000000B shares the same layout as CPUID leaf 0x1F and + should be enumerated in a similar manner. + + If CPUID leaf 0xB is supported, further parsing is not required. + + + 3) CPUID leaf 0x00000004 (Deterministic Cache Parameters Leaf) + + On Intel processors that support neither CPUID leaf 0x1F, nor CPUID leaf + 0xB, the shifts for the SMT domains is calculated using the number of + CPUs sharing the L1 cache. + + Processors that feature Hyper-Threading is detected using `EDX[28]` of + CPUID leaf 0x1 (Basic CPUID Information). + + The order of `Maximum number of addressable IDs for logical processors + sharing this cache` from `EAX[25:14]` of level-0 of CPUID 0x4 provides + the shifts from the APIC ID required to compute the Core ID. + + The APIC ID and Package information is computed using the data from + CPUID leaf 0x1. + + + 4) CPUID leaf 0x00000001 (Basic CPUID Information) + + The mask and shifts to derive the Physical Package (socket) ID is + computed using the `Maximum number of addressable IDs for logical + processors in this physical package` from `EBX[23:16]` of CPUID leaf + 0x1. + + The APIC ID on the legacy platforms is derived from the `Initial APIC + ID` field from `EBX[31:24]` of CPUID leaf 0x1. + + +3) Centaur and Zhaoxin + + Similar to Intel, Centaur and Zhaoxin use a combination of CPUID leaf + 0x00000004 (Deterministic Cache Parameters Leaf) and CPUID leaf 0x00000001 + (Basic CPUID Information) to derive the topology information. + + + System topology examples ======================== From 32278c677947ae2f042c9535674a7fff9a245dd3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Aug 2025 10:23:56 -0700 Subject: [PATCH 7/8] x86/umip: Check that the instruction opcode is at least two bytes When checking for a potential UMIP violation on #GP, verify the decoder found at least two opcode bytes to avoid false positives when the kernel encounters an unknown instruction that starts with 0f. Because the array of opcode.bytes is zero-initialized by insn_init(), peeking at bytes[1] will misinterpret garbage as a potential SLDT or STR instruction, and can incorrectly trigger emulation. E.g. if a VPALIGNR instruction 62 83 c5 05 0f 08 ff vpalignr xmm17{k5},xmm23,XMMWORD PTR [r8],0xff hits a #GP, the kernel emulates it as STR and squashes the #GP (and corrupts the userspace code stream). Arguably the check should look for exactly two bytes, but no three byte opcodes use '0f 00 xx' or '0f 01 xx' as an escape, i.e. it should be impossible to get a false positive if the first two opcode bytes match '0f 00' or '0f 01'. Go with a more conservative check with respect to the existing code to minimize the chances of breaking userspace, e.g. due to decoder weirdness. Analyzed by Nick Bray . Fixes: 1e5db223696a ("x86/umip: Add emulation code for UMIP instructions") Reported-by: Dan Snyder Signed-off-by: Sean Christopherson Signed-off-by: Borislav Petkov (AMD) Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org --- arch/x86/kernel/umip.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 5a4b21389b1d..406ac01ce16d 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -156,8 +156,8 @@ static int identify_insn(struct insn *insn) if (!insn->modrm.nbytes) return -EINVAL; - /* All the instructions of interest start with 0x0f. */ - if (insn->opcode.bytes[0] != 0xf) + /* The instructions of interest have 2-byte opcodes: 0F 00 or 0F 01. */ + if (insn->opcode.nbytes < 2 || insn->opcode.bytes[0] != 0xf) return -EINVAL; if (insn->opcode.bytes[1] == 0x1) { From 27b1fd62012dfe9d3eb8ecde344d7aa673695ecf Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 Aug 2025 10:23:57 -0700 Subject: [PATCH 8/8] x86/umip: Fix decoding of register forms of 0F 01 (SGDT and SIDT aliases) Filter out the register forms of 0F 01 when determining whether or not to emulate in response to a potential UMIP violation #GP, as SGDT and SIDT only accept memory operands. The register variants of 0F 01 are used to encode instructions for things like VMX and SGX, i.e. not checking the Mod field would cause the kernel to incorrectly emulate on #GP, e.g. due to a CPL violation on VMLAUNCH. Fixes: 1e5db223696a ("x86/umip: Add emulation code for UMIP instructions") Signed-off-by: Sean Christopherson Signed-off-by: Borislav Petkov (AMD) Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org --- arch/x86/kernel/umip.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 406ac01ce16d..d432f3824f0c 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -163,8 +163,19 @@ static int identify_insn(struct insn *insn) if (insn->opcode.bytes[1] == 0x1) { switch (X86_MODRM_REG(insn->modrm.value)) { case 0: + /* The reg form of 0F 01 /0 encodes VMX instructions. */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) + return -EINVAL; + return UMIP_INST_SGDT; case 1: + /* + * The reg form of 0F 01 /1 encodes MONITOR/MWAIT, + * STAC/CLAC, and ENCLS. + */ + if (X86_MODRM_MOD(insn->modrm.value) == 3) + return -EINVAL; + return UMIP_INST_SIDT; case 4: return UMIP_INST_SMSW;