mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2026-05-22 03:44:51 -04:00
Leon [1] and Vinicius [2] noted a topology_span_sane() warning during
their testing starting from v6.16-rc1. Debug that followed pointed to
the tl->mask() for the NODE domain being incorrectly resolved to that of
the highest NUMA domain.
tl->mask() for NODE is set to the sd_numa_mask() which depends on the
global "sched_domains_curr_level" hack. "sched_domains_curr_level" is
set to the "tl->numa_level" during tl traversal in build_sched_domains()
calling sd_init() but was not reset before topology_span_sane().
Since "tl->numa_level" still reflected the old value from
build_sched_domains(), topology_span_sane() for the NODE domain trips
when the span of the last NUMA domain overlaps.
Instead of replicating the "sched_domains_curr_level" hack, get rid of
it entirely and instead, pass the entire "sched_domain_topology_level"
object to tl->cpumask() function to prevent such mishap in the future.
sd_numa_mask() now directly references "tl->numa_level" instead of
relying on the global "sched_domains_curr_level" hack to index into
sched_domains_numa_masks[].
The original warning was reproducible on the following NUMA topology
reported by Leon:
$ sudo numactl -H
available: 5 nodes (0-4)
node 0 cpus: 0 1
node 0 size: 2927 MB
node 0 free: 1603 MB
node 1 cpus: 2 3
node 1 size: 3023 MB
node 1 free: 3008 MB
node 2 cpus: 4 5
node 2 size: 3023 MB
node 2 free: 3007 MB
node 3 cpus: 6 7
node 3 size: 3023 MB
node 3 free: 3002 MB
node 4 cpus: 8 9
node 4 size: 3022 MB
node 4 free: 2718 MB
node distances:
node 0 1 2 3 4
0: 10 39 38 37 36
1: 39 10 38 37 36
2: 38 38 10 37 36
3: 37 37 37 10 36
4: 36 36 36 36 10
The above topology can be mimicked using the following QEMU cmd that was
used to reproduce the warning and test the fix:
sudo qemu-system-x86_64 -enable-kvm -cpu host \
-m 20G -smp cpus=10,sockets=10 -machine q35 \
-object memory-backend-ram,size=4G,id=m0 \
-object memory-backend-ram,size=4G,id=m1 \
-object memory-backend-ram,size=4G,id=m2 \
-object memory-backend-ram,size=4G,id=m3 \
-object memory-backend-ram,size=4G,id=m4 \
-numa node,cpus=0-1,memdev=m0,nodeid=0 \
-numa node,cpus=2-3,memdev=m1,nodeid=1 \
-numa node,cpus=4-5,memdev=m2,nodeid=2 \
-numa node,cpus=6-7,memdev=m3,nodeid=3 \
-numa node,cpus=8-9,memdev=m4,nodeid=4 \
-numa dist,src=0,dst=1,val=39 \
-numa dist,src=0,dst=2,val=38 \
-numa dist,src=0,dst=3,val=37 \
-numa dist,src=0,dst=4,val=36 \
-numa dist,src=1,dst=0,val=39 \
-numa dist,src=1,dst=2,val=38 \
-numa dist,src=1,dst=3,val=37 \
-numa dist,src=1,dst=4,val=36 \
-numa dist,src=2,dst=0,val=38 \
-numa dist,src=2,dst=1,val=38 \
-numa dist,src=2,dst=3,val=37 \
-numa dist,src=2,dst=4,val=36 \
-numa dist,src=3,dst=0,val=37 \
-numa dist,src=3,dst=1,val=37 \
-numa dist,src=3,dst=2,val=37 \
-numa dist,src=3,dst=4,val=36 \
-numa dist,src=4,dst=0,val=36 \
-numa dist,src=4,dst=1,val=36 \
-numa dist,src=4,dst=2,val=36 \
-numa dist,src=4,dst=3,val=36 \
...
[ prateek: Moved common functions to include/linux/sched/topology.h,
reuse the common bits for s390 and ppc, commit message ]
Closes: https://lore.kernel.org/lkml/20250610110701.GA256154@unreal/ [1]
Fixes: ccf74128d6 ("sched/topology: Assert non-NUMA topology masks don't (partially) overlap") # ce29a7da84, f55dac1daf
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reported-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Tested-by: Valentin Schneider <vschneid@redhat.com> # x86
Tested-by: Shrikanth Hegde <sshegde@linux.ibm.com> # powerpc
Link: https://lore.kernel.org/lkml/a3de98387abad28592e6ab591f3ff6107fe01dc1.1755893468.git.tim.c.chen@linux.intel.com/ [2]
179 lines
4.2 KiB
C
179 lines
4.2 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _ASM_POWERPC_TOPOLOGY_H
|
|
#define _ASM_POWERPC_TOPOLOGY_H
|
|
#ifdef __KERNEL__
|
|
|
|
|
|
struct device;
|
|
struct device_node;
|
|
struct drmem_lmb;
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
/*
|
|
* If zone_reclaim_mode is enabled, a RECLAIM_DISTANCE of 10 will mean that
|
|
* all zones on all nodes will be eligible for zone_reclaim().
|
|
*/
|
|
#define RECLAIM_DISTANCE 10
|
|
|
|
#include <asm/mmzone.h>
|
|
|
|
#define cpumask_of_node(node) ((node) == -1 ? \
|
|
cpu_all_mask : \
|
|
node_to_cpumask_map[node])
|
|
|
|
struct pci_bus;
|
|
#ifdef CONFIG_PCI
|
|
extern int pcibus_to_node(struct pci_bus *bus);
|
|
#else
|
|
static inline int pcibus_to_node(struct pci_bus *bus)
|
|
{
|
|
return -1;
|
|
}
|
|
#endif
|
|
|
|
#define cpumask_of_pcibus(bus) (pcibus_to_node(bus) == -1 ? \
|
|
cpu_all_mask : \
|
|
cpumask_of_node(pcibus_to_node(bus)))
|
|
|
|
int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc);
|
|
extern int __node_distance(int, int);
|
|
#define node_distance(a, b) __node_distance(a, b)
|
|
|
|
extern void __init dump_numa_cpu_topology(void);
|
|
|
|
extern int sysfs_add_device_to_node(struct device *dev, int nid);
|
|
extern void sysfs_remove_device_from_node(struct device *dev, int nid);
|
|
|
|
static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node)
|
|
{
|
|
numa_cpu_lookup_table[cpu] = node;
|
|
}
|
|
|
|
static inline int early_cpu_to_node(int cpu)
|
|
{
|
|
int nid;
|
|
|
|
nid = numa_cpu_lookup_table[cpu];
|
|
|
|
/*
|
|
* Fall back to node 0 if nid is unset (it should be, except bugs).
|
|
* This allows callers to safely do NODE_DATA(early_cpu_to_node(cpu)).
|
|
*/
|
|
return (nid < 0) ? 0 : nid;
|
|
}
|
|
|
|
int of_drconf_to_nid_single(struct drmem_lmb *lmb);
|
|
void update_numa_distance(struct device_node *node);
|
|
|
|
extern void map_cpu_to_node(int cpu, int node);
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
|
extern void unmap_cpu_from_node(unsigned long cpu);
|
|
#endif /* CONFIG_HOTPLUG_CPU */
|
|
|
|
#else
|
|
|
|
static inline int early_cpu_to_node(int cpu) { return 0; }
|
|
|
|
static inline void dump_numa_cpu_topology(void) {}
|
|
|
|
static inline int sysfs_add_device_to_node(struct device *dev, int nid)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline void sysfs_remove_device_from_node(struct device *dev,
|
|
int nid)
|
|
{
|
|
}
|
|
|
|
static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) {}
|
|
|
|
static inline int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline int of_drconf_to_nid_single(struct drmem_lmb *lmb)
|
|
{
|
|
return first_online_node;
|
|
}
|
|
|
|
static inline void update_numa_distance(struct device_node *node) {}
|
|
|
|
#ifdef CONFIG_SMP
|
|
static inline void map_cpu_to_node(int cpu, int node) {}
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
|
static inline void unmap_cpu_from_node(unsigned long cpu) {}
|
|
#endif /* CONFIG_HOTPLUG_CPU */
|
|
#endif /* CONFIG_SMP */
|
|
|
|
#endif /* CONFIG_NUMA */
|
|
|
|
#if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
|
|
void find_and_update_cpu_nid(int cpu);
|
|
extern int cpu_to_coregroup_id(int cpu);
|
|
#else
|
|
static inline void find_and_update_cpu_nid(int cpu) {}
|
|
static inline int cpu_to_coregroup_id(int cpu)
|
|
{
|
|
#ifdef CONFIG_SMP
|
|
return cpu_to_core_id(cpu);
|
|
#else
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
#endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
|
|
|
|
#include <asm-generic/topology.h>
|
|
|
|
#ifdef CONFIG_SMP
|
|
#include <asm/cputable.h>
|
|
|
|
struct cpumask *cpu_coregroup_mask(int cpu);
|
|
|
|
#ifdef CONFIG_PPC64
|
|
#include <asm/smp.h>
|
|
|
|
#define topology_physical_package_id(cpu) (cpu_to_chip_id(cpu))
|
|
|
|
#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu))
|
|
#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu))
|
|
#define topology_core_id(cpu) (cpu_to_core_id(cpu))
|
|
|
|
#endif
|
|
#endif
|
|
|
|
#ifdef CONFIG_HOTPLUG_SMT
|
|
#include <linux/cpu_smt.h>
|
|
#include <linux/cpumask.h>
|
|
#include <asm/cputhreads.h>
|
|
|
|
static inline bool topology_is_primary_thread(unsigned int cpu)
|
|
{
|
|
return cpu == cpu_first_thread_sibling(cpu);
|
|
}
|
|
#define topology_is_primary_thread topology_is_primary_thread
|
|
|
|
static inline bool topology_smt_thread_allowed(unsigned int cpu)
|
|
{
|
|
return cpu_thread_in_core(cpu) < cpu_smt_num_threads;
|
|
}
|
|
|
|
#define topology_is_core_online topology_is_core_online
|
|
static inline bool topology_is_core_online(unsigned int cpu)
|
|
{
|
|
int i, first_cpu = cpu_first_thread_sibling(cpu);
|
|
|
|
for (i = first_cpu; i < first_cpu + threads_per_core; ++i) {
|
|
if (cpu_online(i))
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
#endif
|
|
|
|
#endif /* __KERNEL__ */
|
|
#endif /* _ASM_POWERPC_TOPOLOGY_H */
|