Merge tag 'sched_ext-for-6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext updates from Tejun Heo:

 - Add mechanism to count and report internal events. This significantly
   improves visibility on subtle corner conditions.

 - The default idle CPU selection logic is revamped and improved in
   multiple ways including being made topology aware.

 - sched_ext was disabling ttwu_queue for simplicity, which can be
   costly when hardware topology is more complex. Implement
   SCX_OPS_ALLOWED_QUEUED_WAKEUP so that BPF schedulers can selectively
   enable ttwu_queue.

 - tools/sched_ext updates to improve compatibility among others.

 - Other misc updates and fixes.

 - sched_ext/for-6.14-fixes were pulled a few times to receive
   prerequisite fixes and resolve conflicts.

* tag 'sched_ext-for-6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: (42 commits)
  sched_ext: idle: Refactor scx_select_cpu_dfl()
  sched_ext: idle: Honor idle flags in the built-in idle selection policy
  sched_ext: Skip per-CPU tasks in scx_bpf_reenqueue_local()
  sched_ext: Add trace point to track sched_ext core events
  sched_ext: Change the event type from u64 to s64
  sched_ext: Documentation: add task lifecycle summary
  tools/sched_ext: Provide a compatible helper for scx_bpf_events()
  selftests/sched_ext: Add NUMA-aware scheduler test
  tools/sched_ext: Provide consistent access to scx flags
  sched_ext: idle: Fix scx_bpf_pick_any_cpu_node() behavior
  sched_ext: idle: Introduce scx_bpf_nr_node_ids()
  sched_ext: idle: Introduce node-aware idle cpu kfunc helpers
  sched_ext: idle: Per-node idle cpumasks
  sched_ext: idle: Introduce SCX_OPS_BUILTIN_IDLE_PER_NODE
  sched_ext: idle: Make idle static keys private
  sched/topology: Introduce for_each_node_numadist() iterator
  mm/numa: Introduce nearest_node_nodemask()
  nodemask: numa: reorganize inclusion path
  nodemask: add nodes_copy()
  tools/sched_ext: Sync with scx repo
  ...
This commit is contained in:
Linus Torvalds
2025-03-24 17:23:48 -07:00
25 changed files with 2148 additions and 783 deletions

View File

@@ -294,6 +294,42 @@ dispatching, and must be dispatched to with ``scx_bpf_dsq_insert()``. See
the function documentation and usage in ``tools/sched_ext/scx_simple.bpf.c``
for more information.
Task Lifecycle
--------------
The following pseudo-code summarizes the entire lifecycle of a task managed
by a sched_ext scheduler:
.. code-block:: c
ops.init_task(); /* A new task is created */
ops.enable(); /* Enable BPF scheduling for the task */
while (task in SCHED_EXT) {
if (task can migrate)
ops.select_cpu(); /* Called on wakeup (optimization) */
ops.runnable(); /* Task becomes ready to run */
while (task is runnable) {
if (task is not in a DSQ) {
ops.enqueue(); /* Task can be added to a DSQ */
/* A CPU becomes available */
ops.dispatch(); /* Task is moved to a local DSQ */
}
ops.running(); /* Task starts running on its assigned CPU */
ops.tick(); /* Called every 1/HZ seconds */
ops.stopping(); /* Task stops running (time slice expires or wait) */
}
ops.quiescent(); /* Task releases its assigned CPU (wait) */
}
ops.disable(); /* Disable BPF scheduling for the task */
ops.exit_task(); /* Task is destroyed */
Where to Look
=============

View File

@@ -21196,8 +21196,7 @@ S: Maintained
W: https://github.com/sched-ext/scx
T: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git
F: include/linux/sched/ext.h
F: kernel/sched/ext.h
F: kernel/sched/ext.c
F: kernel/sched/ext*
F: tools/sched_ext/
F: tools/testing/selftests/sched_ext

View File

@@ -94,7 +94,6 @@
#include <linux/bitmap.h>
#include <linux/minmax.h>
#include <linux/nodemask_types.h>
#include <linux/numa.h>
#include <linux/random.h>
extern nodemask_t _unused_nodemask_arg_;
@@ -191,6 +190,13 @@ static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *s
bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
}
#define nodes_copy(dst, src) __nodes_copy(&(dst), &(src), MAX_NUMNODES)
static __always_inline void __nodes_copy(nodemask_t *dstp,
const nodemask_t *srcp, unsigned int nbits)
{
bitmap_copy(dstp->bits, srcp->bits, nbits);
}
#define nodes_complement(dst, src) \
__nodes_complement(&(dst), &(src), MAX_NUMNODES)
static __always_inline void __nodes_complement(nodemask_t *dstp,

View File

@@ -3,7 +3,16 @@
#define __LINUX_NODEMASK_TYPES_H
#include <linux/bitops.h>
#include <linux/numa.h>
#ifdef CONFIG_NODES_SHIFT
#define NODES_SHIFT CONFIG_NODES_SHIFT
#else
#define NODES_SHIFT 0
#endif
#define MAX_NUMNODES (1 << NODES_SHIFT)
#define NUMA_NO_NODE (-1)
typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;

View File

@@ -3,16 +3,8 @@
#define _LINUX_NUMA_H
#include <linux/init.h>
#include <linux/types.h>
#include <linux/nodemask.h>
#ifdef CONFIG_NODES_SHIFT
#define NODES_SHIFT CONFIG_NODES_SHIFT
#else
#define NODES_SHIFT 0
#endif
#define MAX_NUMNODES (1 << NODES_SHIFT)
#define NUMA_NO_NODE (-1)
#define NUMA_NO_MEMBLK (-1)
static inline bool numa_valid_node(int nid)
@@ -39,6 +31,8 @@ void __init alloc_offline_node_data(int nid);
/* Generic implementation available */
int numa_nearest_node(int node, unsigned int state);
int nearest_node_nodemask(int node, nodemask_t *mask);
#ifndef memory_add_physaddr_to_nid
int memory_add_physaddr_to_nid(u64 start);
#endif
@@ -55,6 +49,11 @@ static inline int numa_nearest_node(int node, unsigned int state)
return NUMA_NO_NODE;
}
static inline int nearest_node_nodemask(int node, nodemask_t *mask)
{
return NUMA_NO_NODE;
}
static inline int memory_add_physaddr_to_nid(u64 start)
{
return 0;

View File

@@ -146,6 +146,7 @@ struct sched_ext_entity {
u32 weight;
s32 sticky_cpu;
s32 holding_cpu;
s32 selected_cpu;
u32 kf_mask; /* see scx_kf_mask above */
struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */
atomic_long_t ops_state;

View File

@@ -261,6 +261,36 @@ sched_numa_hop_mask(unsigned int node, unsigned int hops)
}
#endif /* CONFIG_NUMA */
/**
* for_each_node_numadist() - iterate over nodes in increasing distance
* order, starting from a given node
* @node: the iteration variable and the starting node.
* @unvisited: a nodemask to keep track of the unvisited nodes.
*
* This macro iterates over NUMA node IDs in increasing distance from the
* starting @node and yields MAX_NUMNODES when all the nodes have been
* visited.
*
* Note that by the time the loop completes, the @unvisited nodemask will
* be fully cleared, unless the loop exits early.
*
* The difference between for_each_node() and for_each_node_numadist() is
* that the former allows to iterate over nodes in numerical order, whereas
* the latter iterates over nodes in increasing order of distance.
*
* This complexity of this iterator is O(N^2), where N represents the
* number of nodes, as each iteration involves scanning all nodes to
* find the one with the shortest distance.
*
* Requires rcu_lock to be held.
*/
#define for_each_node_numadist(node, unvisited) \
for (int __start = (node), \
(node) = nearest_node_nodemask((__start), &(unvisited)); \
(node) < MAX_NUMNODES; \
node_clear((node), (unvisited)), \
(node) = nearest_node_nodemask((__start), &(unvisited)))
/**
* for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance
* from a given node.

View File

@@ -26,6 +26,25 @@ TRACE_EVENT(sched_ext_dump,
)
);
TRACE_EVENT(sched_ext_event,
TP_PROTO(const char *name, __s64 delta),
TP_ARGS(name, delta),
TP_STRUCT__entry(
__string(name, name)
__field( __s64, delta )
),
TP_fast_assign(
__assign_str(name);
__entry->delta = delta;
),
TP_printk("name %s delta %lld",
__get_str(name), __entry->delta
)
);
#endif /* _TRACE_SCHED_EXT_H */
/* This part must be outside protection */

View File

@@ -61,6 +61,7 @@
#ifdef CONFIG_SCHED_CLASS_EXT
# include "ext.c"
# include "ext_idle.c"
#endif
#include "syscalls.c"

View File

@@ -3922,13 +3922,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
{
/*
* The BPF scheduler may depend on select_task_rq() being invoked during
* wakeups. In addition, @p may end up executing on a different CPU
* regardless of what happens in the wakeup path making the ttwu_queue
* optimization less meaningful. Skip if on SCX.
*/
if (task_on_scx(p))
/* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */
if (!scx_allow_ttwu_queue(p))
return false;
/*

File diff suppressed because it is too large Load Diff

View File

@@ -8,6 +8,8 @@
*/
#ifdef CONFIG_SCHED_CLASS_EXT
DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
void scx_tick(struct rq *rq);
void init_scx_entity(struct sched_ext_entity *scx);
void scx_pre_fork(struct task_struct *p);
@@ -34,6 +36,13 @@ static inline bool task_on_scx(const struct task_struct *p)
return scx_enabled() && p->sched_class == &ext_sched_class;
}
static inline bool scx_allow_ttwu_queue(const struct task_struct *p)
{
return !scx_enabled() ||
static_branch_likely(&scx_ops_allow_queued_wakeup) ||
p->sched_class != &ext_sched_class;
}
#ifdef CONFIG_SCHED_CORE
bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
bool in_fi);
@@ -52,6 +61,7 @@ static inline void scx_rq_activate(struct rq *rq) {}
static inline void scx_rq_deactivate(struct rq *rq) {}
static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; }
static inline bool task_on_scx(const struct task_struct *p) { return false; }
static inline bool scx_allow_ttwu_queue(const struct task_struct *p) { return true; }
static inline void init_sched_ext_class(void) {}
#endif /* CONFIG_SCHED_CLASS_EXT */

1171
kernel/sched/ext_idle.c Normal file

File diff suppressed because it is too large Load Diff

35
kernel/sched/ext_idle.h Normal file
View File

@@ -0,0 +1,35 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
*
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
* Copyright (c) 2024 Andrea Righi <arighi@nvidia.com>
*/
#ifndef _KERNEL_SCHED_EXT_IDLE_H
#define _KERNEL_SCHED_EXT_IDLE_H
struct sched_ext_ops;
#ifdef CONFIG_SMP
void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops);
void scx_idle_init_masks(void);
bool scx_idle_test_and_clear_cpu(int cpu);
s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags);
#else /* !CONFIG_SMP */
static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {}
static inline void scx_idle_init_masks(void) {}
static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; }
static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
{
return -EBUSY;
}
#endif /* CONFIG_SMP */
s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags);
void scx_idle_enable(struct sched_ext_ops *ops);
void scx_idle_disable(void);
int scx_idle_init(void);
#endif /* _KERNEL_SCHED_EXT_IDLE_H */

View File

@@ -196,6 +196,37 @@ int numa_nearest_node(int node, unsigned int state)
}
EXPORT_SYMBOL_GPL(numa_nearest_node);
/**
* nearest_node_nodemask - Find the node in @mask at the nearest distance
* from @node.
*
* @node: a valid node ID to start the search from.
* @mask: a pointer to a nodemask representing the allowed nodes.
*
* This function iterates over all nodes in @mask and calculates the
* distance from the starting @node, then it returns the node ID that is
* the closest to @node, or MAX_NUMNODES if no node is found.
*
* Note that @node must be a valid node ID usable with node_distance(),
* providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
* or unexpected behavior.
*/
int nearest_node_nodemask(int node, nodemask_t *mask)
{
int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
for_each_node_mask(n, *mask) {
dist = node_distance(node, n);
if (dist < min_dist) {
min_dist = dist;
min_node = n;
}
}
return min_node;
}
EXPORT_SYMBOL_GPL(nearest_node_nodemask);
struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;

View File

@@ -7,6 +7,13 @@
#ifndef __SCX_COMMON_BPF_H
#define __SCX_COMMON_BPF_H
/*
* The generated kfunc prototypes in vmlinux.h are missing address space
* attributes which cause build failures. For now, suppress the generated
* prototypes. See https://github.com/sched-ext/scx/issues/1111.
*/
#define BPF_NO_KFUNC_PROTOTYPES
#ifdef LSP
#define __bpf__
#include "../vmlinux.h"
@@ -18,6 +25,7 @@
#include <bpf/bpf_tracing.h>
#include <asm-generic/errno.h>
#include "user_exit_info.h"
#include "enum_defs.autogen.h"
#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
@@ -62,21 +70,28 @@ void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym
u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak;
u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak;
void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak;
u32 scx_bpf_nr_node_ids(void) __ksym __weak;
u32 scx_bpf_nr_cpu_ids(void) __ksym __weak;
int scx_bpf_cpu_node(s32 cpu) __ksym __weak;
const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak;
const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak;
void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak;
const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) __ksym __weak;
const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) __ksym __weak;
const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
s32 scx_bpf_pick_idle_cpu_node(const cpumask_t *cpus_allowed, int node, u64 flags) __ksym __weak;
s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
s32 scx_bpf_pick_any_cpu_node(const cpumask_t *cpus_allowed, int node, u64 flags) __ksym __weak;
s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
bool scx_bpf_task_running(const struct task_struct *p) __ksym;
s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak;
u64 scx_bpf_now(void) __ksym __weak;
void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;
/*
* Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from
@@ -84,6 +99,9 @@ u64 scx_bpf_now(void) __ksym __weak;
*/
#define BPF_FOR_EACH_ITER (&___it)
#define scx_read_event(e, name) \
(bpf_core_field_exists((e)->name) ? (e)->name : 0)
static inline __attribute__((format(printf, 1, 2)))
void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
@@ -584,6 +602,22 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
__u.__val; \
})
#define READ_ONCE_ARENA(type, x) \
({ \
union { type __val; char __c[1]; } __u = \
{ .__c = { 0 } }; \
__read_once_size((void *)&(x), __u.__c, sizeof(x)); \
__u.__val; \
})
#define WRITE_ONCE_ARENA(type, x, val) \
({ \
union { type __val; char __c[1]; } __u = \
{ .__val = (val) }; \
__write_once_size((void *)&(x), __u.__c, sizeof(x)); \
__u.__val; \
})
/*
* log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value.
* @v: The value for which we're computing the base 2 logarithm.

View File

@@ -16,6 +16,7 @@
#include <stdlib.h>
#include <stdint.h>
#include <errno.h>
#include "enum_defs.autogen.h"
typedef uint8_t u8;
typedef uint16_t u16;

View File

@@ -125,11 +125,106 @@ bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter,
false; \
})
/**
* __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on
* in a compatible way. We will preserve this __COMPAT helper until v6.16.
*
* @enq_flags: enqueue flags from ops.enqueue()
*
* Return: True if SCX_ENQ_CPU_SELECTED is turned on in @enq_flags
*/
static inline bool __COMPAT_is_enq_cpu_selected(u64 enq_flags)
{
#ifdef HAVE_SCX_ENQ_CPU_SELECTED
/*
* This is the case that a BPF code compiled against vmlinux.h
* where the enum SCX_ENQ_CPU_SELECTED exists.
*/
/*
* We should temporarily suspend the macro expansion of
* 'SCX_ENQ_CPU_SELECTED'. This avoids 'SCX_ENQ_CPU_SELECTED' being
* rewritten to '__SCX_ENQ_CPU_SELECTED' when 'SCX_ENQ_CPU_SELECTED'
* is defined in 'scripts/gen_enums.py'.
*/
#pragma push_macro("SCX_ENQ_CPU_SELECTED")
#undef SCX_ENQ_CPU_SELECTED
u64 flag;
/*
* When the kernel did not have SCX_ENQ_CPU_SELECTED,
* select_task_rq_scx() has never been skipped. Thus, this case
* should be considered that the CPU has already been selected.
*/
if (!bpf_core_enum_value_exists(enum scx_enq_flags,
SCX_ENQ_CPU_SELECTED))
return true;
flag = bpf_core_enum_value(enum scx_enq_flags, SCX_ENQ_CPU_SELECTED);
return enq_flags & flag;
/*
* Once done, resume the macro expansion of 'SCX_ENQ_CPU_SELECTED'.
*/
#pragma pop_macro("SCX_ENQ_CPU_SELECTED")
#else
/*
* This is the case that a BPF code compiled against vmlinux.h
* where the enum SCX_ENQ_CPU_SELECTED does NOT exist.
*/
return true;
#endif /* HAVE_SCX_ENQ_CPU_SELECTED */
}
#define scx_bpf_now() \
(bpf_ksym_exists(scx_bpf_now) ? \
scx_bpf_now() : \
bpf_ktime_get_ns())
/*
* v6.15: Introduce event counters.
*
* Preserve the following macro until v6.17.
*/
#define __COMPAT_scx_bpf_events(events, size) \
(bpf_ksym_exists(scx_bpf_events) ? \
scx_bpf_events(events, size) : ({}))
/*
* v6.15: Introduce NUMA-aware kfuncs to operate with per-node idle
* cpumasks.
*
* Preserve the following __COMPAT_scx_*_node macros until v6.17.
*/
#define __COMPAT_scx_bpf_nr_node_ids() \
(bpf_ksym_exists(scx_bpf_nr_node_ids) ? \
scx_bpf_nr_node_ids() : 1U)
#define __COMPAT_scx_bpf_cpu_node(cpu) \
(bpf_ksym_exists(scx_bpf_cpu_node) ? \
scx_bpf_cpu_node(cpu) : 0)
#define __COMPAT_scx_bpf_get_idle_cpumask_node(node) \
(bpf_ksym_exists(scx_bpf_get_idle_cpumask_node) ? \
scx_bpf_get_idle_cpumask_node(node) : \
scx_bpf_get_idle_cpumask()) \
#define __COMPAT_scx_bpf_get_idle_smtmask_node(node) \
(bpf_ksym_exists(scx_bpf_get_idle_smtmask_node) ? \
scx_bpf_get_idle_smtmask_node(node) : \
scx_bpf_get_idle_smtmask())
#define __COMPAT_scx_bpf_pick_idle_cpu_node(cpus_allowed, node, flags) \
(bpf_ksym_exists(scx_bpf_pick_idle_cpu_node) ? \
scx_bpf_pick_idle_cpu_node(cpus_allowed, node, flags) : \
scx_bpf_pick_idle_cpu(cpus_allowed, flags))
#define __COMPAT_scx_bpf_pick_any_cpu_node(cpus_allowed, node, flags) \
(bpf_ksym_exists(scx_bpf_pick_any_cpu_node) ? \
scx_bpf_pick_any_cpu_node(cpus_allowed, node, flags) : \
scx_bpf_pick_any_cpu(cpus_allowed, flags))
/*
* Define sched_ext_ops. This may be expanded to define multiple variants for
* backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().

View File

@@ -106,8 +106,20 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field
return false;
}
#define SCX_OPS_SWITCH_PARTIAL \
__COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL")
#define SCX_OPS_FLAG(name) __COMPAT_ENUM_OR_ZERO("scx_ops_flags", #name)
#define SCX_OPS_KEEP_BUILTIN_IDLE SCX_OPS_FLAG(SCX_OPS_KEEP_BUILTIN_IDLE)
#define SCX_OPS_ENQ_LAST SCX_OPS_FLAG(SCX_OPS_ENQ_LAST)
#define SCX_OPS_ENQ_EXITING SCX_OPS_FLAG(SCX_OPS_ENQ_EXITING)
#define SCX_OPS_SWITCH_PARTIAL SCX_OPS_FLAG(SCX_OPS_SWITCH_PARTIAL)
#define SCX_OPS_ENQ_MIGRATION_DISABLED SCX_OPS_FLAG(SCX_OPS_ENQ_MIGRATION_DISABLED)
#define SCX_OPS_ALLOW_QUEUED_WAKEUP SCX_OPS_FLAG(SCX_OPS_ALLOW_QUEUED_WAKEUP)
#define SCX_OPS_BUILTIN_IDLE_PER_NODE SCX_OPS_FLAG(SCX_OPS_BUILTIN_IDLE_PER_NODE)
#define SCX_PICK_IDLE_FLAG(name) __COMPAT_ENUM_OR_ZERO("scx_pick_idle_cpu_flags", #name)
#define SCX_PICK_IDLE_CORE SCX_PICK_IDLE_FLAG(SCX_PICK_IDLE_CORE)
#define SCX_PICK_IDLE_IN_NODE SCX_PICK_IDLE_FLAG(SCX_PICK_IDLE_IN_NODE)
static inline long scx_hotplug_seq(void)
{

View File

@@ -0,0 +1,120 @@
/*
* WARNING: This file is autogenerated from gen_enum_defs.py [1].
*
* [1] https://github.com/sched-ext/scx/blob/main/scripts/gen_enum_defs.py
*/
#ifndef __ENUM_DEFS_AUTOGEN_H__
#define __ENUM_DEFS_AUTOGEN_H__
#define HAVE_SCX_DSP_DFL_MAX_BATCH
#define HAVE_SCX_DSP_MAX_LOOPS
#define HAVE_SCX_WATCHDOG_MAX_TIMEOUT
#define HAVE_SCX_EXIT_BT_LEN
#define HAVE_SCX_EXIT_MSG_LEN
#define HAVE_SCX_EXIT_DUMP_DFL_LEN
#define HAVE_SCX_CPUPERF_ONE
#define HAVE_SCX_OPS_TASK_ITER_BATCH
#define HAVE_SCX_CPU_PREEMPT_RT
#define HAVE_SCX_CPU_PREEMPT_DL
#define HAVE_SCX_CPU_PREEMPT_STOP
#define HAVE_SCX_CPU_PREEMPT_UNKNOWN
#define HAVE_SCX_DEQ_SLEEP
#define HAVE_SCX_DEQ_CORE_SCHED_EXEC
#define HAVE_SCX_DSQ_FLAG_BUILTIN
#define HAVE_SCX_DSQ_FLAG_LOCAL_ON
#define HAVE_SCX_DSQ_INVALID
#define HAVE_SCX_DSQ_GLOBAL
#define HAVE_SCX_DSQ_LOCAL
#define HAVE_SCX_DSQ_LOCAL_ON
#define HAVE_SCX_DSQ_LOCAL_CPU_MASK
#define HAVE_SCX_DSQ_ITER_REV
#define HAVE___SCX_DSQ_ITER_HAS_SLICE
#define HAVE___SCX_DSQ_ITER_HAS_VTIME
#define HAVE___SCX_DSQ_ITER_USER_FLAGS
#define HAVE___SCX_DSQ_ITER_ALL_FLAGS
#define HAVE_SCX_DSQ_LNODE_ITER_CURSOR
#define HAVE___SCX_DSQ_LNODE_PRIV_SHIFT
#define HAVE_SCX_ENQ_WAKEUP
#define HAVE_SCX_ENQ_HEAD
#define HAVE_SCX_ENQ_CPU_SELECTED
#define HAVE_SCX_ENQ_PREEMPT
#define HAVE_SCX_ENQ_REENQ
#define HAVE_SCX_ENQ_LAST
#define HAVE___SCX_ENQ_INTERNAL_MASK
#define HAVE_SCX_ENQ_CLEAR_OPSS
#define HAVE_SCX_ENQ_DSQ_PRIQ
#define HAVE_SCX_TASK_DSQ_ON_PRIQ
#define HAVE_SCX_TASK_QUEUED
#define HAVE_SCX_TASK_RESET_RUNNABLE_AT
#define HAVE_SCX_TASK_DEQD_FOR_SLEEP
#define HAVE_SCX_TASK_STATE_SHIFT
#define HAVE_SCX_TASK_STATE_BITS
#define HAVE_SCX_TASK_STATE_MASK
#define HAVE_SCX_TASK_CURSOR
#define HAVE_SCX_ECODE_RSN_HOTPLUG
#define HAVE_SCX_ECODE_ACT_RESTART
#define HAVE_SCX_EXIT_NONE
#define HAVE_SCX_EXIT_DONE
#define HAVE_SCX_EXIT_UNREG
#define HAVE_SCX_EXIT_UNREG_BPF
#define HAVE_SCX_EXIT_UNREG_KERN
#define HAVE_SCX_EXIT_SYSRQ
#define HAVE_SCX_EXIT_ERROR
#define HAVE_SCX_EXIT_ERROR_BPF
#define HAVE_SCX_EXIT_ERROR_STALL
#define HAVE_SCX_KF_UNLOCKED
#define HAVE_SCX_KF_CPU_RELEASE
#define HAVE_SCX_KF_DISPATCH
#define HAVE_SCX_KF_ENQUEUE
#define HAVE_SCX_KF_SELECT_CPU
#define HAVE_SCX_KF_REST
#define HAVE___SCX_KF_RQ_LOCKED
#define HAVE___SCX_KF_TERMINAL
#define HAVE_SCX_KICK_IDLE
#define HAVE_SCX_KICK_PREEMPT
#define HAVE_SCX_KICK_WAIT
#define HAVE_SCX_OPI_BEGIN
#define HAVE_SCX_OPI_NORMAL_BEGIN
#define HAVE_SCX_OPI_NORMAL_END
#define HAVE_SCX_OPI_CPU_HOTPLUG_BEGIN
#define HAVE_SCX_OPI_CPU_HOTPLUG_END
#define HAVE_SCX_OPI_END
#define HAVE_SCX_OPS_ENABLING
#define HAVE_SCX_OPS_ENABLED
#define HAVE_SCX_OPS_DISABLING
#define HAVE_SCX_OPS_DISABLED
#define HAVE_SCX_OPS_KEEP_BUILTIN_IDLE
#define HAVE_SCX_OPS_ENQ_LAST
#define HAVE_SCX_OPS_ENQ_EXITING
#define HAVE_SCX_OPS_SWITCH_PARTIAL
#define HAVE_SCX_OPS_HAS_CGROUP_WEIGHT
#define HAVE_SCX_OPS_ALL_FLAGS
#define HAVE_SCX_OPSS_NONE
#define HAVE_SCX_OPSS_QUEUEING
#define HAVE_SCX_OPSS_QUEUED
#define HAVE_SCX_OPSS_DISPATCHING
#define HAVE_SCX_OPSS_QSEQ_SHIFT
#define HAVE_SCX_PICK_IDLE_CORE
#define HAVE_SCX_OPS_NAME_LEN
#define HAVE_SCX_SLICE_DFL
#define HAVE_SCX_SLICE_INF
#define HAVE_SCX_RQ_ONLINE
#define HAVE_SCX_RQ_CAN_STOP_TICK
#define HAVE_SCX_RQ_BAL_PENDING
#define HAVE_SCX_RQ_BAL_KEEP
#define HAVE_SCX_RQ_BYPASSING
#define HAVE_SCX_RQ_IN_WAKEUP
#define HAVE_SCX_RQ_IN_BALANCE
#define HAVE_SCX_TASK_NONE
#define HAVE_SCX_TASK_INIT
#define HAVE_SCX_TASK_READY
#define HAVE_SCX_TASK_ENABLED
#define HAVE_SCX_TASK_NR_STATES
#define HAVE_SCX_TG_ONLINE
#define HAVE_SCX_TG_INITED
#define HAVE_SCX_WAKE_FORK
#define HAVE_SCX_WAKE_TTWU
#define HAVE_SCX_WAKE_SYNC
#endif /* __ENUM_DEFS_AUTOGEN_H__ */

View File

@@ -10,6 +10,7 @@
#include <unistd.h>
#include <inttypes.h>
#include <signal.h>
#include <assert.h>
#include <libgen.h>
#include <bpf/bpf.h>
#include <scx/common.h>
@@ -60,14 +61,22 @@ int main(int argc, char **argv)
skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
assert(skel->rodata->nr_cpu_ids <= INT32_MAX);
while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) {
switch (opt) {
case 's':
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
break;
case 'c':
skel->rodata->central_cpu = strtoul(optarg, NULL, 0);
case 'c': {
u32 central_cpu = strtoul(optarg, NULL, 0);
if (central_cpu >= skel->rodata->nr_cpu_ids) {
fprintf(stderr, "invalid central CPU id value, %u given (%u max)\n", central_cpu, skel->rodata->nr_cpu_ids);
return -1;
}
skel->rodata->central_cpu = (s32)central_cpu;
break;
}
case 'v':
verbose = true;
break;
@@ -96,7 +105,7 @@ int main(int argc, char **argv)
*/
cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids);
SCX_BUG_ON(!cpuset, "Failed to allocate cpuset");
CPU_ZERO(cpuset);
CPU_ZERO_S(CPU_ALLOC_SIZE(skel->rodata->nr_cpu_ids), cpuset);
CPU_SET(skel->rodata->central_cpu, cpuset);
SCX_BUG_ON(sched_setaffinity(0, sizeof(*cpuset), cpuset),
"Failed to affinitize to central CPU %d (max %d)",

View File

@@ -231,7 +231,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
}
/* if select_cpu() wasn't called, try direct dispatch */
if (!(enq_flags & SCX_ENQ_CPU_SELECTED) &&
if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
(cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
@@ -763,6 +763,8 @@ static void dump_shared_dsq(void)
static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
{
struct scx_event_stats events;
bpf_rcu_read_lock();
dispatch_highpri(true);
bpf_rcu_read_unlock();
@@ -772,6 +774,25 @@ static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
if (print_shared_dsq)
dump_shared_dsq();
__COMPAT_scx_bpf_events(&events, sizeof(events));
bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK",
scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK));
bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE",
scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE));
bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST",
scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST));
bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING",
scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING));
bpf_printk("%35s: %lld", "SCX_EV_ENQ_SLICE_DFL",
scx_read_event(&events, SCX_EV_ENQ_SLICE_DFL));
bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION",
scx_read_event(&events, SCX_EV_BYPASS_DURATION));
bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH",
scx_read_event(&events, SCX_EV_BYPASS_DISPATCH));
bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE",
scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE));
bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
return 0;
}

View File

@@ -172,6 +172,7 @@ auto-test-targets := \
maximal \
maybe_null \
minimal \
numa \
prog_run \
reload_loop \
select_cpu_dfl \

View File

@@ -0,0 +1,100 @@
// SPDX-License-Identifier: GPL-2.0
/*
* A scheduler that validates the behavior of the NUMA-aware
* functionalities.
*
* The scheduler creates a separate DSQ for each NUMA node, ensuring tasks
* are exclusively processed by CPUs within their respective nodes. Idle
* CPUs are selected only within the same node, so task migration can only
* occurs between CPUs belonging to the same node.
*
* Copyright (c) 2025 Andrea Righi <arighi@nvidia.com>
*/
#include <scx/common.bpf.h>
char _license[] SEC("license") = "GPL";
UEI_DEFINE(uei);
const volatile unsigned int __COMPAT_SCX_PICK_IDLE_IN_NODE;
static bool is_cpu_idle(s32 cpu, int node)
{
const struct cpumask *idle_cpumask;
bool idle;
idle_cpumask = __COMPAT_scx_bpf_get_idle_cpumask_node(node);
idle = bpf_cpumask_test_cpu(cpu, idle_cpumask);
scx_bpf_put_cpumask(idle_cpumask);
return idle;
}
s32 BPF_STRUCT_OPS(numa_select_cpu,
struct task_struct *p, s32 prev_cpu, u64 wake_flags)
{
int node = __COMPAT_scx_bpf_cpu_node(scx_bpf_task_cpu(p));
s32 cpu;
/*
* We could just use __COMPAT_scx_bpf_pick_any_cpu_node() here,
* since it already tries to pick an idle CPU within the node
* first, but let's use both functions for better testing coverage.
*/
cpu = __COMPAT_scx_bpf_pick_idle_cpu_node(p->cpus_ptr, node,
__COMPAT_SCX_PICK_IDLE_IN_NODE);
if (cpu < 0)
cpu = __COMPAT_scx_bpf_pick_any_cpu_node(p->cpus_ptr, node,
__COMPAT_SCX_PICK_IDLE_IN_NODE);
if (is_cpu_idle(cpu, node))
scx_bpf_error("CPU %d should be marked as busy", cpu);
if (__COMPAT_scx_bpf_cpu_node(cpu) != node)
scx_bpf_error("CPU %d should be in node %d", cpu, node);
return cpu;
}
void BPF_STRUCT_OPS(numa_enqueue, struct task_struct *p, u64 enq_flags)
{
int node = __COMPAT_scx_bpf_cpu_node(scx_bpf_task_cpu(p));
scx_bpf_dsq_insert(p, node, SCX_SLICE_DFL, enq_flags);
}
void BPF_STRUCT_OPS(numa_dispatch, s32 cpu, struct task_struct *prev)
{
int node = __COMPAT_scx_bpf_cpu_node(cpu);
scx_bpf_dsq_move_to_local(node);
}
s32 BPF_STRUCT_OPS_SLEEPABLE(numa_init)
{
int node, err;
bpf_for(node, 0, __COMPAT_scx_bpf_nr_node_ids()) {
err = scx_bpf_create_dsq(node, node);
if (err)
return err;
}
return 0;
}
void BPF_STRUCT_OPS(numa_exit, struct scx_exit_info *ei)
{
UEI_RECORD(uei, ei);
}
SEC(".struct_ops.link")
struct sched_ext_ops numa_ops = {
.select_cpu = (void *)numa_select_cpu,
.enqueue = (void *)numa_enqueue,
.dispatch = (void *)numa_dispatch,
.init = (void *)numa_init,
.exit = (void *)numa_exit,
.name = "numa",
};

View File

@@ -0,0 +1,59 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2025 Andrea Righi <arighi@nvidia.com>
*/
#include <bpf/bpf.h>
#include <scx/common.h>
#include <sys/wait.h>
#include <unistd.h>
#include "numa.bpf.skel.h"
#include "scx_test.h"
static enum scx_test_status setup(void **ctx)
{
struct numa *skel;
skel = numa__open();
SCX_FAIL_IF(!skel, "Failed to open");
SCX_ENUM_INIT(skel);
skel->rodata->__COMPAT_SCX_PICK_IDLE_IN_NODE = SCX_PICK_IDLE_IN_NODE;
skel->struct_ops.numa_ops->flags = SCX_OPS_BUILTIN_IDLE_PER_NODE;
SCX_FAIL_IF(numa__load(skel), "Failed to load skel");
*ctx = skel;
return SCX_TEST_PASS;
}
static enum scx_test_status run(void *ctx)
{
struct numa *skel = ctx;
struct bpf_link *link;
link = bpf_map__attach_struct_ops(skel->maps.numa_ops);
SCX_FAIL_IF(!link, "Failed to attach scheduler");
/* Just sleeping is fine, plenty of scheduling events happening */
sleep(1);
SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE));
bpf_link__destroy(link);
return SCX_TEST_PASS;
}
static void cleanup(void *ctx)
{
struct numa *skel = ctx;
numa__destroy(skel);
}
struct scx_test numa = {
.name = "numa",
.description = "Verify NUMA-aware functionalities",
.setup = setup,
.run = run,
.cleanup = cleanup,
};
REGISTER_SCX_TEST(&numa)